Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,8 @@ import datetime
|
|
| 9 |
import tempfile
|
| 10 |
import soundfile as sf
|
| 11 |
import os
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
# --- Initiation ---
|
|
@@ -108,6 +110,40 @@ def call_microsoft_translate(text, src_lang, tgt_lang):
|
|
| 108 |
# TODO: implement Microsoft API call
|
| 109 |
return "Microsoft translated text"
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
def run_task(text, language, task):
|
| 113 |
# Always return: [audio_numpy, audio_filepath, text_output]
|
|
@@ -199,7 +235,11 @@ def run_task(text, language, task):
|
|
| 199 |
return None, None, segmented_text
|
| 200 |
#return None, None, xlm_tokenizer.tokenize(text)
|
| 201 |
elif language == "Sanskrit":
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
else:
|
| 204 |
return None, None, "Unsupported language"
|
| 205 |
|
|
@@ -207,6 +247,12 @@ def run_task(text, language, task):
|
|
| 207 |
return None, None, "Unsupported task"
|
| 208 |
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
# --- Gradio Interface ---
|
| 212 |
iface = gr.Interface(
|
|
|
|
| 9 |
import tempfile
|
| 10 |
import soundfile as sf
|
| 11 |
import os
|
| 12 |
+
import re
|
| 13 |
+
|
| 14 |
|
| 15 |
|
| 16 |
# --- Initiation ---
|
|
|
|
| 110 |
# TODO: implement Microsoft API call
|
| 111 |
return "Microsoft translated text"
|
| 112 |
|
| 113 |
+
def safe_tokenize_sanskrit(text):
|
| 114 |
+
"""
|
| 115 |
+
Return a list of tokens for Sanskrit (Devanagari). Try IndicTrans2 first,
|
| 116 |
+
then MBART-50, then XLM-R, finally a simple regex fallback.
|
| 117 |
+
"""
|
| 118 |
+
# 1) Try IndicTrans2 tokenizer (no model needed)
|
| 119 |
+
try:
|
| 120 |
+
# IndicTrans2 tokenizer usually supports .tokenize(text)
|
| 121 |
+
return indictrans_tokenizer.tokenize(text)
|
| 122 |
+
except Exception:
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
# 2) Try MBART-50 tokenizer (ensure it’s initialized)
|
| 126 |
+
try:
|
| 127 |
+
# MBART uses SentencePiece; .tokenize works and yields subwords
|
| 128 |
+
return tokenizer.tokenize(text) # reuse your mbart tokenizer var
|
| 129 |
+
except Exception:
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
# 3) Try XLM-R tokenizer (initialized at the top)
|
| 133 |
+
try:
|
| 134 |
+
return xlm_tokenizer.tokenize(text)
|
| 135 |
+
except Exception:
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
# 4) Regex fallback: split on Sanskrit word boundaries and punctuation
|
| 139 |
+
# This keeps Devanagari characters together and separates punctuation/whitespace
|
| 140 |
+
return [tok for tok in re.split(r"(\s+|[—–\-॥।,.;:!?()
|
| 141 |
+
|
| 142 |
+
\[\]
|
| 143 |
+
|
| 144 |
+
{}\"'])", text) if tok.strip()]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
|
| 148 |
def run_task(text, language, task):
|
| 149 |
# Always return: [audio_numpy, audio_filepath, text_output]
|
|
|
|
| 235 |
return None, None, segmented_text
|
| 236 |
#return None, None, xlm_tokenizer.tokenize(text)
|
| 237 |
elif language == "Sanskrit":
|
| 238 |
+
raw_tokens = safe_tokenize_sanskrit(text)
|
| 239 |
+
# Return a human-readable string; if you prefer list, wrap with str(tokens)
|
| 240 |
+
tokens = normalize_sp_tokens(raw_tokens)
|
| 241 |
+
return None, None, " ".join(tokens)
|
| 242 |
+
#return None, None, indictrans_tokenizer.tokenize(text)
|
| 243 |
else:
|
| 244 |
return None, None, "Unsupported language"
|
| 245 |
|
|
|
|
| 247 |
return None, None, "Unsupported task"
|
| 248 |
|
| 249 |
|
| 250 |
+
def normalize_sp_tokens(tokens):
|
| 251 |
+
# Remove SentencePiece underscores and collapse spaces
|
| 252 |
+
return [t.replace("▁", "") for t in tokens]
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
|
| 256 |
|
| 257 |
# --- Gradio Interface ---
|
| 258 |
iface = gr.Interface(
|