tsuching commited on
Commit
28b6955
·
verified ·
1 Parent(s): 20eeed2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -1
app.py CHANGED
@@ -9,6 +9,8 @@ import datetime
9
  import tempfile
10
  import soundfile as sf
11
  import os
 
 
12
 
13
 
14
  # --- Initiation ---
@@ -108,6 +110,40 @@ def call_microsoft_translate(text, src_lang, tgt_lang):
108
  # TODO: implement Microsoft API call
109
  return "Microsoft translated text"
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  def run_task(text, language, task):
113
  # Always return: [audio_numpy, audio_filepath, text_output]
@@ -199,7 +235,11 @@ def run_task(text, language, task):
199
  return None, None, segmented_text
200
  #return None, None, xlm_tokenizer.tokenize(text)
201
  elif language == "Sanskrit":
202
- return None, None, indictrans_tokenizer.tokenize(text)
 
 
 
 
203
  else:
204
  return None, None, "Unsupported language"
205
 
@@ -207,6 +247,12 @@ def run_task(text, language, task):
207
  return None, None, "Unsupported task"
208
 
209
 
 
 
 
 
 
 
210
 
211
  # --- Gradio Interface ---
212
  iface = gr.Interface(
 
9
  import tempfile
10
  import soundfile as sf
11
  import os
12
+ import re
13
+
14
 
15
 
16
  # --- Initiation ---
 
110
  # TODO: implement Microsoft API call
111
  return "Microsoft translated text"
112
 
113
+ def safe_tokenize_sanskrit(text):
114
+ """
115
+ Return a list of tokens for Sanskrit (Devanagari). Try IndicTrans2 first,
116
+ then MBART-50, then XLM-R, finally a simple regex fallback.
117
+ """
118
+ # 1) Try IndicTrans2 tokenizer (no model needed)
119
+ try:
120
+ # IndicTrans2 tokenizer usually supports .tokenize(text)
121
+ return indictrans_tokenizer.tokenize(text)
122
+ except Exception:
123
+ pass
124
+
125
+ # 2) Try MBART-50 tokenizer (ensure it’s initialized)
126
+ try:
127
+ # MBART uses SentencePiece; .tokenize works and yields subwords
128
+ return tokenizer.tokenize(text) # reuse your mbart tokenizer var
129
+ except Exception:
130
+ pass
131
+
132
+ # 3) Try XLM-R tokenizer (initialized at the top)
133
+ try:
134
+ return xlm_tokenizer.tokenize(text)
135
+ except Exception:
136
+ pass
137
+
138
+ # 4) Regex fallback: split on Sanskrit word boundaries and punctuation
139
+ # This keeps Devanagari characters together and separates punctuation/whitespace
140
+ return [tok for tok in re.split(r"(\s+|[—–\-॥।,.;:!?()
141
+
142
+ \[\]
143
+
144
+ {}\"'])", text) if tok.strip()]
145
+
146
+
147
 
148
  def run_task(text, language, task):
149
  # Always return: [audio_numpy, audio_filepath, text_output]
 
235
  return None, None, segmented_text
236
  #return None, None, xlm_tokenizer.tokenize(text)
237
  elif language == "Sanskrit":
238
+ raw_tokens = safe_tokenize_sanskrit(text)
239
+ # Return a human-readable string; if you prefer list, wrap with str(tokens)
240
+ tokens = normalize_sp_tokens(raw_tokens)
241
+ return None, None, " ".join(tokens)
242
+ #return None, None, indictrans_tokenizer.tokenize(text)
243
  else:
244
  return None, None, "Unsupported language"
245
 
 
247
  return None, None, "Unsupported task"
248
 
249
 
250
+ def normalize_sp_tokens(tokens):
251
+ # Remove SentencePiece underscores and collapse spaces
252
+ return [t.replace("▁", "") for t in tokens]
253
+
254
+
255
+
256
 
257
  # --- Gradio Interface ---
258
  iface = gr.Interface(