Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -199,25 +199,32 @@ def run_task(text, language, task):
|
|
| 199 |
except Exception as e2:
|
| 200 |
return None, None, f"Translation error: {e2}"
|
| 201 |
elif language == "Tibetan":
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
#translated_text = translate_with_quota(text, src_lang="bo", tgt_lang="en")
|
| 222 |
#return None, None, translated_text
|
| 223 |
else:
|
|
|
|
| 199 |
except Exception as e2:
|
| 200 |
return None, None, f"Translation error: {e2}"
|
| 201 |
elif language == "Tibetan":
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
# 1) Segment Tibetan text with Botok
|
| 205 |
+
tokens = [t.text for t in wt.tokenize(text)]
|
| 206 |
+
segmented_text = " ".join(tokens)
|
| 207 |
+
print("Segmented Tibetan:", segmented_text)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
# 2) Set source and target languages
|
| 211 |
+
tokenizer.src_lang = "bo_CN"
|
| 212 |
+
forced_bos = tokenizer.lang_code_to_id["en_XX"]
|
| 213 |
+
|
| 214 |
+
# 3) Translate using MBART-50
|
| 215 |
+
inputs = tokenizer(segmented_text, return_tensors="pt")
|
| 216 |
+
outputs = model.generate(
|
| 217 |
+
**inputs,
|
| 218 |
+
max_new_tokens=256,
|
| 219 |
+
forced_bos_token_id=forced_bos
|
| 220 |
+
)
|
| 221 |
+
# 4) Decode output
|
| 222 |
+
english_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 223 |
+
print("Translation output:", english_text)
|
| 224 |
+
|
| 225 |
+
return None, None, english_text
|
| 226 |
+
except Exception as e:
|
| 227 |
+
return None, None, f"Tibetan translation error: {e}"
|
| 228 |
#translated_text = translate_with_quota(text, src_lang="bo", tgt_lang="en")
|
| 229 |
#return None, None, translated_text
|
| 230 |
else:
|