Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,18 +15,6 @@ import os
|
|
| 15 |
# Initialize Botok
|
| 16 |
wt = WordTokenizer()
|
| 17 |
|
| 18 |
-
# Load TibetaMind (Tibetan β Chinese)
|
| 19 |
-
tibetamind_tokenizer = AutoTokenizer.from_pretrained("tibetamind/tibetan-chinese")
|
| 20 |
-
tibetamind_model = AutoModelForSeq2SeqLM.from_pretrained("tibetamind/tibetan-chinese")
|
| 21 |
-
|
| 22 |
-
# MBART for Chinese β English
|
| 23 |
-
translation_tokenizer = MBart50TokenizerFast.from_pretrained(
|
| 24 |
-
"facebook/mbart-large-50-many-to-many-mmt", use_fast=False
|
| 25 |
-
)
|
| 26 |
-
translation_model = MBartForConditionalGeneration.from_pretrained(
|
| 27 |
-
"facebook/mbart-large-50-many-to-many-mmt"
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 31 |
|
| 32 |
try:
|
|
@@ -91,6 +79,11 @@ def translate_with_quota(text, src_lang="bo", tgt_lang="en"):
|
|
| 91 |
tts_tibetan = pipeline("text-to-speech", model="facebook/mms-tts-bod")
|
| 92 |
#tts_sanskrit = pipeline("text-to-speech", model="facebook/mms-tts-san")
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
# Use the slow tokenizer to avoid the bug
|
| 95 |
##translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",use_fast=False)
|
| 96 |
#translation_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
|
|
@@ -161,22 +154,17 @@ def run_task(text, language, task):
|
|
| 161 |
# 1) Segment Tibetan text with Botok
|
| 162 |
tokens = [t.text for t in wt.tokenize(text)]
|
| 163 |
segmented_text = " ".join(tokens)
|
| 164 |
-
|
| 165 |
-
#
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
**inputs,
|
| 176 |
-
max_new_tokens=256,
|
| 177 |
-
forced_bos_token_id=forced_bos
|
| 178 |
-
)
|
| 179 |
-
english_text = translation_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 180 |
|
| 181 |
return None, None, english_text
|
| 182 |
#translated_text = translate_with_quota(text, src_lang="bo", tgt_lang="en")
|
|
@@ -195,6 +183,9 @@ def run_task(text, language, task):
|
|
| 195 |
else:
|
| 196 |
return None, None, "Unsupported task"
|
| 197 |
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
# --- Gradio Interface ---
|
| 200 |
iface = gr.Interface(
|
|
|
|
| 15 |
# Initialize Botok
|
| 16 |
wt = WordTokenizer()
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
|
| 20 |
try:
|
|
|
|
| 79 |
tts_tibetan = pipeline("text-to-speech", model="facebook/mms-tts-bod")
|
| 80 |
#tts_sanskrit = pipeline("text-to-speech", model="facebook/mms-tts-san")
|
| 81 |
|
| 82 |
+
# Load MBART-50
|
| 83 |
+
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", use_fast=False)
|
| 84 |
+
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
# Use the slow tokenizer to avoid the bug
|
| 88 |
##translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",use_fast=False)
|
| 89 |
#translation_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
|
|
|
|
| 154 |
# 1) Segment Tibetan text with Botok
|
| 155 |
tokens = [t.text for t in wt.tokenize(text)]
|
| 156 |
segmented_text = " ".join(tokens)
|
| 157 |
+
|
| 158 |
+
# Set source and target languages
|
| 159 |
+
tokenizer.src_lang = "bo_CN"
|
| 160 |
+
forced_bos = tokenizer.lang_code_to_id["en_XX"]
|
| 161 |
+
|
| 162 |
+
# Translate
|
| 163 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 164 |
+
outputs = model.generate(**inputs, max_new_tokens=256, forced_bos_token_id=forced_bos)
|
| 165 |
+
english_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 166 |
+
|
| 167 |
+
print(english_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
return None, None, english_text
|
| 170 |
#translated_text = translate_with_quota(text, src_lang="bo", tgt_lang="en")
|
|
|
|
| 183 |
else:
|
| 184 |
return None, None, "Unsupported task"
|
| 185 |
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
| 189 |
|
| 190 |
# --- Gradio Interface ---
|
| 191 |
iface = gr.Interface(
|