tsuching commited on
Commit
79e22d1
Β·
verified Β·
1 Parent(s): 498b332

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -28
app.py CHANGED
@@ -15,18 +15,6 @@ import os
15
  # Initialize Botok
16
  wt = WordTokenizer()
17
 
18
- # Load TibetaMind (Tibetan β†’ Chinese)
19
- tibetamind_tokenizer = AutoTokenizer.from_pretrained("tibetamind/tibetan-chinese")
20
- tibetamind_model = AutoModelForSeq2SeqLM.from_pretrained("tibetamind/tibetan-chinese")
21
-
22
- # MBART for Chinese β†’ English
23
- translation_tokenizer = MBart50TokenizerFast.from_pretrained(
24
- "facebook/mbart-large-50-many-to-many-mmt", use_fast=False
25
- )
26
- translation_model = MBartForConditionalGeneration.from_pretrained(
27
- "facebook/mbart-large-50-many-to-many-mmt"
28
- )
29
-
30
  HF_TOKEN = os.getenv("HF_TOKEN")
31
 
32
  try:
@@ -91,6 +79,11 @@ def translate_with_quota(text, src_lang="bo", tgt_lang="en"):
91
  tts_tibetan = pipeline("text-to-speech", model="facebook/mms-tts-bod")
92
  #tts_sanskrit = pipeline("text-to-speech", model="facebook/mms-tts-san")
93
 
 
 
 
 
 
94
  # Use the slow tokenizer to avoid the bug
95
  ##translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",use_fast=False)
96
  #translation_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
@@ -161,22 +154,17 @@ def run_task(text, language, task):
161
  # 1) Segment Tibetan text with Botok
162
  tokens = [t.text for t in wt.tokenize(text)]
163
  segmented_text = " ".join(tokens)
164
-
165
- # 2) Tibetan β†’ Chinese via TibetaMind
166
- inputs = tibetamind_tokenizer(segmented_text, return_tensors="pt")
167
- outputs = tibetamind_model.generate(**inputs, max_new_tokens=256)
168
- chinese_text = tibetamind_tokenizer.decode(outputs[0], skip_special_tokens=True)
169
-
170
- # 3) Chinese β†’ English via MBART
171
- translation_tokenizer.src_lang = "zh_CN"
172
- forced_bos = translation_tokenizer.lang_code_to_id["en_XX"]
173
- inputs = translation_tokenizer(chinese_text, return_tensors="pt")
174
- outputs = translation_model.generate(
175
- **inputs,
176
- max_new_tokens=256,
177
- forced_bos_token_id=forced_bos
178
- )
179
- english_text = translation_tokenizer.decode(outputs[0], skip_special_tokens=True)
180
 
181
  return None, None, english_text
182
  #translated_text = translate_with_quota(text, src_lang="bo", tgt_lang="en")
@@ -195,6 +183,9 @@ def run_task(text, language, task):
195
  else:
196
  return None, None, "Unsupported task"
197
 
 
 
 
198
 
199
  # --- Gradio Interface ---
200
  iface = gr.Interface(
 
15
  # Initialize Botok
16
  wt = WordTokenizer()
17
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
 
20
  try:
 
79
  tts_tibetan = pipeline("text-to-speech", model="facebook/mms-tts-bod")
80
  #tts_sanskrit = pipeline("text-to-speech", model="facebook/mms-tts-san")
81
 
82
+ # Load MBART-50
83
+ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", use_fast=False)
84
+ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
85
+
86
+
87
  # Use the slow tokenizer to avoid the bug
88
  ##translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",use_fast=False)
89
  #translation_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
 
154
  # 1) Segment Tibetan text with Botok
155
  tokens = [t.text for t in wt.tokenize(text)]
156
  segmented_text = " ".join(tokens)
157
+
158
+ # Set source and target languages
159
+ tokenizer.src_lang = "bo_CN"
160
+ forced_bos = tokenizer.lang_code_to_id["en_XX"]
161
+
162
+ # Translate
163
+ inputs = tokenizer(text, return_tensors="pt")
164
+ outputs = model.generate(**inputs, max_new_tokens=256, forced_bos_token_id=forced_bos)
165
+ english_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
166
+
167
+ print(english_text)
 
 
 
 
 
168
 
169
  return None, None, english_text
170
  #translated_text = translate_with_quota(text, src_lang="bo", tgt_lang="en")
 
183
  else:
184
  return None, None, "Unsupported task"
185
 
186
+
187
+
188
+
189
 
190
  # --- Gradio Interface ---
191
  iface = gr.Interface(