Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,28 +7,23 @@ langs = """Afrikaans (af), Amharic (am), Arabic (ar), Asturian (ast), Azerbaijan
|
|
| 7 |
Korean (ko), Luxembourgish; Letzeburgesch (lb), Ganda (lg), Lingala (ln), Lao (lo), Lithuanian (lt), Latvian (lv), Malagasy (mg), Macedonian (mk), Malayalam (ml), Mongolian (mn), Marathi (mr), Malay (ms), Burmese (my), Nepali (ne), Dutch; Flemish (nl), Norwegian (no), Northern Sotho (ns), Occitan (post 1500) (oc), Oriya (or), Panjabi; Punjabi (pa), Polish (pl), Pushto; Pashto (ps), Portuguese (pt), Romanian; Moldavian; Moldovan (ro), Russian (ru), Sindhi (sd), Sinhala; Sinhalese (si), Slovak (sk),
|
| 8 |
Slovenian (sl), Somali (so), Albanian (sq), Serbian (sr), Swati (ss), Sundanese (su), Swedish (sv), Swahili (sw), Tamil (ta), Thai (th), Tagalog (tl), Tswana (tn),
|
| 9 |
Turkish (tr), Ukrainian (uk), Urdu (ur), Uzbek (uz), Vietnamese (vi), Wolof (wo), Xhosa (xh), Yiddish (yi), Yoruba (yo), Chinese (zh), Zulu (zu)"""
|
| 10 |
-
lang_list = [lang.strip() for lang in langs.split(',')]
|
| 11 |
|
| 12 |
model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
|
| 13 |
tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100")
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
In this paper, they propose a compact and shallow massively multilingual MT model, and achieve competitive results with M2M-100, while being super smaller and faster. More details are provided [here](https://huggingface.co/alirezamsh/small100). Currently running on 2 vCPU - 16GB RAM."""
|
| 18 |
-
|
| 19 |
-
def small100_tr(lang, text):
|
| 20 |
-
|
| 21 |
-
lang = lang.split(" ")[-1][1:-1]
|
| 22 |
-
|
| 23 |
tokenizer.tgt_lang = lang
|
| 24 |
encoded_text = tokenizer(text, return_tensors="pt")
|
| 25 |
generated_tokens = model.generate(**encoded_text)
|
| 26 |
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
| 7 |
Korean (ko), Luxembourgish; Letzeburgesch (lb), Ganda (lg), Lingala (ln), Lao (lo), Lithuanian (lt), Latvian (lv), Malagasy (mg), Macedonian (mk), Malayalam (ml), Mongolian (mn), Marathi (mr), Malay (ms), Burmese (my), Nepali (ne), Dutch; Flemish (nl), Norwegian (no), Northern Sotho (ns), Occitan (post 1500) (oc), Oriya (or), Panjabi; Punjabi (pa), Polish (pl), Pushto; Pashto (ps), Portuguese (pt), Romanian; Moldavian; Moldovan (ro), Russian (ru), Sindhi (sd), Sinhala; Sinhalese (si), Slovak (sk),
|
| 8 |
Slovenian (sl), Somali (so), Albanian (sq), Serbian (sr), Swati (ss), Sundanese (su), Swedish (sv), Swahili (sw), Tamil (ta), Thai (th), Tagalog (tl), Tswana (tn),
|
| 9 |
Turkish (tr), Ukrainian (uk), Urdu (ur), Uzbek (uz), Vietnamese (vi), Wolof (wo), Xhosa (xh), Yiddish (yi), Yoruba (yo), Chinese (zh), Zulu (zu)"""
|
| 10 |
+
lang_list = [lang.strip().split(" ")[-1][1:-1] for lang in langs.split(',')]
|
| 11 |
|
| 12 |
model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
|
| 13 |
tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100")
|
| 14 |
|
| 15 |
+
def translate(lang, text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
tokenizer.tgt_lang = lang
|
| 17 |
encoded_text = tokenizer(text, return_tensors="pt")
|
| 18 |
generated_tokens = model.generate(**encoded_text)
|
| 19 |
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 20 |
|
| 21 |
+
Output = gr.outputs.Textbox()
|
| 22 |
+
gr.Interface(
|
| 23 |
+
translate,
|
| 24 |
+
inputs=[
|
| 25 |
+
gr.inputs.Dropdown( lang_list, label="To Language" ),
|
| 26 |
+
'text'
|
| 27 |
+
],
|
| 28 |
+
outputs=[ Output ],
|
| 29 |
+
).launch()
|