Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -153,7 +153,7 @@ def safe_tokenize_sanskrit(text):
|
|
| 153 |
|
| 154 |
|
| 155 |
|
| 156 |
-
def
|
| 157 |
"""
|
| 158 |
Converts Tibetan text into parallel Unicode, MST (IPA), and KVP (Romanization) output,
|
| 159 |
formatted clearly by segmented word.
|
|
@@ -164,13 +164,21 @@ def get_ipa_phonetics(text):
|
|
| 164 |
# Botok tokens include words, punctuation, and whitespace elements.
|
| 165 |
tokens = [t.text for t in wt.tokenize(text)]
|
| 166 |
|
| 167 |
-
output_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
# Define headers for the output
|
| 170 |
-
HEADER_UNICODE = "Unicode:"
|
| 171 |
-
HEADER_WYLIE = " Wylie (Morphological):"
|
| 172 |
-
HEADER_MST = " MST (IPA):"
|
| 173 |
-
HEADER_KVP = " KVP (Phonetic):"
|
| 174 |
|
| 175 |
# 2. Process each token (word, punctuation, or space)
|
| 176 |
for tok in tokens:
|
|
@@ -179,15 +187,27 @@ def get_ipa_phonetics(text):
|
|
| 179 |
continue
|
| 180 |
|
| 181 |
# Punctuation/Whitespace Handling: Pass through for spacing
|
| 182 |
-
if not tok.strip() or len(tok) == 1 and tok in '།།.':
|
| 183 |
# Add a vertical space to clearly separate output by word/phrase
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
unicode_str = tok
|
| 188 |
-
wylie_str = "" # Initialized to prevent UnboundLocalError
|
| 189 |
-
mst_ipa = ""
|
| 190 |
-
kvp_phonetic = ""
|
|
|
|
|
|
|
| 191 |
|
| 192 |
try:
|
| 193 |
# Calculate Wylie first (always needed)
|
|
@@ -206,17 +226,25 @@ def get_ipa_phonetics(text):
|
|
| 206 |
kvp_phonetic = "(Conversion Failed)"
|
| 207 |
|
| 208 |
# 3. Format the output for one word
|
| 209 |
-
output = (
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
)
|
| 215 |
|
| 216 |
-
output_lines.append(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# 4. Join all formatted outputs into a single string
|
| 219 |
-
return "\n".join(output_lines)
|
|
|
|
|
|
|
| 220 |
|
| 221 |
|
| 222 |
# Tibetan TTS function
|
|
@@ -395,9 +423,29 @@ def run_task(text, language, task):
|
|
| 395 |
|
| 396 |
elif task == "Phonetics":
|
| 397 |
if language == "Tibetan":
|
| 398 |
-
# The
|
| 399 |
-
formatted_output =
|
| 400 |
-
return formatted_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
elif language == "Sanskrit":
|
| 402 |
return "Phonetics conversion for Sanskrit is not supported by the current Bophono scheme."
|
| 403 |
else:
|
|
@@ -419,7 +467,7 @@ iface_text = gr.Interface(
|
|
| 419 |
gr.Radio(choices=["Translate", "Tokenize", "Phonetics"], label="Task")
|
| 420 |
],
|
| 421 |
outputs=gr.Textbox(label="Text Output", lines=20),
|
| 422 |
-
title="Translation & Tokenization"
|
| 423 |
)
|
| 424 |
|
| 425 |
iface_tts = gr.Interface(
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
|
| 156 |
+
def get_all_phonetics(text):
|
| 157 |
"""
|
| 158 |
Converts Tibetan text into parallel Unicode, MST (IPA), and KVP (Romanization) output,
|
| 159 |
formatted clearly by segmented word.
|
|
|
|
| 164 |
# Botok tokens include words, punctuation, and whitespace elements.
|
| 165 |
tokens = [t.text for t in wt.tokenize(text)]
|
| 166 |
|
| 167 |
+
#output_lines = []
|
| 168 |
+
|
| 169 |
+
# Dictionaries to store the results by token
|
| 170 |
+
results = {
|
| 171 |
+
"unicode": [],
|
| 172 |
+
"wylie": [],
|
| 173 |
+
"mst_ipa": [],
|
| 174 |
+
"kvp_phonetic": []
|
| 175 |
+
}
|
| 176 |
|
| 177 |
# Define headers for the output
|
| 178 |
+
#HEADER_UNICODE = "Unicode:"
|
| 179 |
+
#HEADER_WYLIE = " Wylie (Morphological):"
|
| 180 |
+
#HEADER_MST = " MST (IPA):"
|
| 181 |
+
#HEADER_KVP = " KVP (Phonetic):"
|
| 182 |
|
| 183 |
# 2. Process each token (word, punctuation, or space)
|
| 184 |
for tok in tokens:
|
|
|
|
| 187 |
continue
|
| 188 |
|
| 189 |
# Punctuation/Whitespace Handling: Pass through for spacing
|
| 190 |
+
#if not tok.strip() or len(tok) == 1 and tok in '།།.':
|
| 191 |
# Add a vertical space to clearly separate output by word/phrase
|
| 192 |
+
# output_lines.append("\n")
|
| 193 |
+
# continue
|
| 194 |
+
|
| 195 |
+
# Punctuation/Whitespace Handling: Use a consistent placeholder for spacing
|
| 196 |
+
is_separator = not tok.strip() or len(tok) == 1 and tok in '།།.'
|
| 197 |
+
if is_separator:
|
| 198 |
+
# Use a placeholder that will be converted to a break later
|
| 199 |
+
results["unicode"].append("")
|
| 200 |
+
results["wylie"].append("")
|
| 201 |
+
results["mst_ipa"].append("")
|
| 202 |
+
results["kvp_phonetic"].append("")
|
| 203 |
+
continue
|
| 204 |
|
| 205 |
unicode_str = tok
|
| 206 |
+
#wylie_str = "" # Initialized to prevent UnboundLocalError
|
| 207 |
+
#mst_ipa = ""
|
| 208 |
+
#kvp_phonetic = ""
|
| 209 |
+
# Initialize to avoid UnboundLocalError during failure
|
| 210 |
+
wylie_str, mst_ipa, kvp_phonetic = "(Failed)", "(Failed)", "(Failed)"
|
| 211 |
|
| 212 |
try:
|
| 213 |
# Calculate Wylie first (always needed)
|
|
|
|
| 226 |
kvp_phonetic = "(Conversion Failed)"
|
| 227 |
|
| 228 |
# 3. Format the output for one word
|
| 229 |
+
#output = (
|
| 230 |
+
# f"{HEADER_UNICODE} {unicode_str}\n"
|
| 231 |
+
# f"{HEADER_WYLIE} {wylie_str}\n"
|
| 232 |
+
# f"{HEADER_MST} {mst_ipa}\n"
|
| 233 |
+
# f"{HEADER_KVP} {kvp_phonetic}\n"
|
| 234 |
+
#)
|
| 235 |
|
| 236 |
+
#output_lines.append(output)
|
| 237 |
+
|
| 238 |
+
# Store results
|
| 239 |
+
results["unicode"].append(tok)
|
| 240 |
+
results["wylie"].append(wylie_str)
|
| 241 |
+
results["mst_ipa"].append(mst_ipa)
|
| 242 |
+
results["kvp_phonetic"].append(kvp_phonetic)
|
| 243 |
|
| 244 |
# 4. Join all formatted outputs into a single string
|
| 245 |
+
#return "\n".join(output_lines)
|
| 246 |
+
|
| 247 |
+
return results
|
| 248 |
|
| 249 |
|
| 250 |
# Tibetan TTS function
|
|
|
|
| 423 |
|
| 424 |
elif task == "Phonetics":
|
| 425 |
if language == "Tibetan":
|
| 426 |
+
# The get_all_phonetics function now returns the formatted multi-line string
|
| 427 |
+
#formatted_output = get_all_phonetics(text)
|
| 428 |
+
#return formatted_output
|
| 429 |
+
|
| 430 |
+
# 1. Get all schemes data
|
| 431 |
+
schemes_data = get_all_phonetics_schemes(text)
|
| 432 |
+
|
| 433 |
+
# 2. Format the three outputs in parallel (Unicode + Wylie + Phonetic)
|
| 434 |
+
unicode_output = " ".join([t for t in schemes_data['unicode'] if t.strip()]) # Cleaned up display
|
| 435 |
+
wylie_output = " ".join([t for t in schemes_data['wylie'] if t.strip()])
|
| 436 |
+
mst_output = " ".join([t for t in schemes_data['mst_ipa'] if t.strip()])
|
| 437 |
+
kvp_output = " ".join([t for t in schemes_data['kvp_phonetic'] if t.strip()])
|
| 438 |
+
|
| 439 |
+
# 3. Present all outputs in a single, formatted string for the Textbox
|
| 440 |
+
# You can copy and paste from this single box now.
|
| 441 |
+
output = (
|
| 442 |
+
f"--- Tibetan Phonetic Analysis ---\n\n"
|
| 443 |
+
f"1. Unicode Text (Input):\n{unicode_output}\n\n"
|
| 444 |
+
f"2. Wylie (Morphological):\n{wylie_output}\n\n"
|
| 445 |
+
f"3. MST (IPA):\n{mst_output}\n\n"
|
| 446 |
+
f"4. KVP (Phonetic):\n{kvp_output}"
|
| 447 |
+
)
|
| 448 |
+
return output
|
| 449 |
elif language == "Sanskrit":
|
| 450 |
return "Phonetics conversion for Sanskrit is not supported by the current Bophono scheme."
|
| 451 |
else:
|
|
|
|
| 467 |
gr.Radio(choices=["Translate", "Tokenize", "Phonetics"], label="Task")
|
| 468 |
],
|
| 469 |
outputs=gr.Textbox(label="Text Output", lines=20),
|
| 470 |
+
title="Translation & Tokenization & Phonetics"
|
| 471 |
)
|
| 472 |
|
| 473 |
iface_tts = gr.Interface(
|