tsuching commited on
Commit
bbcda01
·
verified ·
1 Parent(s): 58a6962

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -24
app.py CHANGED
@@ -153,7 +153,7 @@ def safe_tokenize_sanskrit(text):
153
 
154
 
155
 
156
- def get_ipa_phonetics(text):
157
  """
158
  Converts Tibetan text into parallel Unicode, MST (IPA), and KVP (Romanization) output,
159
  formatted clearly by segmented word.
@@ -164,13 +164,21 @@ def get_ipa_phonetics(text):
164
  # Botok tokens include words, punctuation, and whitespace elements.
165
  tokens = [t.text for t in wt.tokenize(text)]
166
 
167
- output_lines = []
 
 
 
 
 
 
 
 
168
 
169
  # Define headers for the output
170
- HEADER_UNICODE = "Unicode:"
171
- HEADER_WYLIE = " Wylie (Morphological):"
172
- HEADER_MST = " MST (IPA):"
173
- HEADER_KVP = " KVP (Phonetic):"
174
 
175
  # 2. Process each token (word, punctuation, or space)
176
  for tok in tokens:
@@ -179,15 +187,27 @@ def get_ipa_phonetics(text):
179
  continue
180
 
181
  # Punctuation/Whitespace Handling: Pass through for spacing
182
- if not tok.strip() or len(tok) == 1 and tok in '།།.':
183
  # Add a vertical space to clearly separate output by word/phrase
184
- output_lines.append("\n")
185
- continue
 
 
 
 
 
 
 
 
 
 
186
 
187
  unicode_str = tok
188
- wylie_str = "" # Initialized to prevent UnboundLocalError
189
- mst_ipa = ""
190
- kvp_phonetic = ""
 
 
191
 
192
  try:
193
  # Calculate Wylie first (always needed)
@@ -206,17 +226,25 @@ def get_ipa_phonetics(text):
206
  kvp_phonetic = "(Conversion Failed)"
207
 
208
  # 3. Format the output for one word
209
- output = (
210
- f"{HEADER_UNICODE} {unicode_str}\n"
211
- f"{HEADER_WYLIE} {wylie_str}\n"
212
- f"{HEADER_MST} {mst_ipa}\n"
213
- f"{HEADER_KVP} {kvp_phonetic}\n"
214
- )
215
 
216
- output_lines.append(output)
 
 
 
 
 
 
217
 
218
  # 4. Join all formatted outputs into a single string
219
- return "\n".join(output_lines)
 
 
220
 
221
 
222
  # Tibetan TTS function
@@ -395,9 +423,29 @@ def run_task(text, language, task):
395
 
396
  elif task == "Phonetics":
397
  if language == "Tibetan":
398
- # The get_ipa_phonetics function now returns the formatted multi-line string
399
- formatted_output = get_ipa_phonetics(text)
400
- return formatted_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  elif language == "Sanskrit":
402
  return "Phonetics conversion for Sanskrit is not supported by the current Bophono scheme."
403
  else:
@@ -419,7 +467,7 @@ iface_text = gr.Interface(
419
  gr.Radio(choices=["Translate", "Tokenize", "Phonetics"], label="Task")
420
  ],
421
  outputs=gr.Textbox(label="Text Output", lines=20),
422
- title="Translation & Tokenization"
423
  )
424
 
425
  iface_tts = gr.Interface(
 
153
 
154
 
155
 
156
+ def get_all_phonetics(text):
157
  """
158
  Converts Tibetan text into parallel Unicode, MST (IPA), and KVP (Romanization) output,
159
  formatted clearly by segmented word.
 
164
  # Botok tokens include words, punctuation, and whitespace elements.
165
  tokens = [t.text for t in wt.tokenize(text)]
166
 
167
+ #output_lines = []
168
+
169
+ # Dictionaries to store the results by token
170
+ results = {
171
+ "unicode": [],
172
+ "wylie": [],
173
+ "mst_ipa": [],
174
+ "kvp_phonetic": []
175
+ }
176
 
177
  # Define headers for the output
178
+ #HEADER_UNICODE = "Unicode:"
179
+ #HEADER_WYLIE = " Wylie (Morphological):"
180
+ #HEADER_MST = " MST (IPA):"
181
+ #HEADER_KVP = " KVP (Phonetic):"
182
 
183
  # 2. Process each token (word, punctuation, or space)
184
  for tok in tokens:
 
187
  continue
188
 
189
  # Punctuation/Whitespace Handling: Pass through for spacing
190
+ #if not tok.strip() or len(tok) == 1 and tok in '།།.':
191
  # Add a vertical space to clearly separate output by word/phrase
192
+ # output_lines.append("\n")
193
+ # continue
194
+
195
+ # Punctuation/Whitespace Handling: Use a consistent placeholder for spacing
196
+ is_separator = not tok.strip() or len(tok) == 1 and tok in '།།.'
197
+ if is_separator:
198
+ # Use a placeholder that will be converted to a break later
199
+ results["unicode"].append("")
200
+ results["wylie"].append("")
201
+ results["mst_ipa"].append("")
202
+ results["kvp_phonetic"].append("")
203
+ continue
204
 
205
  unicode_str = tok
206
+ #wylie_str = "" # Initialized to prevent UnboundLocalError
207
+ #mst_ipa = ""
208
+ #kvp_phonetic = ""
209
+ # Initialize to avoid UnboundLocalError during failure
210
+ wylie_str, mst_ipa, kvp_phonetic = "(Failed)", "(Failed)", "(Failed)"
211
 
212
  try:
213
  # Calculate Wylie first (always needed)
 
226
  kvp_phonetic = "(Conversion Failed)"
227
 
228
  # 3. Format the output for one word
229
+ #output = (
230
+ # f"{HEADER_UNICODE} {unicode_str}\n"
231
+ # f"{HEADER_WYLIE} {wylie_str}\n"
232
+ # f"{HEADER_MST} {mst_ipa}\n"
233
+ # f"{HEADER_KVP} {kvp_phonetic}\n"
234
+ #)
235
 
236
+ #output_lines.append(output)
237
+
238
+ # Store results
239
+ results["unicode"].append(tok)
240
+ results["wylie"].append(wylie_str)
241
+ results["mst_ipa"].append(mst_ipa)
242
+ results["kvp_phonetic"].append(kvp_phonetic)
243
 
244
  # 4. Join all formatted outputs into a single string
245
+ #return "\n".join(output_lines)
246
+
247
+ return results
248
 
249
 
250
  # Tibetan TTS function
 
423
 
424
  elif task == "Phonetics":
425
  if language == "Tibetan":
426
+ # The get_all_phonetics function now returns the formatted multi-line string
427
+ #formatted_output = get_all_phonetics(text)
428
+ #return formatted_output
429
+
430
+ # 1. Get all schemes data
431
+ schemes_data = get_all_phonetics_schemes(text)
432
+
433
+ # 2. Format the three outputs in parallel (Unicode + Wylie + Phonetic)
434
+ unicode_output = " ".join([t for t in schemes_data['unicode'] if t.strip()]) # Cleaned up display
435
+ wylie_output = " ".join([t for t in schemes_data['wylie'] if t.strip()])
436
+ mst_output = " ".join([t for t in schemes_data['mst_ipa'] if t.strip()])
437
+ kvp_output = " ".join([t for t in schemes_data['kvp_phonetic'] if t.strip()])
438
+
439
+ # 3. Present all outputs in a single, formatted string for the Textbox
440
+ # You can copy and paste from this single box now.
441
+ output = (
442
+ f"--- Tibetan Phonetic Analysis ---\n\n"
443
+ f"1. Unicode Text (Input):\n{unicode_output}\n\n"
444
+ f"2. Wylie (Morphological):\n{wylie_output}\n\n"
445
+ f"3. MST (IPA):\n{mst_output}\n\n"
446
+ f"4. KVP (Phonetic):\n{kvp_output}"
447
+ )
448
+ return output
449
  elif language == "Sanskrit":
450
  return "Phonetics conversion for Sanskrit is not supported by the current Bophono scheme."
451
  else:
 
467
  gr.Radio(choices=["Translate", "Tokenize", "Phonetics"], label="Task")
468
  ],
469
  outputs=gr.Textbox(label="Text Output", lines=20),
470
+ title="Translation & Tokenization & Phonetics"
471
  )
472
 
473
  iface_tts = gr.Interface(