cs2764 commited on
Commit
11cf4ef
·
verified ·
1 Parent(s): f9ed71b

Bug fix for very long text

Browse files
Files changed (2) hide show
  1. app.py +34 -34
  2. text_cleaning.py +11 -6
app.py CHANGED
@@ -134,60 +134,60 @@ def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=500):
134
  import io
135
 
136
  async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
137
- """Generate audio for a single text segment and return as BytesIO"""
138
  logger.info(f"Generating segment {segment_index}...")
139
  communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
140
 
141
- audio_data = io.BytesIO()
 
 
 
 
142
  try:
143
- async for chunk in communicate.stream():
144
- if chunk["type"] == "audio":
145
- audio_data.write(chunk["data"])
146
  except Exception as e:
147
  logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
 
 
148
  raise gr.Error(f"Error generating segment {segment_index}: {e}")
149
 
150
- audio_data.seek(0)
151
-
152
  # Verify segment duration
153
  try:
154
- # Make a copy for verification so we don't consume the main buffer
155
- verify_buffer = io.BytesIO(audio_data.getvalue())
156
- seg_audio = AudioSegment.from_mp3(verify_buffer)
157
  duration_min = len(seg_audio) / 1000 / 60
158
- logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
159
  except Exception as e:
160
  logger.error(f"Error checking segment {segment_index} duration: {e}")
161
 
162
- audio_data.seek(0)
163
- return audio_data
164
 
165
- async def merge_audio_files(audio_objects):
166
- """Merge multiple audio BytesIO objects into one file"""
167
- if not audio_objects:
168
  return None
169
 
170
- logger.info(f"Merging {len(audio_objects)} audio segments...")
171
-
172
- # Load and merge audio segments
173
- combined = AudioSegment.empty()
174
- for i, audio_obj in enumerate(audio_objects):
175
- try:
176
- audio_obj.seek(0)
177
- segment = AudioSegment.from_mp3(audio_obj)
178
- combined += segment
179
- # Explicitly close/clear the BytesIO object to free memory
180
- audio_obj.close()
181
- except Exception as e:
182
- logger.error(f"Error merging segment {i+1}: {e}")
183
-
184
- # Save merged audio to a single temporary file
185
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
186
  merged_path = tmp_file.name
187
- combined.export(merged_path, format="mp3")
188
 
189
- total_duration_min = len(combined) / 1000 / 60
190
- logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  return merged_path
192
 
193
  async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
 
134
  import io
135
 
136
  async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
137
+ """Generate audio for a single text segment and save to temporary file"""
138
  logger.info(f"Generating segment {segment_index}...")
139
  communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
140
 
141
+ # Save directly to temporary file instead of memory
142
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f"_seg{segment_index}.mp3")
143
+ tmp_path = tmp_file.name
144
+ tmp_file.close()
145
+
146
  try:
147
+ await communicate.save(tmp_path)
 
 
148
  except Exception as e:
149
  logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
150
+ if os.path.exists(tmp_path):
151
+ os.remove(tmp_path)
152
  raise gr.Error(f"Error generating segment {segment_index}: {e}")
153
 
 
 
154
  # Verify segment duration
155
  try:
156
+ seg_audio = AudioSegment.from_mp3(tmp_path)
 
 
157
  duration_min = len(seg_audio) / 1000 / 60
158
+ logger.info(f"Segment {segment_index} saved to temp file (Duration: {duration_min:.2f} min)")
159
  except Exception as e:
160
  logger.error(f"Error checking segment {segment_index} duration: {e}")
161
 
162
+ return tmp_path
 
163
 
164
+ async def merge_audio_files(audio_paths):
165
+ """Merge multiple audio files into one file using binary concatenation"""
166
+ if not audio_paths:
167
  return None
168
 
169
+ logger.info(f"Merging {len(audio_paths)} audio segments...")
170
+
171
+ # Create output file
 
 
 
 
 
 
 
 
 
 
 
 
172
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
173
  merged_path = tmp_file.name
 
174
 
175
+ # Binary concatenation of MP3 files (avoids WAV size limit)
176
+ total_size = 0
177
+ with open(merged_path, 'wb') as outfile:
178
+ for i, audio_path in enumerate(audio_paths):
179
+ try:
180
+ with open(audio_path, 'rb') as infile:
181
+ data = infile.read()
182
+ outfile.write(data)
183
+ total_size += len(data)
184
+ # Delete temporary segment file after merging
185
+ os.remove(audio_path)
186
+ logger.info(f"Merged and deleted segment {i+1}")
187
+ except Exception as e:
188
+ logger.error(f"Error merging segment {i+1}: {e}")
189
+
190
+ logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
191
  return merged_path
192
 
193
  async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
text_cleaning.py CHANGED
@@ -125,12 +125,17 @@ class TextCleaner:
125
 
126
  @staticmethod
127
  def remove_special_chars(text):
128
- """Remove excessive special characters"""
129
- # Keep alphanumeric, basic punctuation, and common CJK characters
130
- # This is a conservative regex to avoid removing valid text
131
- # \w matches [a-zA-Z0-9_] and unicode word chars (including Chinese)
132
- # We add some common punctuation
133
- return re.sub(r'[^\w\s.,!?;:()"\'-,。!?;:()“”‘’]', '', text)
 
 
 
 
 
134
 
135
  @staticmethod
136
  def wetext_normalize(text):
 
125
 
126
  @staticmethod
127
  def remove_special_chars(text):
128
+ """Remove special characters that affect TTS but keep normal punctuation"""
129
+ # Only remove characters that TTS engines typically read aloud incorrectly
130
+ # Keep: letters, numbers, spaces, newlines, and common punctuation
131
+
132
+ # Characters to remove (symbols that TTS might read literally)
133
+ text = re.sub(r'[@#$%^&*+=|\\<>{}\[\]~`]', '', text)
134
+
135
+ # Remove multiple consecutive special punctuation (like *** or ---)
136
+ text = re.sub(r'([!?.,;:\-])\1{2,}', r'\1', text)
137
+
138
+ return text
139
 
140
  @staticmethod
141
  def wetext_normalize(text):