Edge-TTS-WebUI-Long-Text

Sleeping

App Files Files Community

cs2764 commited on 15 days ago

Commit

11cf4ef

verified ·

1 Parent(s): f9ed71b

Bug fix for very long text

Browse files

Files changed (2) hide show

app.py +34 -34
text_cleaning.py +11 -6

app.py CHANGED Viewed

@@ -134,60 +134,60 @@ def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=500):
 import io
 async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
-    """Generate audio for a single text segment and return as BytesIO"""
     logger.info(f"Generating segment {segment_index}...")
     communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
-    audio_data = io.BytesIO()
     try:
-        async for chunk in communicate.stream():
-            if chunk["type"] == "audio":
-                audio_data.write(chunk["data"])
     except Exception as e:
         logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
         raise gr.Error(f"Error generating segment {segment_index}: {e}")
-    audio_data.seek(0)
     # Verify segment duration
     try:
-        # Make a copy for verification so we don't consume the main buffer
-        verify_buffer = io.BytesIO(audio_data.getvalue())
-        seg_audio = AudioSegment.from_mp3(verify_buffer)
         duration_min = len(seg_audio) / 1000 / 60
-        logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
     except Exception as e:
         logger.error(f"Error checking segment {segment_index} duration: {e}")
-    audio_data.seek(0)
-    return audio_data
-async def merge_audio_files(audio_objects):
-    """Merge multiple audio BytesIO objects into one file"""
-    if not audio_objects:
         return None
-    logger.info(f"Merging {len(audio_objects)} audio segments...")
-    # Load and merge audio segments
-    combined = AudioSegment.empty()
-    for i, audio_obj in enumerate(audio_objects):
-        try:
-            audio_obj.seek(0)
-            segment = AudioSegment.from_mp3(audio_obj)
-            combined += segment
-            # Explicitly close/clear the BytesIO object to free memory
-            audio_obj.close()
-        except Exception as e:
-            logger.error(f"Error merging segment {i+1}: {e}")
-    # Save merged audio to a single temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         merged_path = tmp_file.name
-        combined.export(merged_path, format="mp3")
-    total_duration_min = len(combined) / 1000 / 60
-    logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
     return merged_path
 async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):

 import io
 async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
+    """Generate audio for a single text segment and save to temporary file"""
     logger.info(f"Generating segment {segment_index}...")
     communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
+    # Save directly to temporary file instead of memory
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f"_seg{segment_index}.mp3")
+    tmp_path = tmp_file.name
+    tmp_file.close()
     try:
+        await communicate.save(tmp_path)
     except Exception as e:
         logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
         raise gr.Error(f"Error generating segment {segment_index}: {e}")
     # Verify segment duration
     try:
+        seg_audio = AudioSegment.from_mp3(tmp_path)
         duration_min = len(seg_audio) / 1000 / 60
+        logger.info(f"Segment {segment_index} saved to temp file (Duration: {duration_min:.2f} min)")
     except Exception as e:
         logger.error(f"Error checking segment {segment_index} duration: {e}")
+    return tmp_path
+async def merge_audio_files(audio_paths):
+    """Merge multiple audio files into one file using binary concatenation"""
+    if not audio_paths:
         return None
+    logger.info(f"Merging {len(audio_paths)} audio segments...")
+    # Create output file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         merged_path = tmp_file.name
+    # Binary concatenation of MP3 files (avoids WAV size limit)
+    total_size = 0
+    with open(merged_path, 'wb') as outfile:
+        for i, audio_path in enumerate(audio_paths):
+            try:
+                with open(audio_path, 'rb') as infile:
+                    data = infile.read()
+                    outfile.write(data)
+                    total_size += len(data)
+                # Delete temporary segment file after merging
+                os.remove(audio_path)
+                logger.info(f"Merged and deleted segment {i+1}")
+            except Exception as e:
+                logger.error(f"Error merging segment {i+1}: {e}")
+    logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
     return merged_path
 async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):

text_cleaning.py CHANGED Viewed

@@ -125,12 +125,17 @@ class TextCleaner:
     @staticmethod
     def remove_special_chars(text):
-        """Remove excessive special characters"""
-        # Keep alphanumeric, basic punctuation, and common CJK characters
-        # This is a conservative regex to avoid removing valid text
-        # \w matches [a-zA-Z0-9_] and unicode word chars (including Chinese)
-        # We add some common punctuation
-        return re.sub(r'[^\w\s.,!?;:()"\'-，。！？；：（）“”‘’]', '', text)
     @staticmethod
     def wetext_normalize(text):

     @staticmethod
     def remove_special_chars(text):
+        """Remove special characters that affect TTS but keep normal punctuation"""
+        # Only remove characters that TTS engines typically read aloud incorrectly
+        # Keep: letters, numbers, spaces, newlines, and common punctuation
+        # Characters to remove (symbols that TTS might read literally)
+        text = re.sub(r'[@#$%^&*+=|\\<>{}\[\]~`]', '', text)
+        # Remove multiple consecutive special punctuation (like *** or ---)
+        text = re.sub(r'([!?.,;:\-])\1{2,}', r'\1', text)
+        return text
     @staticmethod
     def wetext_normalize(text):