Spaces:

asbgig
/

talkclone

Sleeping

App Files Files Community

asbgig commited on Aug 23, 2025

Commit

50f1b46

verified ·

1 Parent(s): 9aaaf3c

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -43

app.py CHANGED Viewed

@@ -1,16 +1,14 @@
-# app.py — TalkClone (HF Space, 1-column, persistent output, CPU-friendly)
-import os, re, tempfile, shutil
 import numpy as np
 import soundfile as sf
 import gradio as gr
-# Agree to Coqui CPML non-interactively on Spaces
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
-# Show labels, send codes (XTTS v2 supported only)
 LANGS = [
     ("English", "en"),
     ("Spanish", "es"),
@@ -35,7 +33,6 @@ LANG_MAP = {name: code for name, code in LANGS}
 _tts = None
 def get_tts():
-    """Lazy-load TTS; try GPU if available, else CPU."""
     global _tts
     if _tts is not None:
         return _tts
@@ -48,7 +45,6 @@ def get_tts():
         use_gpu = torch.cuda.is_available()
     except Exception:
         use_gpu = False
     from TTS.api import TTS
     try:
         _tts = TTS(MODEL_NAME, gpu=use_gpu)
@@ -67,24 +63,27 @@ def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
         tts.tts_to_file(text=txt, file_path=out_path,
                         speaker_wav=wav_path, language=lang)
 def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
     if ref_audio is None:
         raise gr.Error("Upload a reference voice (10–60s, clean speech).")
     text = clean_text(text)
     if not text:
         raise gr.Error("Please enter some text.")
-    # Limit extremely long jobs on free CPU
     if len(text) > 1400 and not split_sentences:
         raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")
     lang = LANG_MAP.get(lang_label, "en")
     wav_path = ref_audio
-    # Sentence split + also break very long sentences into ~200 chars
     chunks = [text]
     if split_sentences:
-        rough = [s.strip() for s in re.split(r'(?<=[.!?؟。．。،،]|[\u0964\u0965])\s+', text) if s.strip()]
         chunks = []
         for s in rough:
             if len(s) <= 220:
@@ -95,8 +94,6 @@ def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.P
     tts = get_tts()
     out_wavs = []
-    # Use a temp dir for parts, but write the FINAL file to a persistent temp path
     with tempfile.TemporaryDirectory() as td:
         total = max(len(chunks), 1)
         for i, chunk in enumerate(chunks, 1):
@@ -106,68 +103,67 @@ def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.P
             data, sr = sf.read(part_path)
             out_wavs.append((data, sr))
-    # Concatenate and save to a persistent temp file that survives function return
     if len(out_wavs) == 1:
         final_data, sr = out_wavs[0]
     else:
         sr = out_wavs[0][1]
         final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
-    persistent_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    persistent_tmp_path = persistent_tmp.name
-    persistent_tmp.close()  # path remains; we write to it next
-    sf.write(persistent_tmp_path, final_data, sr)
-    return persistent_tmp_path
-# ==== Styles (1 column + colors + hide HF/Gradio UI chrome) ====
 CUSTOM_CSS = """
 .gradio-container { max-width: 860px !important; margin: 0 auto; }
 #wrap, #ref, #lang, #txt, #spd, #split, #out_audio, #dl {
-  background: #f8fafc !important; /* slate-50 */
-  border: 1px solid #e5e7eb !important; /* gray-200 */
   border-radius: 14px !important;
   padding: 14px !important;
 }
-/* Make the component surfaces non-white */
-#ref, #out_audio, #dl { background: #eef2ff !important; } /* indigo-50-ish */
-/* Primary button color */
 #gen button, #gen { background: #10b981 !important; color: #fff !important; }
 #gen button:hover { filter: brightness(0.95); }
-/* Hide footer/API/Settings & obvious Space links */
 footer, .footer, #footer,
 a[href*="gradio.live"], a[href*="gradio.app"], a[href*="/api"], a[href*="hf.space"],
 button[aria-label="Settings"],
 [data-testid="block-analytics"], [data-testid="embed-info"] { display: none !important; }
 """
-with gr.Blocks(
-    title="TalkClone - Voice Cloning & TTS",
-    css=CUSTOM_CSS,
-    analytics_enabled=False
-) as demo:
     with gr.Column(elem_id="wrap"):
         gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
         gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
                     "On free CPU, keep text short or enable **Auto split** for speed.")
         ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
-        language = gr.Dropdown(choices=[name for name, _ in LANGS], value="English", label="Language", elem_id="lang")
-        text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
-        speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
-        split = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")
-        submit = gr.Button("Generate", variant="primary", elem_id="gen")
-        output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False, elem_id="out_audio")
-        download = gr.File(label="Download audio", elem_id="dl")
         def run_and_return(text, ref_audio, language, speed, split):
-            p = tts_clone(text, ref_audio, language, speed, split)
-            return p, p
         submit.click(run_and_return,
                      inputs=[text, ref_audio, language, speed, split],

+# app.py — TalkClone (HF Space, 1-column, persistent output, DownloadButton)
+import os, re, tempfile, shutil, time
 import numpy as np
 import soundfile as sf
 import gradio as gr
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
 LANGS = [
     ("English", "en"),
     ("Spanish", "es"),
 _tts = None
 def get_tts():
     global _tts
     if _tts is not None:
         return _tts
         use_gpu = torch.cuda.is_available()
     except Exception:
         use_gpu = False
     from TTS.api import TTS
     try:
         _tts = TTS(MODEL_NAME, gpu=use_gpu)
         tts.tts_to_file(text=txt, file_path=out_path,
                         speaker_wav=wav_path, language=lang)
+def safe_filename(seed_text: str, lang_code: str) -> str:
+    base = clean_text(seed_text)[:40] or "talkclone"
+    base = re.sub(r"[^A-Za-z0-9_-]+", "_", base).strip("_")
+    ts = time.strftime("%Y%m%d-%H%M%S")
+    return f"{base}_{lang_code}_{ts}.wav"
 def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
     if ref_audio is None:
         raise gr.Error("Upload a reference voice (10–60s, clean speech).")
     text = clean_text(text)
     if not text:
         raise gr.Error("Please enter some text.")
     if len(text) > 1400 and not split_sentences:
         raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")
     lang = LANG_MAP.get(lang_label, "en")
     wav_path = ref_audio
     chunks = [text]
     if split_sentences:
+        rough = [s.strip() for s in re.split(r'(?<=[.!?؟۔]|[\u0964\u0965])\s+', text) if s.strip()]
         chunks = []
         for s in rough:
             if len(s) <= 220:
     tts = get_tts()
     out_wavs = []
     with tempfile.TemporaryDirectory() as td:
         total = max(len(chunks), 1)
         for i, chunk in enumerate(chunks, 1):
             data, sr = sf.read(part_path)
             out_wavs.append((data, sr))
+    # concat
     if len(out_wavs) == 1:
         final_data, sr = out_wavs[0]
     else:
         sr = out_wavs[0][1]
         final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
+    # write to persistent temp + copy to a nice-named path for downloading
+    ntf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    ntf_path = ntf.name
+    ntf.close()
+    sf.write(ntf_path, final_data, sr)
+    pretty_name = os.path.join("/tmp", safe_filename(text, lang))
+    try:
+        shutil.copyfile(ntf_path, pretty_name)
+        dl_path = pretty_name
+    except Exception:
+        dl_path = ntf_path  # fallback
+    # return both: audio preview path, and a file path for DownloadButton
+    return ntf_path, dl_path
 CUSTOM_CSS = """
 .gradio-container { max-width: 860px !important; margin: 0 auto; }
 #wrap, #ref, #lang, #txt, #spd, #split, #out_audio, #dl {
+  background: #f8fafc !important;
+  border: 1px solid #e5e7eb !important;
   border-radius: 14px !important;
   padding: 14px !important;
 }
+#ref, #out_audio, #dl { background: #eef2ff !important; }
 #gen button, #gen { background: #10b981 !important; color: #fff !important; }
 #gen button:hover { filter: brightness(0.95); }
+/* hide HF/Gradio chrome */
 footer, .footer, #footer,
 a[href*="gradio.live"], a[href*="gradio.app"], a[href*="/api"], a[href*="hf.space"],
 button[aria-label="Settings"],
 [data-testid="block-analytics"], [data-testid="embed-info"] { display: none !important; }
 """
+with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css=CUSTOM_CSS, analytics_enabled=False) as demo:
     with gr.Column(elem_id="wrap"):
         gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
         gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
                     "On free CPU, keep text short or enable **Auto split** for speed.")
         ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
+        language  = gr.Dropdown(choices=LANG_LABELS, value="English", label="Language", elem_id="lang")
+        text      = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
+        speed     = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
+        split     = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")
+        submit    = gr.Button("Generate", variant="primary", elem_id="gen")
+        output   = gr.Audio(label="Cloned Speech", type="filepath", interactive=False, elem_id="out_audio")
+        download = gr.DownloadButton(label="Download audio", elem_id="dl")
         def run_and_return(text, ref_audio, language, speed, split):
+            audio_path, dl_path = tts_clone(text, ref_audio, language, speed, split)
+            # set button to download the file we just wrote
+            return audio_path, gr.update(value=dl_path, label=f"Download ({os.path.basename(dl_path)})")
         submit.click(run_and_return,
                      inputs=[text, ref_audio, language, speed, split],