Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, VitsTokenizer, VitsModel, set_seed | |
| import numpy as np | |
| import torch | |
| import io | |
| import soundfile as sf | |
| # Initialize ASR pipeline | |
| transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr") | |
| # Initialize LLM pipeline | |
| generator = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True) | |
| # Initialize TTS tokenizer and model | |
| tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") | |
| model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
| def transcribe_generate_and_speak(audio): | |
| sr, y = audio | |
| y = y.astype(np.float32) | |
| y /= np.max(np.abs(y)) | |
| # Transcribe audio | |
| asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"] | |
| # Generate text based on ASR output | |
| generated_text = generator(asr_output, max_length=100, num_return_sequences=1)[0]['generated_text'] | |
| # Generate audio from text | |
| inputs = tokenizer(text=generated_text, return_tensors="pt") | |
| set_seed(555) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| waveform = outputs.waveform[0] | |
| waveform_path = "output.wav" | |
| sf.write(waveform_path, waveform.numpy(), 16000, format='wav') | |
| return waveform_path | |
| # Define Gradio interface | |
| audio_input = gr.Interface( | |
| transcribe_generate_and_speak, | |
| gr.Audio(sources=["microphone"], label="Speak Here"), | |
| "audio", | |
| title="ASR -> LLM -> TTS", | |
| description="Speak into the microphone and hear the generated audio." | |
| ) | |
| # Launch the interface | |
| audio_input.launch() |