gagndeep commited on
Commit
fa63eb7
·
verified ·
1 Parent(s): 32bb546

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +206 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import io
4
+ import numpy as np
5
+ from pydub import AudioSegment
6
+ import tempfile
7
+ import os
8
+
9
+ # Create a custom theme for the application
10
+ custom_theme = gr.themes.Soft(
11
+ primary_hue="blue",
12
+ secondary_hue="indigo",
13
+ neutral_hue="slate",
14
+ font=gr.themes.GoogleFont("Inter"),
15
+ text_size="lg",
16
+ spacing_size="lg",
17
+ radius_size="md"
18
+ ).set(
19
+ button_primary_background_fill="*primary_600",
20
+ button_primary_background_fill_hover="*primary_700",
21
+ block_title_text_weight="600",
22
+ )
23
+
24
+ def vibevoice_conversion(audio_file, speaker_id="default"):
25
+ """
26
+ Convert audio using the VibeVoice Realtime 0.5B model
27
+ """
28
+ try:
29
+ # Check if audio file is provided
30
+ if audio_file is None:
31
+ raise gr.Error("Please upload an audio file")
32
+
33
+ # Create a temporary file to store the uploaded audio
34
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
35
+ temp_audio_path = temp_audio.name
36
+
37
+ # Save the uploaded audio to the temporary file
38
+ if isinstance(audio_file, tuple):
39
+ # If it's a tuple (sample_rate, audio_data)
40
+ sample_rate, audio_data = audio_file
41
+ # Convert numpy array to AudioSegment and export as WAV
42
+ audio_segment = AudioSegment(
43
+ audio_data.tobytes(),
44
+ frame_rate=sample_rate,
45
+ sample_width=audio_data.dtype.itemsize,
46
+ channels=1 if len(audio_data.shape) == 1 else audio_data.shape[0]
47
+ )
48
+ audio_segment.export(temp_audio_path, format="wav")
49
+ else:
50
+ # If it's a file path
51
+ audio_segment = AudioSegment.from_file(audio_file)
52
+ audio_segment.export(temp_audio_path, format="wav")
53
+
54
+ # Prepare the request to the VibeVoice API
55
+ api_url = "https://anycoderapps-vibevice-realtime-0-5b.hf.space/run/predict"
56
+
57
+ # Read the audio file as bytes
58
+ with open(temp_audio_path, "rb") as f:
59
+ audio_bytes = f.read()
60
+
61
+ # Prepare the payload
62
+ payload = {
63
+ "data": [
64
+ audio_bytes,
65
+ speaker_id
66
+ ]
67
+ }
68
+
69
+ # Send request to the VibeVoice API
70
+ response = requests.post(api_url, json=payload)
71
+
72
+ # Clean up temporary file
73
+ os.unlink(temp_audio_path)
74
+
75
+ if response.status_code == 200:
76
+ result = response.json()
77
+ if "data" in result and len(result["data"]) > 0:
78
+ # Get the converted audio data
79
+ converted_audio_bytes = result["data"][0]
80
+
81
+ # Create a temporary file for the converted audio
82
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_converted:
83
+ temp_converted_path = temp_converted.name
84
+ temp_converted.write(converted_audio_bytes)
85
+
86
+ # Return the converted audio file path
87
+ return temp_converted_path
88
+ else:
89
+ raise gr.Error("No audio data received from VibeVoice API")
90
+ else:
91
+ raise gr.Error(f"VibeVoice API request failed with status code: {response.status_code}")
92
+
93
+ except Exception as e:
94
+ raise gr.Error(f"An error occurred during voice conversion: {str(e)}")
95
+
96
+ def process_audio(audio_file, speaker_id):
97
+ """
98
+ Process the audio file and return the converted audio
99
+ """
100
+ try:
101
+ # Convert the audio using VibeVoice
102
+ converted_audio_path = vibevoice_conversion(audio_file, speaker_id)
103
+
104
+ # Return the converted audio
105
+ return converted_audio_path
106
+
107
+ except Exception as e:
108
+ raise gr.Error(f"Error processing audio: {str(e)}")
109
+
110
+ # Create the Gradio interface
111
+ with gr.Blocks() as demo:
112
+ gr.Markdown("# 🎤 VibeVoice Realtime 0.5B - Voice Conversion")
113
+ gr.Markdown("""
114
+ ### Convert your voice to different styles using the VibeVoice Realtime 0.5B model
115
+
116
+ **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
117
+
118
+ Upload an audio file and select a speaker style to convert your voice. The VibeVoice model can transform your voice while preserving the emotional content and prosody.
119
+ """)
120
+
121
+ with gr.Row():
122
+ with gr.Column():
123
+ gr.Markdown("### Input Audio")
124
+ input_audio = gr.Audio(
125
+ label="Upload your audio file",
126
+ type="filepath",
127
+ sources=["upload", "microphone"],
128
+ format="wav"
129
+ )
130
+
131
+ speaker_style = gr.Dropdown(
132
+ choices=[
133
+ "default",
134
+ "female_1",
135
+ "male_1",
136
+ "child",
137
+ "elderly",
138
+ "emotional"
139
+ ],
140
+ value="default",
141
+ label="Select Speaker Style"
142
+ )
143
+
144
+ convert_btn = gr.Button("🔄 Convert Voice", variant="primary", size="lg")
145
+
146
+ with gr.Column():
147
+ gr.Markdown("### Converted Audio")
148
+ output_audio = gr.Audio(
149
+ label="Converted Audio",
150
+ type="filepath",
151
+ format="wav"
152
+ )
153
+
154
+ status_text = gr.Textbox(
155
+ label="Status",
156
+ value="Ready to convert your voice!",
157
+ interactive=False
158
+ )
159
+
160
+ # Add examples
161
+ examples = gr.Examples(
162
+ examples=[
163
+ ["https://example.com/sample1.wav", "female_1"],
164
+ ["https://example.com/sample2.wav", "male_1"],
165
+ ["https://example.com/sample3.wav", "emotional"]
166
+ ],
167
+ inputs=[input_audio, speaker_style],
168
+ label="Try these examples:"
169
+ )
170
+
171
+ # Set up the conversion event
172
+ convert_btn.click(
173
+ fn=process_audio,
174
+ inputs=[input_audio, speaker_style],
175
+ outputs=[output_audio, status_text],
176
+ api_visibility="public",
177
+ api_name="convert_voice"
178
+ )
179
+
180
+ gr.Markdown("""
181
+ ### About VibeVoice Realtime 0.5B
182
+ - **Model**: VibeVoice Realtime 0.5B
183
+ - **Size**: 0.5 Billion parameters
184
+ - **Features**: Real-time voice conversion with emotional preservation
185
+ - **Capabilities**: Speaker style transfer, emotional content preservation, high-quality voice conversion
186
+
187
+ ### Tips for Best Results
188
+ - Use clear, high-quality audio recordings
189
+ - Speak naturally and expressively
190
+ - For best results, use audio samples of 5-15 seconds
191
+ - The model preserves emotional content and prosody from the original voice
192
+ """)
193
+
194
+ # Launch the application with custom theme and settings
195
+ demo.launch(
196
+ theme=custom_theme,
197
+ footer_links=[
198
+ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
199
+ {"label": "VibeVoice Model", "url": "https://huggingface.co/spaces/anycoderapps/VibeVoice-Realtime-0.5B"},
200
+ {"label": "Gradio", "url": "https://gradio.app"},
201
+ {"label": "Hugging Face", "url": "https://huggingface.co"}
202
+ ],
203
+ title="VibeVoice Realtime 0.5B - Voice Conversion",
204
+ description="Convert your voice to different styles using the VibeVoice Realtime 0.5B model",
205
+ show_error=True
206
+ )
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ pydub
3
+ gradio>=6.0
4
+ numpy
5
+ Pillow
6
+ scipy
7
+ librosa
8
+ soundfile
9
+ pandas
10
+ uvicorn
11
+ fastapi
12
+ pydantic
13
+ python-multipart
14
+ aiofiles