lachieandmitch commited on
Commit
7477637
·
verified ·
1 Parent(s): f754715

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ outputs/tts_20250608_125559.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ outputs/tts_20250608_125559.wav filter=lfs diff=lfs merge=lfs -text
38
+ outputs/tts_20250608_125703.mp3 filter=lfs diff=lfs merge=lfs -text
39
+ outputs/tts_20250608_125703.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ ENV/
26
+
27
+ # IDE
28
+ .idea/
29
+ .vscode/
30
+ *.swp
31
+ *.swo
32
+
33
+ # Project specific
34
+ output*.wav
35
+ *.pth
36
+ *.onnx
37
+ voices/
38
+ voices/*.pt
39
+ voices/**/*.pt
40
+ config.json
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
LICENSE ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ Copyright 2025 PierrunoYT (Kokoro TTS Local)
179
+
180
+ Licensed under the Apache License, Version 2.0 (the "License");
181
+ you may not use this file except in compliance with the License.
182
+ You may obtain a copy of the License at
183
+
184
+ http://www.apache.org/licenses/LICENSE-2.0
185
+
186
+ Unless required by applicable law or agreed to in writing, software
187
+ distributed under the License is distributed on an "AS IS" BASIS,
188
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189
+ See the License for the specific language governing permissions and
190
+ limitations under the License.
README.md CHANGED
@@ -1,12 +1,369 @@
1
  ---
2
- title: Kokoro TTS Local
3
- emoji: 🏃
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.33.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Kokoro-TTS-Local
3
+ app_file: gradio_interface.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.33.0
 
 
6
  ---
7
+ # Kokoro TTS Local
8
 
9
+ A local implementation of the Kokoro Text-to-Speech model, featuring dynamic module loading, automatic dependency management, and a web interface.
10
+
11
+ ## Features
12
+
13
+ - Local text-to-speech synthesis using the Kokoro-82M model
14
+ - Multiple voice support with easy voice selection (54 voices available across 8 languages)
15
+ - Automatic model and voice downloading from Hugging Face
16
+ - Phoneme output support and visualization
17
+ - Interactive CLI and web interface
18
+ - Voice listing functionality
19
+ - Cross-platform support (Windows, Linux, macOS)
20
+ - Real-time generation progress display
21
+ - Multiple output formats (WAV, MP3, AAC)
22
+
23
+ ## Prerequisites
24
+
25
+ - Python 3.8 or higher
26
+ - FFmpeg (optional, for MP3/AAC conversion)
27
+ - CUDA-compatible GPU (optional, for faster generation)
28
+ - Git (for version control and package management)
29
+
30
+ ## Installation
31
+
32
+ 1. Clone the repository and create a Python virtual environment:
33
+ ```bash
34
+ # Windows
35
+ python -m venv venv
36
+ .\venv\Scripts\activate
37
+
38
+ # Linux/macOS
39
+ python3 -m venv venv
40
+ source venv/bin/activate
41
+ ```
42
+
43
+ 2. Install dependencies:
44
+ ```bash
45
+ pip install -r requirements.txt
46
+ ```
47
+
48
+ **Alternative Installation (Simplified):**
49
+ For a simpler setup, you can also install the official Kokoro package directly:
50
+ ```bash
51
+ pip install kokoro>=0.9.2 soundfile
52
+ apt-get install espeak-ng # On Linux
53
+ # or brew install espeak # On macOS
54
+ ```
55
+
56
+ 3. (Optional) For GPU acceleration, install PyTorch with CUDA support:
57
+ ```bash
58
+ # For CUDA 11.8
59
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
60
+
61
+ # For CUDA 12.1
62
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
63
+
64
+ # For CUDA 12.6
65
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
66
+
67
+ # For CUDA 12.8 (for RTX 50-series cards)
68
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
69
+ ```
70
+
71
+ You can verify CUDA support is enabled with:
72
+ ```python
73
+ import torch
74
+ print(torch.cuda.is_available()) # Should print True if CUDA is available
75
+ ```
76
+
77
+ The system will automatically download required models and voice files on first run.
78
+
79
+ ## Usage
80
+
81
+ You can use either the command-line interface or the web interface:
82
+
83
+ ### Command Line Interface
84
+
85
+ Run the interactive CLI:
86
+ ```bash
87
+ python tts_demo.py
88
+ ```
89
+
90
+ The CLI provides an interactive menu with the following options:
91
+ 1. List available voices - Shows all available voice options
92
+ 2. Generate speech - Interactive process to:
93
+ - Select a voice from the numbered list
94
+ - Enter text to convert to speech
95
+ - Adjust speech speed (0.5-2.0)
96
+ 3. Exit - Quit the program
97
+
98
+ Example session:
99
+ ```
100
+ === Kokoro TTS Menu ===
101
+ 1. List available voices
102
+ 2. Generate speech
103
+ 3. Exit
104
+ Select an option (1-3): 2
105
+
106
+ Available voices:
107
+ 1. af_alloy
108
+ 2. af_aoede
109
+ 3. af_bella
110
+ ...
111
+
112
+ Select a voice number (or press Enter for default 'af_bella'): 3
113
+
114
+ Enter the text you want to convert to speech
115
+ (or press Enter for default text)
116
+ > Hello, world!
117
+
118
+ Enter speech speed (0.5-2.0, default 1.0): 1.2
119
+
120
+ Generating speech for: 'Hello, world!'
121
+ Using voice: af_bella
122
+ Speed: 1.2x
123
+ ...
124
+ ```
125
+
126
+ ### Web Interface
127
+
128
+ For a more user-friendly experience, launch the web interface:
129
+
130
+ ```bash
131
+ python gradio_interface.py
132
+ ```
133
+
134
+ Then open your browser to the URL shown in the console (typically http://localhost:7860).
135
+
136
+ The web interface provides:
137
+ - Easy voice selection from a dropdown menu
138
+ - Text input field with examples
139
+ - Real-time generation progress
140
+ - Audio playback in the browser
141
+ - Multiple output format options (WAV, MP3, AAC)
142
+ - Download options for generated audio
143
+
144
+ ## Available Voices
145
+
146
+ The system includes 54 different voices across 8 languages:
147
+
148
+ ### 🇺🇸 American English (20 voices)
149
+ **Language code: 'a'**
150
+
151
+ **Female voices (af_*):**
152
+ - af_heart: ❤️ Premium quality voice (Grade A)
153
+ - af_alloy: Clear and professional (Grade C)
154
+ - af_aoede: Smooth and melodic (Grade C+)
155
+ - af_bella: 🔥 Warm and friendly (Grade A-)
156
+ - af_jessica: Natural and engaging (Grade D)
157
+ - af_kore: Bright and energetic (Grade C+)
158
+ - af_nicole: 🎧 Professional and articulate (Grade B-)
159
+ - af_nova: Modern and dynamic (Grade C)
160
+ - af_river: Soft and flowing (Grade D)
161
+ - af_sarah: Casual and approachable (Grade C+)
162
+ - af_sky: Light and airy (Grade C-)
163
+
164
+ **Male voices (am_*):**
165
+ - am_adam: Strong and confident (Grade F+)
166
+ - am_echo: Resonant and clear (Grade D)
167
+ - am_eric: Professional and authoritative (Grade D)
168
+ - am_fenrir: Deep and powerful (Grade C+)
169
+ - am_liam: Friendly and conversational (Grade D)
170
+ - am_michael: Warm and trustworthy (Grade C+)
171
+ - am_onyx: Rich and sophisticated (Grade D)
172
+ - am_puck: Playful and energetic (Grade C+)
173
+ - am_santa: Holiday-themed voice (Grade D-)
174
+
175
+ ### 🇬🇧 British English (8 voices)
176
+ **Language code: 'b'**
177
+
178
+ **Female voices (bf_*):**
179
+ - bf_alice: Refined and elegant (Grade D)
180
+ - bf_emma: Warm and professional (Grade B-)
181
+ - bf_isabella: Sophisticated and clear (Grade C)
182
+ - bf_lily: Sweet and gentle (Grade D)
183
+
184
+ **Male voices (bm_*):**
185
+ - bm_daniel: Polished and professional (Grade D)
186
+ - bm_fable: Storytelling and engaging (Grade C)
187
+ - bm_george: Classic British accent (Grade C)
188
+ - bm_lewis: Modern British accent (Grade D+)
189
+
190
+ ### 🇯🇵 Japanese (5 voices)
191
+ **Language code: 'j'**
192
+
193
+ **Female voices (jf_*):**
194
+ - jf_alpha: Standard Japanese female (Grade C+)
195
+ - jf_gongitsune: Based on classic tale (Grade C)
196
+ - jf_nezumi: Mouse bride tale voice (Grade C-)
197
+ - jf_tebukuro: Glove story voice (Grade C)
198
+
199
+ **Male voices (jm_*):**
200
+ - jm_kumo: Spider thread tale voice (Grade C-)
201
+
202
+ ### 🇨🇳 Mandarin Chinese (8 voices)
203
+ **Language code: 'z'**
204
+
205
+ **Female voices (zf_*):**
206
+ - zf_xiaobei: Chinese female voice (Grade D)
207
+ - zf_xiaoni: Chinese female voice (Grade D)
208
+ - zf_xiaoxiao: Chinese female voice (Grade D)
209
+ - zf_xiaoyi: Chinese female voice (Grade D)
210
+
211
+ **Male voices (zm_*):**
212
+ - zm_yunjian: Chinese male voice (Grade D)
213
+ - zm_yunxi: Chinese male voice (Grade D)
214
+ - zm_yunxia: Chinese male voice (Grade D)
215
+ - zm_yunyang: Chinese male voice (Grade D)
216
+
217
+ ### 🇪🇸 Spanish (3 voices)
218
+ **Language code: 'e'**
219
+
220
+ **Female voices (ef_*):**
221
+ - ef_dora: Spanish female voice
222
+
223
+ **Male voices (em_*):**
224
+ - em_alex: Spanish male voice
225
+ - em_santa: Spanish holiday voice
226
+
227
+ ### 🇫🇷 French (1 voice)
228
+ **Language code: 'f'**
229
+
230
+ **Female voices (ff_*):**
231
+ - ff_siwis: French female voice (Grade B-)
232
+
233
+ ### 🇮🇳 Hindi (4 voices)
234
+ **Language code: 'h'**
235
+
236
+ **Female voices (hf_*):**
237
+ - hf_alpha: Hindi female voice (Grade C)
238
+ - hf_beta: Hindi female voice (Grade C)
239
+
240
+ **Male voices (hm_*):**
241
+ - hm_omega: Hindi male voice (Grade C)
242
+ - hm_psi: Hindi male voice (Grade C)
243
+
244
+ ### 🇮🇹 Italian (2 voices)
245
+ **Language code: 'i'**
246
+
247
+ **Female voices (if_*):**
248
+ - if_sara: Italian female voice (Grade C)
249
+
250
+ **Male voices (im_*):**
251
+ - im_nicola: Italian male voice (Grade C)
252
+
253
+ ### 🇧🇷 Brazilian Portuguese (3 voices)
254
+ **Language code: 'p'**
255
+
256
+ **Female voices (pf_*):**
257
+ - pf_dora: Portuguese female voice
258
+
259
+ **Male voices (pm_*):**
260
+ - pm_alex: Portuguese male voice
261
+ - pm_santa: Portuguese holiday voice
262
+
263
+ **Note:** Quality grades (A to F) indicate the overall quality based on training data quality and duration. Higher grades generally produce better speech quality.
264
+
265
+ ## Project Structure
266
+
267
+ ```
268
+ .
269
+ ├── .cache/ # Cache directory for downloaded models
270
+ │ └── huggingface/ # Hugging Face model cache
271
+ ├── .git/ # Git repository data
272
+ ├── .gitignore # Git ignore rules
273
+ ├── __pycache__/ # Python cache files
274
+ ├── voices/ # Voice model files (downloaded on demand)
275
+ │ └── *.pt # Individual voice files
276
+ ├── venv/ # Python virtual environment
277
+ ├── outputs/ # Generated audio files directory
278
+ ├── LICENSE # Apache 2.0 License file
279
+ ├── README.md # Project documentation
280
+ ├── models.py # Core TTS model implementation
281
+ ├── gradio_interface.py # Web interface implementation
282
+ ├── config.json # Model configuration file
283
+ ├── requirements.txt # Python dependencies
284
+ └── tts_demo.py # CLI implementation
285
+ ```
286
+
287
+ ## Model Information
288
+
289
+ The project uses the latest Kokoro model from Hugging Face:
290
+ - Repository: [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)
291
+ - Model file: `kokoro-v1_0.pth` (downloaded automatically)
292
+ - Sample rate: 24kHz
293
+ - Voice files: Located in the `voices/` directory (downloaded automatically)
294
+ - Available voices: 54 voices across 8 languages
295
+ - Languages: American English ('a'), British English ('b'), Japanese ('j'), Mandarin Chinese ('z'), Spanish ('e'), French ('f'), Hindi ('h'), Italian ('i'), Brazilian Portuguese ('p')
296
+ - Model size: 82M parameters
297
+
298
+ ## Troubleshooting
299
+
300
+ Common issues and solutions:
301
+
302
+ 1. **Model Download Issues**
303
+ - Ensure stable internet connection
304
+ - Check Hugging Face is accessible
305
+ - Verify sufficient disk space
306
+ - Try clearing the `.cache/huggingface` directory
307
+
308
+ 2. **CUDA/GPU Issues**
309
+ - Verify CUDA installation with `nvidia-smi`
310
+ - Update GPU drivers
311
+ - Install PyTorch with CUDA support using the appropriate command:
312
+ ```bash
313
+ # For CUDA 11.8
314
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
315
+
316
+ # For CUDA 12.1
317
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
318
+
319
+ # For CUDA 12.6
320
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
321
+
322
+ # For CUDA 12.8 (for RTX 50-series cards)
323
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
324
+ ```
325
+ - Verify CUDA is available in PyTorch:
326
+ ```python
327
+ import torch
328
+ print(torch.cuda.is_available()) # Should print True
329
+ ```
330
+ - Fall back to CPU if needed
331
+
332
+ 3. **Audio Output Issues**
333
+ - Check system audio settings
334
+ - Verify output directory permissions
335
+ - Install FFmpeg for MP3/AAC support
336
+ - Try different output formats
337
+
338
+ 4. **Voice File Issues**
339
+ - Delete and let system redownload voice files
340
+ - Check `voices/` directory permissions
341
+ - Verify voice file integrity
342
+ - Try using a different voice
343
+
344
+ 5. **Web Interface Issues**
345
+ - Check port 7860 availability
346
+ - Try different browser
347
+ - Clear browser cache
348
+ - Check network firewall settings
349
+
350
+ For any other issues:
351
+ 1. Check the console output for error messages
352
+ 2. Verify all prerequisites are installed
353
+ 3. Ensure virtual environment is activated
354
+ 4. Check system resource usage
355
+ 5. Try reinstalling dependencies
356
+
357
+ ## Contributing
358
+
359
+ Feel free to contribute by:
360
+ 1. Opening issues for bugs or feature requests
361
+ 2. Submitting pull requests with improvements
362
+ 3. Helping with documentation
363
+ 4. Testing different voices and reporting issues
364
+ 5. Suggesting new features or optimizations
365
+ 6. Testing on different platforms and reporting results
366
+
367
+ ## License
368
+
369
+ Apache 2.0 - See LICENSE file for details
gradio_interface.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kokoro-TTS Local Generator
3
+ -------------------------
4
+ A Gradio interface for the Kokoro-TTS-Local text-to-speech system.
5
+ Supports multiple voices and audio formats, with cross-platform compatibility.
6
+
7
+ Key Features:
8
+ - Multiple voice models support (54 voices across 8 languages)
9
+ - Real-time generation with progress logging
10
+ - WAV, MP3, and AAC output formats
11
+ - Network sharing capabilities
12
+ - Cross-platform compatibility (Windows, macOS, Linux)
13
+
14
+ Dependencies:
15
+ - kokoro: Official Kokoro TTS library
16
+ - gradio: Web interface framework
17
+ - soundfile: Audio file handling
18
+ - pydub: Audio format conversion
19
+ """
20
+
21
+ import gradio as gr
22
+ import os
23
+ import sys
24
+ import platform
25
+ from datetime import datetime
26
+ import shutil
27
+ from pathlib import Path
28
+ import soundfile as sf
29
+ from pydub import AudioSegment
30
+ import torch
31
+ import numpy as np
32
+ from typing import Union, List, Optional, Tuple, Dict, Any
33
+ from models import (
34
+ list_available_voices, build_model,
35
+ generate_speech, download_voice_files
36
+ )
37
+ from kokoro import KPipeline
38
+ import speed_dial
39
+
40
+ # Define path type for consistent handling
41
+ PathLike = Union[str, Path]
42
+
43
+ # Configuration validation
44
+ def validate_sample_rate(rate: int) -> int:
45
+ """Validate sample rate is within acceptable range"""
46
+ valid_rates = [16000, 22050, 24000, 44100, 48000]
47
+ if rate not in valid_rates:
48
+ print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}")
49
+ return 24000 # Default to safe value
50
+ return rate
51
+
52
+ # Global configuration
53
+ CONFIG_FILE = Path("tts_config.json") # Stores user preferences and paths
54
+ DEFAULT_OUTPUT_DIR = Path("outputs") # Directory for generated audio files
55
+ SAMPLE_RATE = validate_sample_rate(24000) # Validated sample rate
56
+
57
+ # Initialize model globally
58
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
59
+ model = None
60
+
61
+ LANG_MAP = {
62
+ "af_": "a", "am_": "a",
63
+ "bf_": "b", "bm_": "b",
64
+ "jf_": "j", "jm_": "j",
65
+ "zf_": "z", "zm_": "z",
66
+ "ef_": "e", "em_": "e",
67
+ "ff_": "f",
68
+ "hf_": "h", "hm_": "h",
69
+ "if_": "i", "im_": "i",
70
+ "pf_": "p", "pm_": "p",
71
+ }
72
+ pipelines = {}
73
+
74
+ def get_available_voices():
75
+ """Get list of available voice models."""
76
+ try:
77
+ # Initialize model to trigger voice downloads
78
+ global model
79
+ if model is None:
80
+ print("Initializing model and downloading voices...")
81
+ model = build_model(None, device)
82
+
83
+ voices = list_available_voices()
84
+ if not voices:
85
+ print("No voices found after initialization. Attempting to download...")
86
+ download_voice_files() # Try downloading again
87
+ voices = list_available_voices()
88
+
89
+ print("Available voices:", voices)
90
+ return voices
91
+ except Exception as e:
92
+ print(f"Error getting voices: {e}")
93
+ return []
94
+
95
+ def get_pipeline_for_voice(voice_name: str) -> KPipeline:
96
+ """
97
+ Determine the language code from the voice prefix and return the associated pipeline.
98
+ """
99
+ prefix = voice_name[:3].lower()
100
+ lang_code = LANG_MAP.get(prefix, "a")
101
+ if lang_code not in pipelines:
102
+ print(f"[INFO] Creating pipeline for lang_code='{lang_code}'")
103
+ pipelines[lang_code] = KPipeline(lang_code=lang_code, model=True)
104
+ return pipelines[lang_code]
105
+
106
+ def convert_audio(input_path: PathLike, output_path: PathLike, format: str) -> Optional[PathLike]:
107
+ """Convert audio to specified format.
108
+
109
+ Args:
110
+ input_path: Path to input audio file
111
+ output_path: Path to output audio file
112
+ format: Output format ('wav', 'mp3', or 'aac')
113
+
114
+ Returns:
115
+ Path to output file or None on error
116
+ """
117
+ try:
118
+ # Normalize paths
119
+ input_path = Path(input_path).absolute()
120
+ output_path = Path(output_path).absolute()
121
+
122
+ # Validate input file
123
+ if not input_path.exists():
124
+ raise FileNotFoundError(f"Input file not found: {input_path}")
125
+
126
+ # For WAV format, just return the input path
127
+ if format.lower() == "wav":
128
+ return input_path
129
+
130
+ # Create output directory if it doesn't exist
131
+ output_path.parent.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Convert format
134
+ audio = AudioSegment.from_wav(str(input_path))
135
+
136
+ # Select proper format and options
137
+ if format.lower() == "mp3":
138
+ audio.export(str(output_path), format="mp3", bitrate="192k")
139
+ elif format.lower() == "aac":
140
+ audio.export(str(output_path), format="aac", bitrate="192k")
141
+ else:
142
+ raise ValueError(f"Unsupported format: {format}")
143
+
144
+ # Verify file was created
145
+ if not output_path.exists() or output_path.stat().st_size == 0:
146
+ raise IOError(f"Failed to create {format} file")
147
+
148
+ return output_path
149
+
150
+ except (IOError, FileNotFoundError, ValueError) as e:
151
+ print(f"Error converting audio: {type(e).__name__}: {e}")
152
+ return None
153
+ except Exception as e:
154
+ print(f"Unexpected error converting audio: {type(e).__name__}: {e}")
155
+ import traceback
156
+ traceback.print_exc()
157
+ return None
158
+
159
+ def generate_tts_with_logs(voice_name: str, text: str, format: str, speed: float = 1.0) -> Optional[PathLike]:
160
+ """Generate TTS audio with progress logging.
161
+
162
+ Args:
163
+ voice_name: Name of the voice to use
164
+ text: Text to convert to speech
165
+ format: Output format ('wav', 'mp3', 'aac')
166
+
167
+ Returns:
168
+ Path to generated audio file or None on error
169
+ """
170
+ global model
171
+
172
+ try:
173
+ # Initialize model if needed
174
+ if model is None:
175
+ print("Initializing model...")
176
+ model = build_model(None, device)
177
+
178
+ # Create output directory
179
+ DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
180
+
181
+ # Validate input text
182
+ if not text or not text.strip():
183
+ raise ValueError("Text input cannot be empty")
184
+
185
+ # Limit extremely long texts to prevent memory issues
186
+ MAX_CHARS = 5000
187
+ if len(text) > MAX_CHARS:
188
+ print(f"Warning: Text exceeds {MAX_CHARS} characters. Truncating to prevent memory issues.")
189
+ text = text[:MAX_CHARS] + "..."
190
+
191
+ # Generate base filename from text
192
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
193
+ base_name = f"tts_{timestamp}"
194
+ wav_path = DEFAULT_OUTPUT_DIR / f"{base_name}.wav"
195
+
196
+ # Generate speech
197
+ print(f"\nGenerating speech for: '{text}'")
198
+ print(f"Using voice: {voice_name}")
199
+
200
+ # Validate voice path using Path for consistent handling
201
+ voice_path = Path("voices").absolute() / f"{voice_name}.pt"
202
+ if not voice_path.exists():
203
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
204
+
205
+ try:
206
+ if voice_name.startswith(tuple(LANG_MAP.keys())):
207
+ pipeline = get_pipeline_for_voice(voice_name)
208
+ generator = pipeline(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
209
+ else:
210
+ generator = model(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
211
+
212
+ all_audio = []
213
+ max_segments = 100 # Safety limit for very long texts
214
+ segment_count = 0
215
+
216
+ for gs, ps, audio in generator:
217
+ segment_count += 1
218
+ if segment_count > max_segments:
219
+ print(f"Warning: Reached maximum segment limit ({max_segments})")
220
+ break
221
+
222
+ if audio is not None:
223
+ if isinstance(audio, np.ndarray):
224
+ audio = torch.from_numpy(audio).float()
225
+ all_audio.append(audio)
226
+ print(f"Generated segment: {gs}")
227
+ if ps: # Only print phonemes if available
228
+ print(f"Phonemes: {ps}")
229
+
230
+ if not all_audio:
231
+ raise Exception("No audio generated")
232
+ except Exception as e:
233
+ raise Exception(f"Error in speech generation: {e}")
234
+
235
+ # Combine audio segments and save
236
+ if not all_audio:
237
+ raise Exception("No audio segments were generated")
238
+
239
+ # Handle single segment case without concatenation
240
+ if len(all_audio) == 1:
241
+ final_audio = all_audio[0]
242
+ else:
243
+ try:
244
+ final_audio = torch.cat(all_audio, dim=0)
245
+ except RuntimeError as e:
246
+ raise Exception(f"Failed to concatenate audio segments: {e}")
247
+
248
+ # Save audio file
249
+ try:
250
+ sf.write(wav_path, final_audio.numpy(), SAMPLE_RATE)
251
+ except Exception as e:
252
+ raise Exception(f"Failed to save audio file: {e}")
253
+
254
+ # Convert to requested format if needed
255
+ if format.lower() != "wav":
256
+ output_path = DEFAULT_OUTPUT_DIR / f"{base_name}.{format.lower()}"
257
+ return convert_audio(wav_path, output_path, format.lower())
258
+
259
+ return wav_path
260
+
261
+ except Exception as e:
262
+ print(f"Error generating speech: {e}")
263
+ import traceback
264
+ traceback.print_exc()
265
+ return None
266
+
267
+ def create_interface(server_name="0.0.0.0", server_port=7860):
268
+ """Create and launch the Gradio interface."""
269
+
270
+ # Get available voices
271
+ voices = get_available_voices()
272
+ if not voices:
273
+ print("No voices found! Please check the voices directory.")
274
+ return
275
+
276
+ # Get speed dial presets
277
+ preset_names = speed_dial.get_preset_names()
278
+
279
+ # Create interface
280
+ with gr.Blocks(title="Kokoro TTS Generator") as interface:
281
+ gr.Markdown("# Kokoro TTS Generator")
282
+
283
+ with gr.Row():
284
+ with gr.Column(scale=2):
285
+ # Main TTS controls
286
+ voice = gr.Dropdown(
287
+ choices=voices,
288
+ value=voices[0] if voices else None,
289
+ label="Voice"
290
+ )
291
+ text = gr.Textbox(
292
+ lines=3,
293
+ placeholder="Enter text to convert to speech...",
294
+ label="Text"
295
+ )
296
+ with gr.Row():
297
+ format = gr.Radio(
298
+ choices=["wav", "mp3", "aac"],
299
+ value="wav",
300
+ label="Output Format"
301
+ )
302
+ speed = gr.Slider(
303
+ minimum=0.5,
304
+ maximum=2.0,
305
+ value=1.0,
306
+ step=0.1,
307
+ label="Speed"
308
+ )
309
+ generate = gr.Button("Generate Speech")
310
+
311
+ with gr.Column(scale=1):
312
+ # Speed dial section
313
+ gr.Markdown("## Speed Dial")
314
+ preset_dropdown = gr.Dropdown(
315
+ choices=preset_names,
316
+ value=preset_names[0] if preset_names else None,
317
+ label="Saved Presets",
318
+ interactive=True
319
+ )
320
+ preset_name = gr.Textbox(
321
+ placeholder="Enter preset name...",
322
+ label="New Preset Name"
323
+ )
324
+ with gr.Row():
325
+ load_preset = gr.Button("Load")
326
+ save_preset = gr.Button("Save Current")
327
+ delete_preset = gr.Button("Delete")
328
+
329
+ # Output section
330
+ output = gr.Audio(label="Generated Audio")
331
+
332
+ # Function to load a preset
333
+ def load_preset_fn(preset_name):
334
+ if not preset_name:
335
+ return None, None, None, None
336
+
337
+ preset = speed_dial.get_preset(preset_name)
338
+ if not preset:
339
+ return None, None, None, None
340
+
341
+ return preset["voice"], preset["text"], preset["format"], preset["speed"]
342
+
343
+ # Function to save a preset
344
+ def save_preset_fn(name, voice, text, format, speed):
345
+ if not name or not voice or not text:
346
+ return gr.update(value="Please provide a name, voice, and text")
347
+
348
+ success = speed_dial.save_preset(name, voice, text, format, speed)
349
+
350
+ # Update the dropdown with the new preset list
351
+ preset_names = speed_dial.get_preset_names()
352
+
353
+ if success:
354
+ return gr.update(choices=preset_names, value=name)
355
+ else:
356
+ return gr.update(choices=preset_names)
357
+
358
+ # Function to delete a preset
359
+ def delete_preset_fn(name):
360
+ if not name:
361
+ return gr.update(value="Please select a preset to delete")
362
+
363
+ success = speed_dial.delete_preset(name)
364
+
365
+ # Update the dropdown with the new preset list
366
+ preset_names = speed_dial.get_preset_names()
367
+
368
+ if success:
369
+ return gr.update(choices=preset_names, value=None)
370
+ else:
371
+ return gr.update(choices=preset_names)
372
+
373
+ # Connect the buttons to their functions
374
+ load_preset.click(
375
+ fn=load_preset_fn,
376
+ inputs=preset_dropdown,
377
+ outputs=[voice, text, format, speed]
378
+ )
379
+
380
+ save_preset.click(
381
+ fn=save_preset_fn,
382
+ inputs=[preset_name, voice, text, format, speed],
383
+ outputs=preset_dropdown
384
+ )
385
+
386
+ delete_preset.click(
387
+ fn=delete_preset_fn,
388
+ inputs=preset_dropdown,
389
+ outputs=preset_dropdown
390
+ )
391
+
392
+ # Connect the generate button
393
+ generate.click(
394
+ fn=generate_tts_with_logs,
395
+ inputs=[voice, text, format, speed],
396
+ outputs=output
397
+ )
398
+
399
+ # Launch interface
400
+ interface.launch(
401
+ server_name=server_name,
402
+ server_port=server_port,
403
+ share=True
404
+ )
405
+
406
+ def cleanup_resources():
407
+ """Properly clean up resources when the application exits"""
408
+ global model
409
+
410
+ try:
411
+ print("Cleaning up resources...")
412
+
413
+ # Clean up model resources
414
+ if model is not None:
415
+ print("Releasing model resources...")
416
+
417
+ # Clear voice dictionary to release memory
418
+ if hasattr(model, 'voices') and model.voices is not None:
419
+ try:
420
+ voice_count = len(model.voices)
421
+ for voice_name in list(model.voices.keys()):
422
+ try:
423
+ # Release each voice explicitly
424
+ model.voices[voice_name] = None
425
+ except:
426
+ pass
427
+ model.voices.clear()
428
+ print(f"Cleared {voice_count} voice references")
429
+ except Exception as ve:
430
+ print(f"Error clearing voices: {type(ve).__name__}: {ve}")
431
+
432
+ # Clear model attributes that might hold tensors
433
+ for attr_name in dir(model):
434
+ if not attr_name.startswith('__') and hasattr(model, attr_name):
435
+ try:
436
+ attr = getattr(model, attr_name)
437
+ # Handle specific tensor attributes
438
+ if isinstance(attr, torch.Tensor):
439
+ if attr.is_cuda:
440
+ print(f"Releasing CUDA tensor: {attr_name}")
441
+ setattr(model, attr_name, None)
442
+ elif hasattr(attr, 'to'): # Module or Tensor-like object
443
+ setattr(model, attr_name, None)
444
+ except:
445
+ pass
446
+
447
+ # Delete model reference
448
+ try:
449
+ del model
450
+ model = None
451
+ print("Model reference deleted")
452
+ except Exception as me:
453
+ print(f"Error deleting model: {type(me).__name__}: {me}")
454
+
455
+ # Clear CUDA memory explicitly
456
+ if torch.cuda.is_available():
457
+ try:
458
+ # Get initial memory usage
459
+ try:
460
+ initial = torch.cuda.memory_allocated()
461
+ initial_mb = initial / (1024 * 1024)
462
+ print(f"CUDA memory before cleanup: {initial_mb:.2f} MB")
463
+ except:
464
+ pass
465
+
466
+ # Free memory
467
+ print("Clearing CUDA cache...")
468
+ torch.cuda.empty_cache()
469
+
470
+ # Force synchronization
471
+ try:
472
+ torch.cuda.synchronize()
473
+ except:
474
+ pass
475
+
476
+ # Get final memory usage
477
+ try:
478
+ final = torch.cuda.memory_allocated()
479
+ final_mb = final / (1024 * 1024)
480
+ freed_mb = (initial - final) / (1024 * 1024)
481
+ print(f"CUDA memory after cleanup: {final_mb:.2f} MB (freed {freed_mb:.2f} MB)")
482
+ except:
483
+ pass
484
+ except Exception as ce:
485
+ print(f"Error clearing CUDA memory: {type(ce).__name__}: {ce}")
486
+
487
+ # Restore original functions
488
+ try:
489
+ from models import _cleanup_monkey_patches
490
+ _cleanup_monkey_patches()
491
+ print("Monkey patches restored")
492
+ except Exception as pe:
493
+ print(f"Error restoring monkey patches: {type(pe).__name__}: {pe}")
494
+
495
+ # Final garbage collection
496
+ try:
497
+ import gc
498
+ collected = gc.collect()
499
+ print(f"Garbage collection completed: {collected} objects collected")
500
+ except Exception as gce:
501
+ print(f"Error during garbage collection: {type(gce).__name__}: {gce}")
502
+
503
+ print("Cleanup completed")
504
+
505
+ except Exception as e:
506
+ print(f"Error during cleanup: {type(e).__name__}: {e}")
507
+ import traceback
508
+ traceback.print_exc()
509
+
510
+ # Register cleanup for normal exit
511
+ import atexit
512
+ atexit.register(cleanup_resources)
513
+
514
+ # Register cleanup for signals
515
+ import signal
516
+ import sys
517
+
518
+ def signal_handler(signum, frame):
519
+ print(f"\nReceived signal {signum}, shutting down...")
520
+ cleanup_resources()
521
+ sys.exit(0)
522
+
523
+ # Register for common signals
524
+ for sig in [signal.SIGINT, signal.SIGTERM]:
525
+ try:
526
+ signal.signal(sig, signal_handler)
527
+ except (ValueError, AttributeError):
528
+ # Some signals might not be available on all platforms
529
+ pass
530
+
531
+ if __name__ == "__main__":
532
+ try:
533
+ create_interface()
534
+ finally:
535
+ # Ensure cleanup even if Gradio encounters an error
536
+ cleanup_resources()
models.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Models module for Kokoro TTS Local"""
2
+ from typing import Optional, Tuple, List
3
+ import torch
4
+ from kokoro import KPipeline
5
+ import os
6
+ import json
7
+ import codecs
8
+ from pathlib import Path
9
+ import numpy as np
10
+ import shutil
11
+ import threading
12
+
13
+ # Set environment variables for proper encoding
14
+ os.environ["PYTHONIOENCODING"] = "utf-8"
15
+ # Disable symlinks warning
16
+ os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
17
+
18
+ # Setup for safer monkey-patching
19
+ import atexit
20
+ import signal
21
+ import sys
22
+
23
+ # Track whether patches have been applied
24
+ _patches_applied = {
25
+ 'json_load': False,
26
+ 'load_voice': False
27
+ }
28
+
29
+ def _cleanup_monkey_patches():
30
+ """Restore original functions that were monkey-patched"""
31
+ try:
32
+ if _patches_applied['json_load'] and _original_json_load is not None:
33
+ restore_json_load()
34
+ _patches_applied['json_load'] = False
35
+ print("Restored original json.load function")
36
+ except Exception as e:
37
+ print(f"Warning: Error restoring json.load: {e}")
38
+
39
+ try:
40
+ if _patches_applied['load_voice']:
41
+ restore_original_load_voice()
42
+ _patches_applied['load_voice'] = False
43
+ print("Restored original KPipeline.load_voice function")
44
+ except Exception as e:
45
+ print(f"Warning: Error restoring KPipeline.load_voice: {e}")
46
+
47
+ # Register cleanup for normal exit
48
+ atexit.register(_cleanup_monkey_patches)
49
+
50
+ # Register cleanup for signals
51
+ for sig in [signal.SIGINT, signal.SIGTERM]:
52
+ try:
53
+ signal.signal(sig, lambda signum, frame: (
54
+ print(f"\nReceived signal {signum}, cleaning up..."),
55
+ _cleanup_monkey_patches(),
56
+ sys.exit(1)
57
+ ))
58
+ except (ValueError, AttributeError):
59
+ # Some signals might not be available on all platforms
60
+ pass
61
+
62
+ # List of available voice files (54 voices across 8 languages)
63
+ VOICE_FILES = [
64
+ # American English Female voices (11 voices)
65
+ "af_heart.pt", "af_alloy.pt", "af_aoede.pt", "af_bella.pt", "af_jessica.pt",
66
+ "af_kore.pt", "af_nicole.pt", "af_nova.pt", "af_river.pt", "af_sarah.pt", "af_sky.pt",
67
+
68
+ # American English Male voices (9 voices)
69
+ "am_adam.pt", "am_echo.pt", "am_eric.pt", "am_fenrir.pt", "am_liam.pt",
70
+ "am_michael.pt", "am_onyx.pt", "am_puck.pt", "am_santa.pt",
71
+
72
+ # British English Female voices (4 voices)
73
+ "bf_alice.pt", "bf_emma.pt", "bf_isabella.pt", "bf_lily.pt",
74
+
75
+ # British English Male voices (4 voices)
76
+ "bm_daniel.pt", "bm_fable.pt", "bm_george.pt", "bm_lewis.pt",
77
+
78
+ # Japanese voices (5 voices)
79
+ "jf_alpha.pt", "jf_gongitsune.pt", "jf_nezumi.pt", "jf_tebukuro.pt", "jm_kumo.pt",
80
+
81
+ # Mandarin Chinese voices (8 voices)
82
+ "zf_xiaobei.pt", "zf_xiaoni.pt", "zf_xiaoxiao.pt", "zf_xiaoyi.pt",
83
+ "zm_yunjian.pt", "zm_yunxi.pt", "zm_yunxia.pt", "zm_yunyang.pt",
84
+
85
+ # Spanish voices (3 voices)
86
+ "ef_dora.pt", "em_alex.pt", "em_santa.pt",
87
+
88
+ # French voices (1 voice)
89
+ "ff_siwis.pt",
90
+
91
+ # Hindi voices (4 voices)
92
+ "hf_alpha.pt", "hf_beta.pt", "hm_omega.pt", "hm_psi.pt",
93
+
94
+ # Italian voices (2 voices)
95
+ "if_sara.pt", "im_nicola.pt",
96
+
97
+ # Brazilian Portuguese voices (3 voices)
98
+ "pf_dora.pt", "pm_alex.pt", "pm_santa.pt"
99
+ ]
100
+
101
+ # Language code mapping for different languages
102
+ LANGUAGE_CODES = {
103
+ 'a': 'American English',
104
+ 'b': 'British English',
105
+ 'j': 'Japanese',
106
+ 'z': 'Mandarin Chinese',
107
+ 'e': 'Spanish',
108
+ 'f': 'French',
109
+ 'h': 'Hindi',
110
+ 'i': 'Italian',
111
+ 'p': 'Brazilian Portuguese'
112
+ }
113
+
114
+ # Patch KPipeline's load_voice method to use weights_only=False
115
+ original_load_voice = KPipeline.load_voice
116
+
117
+ def patched_load_voice(self, voice_path):
118
+ """Load voice model with weights_only=False for compatibility"""
119
+ if not os.path.exists(voice_path):
120
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
121
+ voice_name = Path(voice_path).stem
122
+ try:
123
+ voice_model = torch.load(voice_path, weights_only=False)
124
+ if voice_model is None:
125
+ raise ValueError(f"Failed to load voice model from {voice_path}")
126
+ # Ensure device is set
127
+ if not hasattr(self, 'device'):
128
+ self.device = 'cpu'
129
+ # Move model to device and store in voices dictionary
130
+ self.voices[voice_name] = voice_model.to(self.device)
131
+ return self.voices[voice_name]
132
+ except Exception as e:
133
+ print(f"Error loading voice {voice_name}: {e}")
134
+ raise
135
+
136
+ # Apply the patch
137
+ KPipeline.load_voice = patched_load_voice
138
+ _patches_applied['load_voice'] = True
139
+
140
+ # Store original function for restoration if needed
141
+ def restore_original_load_voice():
142
+ global _patches_applied
143
+ if _patches_applied['load_voice']:
144
+ KPipeline.load_voice = original_load_voice
145
+ _patches_applied['load_voice'] = False
146
+
147
+ def patch_json_load():
148
+ """Patch json.load to handle UTF-8 encoded files with special characters"""
149
+ global _patches_applied, _original_json_load
150
+ original_load = json.load
151
+ _original_json_load = original_load # Store for restoration
152
+
153
+ def custom_load(fp, *args, **kwargs):
154
+ try:
155
+ # Try reading with UTF-8 encoding
156
+ if hasattr(fp, 'buffer'):
157
+ content = fp.buffer.read().decode('utf-8')
158
+ else:
159
+ content = fp.read()
160
+ try:
161
+ return json.loads(content)
162
+ except json.JSONDecodeError as e:
163
+ print(f"JSON parsing error: {e}")
164
+ raise
165
+ except UnicodeDecodeError:
166
+ # If UTF-8 fails, try with utf-8-sig for files with BOM
167
+ fp.seek(0)
168
+ content = fp.read()
169
+ if isinstance(content, bytes):
170
+ content = content.decode('utf-8-sig', errors='replace')
171
+ try:
172
+ return json.loads(content)
173
+ except json.JSONDecodeError as e:
174
+ print(f"JSON parsing error: {e}")
175
+ raise
176
+
177
+ json.load = custom_load
178
+ _patches_applied['json_load'] = True
179
+ return original_load # Return original for restoration
180
+
181
+ # Store the original load function for potential restoration
182
+ _original_json_load = None
183
+
184
+ def restore_json_load():
185
+ """Restore the original json.load function"""
186
+ global _original_json_load, _patches_applied
187
+ if _original_json_load is not None and _patches_applied['json_load']:
188
+ json.load = _original_json_load
189
+ _original_json_load = None
190
+ _patches_applied['json_load'] = False
191
+
192
+ def load_config(config_path: str) -> dict:
193
+ """Load configuration file with proper encoding handling"""
194
+ try:
195
+ with codecs.open(config_path, 'r', encoding='utf-8') as f:
196
+ return json.load(f)
197
+ except UnicodeDecodeError:
198
+ # Fallback to utf-8-sig if regular utf-8 fails
199
+ with codecs.open(config_path, 'r', encoding='utf-8-sig') as f:
200
+ return json.load(f)
201
+
202
+ # Initialize espeak-ng
203
+ phonemizer_available = False # Global flag to track if phonemizer is working
204
+ try:
205
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
206
+ from phonemizer import phonemize
207
+ import espeakng_loader
208
+
209
+ # Make library available first
210
+ library_path = espeakng_loader.get_library_path()
211
+ data_path = espeakng_loader.get_data_path()
212
+ espeakng_loader.make_library_available()
213
+
214
+ # Set up espeak-ng paths
215
+ EspeakWrapper.library_path = library_path
216
+ EspeakWrapper.data_path = data_path
217
+
218
+ # Verify espeak-ng is working
219
+ try:
220
+ test_phonemes = phonemize('test', language='en-us')
221
+ if test_phonemes:
222
+ phonemizer_available = True
223
+ print("Phonemizer successfully initialized")
224
+ else:
225
+ print("Note: Phonemization returned empty result")
226
+ print("TTS will work, but phoneme visualization will be disabled")
227
+ except Exception as e:
228
+ # Continue without espeak functionality
229
+ print(f"Note: Phonemizer not available: {e}")
230
+ print("TTS will work, but phoneme visualization will be disabled")
231
+
232
+ except ImportError as e:
233
+ print(f"Note: Phonemizer packages not installed: {e}")
234
+ print("TTS will work, but phoneme visualization will be disabled")
235
+ # Rather than automatically installing packages, inform the user
236
+ print("If you want phoneme visualization, manually install required packages:")
237
+ print("pip install espeakng-loader phonemizer-fork")
238
+
239
+ # Initialize pipeline globally with thread safety
240
+ _pipeline = None
241
+ _pipeline_lock = threading.RLock() # Reentrant lock for thread safety
242
+
243
+ def download_voice_files(voice_files=None, repo_version="main", required_count=1):
244
+ """Download voice files from Hugging Face.
245
+
246
+ Args:
247
+ voice_files: Optional list of voice files to download. If None, download all VOICE_FILES.
248
+ repo_version: Version/tag of the repository to use (default: "main")
249
+ required_count: Minimum number of voices required (default: 1)
250
+
251
+ Returns:
252
+ List of successfully downloaded voice files
253
+
254
+ Raises:
255
+ ValueError: If fewer than required_count voices could be downloaded
256
+ """
257
+ # Use absolute path for voices directory
258
+ voices_dir = Path(os.path.abspath("voices"))
259
+ voices_dir.mkdir(exist_ok=True)
260
+
261
+ # Import here to avoid startup dependency
262
+ from huggingface_hub import hf_hub_download
263
+ downloaded_voices = []
264
+ failed_voices = []
265
+
266
+ # If specific voice files are requested, use those. Otherwise use all.
267
+ files_to_download = voice_files if voice_files is not None else VOICE_FILES
268
+ total_files = len(files_to_download)
269
+
270
+ print(f"\nDownloading voice files... ({total_files} total files)")
271
+
272
+ # Check for existing voice files first
273
+ existing_files = []
274
+ for voice_file in files_to_download:
275
+ voice_path = voices_dir / voice_file
276
+ if voice_path.exists():
277
+ print(f"Voice file {voice_file} already exists")
278
+ downloaded_voices.append(voice_file)
279
+ existing_files.append(voice_file)
280
+
281
+ # Remove existing files from the download list
282
+ files_to_download = [f for f in files_to_download if f not in existing_files]
283
+ if not files_to_download and downloaded_voices:
284
+ print(f"All required voice files already exist ({len(downloaded_voices)} files)")
285
+ return downloaded_voices
286
+
287
+ # Proceed with downloading missing files
288
+ retry_count = 3
289
+ try:
290
+ import tempfile
291
+ with tempfile.TemporaryDirectory() as temp_dir:
292
+ for voice_file in files_to_download:
293
+ # Full path where the voice file should be
294
+ voice_path = voices_dir / voice_file
295
+
296
+ # Try with retries
297
+ for attempt in range(retry_count):
298
+ try:
299
+ print(f"Downloading {voice_file}... (attempt {attempt+1}/{retry_count})")
300
+ # Download to a temporary location first
301
+ temp_path = hf_hub_download(
302
+ repo_id="hexgrad/Kokoro-82M",
303
+ filename=f"voices/{voice_file}",
304
+ local_dir=temp_dir,
305
+ force_download=True,
306
+ revision=repo_version
307
+ )
308
+
309
+ # Move the file to the correct location
310
+ os.makedirs(os.path.dirname(str(voice_path)), exist_ok=True)
311
+ shutil.copy2(temp_path, str(voice_path)) # Use copy2 instead of move
312
+
313
+ # Verify file integrity
314
+ if os.path.getsize(str(voice_path)) > 0:
315
+ downloaded_voices.append(voice_file)
316
+ print(f"Successfully downloaded {voice_file}")
317
+ break # Success, exit retry loop
318
+ else:
319
+ print(f"Warning: Downloaded file {voice_file} has zero size, retrying...")
320
+ os.remove(str(voice_path)) # Remove invalid file
321
+ if attempt == retry_count - 1:
322
+ failed_voices.append(voice_file)
323
+ except (IOError, OSError, ValueError, FileNotFoundError, ConnectionError) as e:
324
+ print(f"Warning: Failed to download {voice_file} (attempt {attempt+1}): {e}")
325
+ if attempt == retry_count - 1:
326
+ failed_voices.append(voice_file)
327
+ print(f"Error: Failed all {retry_count} attempts to download {voice_file}")
328
+ except Exception as e:
329
+ print(f"Error during voice download process: {e}")
330
+ import traceback
331
+ traceback.print_exc()
332
+
333
+ # Report results
334
+ if failed_voices:
335
+ print(f"Warning: Failed to download {len(failed_voices)} voice files: {', '.join(failed_voices)}")
336
+
337
+ if not downloaded_voices:
338
+ error_msg = "No voice files could be downloaded. Please check your internet connection."
339
+ print(f"Error: {error_msg}")
340
+ raise ValueError(error_msg)
341
+ elif len(downloaded_voices) < required_count:
342
+ error_msg = f"Only {len(downloaded_voices)} voice files could be downloaded, but {required_count} were required."
343
+ print(f"Error: {error_msg}")
344
+ raise ValueError(error_msg)
345
+ else:
346
+ print(f"Successfully processed {len(downloaded_voices)} voice files")
347
+
348
+ return downloaded_voices
349
+
350
+ def build_model(model_path: str, device: str, repo_version: str = "main") -> KPipeline:
351
+ """Build and return the Kokoro pipeline with proper encoding configuration
352
+
353
+ Args:
354
+ model_path: Path to the model file or None to use default
355
+ device: Device to use ('cuda' or 'cpu')
356
+ repo_version: Version/tag of the repository to use (default: "main")
357
+
358
+ Returns:
359
+ Initialized KPipeline instance
360
+ """
361
+ global _pipeline, _pipeline_lock
362
+
363
+ # Use a lock for thread safety
364
+ with _pipeline_lock:
365
+ # Double-check pattern to avoid race conditions
366
+ if _pipeline is not None:
367
+ return _pipeline
368
+
369
+ try:
370
+ # Patch json loading before initializing pipeline
371
+ patch_json_load()
372
+
373
+ # Download model if it doesn't exist
374
+ if model_path is None:
375
+ model_path = 'kokoro-v1_0.pth'
376
+
377
+ model_path = os.path.abspath(model_path)
378
+ if not os.path.exists(model_path):
379
+ print(f"Downloading model file {model_path}...")
380
+ try:
381
+ from huggingface_hub import hf_hub_download
382
+ model_path = hf_hub_download(
383
+ repo_id="hexgrad/Kokoro-82M",
384
+ filename="kokoro-v1_0.pth",
385
+ local_dir=".",
386
+ force_download=True,
387
+ revision=repo_version
388
+ )
389
+ print(f"Model downloaded to {model_path}")
390
+ except Exception as e:
391
+ print(f"Error downloading model: {e}")
392
+ raise ValueError(f"Could not download model: {e}") from e
393
+
394
+ # Download config if it doesn't exist
395
+ config_path = os.path.abspath("config.json")
396
+ if not os.path.exists(config_path):
397
+ print("Downloading config file...")
398
+ try:
399
+ config_path = hf_hub_download(
400
+ repo_id="hexgrad/Kokoro-82M",
401
+ filename="config.json",
402
+ local_dir=".",
403
+ force_download=True,
404
+ revision=repo_version
405
+ )
406
+ print(f"Config downloaded to {config_path}")
407
+ except Exception as e:
408
+ print(f"Error downloading config: {e}")
409
+ raise ValueError(f"Could not download config: {e}") from e
410
+
411
+ # Download voice files - require at least one voice
412
+ try:
413
+ downloaded_voices = download_voice_files(repo_version=repo_version, required_count=1)
414
+ except ValueError as e:
415
+ print(f"Error: Voice files download failed: {e}")
416
+ raise ValueError("Voice files download failed") from e
417
+
418
+ # Validate language code
419
+ lang_code = 'a' # Default to 'a' for American English
420
+ supported_codes = list(LANGUAGE_CODES.keys())
421
+ if lang_code not in supported_codes:
422
+ print(f"Warning: Unsupported language code '{lang_code}'. Using 'a' (American English).")
423
+ print(f"Supported language codes: {', '.join(supported_codes)}")
424
+ lang_code = 'a'
425
+
426
+ # Initialize pipeline with validated language code
427
+ pipeline_instance = KPipeline(lang_code=lang_code)
428
+ if pipeline_instance is None:
429
+ raise ValueError("Failed to initialize KPipeline - pipeline is None")
430
+
431
+ # Store device parameter for reference in other operations
432
+ pipeline_instance.device = device
433
+
434
+ # Initialize voices dictionary if it doesn't exist
435
+ if not hasattr(pipeline_instance, 'voices'):
436
+ pipeline_instance.voices = {}
437
+
438
+ # Try to load the first available voice with improved error handling
439
+ voice_loaded = False
440
+ for voice_file in downloaded_voices:
441
+ voice_path = os.path.abspath(os.path.join("voices", voice_file))
442
+ if os.path.exists(voice_path):
443
+ try:
444
+ pipeline_instance.load_voice(voice_path)
445
+ print(f"Successfully loaded voice: {voice_file}")
446
+ voice_loaded = True
447
+ break # Successfully loaded a voice
448
+ except Exception as e:
449
+ print(f"Warning: Failed to load voice {voice_file}: {e}")
450
+ continue
451
+
452
+ if not voice_loaded:
453
+ print("Warning: Could not load any voice models")
454
+
455
+ # Set the global _pipeline only after successful initialization
456
+ _pipeline = pipeline_instance
457
+
458
+ except Exception as e:
459
+ print(f"Error initializing pipeline: {e}")
460
+ # Restore original json.load on error
461
+ restore_json_load()
462
+ raise
463
+
464
+ return _pipeline
465
+
466
+ def list_available_voices() -> List[str]:
467
+ """List all available voice models"""
468
+ # Always use absolute path for consistency
469
+ voices_dir = Path(os.path.abspath("voices"))
470
+
471
+ # Create voices directory if it doesn't exist
472
+ if not voices_dir.exists():
473
+ print(f"Creating voices directory at {voices_dir}")
474
+ voices_dir.mkdir(exist_ok=True)
475
+ return []
476
+
477
+ # Get all .pt files in the voices directory
478
+ voice_files = list(voices_dir.glob("*.pt"))
479
+
480
+ # If we found voice files, return them
481
+ if voice_files:
482
+ return [f.stem for f in sorted(voice_files, key=lambda f: f.stem.lower())]
483
+
484
+ # If no voice files in standard location, check if we need to do a one-time migration
485
+ # This is legacy support for older installations
486
+ alt_voices_path = Path(".") / "voices"
487
+ if alt_voices_path.exists() and alt_voices_path.is_dir() and alt_voices_path != voices_dir:
488
+ print(f"Checking alternative voice location: {alt_voices_path.absolute()}")
489
+ alt_voice_files = list(alt_voices_path.glob("*.pt"))
490
+
491
+ if alt_voice_files:
492
+ print(f"Found {len(alt_voice_files)} voice files in alternate location")
493
+ print("Moving files to the standard voices directory...")
494
+
495
+ # Process files in a batch for efficiency
496
+ files_moved = 0
497
+ for voice_file in alt_voice_files:
498
+ target_path = voices_dir / voice_file.name
499
+ if not target_path.exists():
500
+ try:
501
+ # Use copy2 to preserve metadata, then remove original if successful
502
+ shutil.copy2(str(voice_file), str(target_path))
503
+ files_moved += 1
504
+ except (OSError, IOError) as e:
505
+ print(f"Error copying {voice_file.name}: {e}")
506
+
507
+ if files_moved > 0:
508
+ print(f"Successfully moved {files_moved} voice files")
509
+ return [f.stem for f in sorted(voices_dir.glob("*.pt"), key=lambda f: f.stem.lower())]
510
+
511
+ print("No voice files found. Please run the application again to download voices.")
512
+ return []
513
+
514
+ def get_language_code_from_voice(voice_name: str) -> str:
515
+ """Get the appropriate language code from a voice name
516
+
517
+ Args:
518
+ voice_name: Name of the voice (e.g., 'af_bella', 'jf_alpha')
519
+
520
+ Returns:
521
+ Language code for the voice
522
+ """
523
+ # Extract prefix from voice name
524
+ prefix = voice_name[:2] if len(voice_name) >= 2 else 'af'
525
+
526
+ # Map voice prefixes to language codes
527
+ prefix_to_lang = {
528
+ 'af': 'a', 'am': 'a', # American English
529
+ 'bf': 'b', 'bm': 'b', # British English
530
+ 'jf': 'j', 'jm': 'j', # Japanese
531
+ 'zf': 'z', 'zm': 'z', # Mandarin Chinese
532
+ 'ef': 'e', 'em': 'e', # Spanish
533
+ 'ff': 'f', 'fm': 'f', # French
534
+ 'hf': 'h', 'hm': 'h', # Hindi
535
+ 'if': 'i', 'im': 'i', # Italian
536
+ 'pf': 'p', 'pm': 'p', # Brazilian Portuguese
537
+ }
538
+
539
+ return prefix_to_lang.get(prefix, 'a') # Default to American English
540
+
541
+ def load_voice(voice_name: str, device: str) -> torch.Tensor:
542
+ """Load a voice model in a thread-safe manner
543
+
544
+ Args:
545
+ voice_name: Name of the voice to load (with or without .pt extension)
546
+ device: Device to use ('cuda' or 'cpu')
547
+
548
+ Returns:
549
+ Loaded voice model tensor
550
+
551
+ Raises:
552
+ ValueError: If voice file not found or loading fails
553
+ """
554
+ pipeline = build_model(None, device)
555
+
556
+ # Format voice path correctly - strip .pt if it was included
557
+ voice_name = voice_name.replace('.pt', '')
558
+ voice_path = os.path.abspath(os.path.join("voices", f"{voice_name}.pt"))
559
+
560
+ if not os.path.exists(voice_path):
561
+ raise ValueError(f"Voice file not found: {voice_path}")
562
+
563
+ # Use a lock to ensure thread safety when loading voices
564
+ with _pipeline_lock:
565
+ # Check if voice is already loaded
566
+ if hasattr(pipeline, 'voices') and voice_name in pipeline.voices:
567
+ return pipeline.voices[voice_name]
568
+
569
+ # Load voice if not already loaded
570
+ return pipeline.load_voice(voice_path)
571
+
572
+ def generate_speech(
573
+ model: KPipeline,
574
+ text: str,
575
+ voice: str,
576
+ lang: str = 'a',
577
+ device: str = 'cpu',
578
+ speed: float = 1.0
579
+ ) -> Tuple[Optional[torch.Tensor], Optional[str]]:
580
+ """Generate speech using the Kokoro pipeline in a thread-safe manner
581
+
582
+ Args:
583
+ model: KPipeline instance
584
+ text: Text to synthesize
585
+ voice: Voice name (e.g. 'af_bella')
586
+ lang: Language code ('a' for American English, 'b' for British English)
587
+ device: Device to use ('cuda' or 'cpu')
588
+ speed: Speech speed multiplier (default: 1.0)
589
+
590
+ Returns:
591
+ Tuple of (audio tensor, phonemes string) or (None, None) on error
592
+ """
593
+ global _pipeline_lock
594
+
595
+ try:
596
+ if model is None:
597
+ raise ValueError("Model is None - pipeline not properly initialized")
598
+
599
+ # Format voice name and path
600
+ voice_name = voice.replace('.pt', '')
601
+ voice_path = os.path.abspath(os.path.join("voices", f"{voice_name}.pt"))
602
+
603
+ # Check if voice file exists
604
+ if not os.path.exists(voice_path):
605
+ raise ValueError(f"Voice file not found: {voice_path}")
606
+
607
+ # Thread-safe initialization of model properties and voice loading
608
+ with _pipeline_lock:
609
+ # Initialize voices dictionary if it doesn't exist
610
+ if not hasattr(model, 'voices'):
611
+ model.voices = {}
612
+
613
+ # Ensure device is set
614
+ if not hasattr(model, 'device'):
615
+ model.device = device
616
+
617
+ # Ensure voice is loaded before generating
618
+ if voice_name not in model.voices:
619
+ print(f"Loading voice {voice_name}...")
620
+ try:
621
+ model.load_voice(voice_path)
622
+ if voice_name not in model.voices:
623
+ raise ValueError("Voice load succeeded but voice not in model.voices dictionary")
624
+ except Exception as e:
625
+ raise ValueError(f"Failed to load voice {voice_name}: {e}")
626
+
627
+ # Generate speech (outside the lock for better concurrency)
628
+ print(f"Generating speech with device: {model.device}")
629
+ generator = model(
630
+ text,
631
+ voice=voice_path,
632
+ speed=speed,
633
+ split_pattern=r'\n+'
634
+ )
635
+
636
+ # Get first generated segment and convert numpy array to tensor if needed
637
+ for gs, ps, audio in generator:
638
+ if audio is not None:
639
+ if isinstance(audio, np.ndarray):
640
+ audio = torch.from_numpy(audio).float()
641
+ return audio, ps
642
+
643
+ return None, None
644
+ except (ValueError, FileNotFoundError, RuntimeError, KeyError, AttributeError, TypeError) as e:
645
+ print(f"Error generating speech: {e}")
646
+ return None, None
647
+ except Exception as e:
648
+ print(f"Unexpected error during speech generation: {e}")
649
+ import traceback
650
+ traceback.print_exc()
651
+ return None, None
outputs/tts_20250608_125559.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3317e6fef86203f76bc9bd0d47267575594a30d76af064a0427d868c8d04f3
3
+ size 4414125
outputs/tts_20250608_125559.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71e8cd896e8f7c5df4bbf721945771d44fe947375dab70ea3f34bc17f3c42017
3
+ size 10590044
outputs/tts_20250608_125703.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e950dc144d38fa862cf1306cfa602d7f1a3181d1ab9130dab192c82a2b76fe6e
3
+ size 4677165
outputs/tts_20250608_125703.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7a99e011575460595ef14ae32b698775b264da96ca8ba96a29d4f8139cc7ad3
3
+ size 11221244
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ kokoro>=0.9.2 # Official Kokoro TTS library (v1.0 model support)
2
+ misaki # G2P library for Kokoro (multi-language support)
3
+ torch>=2.0.0 # PyTorch for model inference (for GPU support, see README.md for CUDA-specific installation)
4
+ soundfile>=0.12.1 # Audio file handling
5
+ huggingface-hub>=0.16.0 # Model downloads from Hugging Face
6
+ gradio>=4.0.0 # Web interface
7
+ pydub>=0.25.1 # For audio format conversion
8
+ espeakng-loader>=0.1.0 # For loading espeak-ng library
9
+ phonemizer-fork>=3.2.1 # For phoneme generation
10
+ wheel>=0.38.0 # For building packages
11
+ setuptools>=65.0.0 # For installing packages
12
+ num2words>=0.5.12 # For number to word conversion
13
+ spacy>=3.4.0 # For text processing
speed_dial.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speed Dial Module for Kokoro-TTS-Local
3
+ --------------------------------------
4
+ Manages speed dial presets for quick access to frequently used voice and text combinations.
5
+
6
+ This module provides functions to:
7
+ - Load speed dial presets from a JSON file
8
+ - Save new presets to the JSON file
9
+ - Delete presets from the JSON file
10
+ - Validate preset data
11
+ """
12
+
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Dict, List, Optional, Any
17
+
18
+ # Define the path for the speed dial presets file
19
+ SPEED_DIAL_FILE = Path("speed_dial.json")
20
+
21
+ def load_presets() -> Dict[str, Dict[str, Any]]:
22
+ """
23
+ Load speed dial presets from the JSON file.
24
+
25
+ Returns:
26
+ Dictionary of presets where keys are preset names and values are preset data
27
+ """
28
+ if not SPEED_DIAL_FILE.exists():
29
+ # If file doesn't exist, return an empty dictionary
30
+ return {}
31
+
32
+ try:
33
+ with open(SPEED_DIAL_FILE, 'r', encoding='utf-8') as f:
34
+ presets = json.load(f)
35
+
36
+ # Validate the loaded presets
37
+ validated_presets = {}
38
+ for name, preset in presets.items():
39
+ if validate_preset(preset):
40
+ validated_presets[name] = preset
41
+
42
+ return validated_presets
43
+ except (json.JSONDecodeError, IOError) as e:
44
+ print(f"Error loading speed dial presets: {e}")
45
+ return {}
46
+
47
+ def save_preset(name: str, voice: str, text: str, format: str = "wav", speed: float = 1.0) -> bool:
48
+ """
49
+ Save a new speed dial preset.
50
+
51
+ Args:
52
+ name: Name of the preset
53
+ voice: Voice to use
54
+ text: Text to convert to speech
55
+ format: Output format (default: "wav")
56
+ speed: Speech speed (default: 1.0)
57
+
58
+ Returns:
59
+ True if successful, False otherwise
60
+ """
61
+ # Create preset data
62
+ preset = {
63
+ "voice": voice,
64
+ "text": text,
65
+ "format": format,
66
+ "speed": speed
67
+ }
68
+
69
+ # Validate preset data
70
+ if not validate_preset(preset):
71
+ return False
72
+
73
+ # Load existing presets
74
+ presets = load_presets()
75
+
76
+ # Add or update the preset
77
+ presets[name] = preset
78
+
79
+ # Save presets to file
80
+ try:
81
+ with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f:
82
+ json.dump(presets, f, indent=2, ensure_ascii=False)
83
+ return True
84
+ except IOError as e:
85
+ print(f"Error saving speed dial preset: {e}")
86
+ return False
87
+
88
+ def delete_preset(name: str) -> bool:
89
+ """
90
+ Delete a speed dial preset.
91
+
92
+ Args:
93
+ name: Name of the preset to delete
94
+
95
+ Returns:
96
+ True if successful, False otherwise
97
+ """
98
+ # Load existing presets
99
+ presets = load_presets()
100
+
101
+ # Check if preset exists
102
+ if name not in presets:
103
+ return False
104
+
105
+ # Remove the preset
106
+ del presets[name]
107
+
108
+ # Save presets to file
109
+ try:
110
+ with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f:
111
+ json.dump(presets, f, indent=2, ensure_ascii=False)
112
+ return True
113
+ except IOError as e:
114
+ print(f"Error deleting speed dial preset: {e}")
115
+ return False
116
+
117
+ def validate_preset(preset: Dict[str, Any]) -> bool:
118
+ """
119
+ Validate a preset's data structure.
120
+
121
+ Args:
122
+ preset: Preset data to validate
123
+
124
+ Returns:
125
+ True if valid, False otherwise
126
+ """
127
+ # Check required fields
128
+ required_fields = ["voice", "text"]
129
+ for field in required_fields:
130
+ if field not in preset:
131
+ print(f"Preset missing required field: {field}")
132
+ return False
133
+
134
+ # Check field types
135
+ if not isinstance(preset.get("voice"), str):
136
+ print("Preset voice must be a string")
137
+ return False
138
+
139
+ if not isinstance(preset.get("text"), str):
140
+ print("Preset text must be a string")
141
+ return False
142
+
143
+ # Optional fields with defaults
144
+ if "format" not in preset:
145
+ preset["format"] = "wav"
146
+ elif not isinstance(preset["format"], str):
147
+ print("Preset format must be a string")
148
+ return False
149
+
150
+ if "speed" not in preset:
151
+ preset["speed"] = 1.0
152
+ elif not isinstance(preset["speed"], (int, float)):
153
+ print("Preset speed must be a number")
154
+ return False
155
+
156
+ return True
157
+
158
+ def get_preset_names() -> List[str]:
159
+ """
160
+ Get a list of all preset names.
161
+
162
+ Returns:
163
+ List of preset names
164
+ """
165
+ presets = load_presets()
166
+ return list(presets.keys())
167
+
168
+ def get_preset(name: str) -> Optional[Dict[str, Any]]:
169
+ """
170
+ Get a specific preset by name.
171
+
172
+ Args:
173
+ name: Name of the preset to get
174
+
175
+ Returns:
176
+ Preset data or None if not found
177
+ """
178
+ presets = load_presets()
179
+ return presets.get(name)
test.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import torch
2
+ print(torch.cuda.is_available())
tts_demo.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Optional, Tuple, List, Union
3
+ from models import build_model, generate_speech, list_available_voices
4
+ from tqdm.auto import tqdm
5
+ import soundfile as sf
6
+ from pathlib import Path
7
+ import numpy as np
8
+ import time
9
+ import os
10
+ import sys
11
+
12
+ # Define path type for consistent handling
13
+ PathLike = Union[str, Path]
14
+
15
+ # Constants with validation
16
+ def validate_sample_rate(rate: int) -> int:
17
+ """Validate sample rate is within acceptable range"""
18
+ valid_rates = [16000, 22050, 24000, 44100, 48000]
19
+ if rate not in valid_rates:
20
+ print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}")
21
+ return 24000 # Default to safe value
22
+ return rate
23
+
24
+ def validate_language(lang: str) -> str:
25
+ """Validate language code"""
26
+ # Import here to avoid circular imports
27
+ from models import LANGUAGE_CODES
28
+ valid_langs = list(LANGUAGE_CODES.keys())
29
+ if lang not in valid_langs:
30
+ print(f"Warning: Invalid language code '{lang}'. Using 'a' (American English).")
31
+ print(f"Supported language codes: {', '.join(valid_langs)}")
32
+ return 'a' # Default to American English
33
+ return lang
34
+
35
+ # Define and validate constants
36
+ SAMPLE_RATE = validate_sample_rate(24000)
37
+ DEFAULT_MODEL_PATH = Path('kokoro-v1_0.pth').absolute()
38
+ DEFAULT_OUTPUT_FILE = Path('output.wav').absolute()
39
+ DEFAULT_LANGUAGE = validate_language('a') # 'a' for American English, 'b' for British English
40
+ DEFAULT_TEXT = "Hello, welcome to this text-to-speech test."
41
+
42
+ # Ensure output directory exists
43
+ DEFAULT_OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
44
+
45
+ # Configure tqdm for better Windows console support
46
+ tqdm.monitor_interval = 0
47
+
48
+ def print_menu():
49
+ """Print the main menu options."""
50
+ print("\n=== Kokoro TTS Menu ===")
51
+ print("1. List available voices")
52
+ print("2. Generate speech")
53
+ print("3. Exit")
54
+ return input("Select an option (1-3): ").strip()
55
+
56
+ def select_voice(voices: List[str]) -> str:
57
+ """Interactive voice selection."""
58
+ print("\nAvailable voices:")
59
+ for i, voice in enumerate(voices, 1):
60
+ print(f"{i}. {voice}")
61
+
62
+ while True:
63
+ try:
64
+ choice = input("\nSelect a voice number (or press Enter for default 'af_bella'): ").strip()
65
+ if not choice:
66
+ return "af_bella"
67
+ choice = int(choice)
68
+ if 1 <= choice <= len(voices):
69
+ return voices[choice - 1]
70
+ print("Invalid choice. Please try again.")
71
+ except ValueError:
72
+ print("Please enter a valid number.")
73
+
74
+ def get_text_input() -> str:
75
+ """Get text input from user."""
76
+ print("\nEnter the text you want to convert to speech")
77
+ print("(or press Enter for default text)")
78
+ text = input("> ").strip()
79
+ return text if text else DEFAULT_TEXT
80
+
81
+ def get_speed() -> float:
82
+ """Get speech speed from user."""
83
+ while True:
84
+ try:
85
+ speed = input("\nEnter speech speed (0.5-2.0, default 1.0): ").strip()
86
+ if not speed:
87
+ return 1.0
88
+ speed = float(speed)
89
+ if 0.5 <= speed <= 2.0:
90
+ return speed
91
+ print("Speed must be between 0.5 and 2.0")
92
+ except ValueError:
93
+ print("Please enter a valid number.")
94
+
95
+ def save_audio_with_retry(audio_data: np.ndarray, sample_rate: int, output_path: PathLike, max_retries: int = 3, retry_delay: float = 1.0) -> bool:
96
+ """
97
+ Attempt to save audio data to file with retry logic.
98
+
99
+ Args:
100
+ audio_data: Audio data as numpy array
101
+ sample_rate: Sample rate in Hz
102
+ output_path: Path to save the audio file
103
+ max_retries: Maximum number of retry attempts
104
+ retry_delay: Delay between retries in seconds
105
+
106
+ Returns:
107
+ True if successful, False otherwise
108
+ """
109
+ # Convert and normalize path to Path object
110
+ output_path = Path(output_path).absolute()
111
+
112
+ # Create parent directory if it doesn't exist
113
+ output_path.parent.mkdir(parents=True, exist_ok=True)
114
+
115
+ # Try to remove the file if it exists to avoid "file in use" issues
116
+ try:
117
+ if output_path.exists():
118
+ print(f"Removing existing file: {output_path}")
119
+ output_path.unlink()
120
+ except Exception as e:
121
+ print(f"Warning: Could not remove existing file: {e}")
122
+ print("This might indicate the file is in use by another program.")
123
+
124
+ for attempt in range(max_retries):
125
+ try:
126
+ # Validate audio data before saving
127
+ if audio_data is None or len(audio_data) == 0:
128
+ raise ValueError("Empty audio data")
129
+
130
+ # Check write permissions for the directory
131
+ if not os.access(str(output_path.parent), os.W_OK):
132
+ raise PermissionError(f"No write permission for directory: {output_path.parent}")
133
+
134
+ # Try to use a temporary file first, then rename it
135
+ temp_path = output_path.with_name(f"temp_{output_path.name}")
136
+
137
+ # Save audio file to temporary location
138
+ print(f"Saving audio to temporary file: {temp_path}")
139
+ sf.write(str(temp_path), audio_data, sample_rate)
140
+
141
+ # If successful, rename to final location
142
+ if temp_path.exists():
143
+ # Remove target file if it exists
144
+ if output_path.exists():
145
+ output_path.unlink()
146
+ # Rename temp file to target file
147
+ temp_path.rename(output_path)
148
+ print(f"Successfully renamed temporary file to: {output_path}")
149
+
150
+ return True
151
+
152
+ except (IOError, PermissionError) as e:
153
+ if attempt < max_retries - 1:
154
+ print(f"\nFailed to save audio (attempt {attempt + 1}/{max_retries}): {e}")
155
+ print("The output file might be in use by another program (e.g., media player).")
156
+ print(f"Please close any programs that might be using '{output_path}'")
157
+ print(f"Retrying in {retry_delay} seconds...")
158
+ time.sleep(retry_delay)
159
+ else:
160
+ print(f"\nError: Could not save audio after {max_retries} attempts: {e}")
161
+ print(f"Please ensure '{output_path}' is not open in any other program and try again.")
162
+ print(f"You might need to restart your computer if the file remains locked.")
163
+ return False
164
+ except Exception as e:
165
+ print(f"\nUnexpected error saving audio: {type(e).__name__}: {e}")
166
+ if attempt < max_retries - 1:
167
+ print(f"Retrying in {retry_delay} seconds...")
168
+ time.sleep(retry_delay)
169
+ else:
170
+ return False
171
+ finally:
172
+ # Clean up temp file if it exists and we failed
173
+ try:
174
+ temp_path = output_path.with_name(f"temp_{output_path.name}")
175
+ if temp_path.exists():
176
+ temp_path.unlink()
177
+ except Exception:
178
+ pass
179
+
180
+ return False
181
+
182
+ def main() -> None:
183
+ try:
184
+ # Set up device safely
185
+ try:
186
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
187
+ except (RuntimeError, AttributeError, ImportError) as e:
188
+ print(f"CUDA initialization error: {e}. Using CPU instead.")
189
+ device = 'cpu' # Fallback if CUDA check fails
190
+ print(f"Using device: {device}")
191
+
192
+ # Build model
193
+ print("\nInitializing model...")
194
+ with tqdm(total=1, desc="Building model") as pbar:
195
+ model = build_model(DEFAULT_MODEL_PATH, device)
196
+ pbar.update(1)
197
+
198
+ # Cache for voices to avoid redundant calls
199
+ voices_cache = None
200
+
201
+ while True:
202
+ choice = print_menu()
203
+
204
+ if choice == "1":
205
+ # List voices
206
+ voices_cache = list_available_voices()
207
+ print("\nAvailable voices:")
208
+ for voice in voices_cache:
209
+ print(f"- {voice}")
210
+
211
+ elif choice == "2":
212
+ # Generate speech
213
+ # Use cached voices if available
214
+ if voices_cache is None:
215
+ voices_cache = list_available_voices()
216
+
217
+ if not voices_cache:
218
+ print("No voices found! Please check the voices directory.")
219
+ continue
220
+
221
+ # Get user inputs
222
+ voice = select_voice(voices_cache)
223
+ text = get_text_input()
224
+
225
+ # Validate text (don't allow extremely long inputs)
226
+ if len(text) > 10000: # Reasonable limit for text length
227
+ print("Text is too long. Please enter a shorter text.")
228
+ continue
229
+
230
+ speed = get_speed()
231
+
232
+ print(f"\nGenerating speech for: '{text}'")
233
+ print(f"Using voice: {voice}")
234
+ print(f"Speed: {speed}x")
235
+
236
+ # Generate speech
237
+ all_audio = []
238
+ # Use Path object for consistent path handling
239
+ voice_path = Path("voices").absolute() / f"{voice}.pt"
240
+
241
+ # Verify voice file exists
242
+ if not voice_path.exists():
243
+ print(f"Error: Voice file not found: {voice_path}")
244
+ continue
245
+
246
+ # Set a timeout for generation with per-segment timeout
247
+ max_gen_time = 300 # 5 minutes max total
248
+ max_segment_time = 60 # 60 seconds max per segment
249
+ start_time = time.time()
250
+ segment_start_time = start_time
251
+
252
+ try:
253
+ # Setup watchdog timer for overall process
254
+ import threading
255
+ generation_complete = False
256
+
257
+ def watchdog_timer():
258
+ if not generation_complete:
259
+ print("\nWatchdog: Generation taking too long, process will be cancelled")
260
+ # Can't directly interrupt generator, but this will inform user
261
+
262
+ # Start watchdog timer
263
+ watchdog = threading.Timer(max_gen_time, watchdog_timer)
264
+ watchdog.daemon = True # Don't prevent program exit
265
+ watchdog.start()
266
+
267
+ # Initialize generator
268
+ try:
269
+ generator = model(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
270
+ except (ValueError, TypeError, RuntimeError) as e:
271
+ print(f"Error initializing speech generator: {e}")
272
+ watchdog.cancel()
273
+ continue
274
+ except Exception as e:
275
+ print(f"Unexpected error initializing generator: {type(e).__name__}: {e}")
276
+ watchdog.cancel()
277
+ continue
278
+
279
+ # Process segments
280
+ with tqdm(desc="Generating speech") as pbar:
281
+ for gs, ps, audio in generator:
282
+ # Check overall timeout
283
+ current_time = time.time()
284
+ if current_time - start_time > max_gen_time:
285
+ print("\nWarning: Total generation time exceeded limit, stopping")
286
+ break
287
+
288
+ # Check per-segment timeout
289
+ segment_elapsed = current_time - segment_start_time
290
+ if segment_elapsed > max_segment_time:
291
+ print(f"\nWarning: Segment took too long ({segment_elapsed:.1f}s), stopping")
292
+ break
293
+
294
+ # Reset segment timer
295
+ segment_start_time = current_time
296
+
297
+ # Process audio if available
298
+ if audio is not None:
299
+ # Only convert if it's a numpy array, not if already tensor
300
+ audio_tensor = audio if isinstance(audio, torch.Tensor) else torch.from_numpy(audio).float()
301
+
302
+ all_audio.append(audio_tensor)
303
+ print(f"\nGenerated segment: {gs}")
304
+ if ps: # Only print phonemes if available
305
+ print(f"Phonemes: {ps}")
306
+ pbar.update(1)
307
+
308
+ # Mark generation as complete (for watchdog)
309
+ generation_complete = True
310
+ watchdog.cancel()
311
+
312
+ except ValueError as e:
313
+ print(f"Value error during speech generation: {e}")
314
+ except RuntimeError as e:
315
+ print(f"Runtime error during speech generation: {e}")
316
+ # If CUDA out of memory, provide more helpful message
317
+ if "CUDA out of memory" in str(e):
318
+ print("CUDA out of memory error - try using a shorter text or switching to CPU")
319
+ except KeyError as e:
320
+ print(f"Key error during speech generation: {e}")
321
+ print("This might be caused by a missing voice configuration")
322
+ except FileNotFoundError as e:
323
+ print(f"File not found: {e}")
324
+ except Exception as e:
325
+ print(f"Unexpected error during speech generation: {type(e).__name__}: {e}")
326
+ import traceback
327
+ traceback.print_exc()
328
+
329
+ # Save audio
330
+ if all_audio:
331
+ try:
332
+ # Handle single segment case without concatenation
333
+ if len(all_audio) == 1:
334
+ final_audio = all_audio[0]
335
+ else:
336
+ try:
337
+ final_audio = torch.cat(all_audio, dim=0)
338
+ except RuntimeError as e:
339
+ print(f"Error concatenating audio segments: {e}")
340
+ continue
341
+
342
+ # Use consistent Path object
343
+ output_path = Path(DEFAULT_OUTPUT_FILE)
344
+ if save_audio_with_retry(final_audio.numpy(), SAMPLE_RATE, output_path):
345
+ print(f"\nAudio saved to {output_path}")
346
+ # Play a system beep to indicate completion
347
+ try:
348
+ print('\a') # ASCII bell - should make a sound on most systems
349
+ except:
350
+ pass
351
+ else:
352
+ print("Failed to save audio file")
353
+ except Exception as e:
354
+ print(f"Error processing audio: {type(e).__name__}: {e}")
355
+ else:
356
+ print("Error: Failed to generate audio")
357
+
358
+ elif choice == "3":
359
+ print("\nGoodbye!")
360
+ break
361
+
362
+ else:
363
+ print("\nInvalid choice. Please try again.")
364
+
365
+ except Exception as e:
366
+ print(f"Error in main: {e}")
367
+ import traceback
368
+ traceback.print_exc()
369
+ finally:
370
+ # Comprehensive cleanup with error handling
371
+ try:
372
+ print("\nPerforming cleanup...")
373
+
374
+ # Ensure model is properly released
375
+ if 'model' in locals() and model is not None:
376
+ print("Cleaning up model resources...")
377
+ # First clear any references to voice models
378
+ if hasattr(model, 'voices'):
379
+ try:
380
+ voices_count = len(model.voices)
381
+ model.voices.clear()
382
+ print(f"Cleared {voices_count} voice references")
383
+ except Exception as voice_error:
384
+ print(f"Error clearing voice references: {voice_error}")
385
+
386
+ # Clear any other model attributes that might hold references
387
+ try:
388
+ for attr in list(model.__dict__.keys()):
389
+ if hasattr(model, attr) and not attr.startswith('__'):
390
+ try:
391
+ delattr(model, attr)
392
+ except:
393
+ pass
394
+ except Exception as attr_error:
395
+ print(f"Error clearing model attributes: {attr_error}")
396
+
397
+ # Then delete the model
398
+ try:
399
+ del model
400
+ model = None
401
+ print("Model reference deleted")
402
+ except Exception as del_error:
403
+ print(f"Error deleting model: {del_error}")
404
+
405
+ # Clean up voice cache
406
+ if 'voices_cache' in locals() and voices_cache is not None:
407
+ try:
408
+ voices_cache.clear()
409
+ voices_cache = None
410
+ print("Voice cache cleared")
411
+ except Exception as cache_error:
412
+ print(f"Error clearing voice cache: {cache_error}")
413
+
414
+ # Clean up any CUDA resources
415
+ if torch.cuda.is_available():
416
+ try:
417
+ print("Cleaning up CUDA resources...")
418
+ torch.cuda.empty_cache()
419
+ print("CUDA cache emptied")
420
+ except Exception as cuda_error:
421
+ print(f"Error clearing CUDA cache: {cuda_error}")
422
+
423
+ # Make sure patched functions are restored
424
+ try:
425
+ from models import _cleanup_monkey_patches
426
+ _cleanup_monkey_patches()
427
+ print("Monkey patches restored")
428
+ except Exception as patch_error:
429
+ print(f"Error restoring monkey patches: {patch_error}")
430
+
431
+ # Final garbage collection
432
+ try:
433
+ import gc
434
+ gc.collect()
435
+ print("Garbage collection completed")
436
+ except Exception as gc_error:
437
+ print(f"Error during garbage collection: {gc_error}")
438
+
439
+ print("Cleanup completed")
440
+
441
+ except Exception as e:
442
+ print(f"Error during cleanup: {type(e).__name__}: {e}")
443
+ import traceback
444
+ traceback.print_exc()
445
+
446
+ if __name__ == "__main__":
447
+ main()