Spaces:

lachieandmitch
/

Kokoro-TTS-Local

Sleeping

App Files Files Community

lachieandmitch commited on Jun 8

Commit

7477637

verified ·

1 Parent(s): f754715

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +4 -0
.gitignore +40 -0
.gradio/certificate.pem +31 -0
LICENSE +190 -0
README.md +364 -7
gradio_interface.py +536 -0
models.py +651 -0
outputs/tts_20250608_125559.mp3 +3 -0
outputs/tts_20250608_125559.wav +3 -0
outputs/tts_20250608_125703.mp3 +3 -0
outputs/tts_20250608_125703.wav +3 -0
requirements.txt +13 -0
speed_dial.py +179 -0
test.py +2 -0
tts_demo.py +447 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+outputs/tts_20250608_125559.mp3 filter=lfs diff=lfs merge=lfs -text
+outputs/tts_20250608_125559.wav filter=lfs diff=lfs merge=lfs -text
+outputs/tts_20250608_125703.mp3 filter=lfs diff=lfs merge=lfs -text
+outputs/tts_20250608_125703.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,40 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Project specific
+output*.wav
+*.pth
+*.onnx
+voices/
+voices/*.pt
+voices/**/*.pt
+config.json

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

LICENSE ADDED Viewed

	@@ -0,0 +1,190 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   Copyright 2025 PierrunoYT (Kokoro TTS Local)
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,369 @@
 ---
-title: Kokoro TTS Local
-emoji: 🏃
-colorFrom: purple
-colorTo: green
 sdk: gradio
 sdk_version: 5.33.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Kokoro-TTS-Local
+app_file: gradio_interface.py
 sdk: gradio
 sdk_version: 5.33.0
 ---
+# Kokoro TTS Local
+A local implementation of the Kokoro Text-to-Speech model, featuring dynamic module loading, automatic dependency management, and a web interface.
+## Features
+- Local text-to-speech synthesis using the Kokoro-82M model
+- Multiple voice support with easy voice selection (54 voices available across 8 languages)
+- Automatic model and voice downloading from Hugging Face
+- Phoneme output support and visualization
+- Interactive CLI and web interface
+- Voice listing functionality
+- Cross-platform support (Windows, Linux, macOS)
+- Real-time generation progress display
+- Multiple output formats (WAV, MP3, AAC)
+## Prerequisites
+- Python 3.8 or higher
+- FFmpeg (optional, for MP3/AAC conversion)
+- CUDA-compatible GPU (optional, for faster generation)
+- Git (for version control and package management)
+## Installation
+1. Clone the repository and create a Python virtual environment:
+```bash
+# Windows
+python -m venv venv
+.\venv\Scripts\activate
+# Linux/macOS
+python3 -m venv venv
+source venv/bin/activate
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+**Alternative Installation (Simplified):**
+For a simpler setup, you can also install the official Kokoro package directly:
+```bash
+pip install kokoro>=0.9.2 soundfile
+apt-get install espeak-ng  # On Linux
+# or brew install espeak  # On macOS
+```
+3. (Optional) For GPU acceleration, install PyTorch with CUDA support:
+```bash
+# For CUDA 11.8
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+# For CUDA 12.1
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# For CUDA 12.6
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+# For CUDA 12.8 (for RTX 50-series cards)
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+```
+You can verify CUDA support is enabled with:
+```python
+import torch
+print(torch.cuda.is_available())  # Should print True if CUDA is available
+```
+The system will automatically download required models and voice files on first run.
+## Usage
+You can use either the command-line interface or the web interface:
+### Command Line Interface
+Run the interactive CLI:
+```bash
+python tts_demo.py
+```
+The CLI provides an interactive menu with the following options:
+1. List available voices - Shows all available voice options
+2. Generate speech - Interactive process to:
+   - Select a voice from the numbered list
+   - Enter text to convert to speech
+   - Adjust speech speed (0.5-2.0)
+3. Exit - Quit the program
+Example session:
+```
+=== Kokoro TTS Menu ===
+1. List available voices
+2. Generate speech
+3. Exit
+Select an option (1-3): 2
+Available voices:
+1. af_alloy
+2. af_aoede
+3. af_bella
+...
+Select a voice number (or press Enter for default 'af_bella'): 3
+Enter the text you want to convert to speech
+(or press Enter for default text)
+> Hello, world!
+Enter speech speed (0.5-2.0, default 1.0): 1.2
+Generating speech for: 'Hello, world!'
+Using voice: af_bella
+Speed: 1.2x
+...
+```
+### Web Interface
+For a more user-friendly experience, launch the web interface:
+```bash
+python gradio_interface.py
+```
+Then open your browser to the URL shown in the console (typically http://localhost:7860).
+The web interface provides:
+- Easy voice selection from a dropdown menu
+- Text input field with examples
+- Real-time generation progress
+- Audio playback in the browser
+- Multiple output format options (WAV, MP3, AAC)
+- Download options for generated audio
+## Available Voices
+The system includes 54 different voices across 8 languages:
+### 🇺🇸 American English (20 voices)
+**Language code: 'a'**
+**Female voices (af_*):**
+- af_heart: ❤️ Premium quality voice (Grade A)
+- af_alloy: Clear and professional (Grade C)
+- af_aoede: Smooth and melodic (Grade C+)
+- af_bella: 🔥 Warm and friendly (Grade A-)
+- af_jessica: Natural and engaging (Grade D)
+- af_kore: Bright and energetic (Grade C+)
+- af_nicole: 🎧 Professional and articulate (Grade B-)
+- af_nova: Modern and dynamic (Grade C)
+- af_river: Soft and flowing (Grade D)
+- af_sarah: Casual and approachable (Grade C+)
+- af_sky: Light and airy (Grade C-)
+**Male voices (am_*):**
+- am_adam: Strong and confident (Grade F+)
+- am_echo: Resonant and clear (Grade D)
+- am_eric: Professional and authoritative (Grade D)
+- am_fenrir: Deep and powerful (Grade C+)
+- am_liam: Friendly and conversational (Grade D)
+- am_michael: Warm and trustworthy (Grade C+)
+- am_onyx: Rich and sophisticated (Grade D)
+- am_puck: Playful and energetic (Grade C+)
+- am_santa: Holiday-themed voice (Grade D-)
+### 🇬🇧 British English (8 voices)
+**Language code: 'b'**
+**Female voices (bf_*):**
+- bf_alice: Refined and elegant (Grade D)
+- bf_emma: Warm and professional (Grade B-)
+- bf_isabella: Sophisticated and clear (Grade C)
+- bf_lily: Sweet and gentle (Grade D)
+**Male voices (bm_*):**
+- bm_daniel: Polished and professional (Grade D)
+- bm_fable: Storytelling and engaging (Grade C)
+- bm_george: Classic British accent (Grade C)
+- bm_lewis: Modern British accent (Grade D+)
+### 🇯🇵 Japanese (5 voices)
+**Language code: 'j'**
+**Female voices (jf_*):**
+- jf_alpha: Standard Japanese female (Grade C+)
+- jf_gongitsune: Based on classic tale (Grade C)
+- jf_nezumi: Mouse bride tale voice (Grade C-)
+- jf_tebukuro: Glove story voice (Grade C)
+**Male voices (jm_*):**
+- jm_kumo: Spider thread tale voice (Grade C-)
+### 🇨🇳 Mandarin Chinese (8 voices)
+**Language code: 'z'**
+**Female voices (zf_*):**
+- zf_xiaobei: Chinese female voice (Grade D)
+- zf_xiaoni: Chinese female voice (Grade D)
+- zf_xiaoxiao: Chinese female voice (Grade D)
+- zf_xiaoyi: Chinese female voice (Grade D)
+**Male voices (zm_*):**
+- zm_yunjian: Chinese male voice (Grade D)
+- zm_yunxi: Chinese male voice (Grade D)
+- zm_yunxia: Chinese male voice (Grade D)
+- zm_yunyang: Chinese male voice (Grade D)
+### 🇪🇸 Spanish (3 voices)
+**Language code: 'e'**
+**Female voices (ef_*):**
+- ef_dora: Spanish female voice
+**Male voices (em_*):**
+- em_alex: Spanish male voice
+- em_santa: Spanish holiday voice
+### 🇫🇷 French (1 voice)
+**Language code: 'f'**
+**Female voices (ff_*):**
+- ff_siwis: French female voice (Grade B-)
+### 🇮🇳 Hindi (4 voices)
+**Language code: 'h'**
+**Female voices (hf_*):**
+- hf_alpha: Hindi female voice (Grade C)
+- hf_beta: Hindi female voice (Grade C)
+**Male voices (hm_*):**
+- hm_omega: Hindi male voice (Grade C)
+- hm_psi: Hindi male voice (Grade C)
+### 🇮🇹 Italian (2 voices)
+**Language code: 'i'**
+**Female voices (if_*):**
+- if_sara: Italian female voice (Grade C)
+**Male voices (im_*):**
+- im_nicola: Italian male voice (Grade C)
+### 🇧🇷 Brazilian Portuguese (3 voices)
+**Language code: 'p'**
+**Female voices (pf_*):**
+- pf_dora: Portuguese female voice
+**Male voices (pm_*):**
+- pm_alex: Portuguese male voice
+- pm_santa: Portuguese holiday voice
+**Note:** Quality grades (A to F) indicate the overall quality based on training data quality and duration. Higher grades generally produce better speech quality.
+## Project Structure
+```
+.
+├── .cache/                 # Cache directory for downloaded models
+│   └── huggingface/       # Hugging Face model cache
+├── .git/                   # Git repository data
+├── .gitignore             # Git ignore rules
+├── __pycache__/           # Python cache files
+├── voices/                # Voice model files (downloaded on demand)
+│   └── *.pt              # Individual voice files
+├── venv/                  # Python virtual environment
+├── outputs/               # Generated audio files directory
+├── LICENSE                # Apache 2.0 License file
+├── README.md             # Project documentation
+├── models.py             # Core TTS model implementation
+├── gradio_interface.py   # Web interface implementation
+├── config.json           # Model configuration file
+├── requirements.txt      # Python dependencies
+└── tts_demo.py          # CLI implementation
+```
+## Model Information
+The project uses the latest Kokoro model from Hugging Face:
+- Repository: [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)
+- Model file: `kokoro-v1_0.pth` (downloaded automatically)
+- Sample rate: 24kHz
+- Voice files: Located in the `voices/` directory (downloaded automatically)
+- Available voices: 54 voices across 8 languages
+- Languages: American English ('a'), British English ('b'), Japanese ('j'), Mandarin Chinese ('z'), Spanish ('e'), French ('f'), Hindi ('h'), Italian ('i'), Brazilian Portuguese ('p')
+- Model size: 82M parameters
+## Troubleshooting
+Common issues and solutions:
+1. **Model Download Issues**
+   - Ensure stable internet connection
+   - Check Hugging Face is accessible
+   - Verify sufficient disk space
+   - Try clearing the `.cache/huggingface` directory
+2. **CUDA/GPU Issues**
+   - Verify CUDA installation with `nvidia-smi`
+   - Update GPU drivers
+   - Install PyTorch with CUDA support using the appropriate command:
+     ```bash
+     # For CUDA 11.8
+     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+     # For CUDA 12.1
+     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+     # For CUDA 12.6
+     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+     # For CUDA 12.8 (for RTX 50-series cards)
+     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+     ```
+   - Verify CUDA is available in PyTorch:
+     ```python
+     import torch
+     print(torch.cuda.is_available())  # Should print True
+     ```
+   - Fall back to CPU if needed
+3. **Audio Output Issues**
+   - Check system audio settings
+   - Verify output directory permissions
+   - Install FFmpeg for MP3/AAC support
+   - Try different output formats
+4. **Voice File Issues**
+   - Delete and let system redownload voice files
+   - Check `voices/` directory permissions
+   - Verify voice file integrity
+   - Try using a different voice
+5. **Web Interface Issues**
+   - Check port 7860 availability
+   - Try different browser
+   - Clear browser cache
+   - Check network firewall settings
+For any other issues:
+1. Check the console output for error messages
+2. Verify all prerequisites are installed
+3. Ensure virtual environment is activated
+4. Check system resource usage
+5. Try reinstalling dependencies
+## Contributing
+Feel free to contribute by:
+1. Opening issues for bugs or feature requests
+2. Submitting pull requests with improvements
+3. Helping with documentation
+4. Testing different voices and reporting issues
+5. Suggesting new features or optimizations
+6. Testing on different platforms and reporting results
+## License
+Apache 2.0 - See LICENSE file for details

gradio_interface.py ADDED Viewed

	@@ -0,0 +1,536 @@

+"""
+Kokoro-TTS Local Generator
+-------------------------
+A Gradio interface for the Kokoro-TTS-Local text-to-speech system.
+Supports multiple voices and audio formats, with cross-platform compatibility.
+Key Features:
+- Multiple voice models support (54 voices across 8 languages)
+- Real-time generation with progress logging
+- WAV, MP3, and AAC output formats
+- Network sharing capabilities
+- Cross-platform compatibility (Windows, macOS, Linux)
+Dependencies:
+- kokoro: Official Kokoro TTS library
+- gradio: Web interface framework
+- soundfile: Audio file handling
+- pydub: Audio format conversion
+"""
+import gradio as gr
+import os
+import sys
+import platform
+from datetime import datetime
+import shutil
+from pathlib import Path
+import soundfile as sf
+from pydub import AudioSegment
+import torch
+import numpy as np
+from typing import Union, List, Optional, Tuple, Dict, Any
+from models import (
+    list_available_voices, build_model,
+    generate_speech, download_voice_files
+)
+from kokoro import KPipeline
+import speed_dial
+# Define path type for consistent handling
+PathLike = Union[str, Path]
+# Configuration validation
+def validate_sample_rate(rate: int) -> int:
+    """Validate sample rate is within acceptable range"""
+    valid_rates = [16000, 22050, 24000, 44100, 48000]
+    if rate not in valid_rates:
+        print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}")
+        return 24000  # Default to safe value
+    return rate
+# Global configuration
+CONFIG_FILE = Path("tts_config.json")  # Stores user preferences and paths
+DEFAULT_OUTPUT_DIR = Path("outputs")    # Directory for generated audio files
+SAMPLE_RATE = validate_sample_rate(24000)  # Validated sample rate
+# Initialize model globally
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = None
+LANG_MAP = {
+    "af_": "a", "am_": "a",
+    "bf_": "b", "bm_": "b",
+    "jf_": "j", "jm_": "j",
+    "zf_": "z", "zm_": "z",
+    "ef_": "e", "em_": "e",
+    "ff_": "f",
+    "hf_": "h", "hm_": "h",
+    "if_": "i", "im_": "i",
+    "pf_": "p", "pm_": "p",
+}
+pipelines = {}
+def get_available_voices():
+    """Get list of available voice models."""
+    try:
+        # Initialize model to trigger voice downloads
+        global model
+        if model is None:
+            print("Initializing model and downloading voices...")
+            model = build_model(None, device)
+        voices = list_available_voices()
+        if not voices:
+            print("No voices found after initialization. Attempting to download...")
+            download_voice_files()  # Try downloading again
+            voices = list_available_voices()
+        print("Available voices:", voices)
+        return voices
+    except Exception as e:
+        print(f"Error getting voices: {e}")
+        return []
+def get_pipeline_for_voice(voice_name: str) -> KPipeline:
+    """
+    Determine the language code from the voice prefix and return the associated pipeline.
+    """
+    prefix = voice_name[:3].lower()
+    lang_code = LANG_MAP.get(prefix, "a")
+    if lang_code not in pipelines:
+        print(f"[INFO] Creating pipeline for lang_code='{lang_code}'")
+        pipelines[lang_code] = KPipeline(lang_code=lang_code, model=True)
+    return pipelines[lang_code]
+def convert_audio(input_path: PathLike, output_path: PathLike, format: str) -> Optional[PathLike]:
+    """Convert audio to specified format.
+    Args:
+        input_path: Path to input audio file
+        output_path: Path to output audio file
+        format: Output format ('wav', 'mp3', or 'aac')
+    Returns:
+        Path to output file or None on error
+    """
+    try:
+        # Normalize paths
+        input_path = Path(input_path).absolute()
+        output_path = Path(output_path).absolute()
+        # Validate input file
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+        # For WAV format, just return the input path
+        if format.lower() == "wav":
+            return input_path
+        # Create output directory if it doesn't exist
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Convert format
+        audio = AudioSegment.from_wav(str(input_path))
+        # Select proper format and options
+        if format.lower() == "mp3":
+            audio.export(str(output_path), format="mp3", bitrate="192k")
+        elif format.lower() == "aac":
+            audio.export(str(output_path), format="aac", bitrate="192k")
+        else:
+            raise ValueError(f"Unsupported format: {format}")
+        # Verify file was created
+        if not output_path.exists() or output_path.stat().st_size == 0:
+            raise IOError(f"Failed to create {format} file")
+        return output_path
+    except (IOError, FileNotFoundError, ValueError) as e:
+        print(f"Error converting audio: {type(e).__name__}: {e}")
+        return None
+    except Exception as e:
+        print(f"Unexpected error converting audio: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def generate_tts_with_logs(voice_name: str, text: str, format: str, speed: float = 1.0) -> Optional[PathLike]:
+    """Generate TTS audio with progress logging.
+    Args:
+        voice_name: Name of the voice to use
+        text: Text to convert to speech
+        format: Output format ('wav', 'mp3', 'aac')
+    Returns:
+        Path to generated audio file or None on error
+    """
+    global model
+    try:
+        # Initialize model if needed
+        if model is None:
+            print("Initializing model...")
+            model = build_model(None, device)
+        # Create output directory
+        DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        # Validate input text
+        if not text or not text.strip():
+            raise ValueError("Text input cannot be empty")
+        # Limit extremely long texts to prevent memory issues
+        MAX_CHARS = 5000
+        if len(text) > MAX_CHARS:
+            print(f"Warning: Text exceeds {MAX_CHARS} characters. Truncating to prevent memory issues.")
+            text = text[:MAX_CHARS] + "..."
+        # Generate base filename from text
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        base_name = f"tts_{timestamp}"
+        wav_path = DEFAULT_OUTPUT_DIR / f"{base_name}.wav"
+        # Generate speech
+        print(f"\nGenerating speech for: '{text}'")
+        print(f"Using voice: {voice_name}")
+        # Validate voice path using Path for consistent handling
+        voice_path = Path("voices").absolute() / f"{voice_name}.pt"
+        if not voice_path.exists():
+            raise FileNotFoundError(f"Voice file not found: {voice_path}")
+        try:
+            if voice_name.startswith(tuple(LANG_MAP.keys())):
+                pipeline = get_pipeline_for_voice(voice_name)
+                generator = pipeline(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
+            else:
+                generator = model(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
+            all_audio = []
+            max_segments = 100  # Safety limit for very long texts
+            segment_count = 0
+            for gs, ps, audio in generator:
+                segment_count += 1
+                if segment_count > max_segments:
+                    print(f"Warning: Reached maximum segment limit ({max_segments})")
+                    break
+                if audio is not None:
+                    if isinstance(audio, np.ndarray):
+                        audio = torch.from_numpy(audio).float()
+                    all_audio.append(audio)
+                    print(f"Generated segment: {gs}")
+                    if ps:  # Only print phonemes if available
+                        print(f"Phonemes: {ps}")
+            if not all_audio:
+                raise Exception("No audio generated")
+        except Exception as e:
+            raise Exception(f"Error in speech generation: {e}")
+        # Combine audio segments and save
+        if not all_audio:
+            raise Exception("No audio segments were generated")
+        # Handle single segment case without concatenation
+        if len(all_audio) == 1:
+            final_audio = all_audio[0]
+        else:
+            try:
+                final_audio = torch.cat(all_audio, dim=0)
+            except RuntimeError as e:
+                raise Exception(f"Failed to concatenate audio segments: {e}")
+        # Save audio file
+        try:
+            sf.write(wav_path, final_audio.numpy(), SAMPLE_RATE)
+        except Exception as e:
+            raise Exception(f"Failed to save audio file: {e}")
+        # Convert to requested format if needed
+        if format.lower() != "wav":
+            output_path = DEFAULT_OUTPUT_DIR / f"{base_name}.{format.lower()}"
+            return convert_audio(wav_path, output_path, format.lower())
+        return wav_path
+    except Exception as e:
+        print(f"Error generating speech: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def create_interface(server_name="0.0.0.0", server_port=7860):
+    """Create and launch the Gradio interface."""
+    # Get available voices
+    voices = get_available_voices()
+    if not voices:
+        print("No voices found! Please check the voices directory.")
+        return
+    # Get speed dial presets
+    preset_names = speed_dial.get_preset_names()
+    # Create interface
+    with gr.Blocks(title="Kokoro TTS Generator") as interface:
+        gr.Markdown("# Kokoro TTS Generator")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Main TTS controls
+                voice = gr.Dropdown(
+                    choices=voices,
+                    value=voices[0] if voices else None,
+                    label="Voice"
+                )
+                text = gr.Textbox(
+                    lines=3,
+                    placeholder="Enter text to convert to speech...",
+                    label="Text"
+                )
+                with gr.Row():
+                    format = gr.Radio(
+                        choices=["wav", "mp3", "aac"],
+                        value="wav",
+                        label="Output Format"
+                    )
+                    speed = gr.Slider(
+                        minimum=0.5,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Speed"
+                    )
+                generate = gr.Button("Generate Speech")
+            with gr.Column(scale=1):
+                # Speed dial section
+                gr.Markdown("## Speed Dial")
+                preset_dropdown = gr.Dropdown(
+                    choices=preset_names,
+                    value=preset_names[0] if preset_names else None,
+                    label="Saved Presets",
+                    interactive=True
+                )
+                preset_name = gr.Textbox(
+                    placeholder="Enter preset name...",
+                    label="New Preset Name"
+                )
+                with gr.Row():
+                    load_preset = gr.Button("Load")
+                    save_preset = gr.Button("Save Current")
+                    delete_preset = gr.Button("Delete")
+                # Output section
+                output = gr.Audio(label="Generated Audio")
+        # Function to load a preset
+        def load_preset_fn(preset_name):
+            if not preset_name:
+                return None, None, None, None
+            preset = speed_dial.get_preset(preset_name)
+            if not preset:
+                return None, None, None, None
+            return preset["voice"], preset["text"], preset["format"], preset["speed"]
+        # Function to save a preset
+        def save_preset_fn(name, voice, text, format, speed):
+            if not name or not voice or not text:
+                return gr.update(value="Please provide a name, voice, and text")
+            success = speed_dial.save_preset(name, voice, text, format, speed)
+            # Update the dropdown with the new preset list
+            preset_names = speed_dial.get_preset_names()
+            if success:
+                return gr.update(choices=preset_names, value=name)
+            else:
+                return gr.update(choices=preset_names)
+        # Function to delete a preset
+        def delete_preset_fn(name):
+            if not name:
+                return gr.update(value="Please select a preset to delete")
+            success = speed_dial.delete_preset(name)
+            # Update the dropdown with the new preset list
+            preset_names = speed_dial.get_preset_names()
+            if success:
+                return gr.update(choices=preset_names, value=None)
+            else:
+                return gr.update(choices=preset_names)
+        # Connect the buttons to their functions
+        load_preset.click(
+            fn=load_preset_fn,
+            inputs=preset_dropdown,
+            outputs=[voice, text, format, speed]
+        )
+        save_preset.click(
+            fn=save_preset_fn,
+            inputs=[preset_name, voice, text, format, speed],
+            outputs=preset_dropdown
+        )
+        delete_preset.click(
+            fn=delete_preset_fn,
+            inputs=preset_dropdown,
+            outputs=preset_dropdown
+        )
+        # Connect the generate button
+        generate.click(
+            fn=generate_tts_with_logs,
+            inputs=[voice, text, format, speed],
+            outputs=output
+        )
+    # Launch interface
+    interface.launch(
+        server_name=server_name,
+        server_port=server_port,
+        share=True
+    )
+def cleanup_resources():
+    """Properly clean up resources when the application exits"""
+    global model
+    try:
+        print("Cleaning up resources...")
+        # Clean up model resources
+        if model is not None:
+            print("Releasing model resources...")
+            # Clear voice dictionary to release memory
+            if hasattr(model, 'voices') and model.voices is not None:
+                try:
+                    voice_count = len(model.voices)
+                    for voice_name in list(model.voices.keys()):
+                        try:
+                            # Release each voice explicitly
+                            model.voices[voice_name] = None
+                        except:
+                            pass
+                    model.voices.clear()
+                    print(f"Cleared {voice_count} voice references")
+                except Exception as ve:
+                    print(f"Error clearing voices: {type(ve).__name__}: {ve}")
+            # Clear model attributes that might hold tensors
+            for attr_name in dir(model):
+                if not attr_name.startswith('__') and hasattr(model, attr_name):
+                    try:
+                        attr = getattr(model, attr_name)
+                        # Handle specific tensor attributes
+                        if isinstance(attr, torch.Tensor):
+                            if attr.is_cuda:
+                                print(f"Releasing CUDA tensor: {attr_name}")
+                                setattr(model, attr_name, None)
+                        elif hasattr(attr, 'to'):  # Module or Tensor-like object
+                            setattr(model, attr_name, None)
+                    except:
+                        pass
+            # Delete model reference
+            try:
+                del model
+                model = None
+                print("Model reference deleted")
+            except Exception as me:
+                print(f"Error deleting model: {type(me).__name__}: {me}")
+        # Clear CUDA memory explicitly
+        if torch.cuda.is_available():
+            try:
+                # Get initial memory usage
+                try:
+                    initial = torch.cuda.memory_allocated()
+                    initial_mb = initial / (1024 * 1024)
+                    print(f"CUDA memory before cleanup: {initial_mb:.2f} MB")
+                except:
+                    pass
+                # Free memory
+                print("Clearing CUDA cache...")
+                torch.cuda.empty_cache()
+                # Force synchronization
+                try:
+                    torch.cuda.synchronize()
+                except:
+                    pass
+                # Get final memory usage
+                try:
+                    final = torch.cuda.memory_allocated()
+                    final_mb = final / (1024 * 1024)
+                    freed_mb = (initial - final) / (1024 * 1024)
+                    print(f"CUDA memory after cleanup: {final_mb:.2f} MB (freed {freed_mb:.2f} MB)")
+                except:
+                    pass
+            except Exception as ce:
+                print(f"Error clearing CUDA memory: {type(ce).__name__}: {ce}")
+        # Restore original functions
+        try:
+            from models import _cleanup_monkey_patches
+            _cleanup_monkey_patches()
+            print("Monkey patches restored")
+        except Exception as pe:
+            print(f"Error restoring monkey patches: {type(pe).__name__}: {pe}")
+        # Final garbage collection
+        try:
+            import gc
+            collected = gc.collect()
+            print(f"Garbage collection completed: {collected} objects collected")
+        except Exception as gce:
+            print(f"Error during garbage collection: {type(gce).__name__}: {gce}")
+        print("Cleanup completed")
+    except Exception as e:
+        print(f"Error during cleanup: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+# Register cleanup for normal exit
+import atexit
+atexit.register(cleanup_resources)
+# Register cleanup for signals
+import signal
+import sys
+def signal_handler(signum, frame):
+    print(f"\nReceived signal {signum}, shutting down...")
+    cleanup_resources()
+    sys.exit(0)
+# Register for common signals
+for sig in [signal.SIGINT, signal.SIGTERM]:
+    try:
+        signal.signal(sig, signal_handler)
+    except (ValueError, AttributeError):
+        # Some signals might not be available on all platforms
+        pass
+if __name__ == "__main__":
+    try:
+        create_interface()
+    finally:
+        # Ensure cleanup even if Gradio encounters an error
+        cleanup_resources()

models.py ADDED Viewed

	@@ -0,0 +1,651 @@

+"""Models module for Kokoro TTS Local"""
+from typing import Optional, Tuple, List
+import torch
+from kokoro import KPipeline
+import os
+import json
+import codecs
+from pathlib import Path
+import numpy as np
+import shutil
+import threading
+# Set environment variables for proper encoding
+os.environ["PYTHONIOENCODING"] = "utf-8"
+# Disable symlinks warning
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+# Setup for safer monkey-patching
+import atexit
+import signal
+import sys
+# Track whether patches have been applied
+_patches_applied = {
+    'json_load': False,
+    'load_voice': False
+}
+def _cleanup_monkey_patches():
+    """Restore original functions that were monkey-patched"""
+    try:
+        if _patches_applied['json_load'] and _original_json_load is not None:
+            restore_json_load()
+            _patches_applied['json_load'] = False
+            print("Restored original json.load function")
+    except Exception as e:
+        print(f"Warning: Error restoring json.load: {e}")
+    try:
+        if _patches_applied['load_voice']:
+            restore_original_load_voice()
+            _patches_applied['load_voice'] = False
+            print("Restored original KPipeline.load_voice function")
+    except Exception as e:
+        print(f"Warning: Error restoring KPipeline.load_voice: {e}")
+# Register cleanup for normal exit
+atexit.register(_cleanup_monkey_patches)
+# Register cleanup for signals
+for sig in [signal.SIGINT, signal.SIGTERM]:
+    try:
+        signal.signal(sig, lambda signum, frame: (
+            print(f"\nReceived signal {signum}, cleaning up..."),
+            _cleanup_monkey_patches(),
+            sys.exit(1)
+        ))
+    except (ValueError, AttributeError):
+        # Some signals might not be available on all platforms
+        pass
+# List of available voice files (54 voices across 8 languages)
+VOICE_FILES = [
+    # American English Female voices (11 voices)
+    "af_heart.pt", "af_alloy.pt", "af_aoede.pt", "af_bella.pt", "af_jessica.pt",
+    "af_kore.pt", "af_nicole.pt", "af_nova.pt", "af_river.pt", "af_sarah.pt", "af_sky.pt",
+    # American English Male voices (9 voices)
+    "am_adam.pt", "am_echo.pt", "am_eric.pt", "am_fenrir.pt", "am_liam.pt",
+    "am_michael.pt", "am_onyx.pt", "am_puck.pt", "am_santa.pt",
+    # British English Female voices (4 voices)
+    "bf_alice.pt", "bf_emma.pt", "bf_isabella.pt", "bf_lily.pt",
+    # British English Male voices (4 voices)
+    "bm_daniel.pt", "bm_fable.pt", "bm_george.pt", "bm_lewis.pt",
+    # Japanese voices (5 voices)
+    "jf_alpha.pt", "jf_gongitsune.pt", "jf_nezumi.pt", "jf_tebukuro.pt", "jm_kumo.pt",
+    # Mandarin Chinese voices (8 voices)
+    "zf_xiaobei.pt", "zf_xiaoni.pt", "zf_xiaoxiao.pt", "zf_xiaoyi.pt",
+    "zm_yunjian.pt", "zm_yunxi.pt", "zm_yunxia.pt", "zm_yunyang.pt",
+    # Spanish voices (3 voices)
+    "ef_dora.pt", "em_alex.pt", "em_santa.pt",
+    # French voices (1 voice)
+    "ff_siwis.pt",
+    # Hindi voices (4 voices)
+    "hf_alpha.pt", "hf_beta.pt", "hm_omega.pt", "hm_psi.pt",
+    # Italian voices (2 voices)
+    "if_sara.pt", "im_nicola.pt",
+    # Brazilian Portuguese voices (3 voices)
+    "pf_dora.pt", "pm_alex.pt", "pm_santa.pt"
+]
+# Language code mapping for different languages
+LANGUAGE_CODES = {
+    'a': 'American English',
+    'b': 'British English',
+    'j': 'Japanese',
+    'z': 'Mandarin Chinese',
+    'e': 'Spanish',
+    'f': 'French',
+    'h': 'Hindi',
+    'i': 'Italian',
+    'p': 'Brazilian Portuguese'
+}
+# Patch KPipeline's load_voice method to use weights_only=False
+original_load_voice = KPipeline.load_voice
+def patched_load_voice(self, voice_path):
+    """Load voice model with weights_only=False for compatibility"""
+    if not os.path.exists(voice_path):
+        raise FileNotFoundError(f"Voice file not found: {voice_path}")
+    voice_name = Path(voice_path).stem
+    try:
+        voice_model = torch.load(voice_path, weights_only=False)
+        if voice_model is None:
+            raise ValueError(f"Failed to load voice model from {voice_path}")
+        # Ensure device is set
+        if not hasattr(self, 'device'):
+            self.device = 'cpu'
+        # Move model to device and store in voices dictionary
+        self.voices[voice_name] = voice_model.to(self.device)
+        return self.voices[voice_name]
+    except Exception as e:
+        print(f"Error loading voice {voice_name}: {e}")
+        raise
+# Apply the patch
+KPipeline.load_voice = patched_load_voice
+_patches_applied['load_voice'] = True
+# Store original function for restoration if needed
+def restore_original_load_voice():
+    global _patches_applied
+    if _patches_applied['load_voice']:
+        KPipeline.load_voice = original_load_voice
+        _patches_applied['load_voice'] = False
+def patch_json_load():
+    """Patch json.load to handle UTF-8 encoded files with special characters"""
+    global _patches_applied, _original_json_load
+    original_load = json.load
+    _original_json_load = original_load  # Store for restoration
+    def custom_load(fp, *args, **kwargs):
+        try:
+            # Try reading with UTF-8 encoding
+            if hasattr(fp, 'buffer'):
+                content = fp.buffer.read().decode('utf-8')
+            else:
+                content = fp.read()
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError as e:
+                print(f"JSON parsing error: {e}")
+                raise
+        except UnicodeDecodeError:
+            # If UTF-8 fails, try with utf-8-sig for files with BOM
+            fp.seek(0)
+            content = fp.read()
+            if isinstance(content, bytes):
+                content = content.decode('utf-8-sig', errors='replace')
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError as e:
+                print(f"JSON parsing error: {e}")
+                raise
+    json.load = custom_load
+    _patches_applied['json_load'] = True
+    return original_load  # Return original for restoration
+# Store the original load function for potential restoration
+_original_json_load = None
+def restore_json_load():
+    """Restore the original json.load function"""
+    global _original_json_load, _patches_applied
+    if _original_json_load is not None and _patches_applied['json_load']:
+        json.load = _original_json_load
+        _original_json_load = None
+        _patches_applied['json_load'] = False
+def load_config(config_path: str) -> dict:
+    """Load configuration file with proper encoding handling"""
+    try:
+        with codecs.open(config_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except UnicodeDecodeError:
+        # Fallback to utf-8-sig if regular utf-8 fails
+        with codecs.open(config_path, 'r', encoding='utf-8-sig') as f:
+            return json.load(f)
+# Initialize espeak-ng
+phonemizer_available = False  # Global flag to track if phonemizer is working
+try:
+    from phonemizer.backend.espeak.wrapper import EspeakWrapper
+    from phonemizer import phonemize
+    import espeakng_loader
+    # Make library available first
+    library_path = espeakng_loader.get_library_path()
+    data_path = espeakng_loader.get_data_path()
+    espeakng_loader.make_library_available()
+    # Set up espeak-ng paths
+    EspeakWrapper.library_path = library_path
+    EspeakWrapper.data_path = data_path
+    # Verify espeak-ng is working
+    try:
+        test_phonemes = phonemize('test', language='en-us')
+        if test_phonemes:
+            phonemizer_available = True
+            print("Phonemizer successfully initialized")
+        else:
+            print("Note: Phonemization returned empty result")
+            print("TTS will work, but phoneme visualization will be disabled")
+    except Exception as e:
+        # Continue without espeak functionality
+        print(f"Note: Phonemizer not available: {e}")
+        print("TTS will work, but phoneme visualization will be disabled")
+except ImportError as e:
+    print(f"Note: Phonemizer packages not installed: {e}")
+    print("TTS will work, but phoneme visualization will be disabled")
+    # Rather than automatically installing packages, inform the user
+    print("If you want phoneme visualization, manually install required packages:")
+    print("pip install espeakng-loader phonemizer-fork")
+# Initialize pipeline globally with thread safety
+_pipeline = None
+_pipeline_lock = threading.RLock()  # Reentrant lock for thread safety
+def download_voice_files(voice_files=None, repo_version="main", required_count=1):
+    """Download voice files from Hugging Face.
+    Args:
+        voice_files: Optional list of voice files to download. If None, download all VOICE_FILES.
+        repo_version: Version/tag of the repository to use (default: "main")
+        required_count: Minimum number of voices required (default: 1)
+    Returns:
+        List of successfully downloaded voice files
+    Raises:
+        ValueError: If fewer than required_count voices could be downloaded
+    """
+    # Use absolute path for voices directory
+    voices_dir = Path(os.path.abspath("voices"))
+    voices_dir.mkdir(exist_ok=True)
+    # Import here to avoid startup dependency
+    from huggingface_hub import hf_hub_download
+    downloaded_voices = []
+    failed_voices = []
+    # If specific voice files are requested, use those. Otherwise use all.
+    files_to_download = voice_files if voice_files is not None else VOICE_FILES
+    total_files = len(files_to_download)
+    print(f"\nDownloading voice files... ({total_files} total files)")
+    # Check for existing voice files first
+    existing_files = []
+    for voice_file in files_to_download:
+        voice_path = voices_dir / voice_file
+        if voice_path.exists():
+            print(f"Voice file {voice_file} already exists")
+            downloaded_voices.append(voice_file)
+            existing_files.append(voice_file)
+    # Remove existing files from the download list
+    files_to_download = [f for f in files_to_download if f not in existing_files]
+    if not files_to_download and downloaded_voices:
+        print(f"All required voice files already exist ({len(downloaded_voices)} files)")
+        return downloaded_voices
+    # Proceed with downloading missing files
+    retry_count = 3
+    try:
+        import tempfile
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for voice_file in files_to_download:
+                # Full path where the voice file should be
+                voice_path = voices_dir / voice_file
+                # Try with retries
+                for attempt in range(retry_count):
+                    try:
+                        print(f"Downloading {voice_file}... (attempt {attempt+1}/{retry_count})")
+                        # Download to a temporary location first
+                        temp_path = hf_hub_download(
+                            repo_id="hexgrad/Kokoro-82M",
+                            filename=f"voices/{voice_file}",
+                            local_dir=temp_dir,
+                            force_download=True,
+                            revision=repo_version
+                        )
+                        # Move the file to the correct location
+                        os.makedirs(os.path.dirname(str(voice_path)), exist_ok=True)
+                        shutil.copy2(temp_path, str(voice_path))  # Use copy2 instead of move
+                        # Verify file integrity
+                        if os.path.getsize(str(voice_path)) > 0:
+                            downloaded_voices.append(voice_file)
+                            print(f"Successfully downloaded {voice_file}")
+                            break  # Success, exit retry loop
+                        else:
+                            print(f"Warning: Downloaded file {voice_file} has zero size, retrying...")
+                            os.remove(str(voice_path))  # Remove invalid file
+                            if attempt == retry_count - 1:
+                                failed_voices.append(voice_file)
+                    except (IOError, OSError, ValueError, FileNotFoundError, ConnectionError) as e:
+                        print(f"Warning: Failed to download {voice_file} (attempt {attempt+1}): {e}")
+                        if attempt == retry_count - 1:
+                            failed_voices.append(voice_file)
+                            print(f"Error: Failed all {retry_count} attempts to download {voice_file}")
+    except Exception as e:
+        print(f"Error during voice download process: {e}")
+        import traceback
+        traceback.print_exc()
+    # Report results
+    if failed_voices:
+        print(f"Warning: Failed to download {len(failed_voices)} voice files: {', '.join(failed_voices)}")
+    if not downloaded_voices:
+        error_msg = "No voice files could be downloaded. Please check your internet connection."
+        print(f"Error: {error_msg}")
+        raise ValueError(error_msg)
+    elif len(downloaded_voices) < required_count:
+        error_msg = f"Only {len(downloaded_voices)} voice files could be downloaded, but {required_count} were required."
+        print(f"Error: {error_msg}")
+        raise ValueError(error_msg)
+    else:
+        print(f"Successfully processed {len(downloaded_voices)} voice files")
+    return downloaded_voices
+def build_model(model_path: str, device: str, repo_version: str = "main") -> KPipeline:
+    """Build and return the Kokoro pipeline with proper encoding configuration
+    Args:
+        model_path: Path to the model file or None to use default
+        device: Device to use ('cuda' or 'cpu')
+        repo_version: Version/tag of the repository to use (default: "main")
+    Returns:
+        Initialized KPipeline instance
+    """
+    global _pipeline, _pipeline_lock
+    # Use a lock for thread safety
+    with _pipeline_lock:
+        # Double-check pattern to avoid race conditions
+        if _pipeline is not None:
+            return _pipeline
+        try:
+            # Patch json loading before initializing pipeline
+            patch_json_load()
+            # Download model if it doesn't exist
+            if model_path is None:
+                model_path = 'kokoro-v1_0.pth'
+            model_path = os.path.abspath(model_path)
+            if not os.path.exists(model_path):
+                print(f"Downloading model file {model_path}...")
+                try:
+                    from huggingface_hub import hf_hub_download
+                    model_path = hf_hub_download(
+                        repo_id="hexgrad/Kokoro-82M",
+                        filename="kokoro-v1_0.pth",
+                        local_dir=".",
+                        force_download=True,
+                        revision=repo_version
+                    )
+                    print(f"Model downloaded to {model_path}")
+                except Exception as e:
+                    print(f"Error downloading model: {e}")
+                    raise ValueError(f"Could not download model: {e}") from e
+            # Download config if it doesn't exist
+            config_path = os.path.abspath("config.json")
+            if not os.path.exists(config_path):
+                print("Downloading config file...")
+                try:
+                    config_path = hf_hub_download(
+                        repo_id="hexgrad/Kokoro-82M",
+                        filename="config.json",
+                        local_dir=".",
+                        force_download=True,
+                        revision=repo_version
+                    )
+                    print(f"Config downloaded to {config_path}")
+                except Exception as e:
+                    print(f"Error downloading config: {e}")
+                    raise ValueError(f"Could not download config: {e}") from e
+            # Download voice files - require at least one voice
+            try:
+                downloaded_voices = download_voice_files(repo_version=repo_version, required_count=1)
+            except ValueError as e:
+                print(f"Error: Voice files download failed: {e}")
+                raise ValueError("Voice files download failed") from e
+            # Validate language code
+            lang_code = 'a'  # Default to 'a' for American English
+            supported_codes = list(LANGUAGE_CODES.keys())
+            if lang_code not in supported_codes:
+                print(f"Warning: Unsupported language code '{lang_code}'. Using 'a' (American English).")
+                print(f"Supported language codes: {', '.join(supported_codes)}")
+                lang_code = 'a'
+            # Initialize pipeline with validated language code
+            pipeline_instance = KPipeline(lang_code=lang_code)
+            if pipeline_instance is None:
+                raise ValueError("Failed to initialize KPipeline - pipeline is None")
+            # Store device parameter for reference in other operations
+            pipeline_instance.device = device
+            # Initialize voices dictionary if it doesn't exist
+            if not hasattr(pipeline_instance, 'voices'):
+                pipeline_instance.voices = {}
+            # Try to load the first available voice with improved error handling
+            voice_loaded = False
+            for voice_file in downloaded_voices:
+                voice_path = os.path.abspath(os.path.join("voices", voice_file))
+                if os.path.exists(voice_path):
+                    try:
+                        pipeline_instance.load_voice(voice_path)
+                        print(f"Successfully loaded voice: {voice_file}")
+                        voice_loaded = True
+                        break  # Successfully loaded a voice
+                    except Exception as e:
+                        print(f"Warning: Failed to load voice {voice_file}: {e}")
+                        continue
+            if not voice_loaded:
+                print("Warning: Could not load any voice models")
+            # Set the global _pipeline only after successful initialization
+            _pipeline = pipeline_instance
+        except Exception as e:
+            print(f"Error initializing pipeline: {e}")
+            # Restore original json.load on error
+            restore_json_load()
+            raise
+        return _pipeline
+def list_available_voices() -> List[str]:
+    """List all available voice models"""
+    # Always use absolute path for consistency
+    voices_dir = Path(os.path.abspath("voices"))
+    # Create voices directory if it doesn't exist
+    if not voices_dir.exists():
+        print(f"Creating voices directory at {voices_dir}")
+        voices_dir.mkdir(exist_ok=True)
+        return []
+    # Get all .pt files in the voices directory
+    voice_files = list(voices_dir.glob("*.pt"))
+    # If we found voice files, return them
+    if voice_files:
+        return [f.stem for f in sorted(voice_files, key=lambda f: f.stem.lower())]
+    # If no voice files in standard location, check if we need to do a one-time migration
+    # This is legacy support for older installations
+    alt_voices_path = Path(".") / "voices"
+    if alt_voices_path.exists() and alt_voices_path.is_dir() and alt_voices_path != voices_dir:
+        print(f"Checking alternative voice location: {alt_voices_path.absolute()}")
+        alt_voice_files = list(alt_voices_path.glob("*.pt"))
+        if alt_voice_files:
+            print(f"Found {len(alt_voice_files)} voice files in alternate location")
+            print("Moving files to the standard voices directory...")
+            # Process files in a batch for efficiency
+            files_moved = 0
+            for voice_file in alt_voice_files:
+                target_path = voices_dir / voice_file.name
+                if not target_path.exists():
+                    try:
+                        # Use copy2 to preserve metadata, then remove original if successful
+                        shutil.copy2(str(voice_file), str(target_path))
+                        files_moved += 1
+                    except (OSError, IOError) as e:
+                        print(f"Error copying {voice_file.name}: {e}")
+            if files_moved > 0:
+                print(f"Successfully moved {files_moved} voice files")
+                return [f.stem for f in sorted(voices_dir.glob("*.pt"), key=lambda f: f.stem.lower())]
+    print("No voice files found. Please run the application again to download voices.")
+    return []
+def get_language_code_from_voice(voice_name: str) -> str:
+    """Get the appropriate language code from a voice name
+    Args:
+        voice_name: Name of the voice (e.g., 'af_bella', 'jf_alpha')
+    Returns:
+        Language code for the voice
+    """
+    # Extract prefix from voice name
+    prefix = voice_name[:2] if len(voice_name) >= 2 else 'af'
+    # Map voice prefixes to language codes
+    prefix_to_lang = {
+        'af': 'a', 'am': 'a',  # American English
+        'bf': 'b', 'bm': 'b',  # British English
+        'jf': 'j', 'jm': 'j',  # Japanese
+        'zf': 'z', 'zm': 'z',  # Mandarin Chinese
+        'ef': 'e', 'em': 'e',  # Spanish
+        'ff': 'f', 'fm': 'f',  # French
+        'hf': 'h', 'hm': 'h',  # Hindi
+        'if': 'i', 'im': 'i',  # Italian
+        'pf': 'p', 'pm': 'p',  # Brazilian Portuguese
+    }
+    return prefix_to_lang.get(prefix, 'a')  # Default to American English
+def load_voice(voice_name: str, device: str) -> torch.Tensor:
+    """Load a voice model in a thread-safe manner
+    Args:
+        voice_name: Name of the voice to load (with or without .pt extension)
+        device: Device to use ('cuda' or 'cpu')
+    Returns:
+        Loaded voice model tensor
+    Raises:
+        ValueError: If voice file not found or loading fails
+    """
+    pipeline = build_model(None, device)
+    # Format voice path correctly - strip .pt if it was included
+    voice_name = voice_name.replace('.pt', '')
+    voice_path = os.path.abspath(os.path.join("voices", f"{voice_name}.pt"))
+    if not os.path.exists(voice_path):
+        raise ValueError(f"Voice file not found: {voice_path}")
+    # Use a lock to ensure thread safety when loading voices
+    with _pipeline_lock:
+        # Check if voice is already loaded
+        if hasattr(pipeline, 'voices') and voice_name in pipeline.voices:
+            return pipeline.voices[voice_name]
+        # Load voice if not already loaded
+        return pipeline.load_voice(voice_path)
+def generate_speech(
+    model: KPipeline,
+    text: str,
+    voice: str,
+    lang: str = 'a',
+    device: str = 'cpu',
+    speed: float = 1.0
+) -> Tuple[Optional[torch.Tensor], Optional[str]]:
+    """Generate speech using the Kokoro pipeline in a thread-safe manner
+    Args:
+        model: KPipeline instance
+        text: Text to synthesize
+        voice: Voice name (e.g. 'af_bella')
+        lang: Language code ('a' for American English, 'b' for British English)
+        device: Device to use ('cuda' or 'cpu')
+        speed: Speech speed multiplier (default: 1.0)
+    Returns:
+        Tuple of (audio tensor, phonemes string) or (None, None) on error
+    """
+    global _pipeline_lock
+    try:
+        if model is None:
+            raise ValueError("Model is None - pipeline not properly initialized")
+        # Format voice name and path
+        voice_name = voice.replace('.pt', '')
+        voice_path = os.path.abspath(os.path.join("voices", f"{voice_name}.pt"))
+        # Check if voice file exists
+        if not os.path.exists(voice_path):
+            raise ValueError(f"Voice file not found: {voice_path}")
+        # Thread-safe initialization of model properties and voice loading
+        with _pipeline_lock:
+            # Initialize voices dictionary if it doesn't exist
+            if not hasattr(model, 'voices'):
+                model.voices = {}
+            # Ensure device is set
+            if not hasattr(model, 'device'):
+                model.device = device
+            # Ensure voice is loaded before generating
+            if voice_name not in model.voices:
+                print(f"Loading voice {voice_name}...")
+                try:
+                    model.load_voice(voice_path)
+                    if voice_name not in model.voices:
+                        raise ValueError("Voice load succeeded but voice not in model.voices dictionary")
+                except Exception as e:
+                    raise ValueError(f"Failed to load voice {voice_name}: {e}")
+        # Generate speech (outside the lock for better concurrency)
+        print(f"Generating speech with device: {model.device}")
+        generator = model(
+            text,
+            voice=voice_path,
+            speed=speed,
+            split_pattern=r'\n+'
+        )
+        # Get first generated segment and convert numpy array to tensor if needed
+        for gs, ps, audio in generator:
+            if audio is not None:
+                if isinstance(audio, np.ndarray):
+                    audio = torch.from_numpy(audio).float()
+                return audio, ps
+        return None, None
+    except (ValueError, FileNotFoundError, RuntimeError, KeyError, AttributeError, TypeError) as e:
+        print(f"Error generating speech: {e}")
+        return None, None
+    except Exception as e:
+        print(f"Unexpected error during speech generation: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None

outputs/tts_20250608_125559.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad3317e6fef86203f76bc9bd0d47267575594a30d76af064a0427d868c8d04f3
+size 4414125

outputs/tts_20250608_125559.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71e8cd896e8f7c5df4bbf721945771d44fe947375dab70ea3f34bc17f3c42017
+size 10590044

outputs/tts_20250608_125703.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e950dc144d38fa862cf1306cfa602d7f1a3181d1ab9130dab192c82a2b76fe6e
+size 4677165

outputs/tts_20250608_125703.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7a99e011575460595ef14ae32b698775b264da96ca8ba96a29d4f8139cc7ad3
+size 11221244

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+kokoro>=0.9.2  # Official Kokoro TTS library (v1.0 model support)
+misaki  # G2P library for Kokoro (multi-language support)
+torch>=2.0.0  # PyTorch for model inference (for GPU support, see README.md for CUDA-specific installation)
+soundfile>=0.12.1  # Audio file handling
+huggingface-hub>=0.16.0  # Model downloads from Hugging Face
+gradio>=4.0.0  # Web interface
+pydub>=0.25.1  # For audio format conversion
+espeakng-loader>=0.1.0  # For loading espeak-ng library
+phonemizer-fork>=3.2.1  # For phoneme generation
+wheel>=0.38.0  # For building packages
+setuptools>=65.0.0  # For installing packages
+num2words>=0.5.12  # For number to word conversion
+spacy>=3.4.0  # For text processing

speed_dial.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+Speed Dial Module for Kokoro-TTS-Local
+--------------------------------------
+Manages speed dial presets for quick access to frequently used voice and text combinations.
+This module provides functions to:
+- Load speed dial presets from a JSON file
+- Save new presets to the JSON file
+- Delete presets from the JSON file
+- Validate preset data
+"""
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+# Define the path for the speed dial presets file
+SPEED_DIAL_FILE = Path("speed_dial.json")
+def load_presets() -> Dict[str, Dict[str, Any]]:
+    """
+    Load speed dial presets from the JSON file.
+    Returns:
+        Dictionary of presets where keys are preset names and values are preset data
+    """
+    if not SPEED_DIAL_FILE.exists():
+        # If file doesn't exist, return an empty dictionary
+        return {}
+    try:
+        with open(SPEED_DIAL_FILE, 'r', encoding='utf-8') as f:
+            presets = json.load(f)
+        # Validate the loaded presets
+        validated_presets = {}
+        for name, preset in presets.items():
+            if validate_preset(preset):
+                validated_presets[name] = preset
+        return validated_presets
+    except (json.JSONDecodeError, IOError) as e:
+        print(f"Error loading speed dial presets: {e}")
+        return {}
+def save_preset(name: str, voice: str, text: str, format: str = "wav", speed: float = 1.0) -> bool:
+    """
+    Save a new speed dial preset.
+    Args:
+        name: Name of the preset
+        voice: Voice to use
+        text: Text to convert to speech
+        format: Output format (default: "wav")
+        speed: Speech speed (default: 1.0)
+    Returns:
+        True if successful, False otherwise
+    """
+    # Create preset data
+    preset = {
+        "voice": voice,
+        "text": text,
+        "format": format,
+        "speed": speed
+    }
+    # Validate preset data
+    if not validate_preset(preset):
+        return False
+    # Load existing presets
+    presets = load_presets()
+    # Add or update the preset
+    presets[name] = preset
+    # Save presets to file
+    try:
+        with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f:
+            json.dump(presets, f, indent=2, ensure_ascii=False)
+        return True
+    except IOError as e:
+        print(f"Error saving speed dial preset: {e}")
+        return False
+def delete_preset(name: str) -> bool:
+    """
+    Delete a speed dial preset.
+    Args:
+        name: Name of the preset to delete
+    Returns:
+        True if successful, False otherwise
+    """
+    # Load existing presets
+    presets = load_presets()
+    # Check if preset exists
+    if name not in presets:
+        return False
+    # Remove the preset
+    del presets[name]
+    # Save presets to file
+    try:
+        with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f:
+            json.dump(presets, f, indent=2, ensure_ascii=False)
+        return True
+    except IOError as e:
+        print(f"Error deleting speed dial preset: {e}")
+        return False
+def validate_preset(preset: Dict[str, Any]) -> bool:
+    """
+    Validate a preset's data structure.
+    Args:
+        preset: Preset data to validate
+    Returns:
+        True if valid, False otherwise
+    """
+    # Check required fields
+    required_fields = ["voice", "text"]
+    for field in required_fields:
+        if field not in preset:
+            print(f"Preset missing required field: {field}")
+            return False
+    # Check field types
+    if not isinstance(preset.get("voice"), str):
+        print("Preset voice must be a string")
+        return False
+    if not isinstance(preset.get("text"), str):
+        print("Preset text must be a string")
+        return False
+    # Optional fields with defaults
+    if "format" not in preset:
+        preset["format"] = "wav"
+    elif not isinstance(preset["format"], str):
+        print("Preset format must be a string")
+        return False
+    if "speed" not in preset:
+        preset["speed"] = 1.0
+    elif not isinstance(preset["speed"], (int, float)):
+        print("Preset speed must be a number")
+        return False
+    return True
+def get_preset_names() -> List[str]:
+    """
+    Get a list of all preset names.
+    Returns:
+        List of preset names
+    """
+    presets = load_presets()
+    return list(presets.keys())
+def get_preset(name: str) -> Optional[Dict[str, Any]]:
+    """
+    Get a specific preset by name.
+    Args:
+        name: Name of the preset to get
+    Returns:
+        Preset data or None if not found
+    """
+    presets = load_presets()
+    return presets.get(name)

test.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import torch
2	+ print(torch.cuda.is_available())

tts_demo.py ADDED Viewed

	@@ -0,0 +1,447 @@

+import torch
+from typing import Optional, Tuple, List, Union
+from models import build_model, generate_speech, list_available_voices
+from tqdm.auto import tqdm
+import soundfile as sf
+from pathlib import Path
+import numpy as np
+import time
+import os
+import sys
+# Define path type for consistent handling
+PathLike = Union[str, Path]
+# Constants with validation
+def validate_sample_rate(rate: int) -> int:
+    """Validate sample rate is within acceptable range"""
+    valid_rates = [16000, 22050, 24000, 44100, 48000]
+    if rate not in valid_rates:
+        print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}")
+        return 24000  # Default to safe value
+    return rate
+def validate_language(lang: str) -> str:
+    """Validate language code"""
+    # Import here to avoid circular imports
+    from models import LANGUAGE_CODES
+    valid_langs = list(LANGUAGE_CODES.keys())
+    if lang not in valid_langs:
+        print(f"Warning: Invalid language code '{lang}'. Using 'a' (American English).")
+        print(f"Supported language codes: {', '.join(valid_langs)}")
+        return 'a'  # Default to American English
+    return lang
+# Define and validate constants
+SAMPLE_RATE = validate_sample_rate(24000)
+DEFAULT_MODEL_PATH = Path('kokoro-v1_0.pth').absolute()
+DEFAULT_OUTPUT_FILE = Path('output.wav').absolute()
+DEFAULT_LANGUAGE = validate_language('a')  # 'a' for American English, 'b' for British English
+DEFAULT_TEXT = "Hello, welcome to this text-to-speech test."
+# Ensure output directory exists
+DEFAULT_OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+# Configure tqdm for better Windows console support
+tqdm.monitor_interval = 0
+def print_menu():
+    """Print the main menu options."""
+    print("\n=== Kokoro TTS Menu ===")
+    print("1. List available voices")
+    print("2. Generate speech")
+    print("3. Exit")
+    return input("Select an option (1-3): ").strip()
+def select_voice(voices: List[str]) -> str:
+    """Interactive voice selection."""
+    print("\nAvailable voices:")
+    for i, voice in enumerate(voices, 1):
+        print(f"{i}. {voice}")
+    while True:
+        try:
+            choice = input("\nSelect a voice number (or press Enter for default 'af_bella'): ").strip()
+            if not choice:
+                return "af_bella"
+            choice = int(choice)
+            if 1 <= choice <= len(voices):
+                return voices[choice - 1]
+            print("Invalid choice. Please try again.")
+        except ValueError:
+            print("Please enter a valid number.")
+def get_text_input() -> str:
+    """Get text input from user."""
+    print("\nEnter the text you want to convert to speech")
+    print("(or press Enter for default text)")
+    text = input("> ").strip()
+    return text if text else DEFAULT_TEXT
+def get_speed() -> float:
+    """Get speech speed from user."""
+    while True:
+        try:
+            speed = input("\nEnter speech speed (0.5-2.0, default 1.0): ").strip()
+            if not speed:
+                return 1.0
+            speed = float(speed)
+            if 0.5 <= speed <= 2.0:
+                return speed
+            print("Speed must be between 0.5 and 2.0")
+        except ValueError:
+            print("Please enter a valid number.")
+def save_audio_with_retry(audio_data: np.ndarray, sample_rate: int, output_path: PathLike, max_retries: int = 3, retry_delay: float = 1.0) -> bool:
+    """
+    Attempt to save audio data to file with retry logic.
+    Args:
+        audio_data: Audio data as numpy array
+        sample_rate: Sample rate in Hz
+        output_path: Path to save the audio file
+        max_retries: Maximum number of retry attempts
+        retry_delay: Delay between retries in seconds
+    Returns:
+        True if successful, False otherwise
+    """
+    # Convert and normalize path to Path object
+    output_path = Path(output_path).absolute()
+    # Create parent directory if it doesn't exist
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # Try to remove the file if it exists to avoid "file in use" issues
+    try:
+        if output_path.exists():
+            print(f"Removing existing file: {output_path}")
+            output_path.unlink()
+    except Exception as e:
+        print(f"Warning: Could not remove existing file: {e}")
+        print("This might indicate the file is in use by another program.")
+    for attempt in range(max_retries):
+        try:
+            # Validate audio data before saving
+            if audio_data is None or len(audio_data) == 0:
+                raise ValueError("Empty audio data")
+            # Check write permissions for the directory
+            if not os.access(str(output_path.parent), os.W_OK):
+                raise PermissionError(f"No write permission for directory: {output_path.parent}")
+            # Try to use a temporary file first, then rename it
+            temp_path = output_path.with_name(f"temp_{output_path.name}")
+            # Save audio file to temporary location
+            print(f"Saving audio to temporary file: {temp_path}")
+            sf.write(str(temp_path), audio_data, sample_rate)
+            # If successful, rename to final location
+            if temp_path.exists():
+                # Remove target file if it exists
+                if output_path.exists():
+                    output_path.unlink()
+                # Rename temp file to target file
+                temp_path.rename(output_path)
+                print(f"Successfully renamed temporary file to: {output_path}")
+            return True
+        except (IOError, PermissionError) as e:
+            if attempt < max_retries - 1:
+                print(f"\nFailed to save audio (attempt {attempt + 1}/{max_retries}): {e}")
+                print("The output file might be in use by another program (e.g., media player).")
+                print(f"Please close any programs that might be using '{output_path}'")
+                print(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:
+                print(f"\nError: Could not save audio after {max_retries} attempts: {e}")
+                print(f"Please ensure '{output_path}' is not open in any other program and try again.")
+                print(f"You might need to restart your computer if the file remains locked.")
+                return False
+        except Exception as e:
+            print(f"\nUnexpected error saving audio: {type(e).__name__}: {e}")
+            if attempt < max_retries - 1:
+                print(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:
+                return False
+        finally:
+            # Clean up temp file if it exists and we failed
+            try:
+                temp_path = output_path.with_name(f"temp_{output_path.name}")
+                if temp_path.exists():
+                    temp_path.unlink()
+            except Exception:
+                pass
+    return False
+def main() -> None:
+    try:
+        # Set up device safely
+        try:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        except (RuntimeError, AttributeError, ImportError) as e:
+            print(f"CUDA initialization error: {e}. Using CPU instead.")
+            device = 'cpu'  # Fallback if CUDA check fails
+        print(f"Using device: {device}")
+        # Build model
+        print("\nInitializing model...")
+        with tqdm(total=1, desc="Building model") as pbar:
+            model = build_model(DEFAULT_MODEL_PATH, device)
+            pbar.update(1)
+        # Cache for voices to avoid redundant calls
+        voices_cache = None
+        while True:
+            choice = print_menu()
+            if choice == "1":
+                # List voices
+                voices_cache = list_available_voices()
+                print("\nAvailable voices:")
+                for voice in voices_cache:
+                    print(f"- {voice}")
+            elif choice == "2":
+                # Generate speech
+                # Use cached voices if available
+                if voices_cache is None:
+                    voices_cache = list_available_voices()
+                if not voices_cache:
+                    print("No voices found! Please check the voices directory.")
+                    continue
+                # Get user inputs
+                voice = select_voice(voices_cache)
+                text = get_text_input()
+                # Validate text (don't allow extremely long inputs)
+                if len(text) > 10000:  # Reasonable limit for text length
+                    print("Text is too long. Please enter a shorter text.")
+                    continue
+                speed = get_speed()
+                print(f"\nGenerating speech for: '{text}'")
+                print(f"Using voice: {voice}")
+                print(f"Speed: {speed}x")
+                # Generate speech
+                all_audio = []
+                # Use Path object for consistent path handling
+                voice_path = Path("voices").absolute() / f"{voice}.pt"
+                # Verify voice file exists
+                if not voice_path.exists():
+                    print(f"Error: Voice file not found: {voice_path}")
+                    continue
+                # Set a timeout for generation with per-segment timeout
+                max_gen_time = 300  # 5 minutes max total
+                max_segment_time = 60  # 60 seconds max per segment
+                start_time = time.time()
+                segment_start_time = start_time
+                try:
+                    # Setup watchdog timer for overall process
+                    import threading
+                    generation_complete = False
+                    def watchdog_timer():
+                        if not generation_complete:
+                            print("\nWatchdog: Generation taking too long, process will be cancelled")
+                            # Can't directly interrupt generator, but this will inform user
+                    # Start watchdog timer
+                    watchdog = threading.Timer(max_gen_time, watchdog_timer)
+                    watchdog.daemon = True  # Don't prevent program exit
+                    watchdog.start()
+                    # Initialize generator
+                    try:
+                        generator = model(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
+                    except (ValueError, TypeError, RuntimeError) as e:
+                        print(f"Error initializing speech generator: {e}")
+                        watchdog.cancel()
+                        continue
+                    except Exception as e:
+                        print(f"Unexpected error initializing generator: {type(e).__name__}: {e}")
+                        watchdog.cancel()
+                        continue
+                    # Process segments
+                    with tqdm(desc="Generating speech") as pbar:
+                        for gs, ps, audio in generator:
+                            # Check overall timeout
+                            current_time = time.time()
+                            if current_time - start_time > max_gen_time:
+                                print("\nWarning: Total generation time exceeded limit, stopping")
+                                break
+                            # Check per-segment timeout
+                            segment_elapsed = current_time - segment_start_time
+                            if segment_elapsed > max_segment_time:
+                                print(f"\nWarning: Segment took too long ({segment_elapsed:.1f}s), stopping")
+                                break
+                            # Reset segment timer
+                            segment_start_time = current_time
+                            # Process audio if available
+                            if audio is not None:
+                                # Only convert if it's a numpy array, not if already tensor
+                                audio_tensor = audio if isinstance(audio, torch.Tensor) else torch.from_numpy(audio).float()
+                                all_audio.append(audio_tensor)
+                                print(f"\nGenerated segment: {gs}")
+                                if ps:  # Only print phonemes if available
+                                    print(f"Phonemes: {ps}")
+                                pbar.update(1)
+                    # Mark generation as complete (for watchdog)
+                    generation_complete = True
+                    watchdog.cancel()
+                except ValueError as e:
+                    print(f"Value error during speech generation: {e}")
+                except RuntimeError as e:
+                    print(f"Runtime error during speech generation: {e}")
+                    # If CUDA out of memory, provide more helpful message
+                    if "CUDA out of memory" in str(e):
+                        print("CUDA out of memory error - try using a shorter text or switching to CPU")
+                except KeyError as e:
+                    print(f"Key error during speech generation: {e}")
+                    print("This might be caused by a missing voice configuration")
+                except FileNotFoundError as e:
+                    print(f"File not found: {e}")
+                except Exception as e:
+                    print(f"Unexpected error during speech generation: {type(e).__name__}: {e}")
+                    import traceback
+                    traceback.print_exc()
+                # Save audio
+                if all_audio:
+                    try:
+                        # Handle single segment case without concatenation
+                        if len(all_audio) == 1:
+                            final_audio = all_audio[0]
+                        else:
+                            try:
+                                final_audio = torch.cat(all_audio, dim=0)
+                            except RuntimeError as e:
+                                print(f"Error concatenating audio segments: {e}")
+                                continue
+                        # Use consistent Path object
+                        output_path = Path(DEFAULT_OUTPUT_FILE)
+                        if save_audio_with_retry(final_audio.numpy(), SAMPLE_RATE, output_path):
+                            print(f"\nAudio saved to {output_path}")
+                            # Play a system beep to indicate completion
+                            try:
+                                print('\a')  # ASCII bell - should make a sound on most systems
+                            except:
+                                pass
+                        else:
+                            print("Failed to save audio file")
+                    except Exception as e:
+                        print(f"Error processing audio: {type(e).__name__}: {e}")
+                else:
+                    print("Error: Failed to generate audio")
+            elif choice == "3":
+                print("\nGoodbye!")
+                break
+            else:
+                print("\nInvalid choice. Please try again.")
+    except Exception as e:
+        print(f"Error in main: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        # Comprehensive cleanup with error handling
+        try:
+            print("\nPerforming cleanup...")
+            # Ensure model is properly released
+            if 'model' in locals() and model is not None:
+                print("Cleaning up model resources...")
+                # First clear any references to voice models
+                if hasattr(model, 'voices'):
+                    try:
+                        voices_count = len(model.voices)
+                        model.voices.clear()
+                        print(f"Cleared {voices_count} voice references")
+                    except Exception as voice_error:
+                        print(f"Error clearing voice references: {voice_error}")
+                # Clear any other model attributes that might hold references
+                try:
+                    for attr in list(model.__dict__.keys()):
+                        if hasattr(model, attr) and not attr.startswith('__'):
+                            try:
+                                delattr(model, attr)
+                            except:
+                                pass
+                except Exception as attr_error:
+                    print(f"Error clearing model attributes: {attr_error}")
+                # Then delete the model
+                try:
+                    del model
+                    model = None
+                    print("Model reference deleted")
+                except Exception as del_error:
+                    print(f"Error deleting model: {del_error}")
+            # Clean up voice cache
+            if 'voices_cache' in locals() and voices_cache is not None:
+                try:
+                    voices_cache.clear()
+                    voices_cache = None
+                    print("Voice cache cleared")
+                except Exception as cache_error:
+                    print(f"Error clearing voice cache: {cache_error}")
+            # Clean up any CUDA resources
+            if torch.cuda.is_available():
+                try:
+                    print("Cleaning up CUDA resources...")
+                    torch.cuda.empty_cache()
+                    print("CUDA cache emptied")
+                except Exception as cuda_error:
+                    print(f"Error clearing CUDA cache: {cuda_error}")
+            # Make sure patched functions are restored
+            try:
+                from models import _cleanup_monkey_patches
+                _cleanup_monkey_patches()
+                print("Monkey patches restored")
+            except Exception as patch_error:
+                print(f"Error restoring monkey patches: {patch_error}")
+            # Final garbage collection
+            try:
+                import gc
+                gc.collect()
+                print("Garbage collection completed")
+            except Exception as gc_error:
+                print(f"Error during garbage collection: {gc_error}")
+            print("Cleanup completed")
+        except Exception as e:
+            print(f"Error during cleanup: {type(e).__name__}: {e}")
+            import traceback
+            traceback.print_exc()
+if __name__ == "__main__":
+    main()