Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +4 -0
- .gitignore +40 -0
- .gradio/certificate.pem +31 -0
- LICENSE +190 -0
- README.md +364 -7
- gradio_interface.py +536 -0
- models.py +651 -0
- outputs/tts_20250608_125559.mp3 +3 -0
- outputs/tts_20250608_125559.wav +3 -0
- outputs/tts_20250608_125703.mp3 +3 -0
- outputs/tts_20250608_125703.wav +3 -0
- requirements.txt +13 -0
- speed_dial.py +179 -0
- test.py +2 -0
- tts_demo.py +447 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
outputs/tts_20250608_125559.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
outputs/tts_20250608_125559.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
outputs/tts_20250608_125703.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
outputs/tts_20250608_125703.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
|
| 27 |
+
# IDE
|
| 28 |
+
.idea/
|
| 29 |
+
.vscode/
|
| 30 |
+
*.swp
|
| 31 |
+
*.swo
|
| 32 |
+
|
| 33 |
+
# Project specific
|
| 34 |
+
output*.wav
|
| 35 |
+
*.pth
|
| 36 |
+
*.onnx
|
| 37 |
+
voices/
|
| 38 |
+
voices/*.pt
|
| 39 |
+
voices/**/*.pt
|
| 40 |
+
config.json
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
LICENSE
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
Copyright 2025 PierrunoYT (Kokoro TTS Local)
|
| 179 |
+
|
| 180 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 181 |
+
you may not use this file except in compliance with the License.
|
| 182 |
+
You may obtain a copy of the License at
|
| 183 |
+
|
| 184 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 185 |
+
|
| 186 |
+
Unless required by applicable law or agreed to in writing, software
|
| 187 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 188 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 189 |
+
See the License for the specific language governing permissions and
|
| 190 |
+
limitations under the License.
|
README.md
CHANGED
|
@@ -1,12 +1,369 @@
|
|
| 1 |
---
|
| 2 |
-
title: Kokoro
|
| 3 |
-
|
| 4 |
-
colorFrom: purple
|
| 5 |
-
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.33.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Kokoro-TTS-Local
|
| 3 |
+
app_file: gradio_interface.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
sdk_version: 5.33.0
|
|
|
|
|
|
|
| 6 |
---
|
| 7 |
+
# Kokoro TTS Local
|
| 8 |
|
| 9 |
+
A local implementation of the Kokoro Text-to-Speech model, featuring dynamic module loading, automatic dependency management, and a web interface.
|
| 10 |
+
|
| 11 |
+
## Features
|
| 12 |
+
|
| 13 |
+
- Local text-to-speech synthesis using the Kokoro-82M model
|
| 14 |
+
- Multiple voice support with easy voice selection (54 voices available across 8 languages)
|
| 15 |
+
- Automatic model and voice downloading from Hugging Face
|
| 16 |
+
- Phoneme output support and visualization
|
| 17 |
+
- Interactive CLI and web interface
|
| 18 |
+
- Voice listing functionality
|
| 19 |
+
- Cross-platform support (Windows, Linux, macOS)
|
| 20 |
+
- Real-time generation progress display
|
| 21 |
+
- Multiple output formats (WAV, MP3, AAC)
|
| 22 |
+
|
| 23 |
+
## Prerequisites
|
| 24 |
+
|
| 25 |
+
- Python 3.8 or higher
|
| 26 |
+
- FFmpeg (optional, for MP3/AAC conversion)
|
| 27 |
+
- CUDA-compatible GPU (optional, for faster generation)
|
| 28 |
+
- Git (for version control and package management)
|
| 29 |
+
|
| 30 |
+
## Installation
|
| 31 |
+
|
| 32 |
+
1. Clone the repository and create a Python virtual environment:
|
| 33 |
+
```bash
|
| 34 |
+
# Windows
|
| 35 |
+
python -m venv venv
|
| 36 |
+
.\venv\Scripts\activate
|
| 37 |
+
|
| 38 |
+
# Linux/macOS
|
| 39 |
+
python3 -m venv venv
|
| 40 |
+
source venv/bin/activate
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
2. Install dependencies:
|
| 44 |
+
```bash
|
| 45 |
+
pip install -r requirements.txt
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
**Alternative Installation (Simplified):**
|
| 49 |
+
For a simpler setup, you can also install the official Kokoro package directly:
|
| 50 |
+
```bash
|
| 51 |
+
pip install kokoro>=0.9.2 soundfile
|
| 52 |
+
apt-get install espeak-ng # On Linux
|
| 53 |
+
# or brew install espeak # On macOS
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
3. (Optional) For GPU acceleration, install PyTorch with CUDA support:
|
| 57 |
+
```bash
|
| 58 |
+
# For CUDA 11.8
|
| 59 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 60 |
+
|
| 61 |
+
# For CUDA 12.1
|
| 62 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 63 |
+
|
| 64 |
+
# For CUDA 12.6
|
| 65 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
|
| 66 |
+
|
| 67 |
+
# For CUDA 12.8 (for RTX 50-series cards)
|
| 68 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
You can verify CUDA support is enabled with:
|
| 72 |
+
```python
|
| 73 |
+
import torch
|
| 74 |
+
print(torch.cuda.is_available()) # Should print True if CUDA is available
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
The system will automatically download required models and voice files on first run.
|
| 78 |
+
|
| 79 |
+
## Usage
|
| 80 |
+
|
| 81 |
+
You can use either the command-line interface or the web interface:
|
| 82 |
+
|
| 83 |
+
### Command Line Interface
|
| 84 |
+
|
| 85 |
+
Run the interactive CLI:
|
| 86 |
+
```bash
|
| 87 |
+
python tts_demo.py
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
The CLI provides an interactive menu with the following options:
|
| 91 |
+
1. List available voices - Shows all available voice options
|
| 92 |
+
2. Generate speech - Interactive process to:
|
| 93 |
+
- Select a voice from the numbered list
|
| 94 |
+
- Enter text to convert to speech
|
| 95 |
+
- Adjust speech speed (0.5-2.0)
|
| 96 |
+
3. Exit - Quit the program
|
| 97 |
+
|
| 98 |
+
Example session:
|
| 99 |
+
```
|
| 100 |
+
=== Kokoro TTS Menu ===
|
| 101 |
+
1. List available voices
|
| 102 |
+
2. Generate speech
|
| 103 |
+
3. Exit
|
| 104 |
+
Select an option (1-3): 2
|
| 105 |
+
|
| 106 |
+
Available voices:
|
| 107 |
+
1. af_alloy
|
| 108 |
+
2. af_aoede
|
| 109 |
+
3. af_bella
|
| 110 |
+
...
|
| 111 |
+
|
| 112 |
+
Select a voice number (or press Enter for default 'af_bella'): 3
|
| 113 |
+
|
| 114 |
+
Enter the text you want to convert to speech
|
| 115 |
+
(or press Enter for default text)
|
| 116 |
+
> Hello, world!
|
| 117 |
+
|
| 118 |
+
Enter speech speed (0.5-2.0, default 1.0): 1.2
|
| 119 |
+
|
| 120 |
+
Generating speech for: 'Hello, world!'
|
| 121 |
+
Using voice: af_bella
|
| 122 |
+
Speed: 1.2x
|
| 123 |
+
...
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
### Web Interface
|
| 127 |
+
|
| 128 |
+
For a more user-friendly experience, launch the web interface:
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
python gradio_interface.py
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
Then open your browser to the URL shown in the console (typically http://localhost:7860).
|
| 135 |
+
|
| 136 |
+
The web interface provides:
|
| 137 |
+
- Easy voice selection from a dropdown menu
|
| 138 |
+
- Text input field with examples
|
| 139 |
+
- Real-time generation progress
|
| 140 |
+
- Audio playback in the browser
|
| 141 |
+
- Multiple output format options (WAV, MP3, AAC)
|
| 142 |
+
- Download options for generated audio
|
| 143 |
+
|
| 144 |
+
## Available Voices
|
| 145 |
+
|
| 146 |
+
The system includes 54 different voices across 8 languages:
|
| 147 |
+
|
| 148 |
+
### 🇺🇸 American English (20 voices)
|
| 149 |
+
**Language code: 'a'**
|
| 150 |
+
|
| 151 |
+
**Female voices (af_*):**
|
| 152 |
+
- af_heart: ❤️ Premium quality voice (Grade A)
|
| 153 |
+
- af_alloy: Clear and professional (Grade C)
|
| 154 |
+
- af_aoede: Smooth and melodic (Grade C+)
|
| 155 |
+
- af_bella: 🔥 Warm and friendly (Grade A-)
|
| 156 |
+
- af_jessica: Natural and engaging (Grade D)
|
| 157 |
+
- af_kore: Bright and energetic (Grade C+)
|
| 158 |
+
- af_nicole: 🎧 Professional and articulate (Grade B-)
|
| 159 |
+
- af_nova: Modern and dynamic (Grade C)
|
| 160 |
+
- af_river: Soft and flowing (Grade D)
|
| 161 |
+
- af_sarah: Casual and approachable (Grade C+)
|
| 162 |
+
- af_sky: Light and airy (Grade C-)
|
| 163 |
+
|
| 164 |
+
**Male voices (am_*):**
|
| 165 |
+
- am_adam: Strong and confident (Grade F+)
|
| 166 |
+
- am_echo: Resonant and clear (Grade D)
|
| 167 |
+
- am_eric: Professional and authoritative (Grade D)
|
| 168 |
+
- am_fenrir: Deep and powerful (Grade C+)
|
| 169 |
+
- am_liam: Friendly and conversational (Grade D)
|
| 170 |
+
- am_michael: Warm and trustworthy (Grade C+)
|
| 171 |
+
- am_onyx: Rich and sophisticated (Grade D)
|
| 172 |
+
- am_puck: Playful and energetic (Grade C+)
|
| 173 |
+
- am_santa: Holiday-themed voice (Grade D-)
|
| 174 |
+
|
| 175 |
+
### 🇬🇧 British English (8 voices)
|
| 176 |
+
**Language code: 'b'**
|
| 177 |
+
|
| 178 |
+
**Female voices (bf_*):**
|
| 179 |
+
- bf_alice: Refined and elegant (Grade D)
|
| 180 |
+
- bf_emma: Warm and professional (Grade B-)
|
| 181 |
+
- bf_isabella: Sophisticated and clear (Grade C)
|
| 182 |
+
- bf_lily: Sweet and gentle (Grade D)
|
| 183 |
+
|
| 184 |
+
**Male voices (bm_*):**
|
| 185 |
+
- bm_daniel: Polished and professional (Grade D)
|
| 186 |
+
- bm_fable: Storytelling and engaging (Grade C)
|
| 187 |
+
- bm_george: Classic British accent (Grade C)
|
| 188 |
+
- bm_lewis: Modern British accent (Grade D+)
|
| 189 |
+
|
| 190 |
+
### 🇯🇵 Japanese (5 voices)
|
| 191 |
+
**Language code: 'j'**
|
| 192 |
+
|
| 193 |
+
**Female voices (jf_*):**
|
| 194 |
+
- jf_alpha: Standard Japanese female (Grade C+)
|
| 195 |
+
- jf_gongitsune: Based on classic tale (Grade C)
|
| 196 |
+
- jf_nezumi: Mouse bride tale voice (Grade C-)
|
| 197 |
+
- jf_tebukuro: Glove story voice (Grade C)
|
| 198 |
+
|
| 199 |
+
**Male voices (jm_*):**
|
| 200 |
+
- jm_kumo: Spider thread tale voice (Grade C-)
|
| 201 |
+
|
| 202 |
+
### 🇨🇳 Mandarin Chinese (8 voices)
|
| 203 |
+
**Language code: 'z'**
|
| 204 |
+
|
| 205 |
+
**Female voices (zf_*):**
|
| 206 |
+
- zf_xiaobei: Chinese female voice (Grade D)
|
| 207 |
+
- zf_xiaoni: Chinese female voice (Grade D)
|
| 208 |
+
- zf_xiaoxiao: Chinese female voice (Grade D)
|
| 209 |
+
- zf_xiaoyi: Chinese female voice (Grade D)
|
| 210 |
+
|
| 211 |
+
**Male voices (zm_*):**
|
| 212 |
+
- zm_yunjian: Chinese male voice (Grade D)
|
| 213 |
+
- zm_yunxi: Chinese male voice (Grade D)
|
| 214 |
+
- zm_yunxia: Chinese male voice (Grade D)
|
| 215 |
+
- zm_yunyang: Chinese male voice (Grade D)
|
| 216 |
+
|
| 217 |
+
### 🇪🇸 Spanish (3 voices)
|
| 218 |
+
**Language code: 'e'**
|
| 219 |
+
|
| 220 |
+
**Female voices (ef_*):**
|
| 221 |
+
- ef_dora: Spanish female voice
|
| 222 |
+
|
| 223 |
+
**Male voices (em_*):**
|
| 224 |
+
- em_alex: Spanish male voice
|
| 225 |
+
- em_santa: Spanish holiday voice
|
| 226 |
+
|
| 227 |
+
### 🇫🇷 French (1 voice)
|
| 228 |
+
**Language code: 'f'**
|
| 229 |
+
|
| 230 |
+
**Female voices (ff_*):**
|
| 231 |
+
- ff_siwis: French female voice (Grade B-)
|
| 232 |
+
|
| 233 |
+
### 🇮🇳 Hindi (4 voices)
|
| 234 |
+
**Language code: 'h'**
|
| 235 |
+
|
| 236 |
+
**Female voices (hf_*):**
|
| 237 |
+
- hf_alpha: Hindi female voice (Grade C)
|
| 238 |
+
- hf_beta: Hindi female voice (Grade C)
|
| 239 |
+
|
| 240 |
+
**Male voices (hm_*):**
|
| 241 |
+
- hm_omega: Hindi male voice (Grade C)
|
| 242 |
+
- hm_psi: Hindi male voice (Grade C)
|
| 243 |
+
|
| 244 |
+
### 🇮🇹 Italian (2 voices)
|
| 245 |
+
**Language code: 'i'**
|
| 246 |
+
|
| 247 |
+
**Female voices (if_*):**
|
| 248 |
+
- if_sara: Italian female voice (Grade C)
|
| 249 |
+
|
| 250 |
+
**Male voices (im_*):**
|
| 251 |
+
- im_nicola: Italian male voice (Grade C)
|
| 252 |
+
|
| 253 |
+
### 🇧🇷 Brazilian Portuguese (3 voices)
|
| 254 |
+
**Language code: 'p'**
|
| 255 |
+
|
| 256 |
+
**Female voices (pf_*):**
|
| 257 |
+
- pf_dora: Portuguese female voice
|
| 258 |
+
|
| 259 |
+
**Male voices (pm_*):**
|
| 260 |
+
- pm_alex: Portuguese male voice
|
| 261 |
+
- pm_santa: Portuguese holiday voice
|
| 262 |
+
|
| 263 |
+
**Note:** Quality grades (A to F) indicate the overall quality based on training data quality and duration. Higher grades generally produce better speech quality.
|
| 264 |
+
|
| 265 |
+
## Project Structure
|
| 266 |
+
|
| 267 |
+
```
|
| 268 |
+
.
|
| 269 |
+
├── .cache/ # Cache directory for downloaded models
|
| 270 |
+
│ └── huggingface/ # Hugging Face model cache
|
| 271 |
+
├── .git/ # Git repository data
|
| 272 |
+
├── .gitignore # Git ignore rules
|
| 273 |
+
├── __pycache__/ # Python cache files
|
| 274 |
+
├── voices/ # Voice model files (downloaded on demand)
|
| 275 |
+
│ └── *.pt # Individual voice files
|
| 276 |
+
├── venv/ # Python virtual environment
|
| 277 |
+
├── outputs/ # Generated audio files directory
|
| 278 |
+
├── LICENSE # Apache 2.0 License file
|
| 279 |
+
├── README.md # Project documentation
|
| 280 |
+
├── models.py # Core TTS model implementation
|
| 281 |
+
├── gradio_interface.py # Web interface implementation
|
| 282 |
+
├── config.json # Model configuration file
|
| 283 |
+
├── requirements.txt # Python dependencies
|
| 284 |
+
└── tts_demo.py # CLI implementation
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
## Model Information
|
| 288 |
+
|
| 289 |
+
The project uses the latest Kokoro model from Hugging Face:
|
| 290 |
+
- Repository: [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)
|
| 291 |
+
- Model file: `kokoro-v1_0.pth` (downloaded automatically)
|
| 292 |
+
- Sample rate: 24kHz
|
| 293 |
+
- Voice files: Located in the `voices/` directory (downloaded automatically)
|
| 294 |
+
- Available voices: 54 voices across 8 languages
|
| 295 |
+
- Languages: American English ('a'), British English ('b'), Japanese ('j'), Mandarin Chinese ('z'), Spanish ('e'), French ('f'), Hindi ('h'), Italian ('i'), Brazilian Portuguese ('p')
|
| 296 |
+
- Model size: 82M parameters
|
| 297 |
+
|
| 298 |
+
## Troubleshooting
|
| 299 |
+
|
| 300 |
+
Common issues and solutions:
|
| 301 |
+
|
| 302 |
+
1. **Model Download Issues**
|
| 303 |
+
- Ensure stable internet connection
|
| 304 |
+
- Check Hugging Face is accessible
|
| 305 |
+
- Verify sufficient disk space
|
| 306 |
+
- Try clearing the `.cache/huggingface` directory
|
| 307 |
+
|
| 308 |
+
2. **CUDA/GPU Issues**
|
| 309 |
+
- Verify CUDA installation with `nvidia-smi`
|
| 310 |
+
- Update GPU drivers
|
| 311 |
+
- Install PyTorch with CUDA support using the appropriate command:
|
| 312 |
+
```bash
|
| 313 |
+
# For CUDA 11.8
|
| 314 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 315 |
+
|
| 316 |
+
# For CUDA 12.1
|
| 317 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 318 |
+
|
| 319 |
+
# For CUDA 12.6
|
| 320 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
|
| 321 |
+
|
| 322 |
+
# For CUDA 12.8 (for RTX 50-series cards)
|
| 323 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
| 324 |
+
```
|
| 325 |
+
- Verify CUDA is available in PyTorch:
|
| 326 |
+
```python
|
| 327 |
+
import torch
|
| 328 |
+
print(torch.cuda.is_available()) # Should print True
|
| 329 |
+
```
|
| 330 |
+
- Fall back to CPU if needed
|
| 331 |
+
|
| 332 |
+
3. **Audio Output Issues**
|
| 333 |
+
- Check system audio settings
|
| 334 |
+
- Verify output directory permissions
|
| 335 |
+
- Install FFmpeg for MP3/AAC support
|
| 336 |
+
- Try different output formats
|
| 337 |
+
|
| 338 |
+
4. **Voice File Issues**
|
| 339 |
+
- Delete and let system redownload voice files
|
| 340 |
+
- Check `voices/` directory permissions
|
| 341 |
+
- Verify voice file integrity
|
| 342 |
+
- Try using a different voice
|
| 343 |
+
|
| 344 |
+
5. **Web Interface Issues**
|
| 345 |
+
- Check port 7860 availability
|
| 346 |
+
- Try different browser
|
| 347 |
+
- Clear browser cache
|
| 348 |
+
- Check network firewall settings
|
| 349 |
+
|
| 350 |
+
For any other issues:
|
| 351 |
+
1. Check the console output for error messages
|
| 352 |
+
2. Verify all prerequisites are installed
|
| 353 |
+
3. Ensure virtual environment is activated
|
| 354 |
+
4. Check system resource usage
|
| 355 |
+
5. Try reinstalling dependencies
|
| 356 |
+
|
| 357 |
+
## Contributing
|
| 358 |
+
|
| 359 |
+
Feel free to contribute by:
|
| 360 |
+
1. Opening issues for bugs or feature requests
|
| 361 |
+
2. Submitting pull requests with improvements
|
| 362 |
+
3. Helping with documentation
|
| 363 |
+
4. Testing different voices and reporting issues
|
| 364 |
+
5. Suggesting new features or optimizations
|
| 365 |
+
6. Testing on different platforms and reporting results
|
| 366 |
+
|
| 367 |
+
## License
|
| 368 |
+
|
| 369 |
+
Apache 2.0 - See LICENSE file for details
|
gradio_interface.py
ADDED
|
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Kokoro-TTS Local Generator
|
| 3 |
+
-------------------------
|
| 4 |
+
A Gradio interface for the Kokoro-TTS-Local text-to-speech system.
|
| 5 |
+
Supports multiple voices and audio formats, with cross-platform compatibility.
|
| 6 |
+
|
| 7 |
+
Key Features:
|
| 8 |
+
- Multiple voice models support (54 voices across 8 languages)
|
| 9 |
+
- Real-time generation with progress logging
|
| 10 |
+
- WAV, MP3, and AAC output formats
|
| 11 |
+
- Network sharing capabilities
|
| 12 |
+
- Cross-platform compatibility (Windows, macOS, Linux)
|
| 13 |
+
|
| 14 |
+
Dependencies:
|
| 15 |
+
- kokoro: Official Kokoro TTS library
|
| 16 |
+
- gradio: Web interface framework
|
| 17 |
+
- soundfile: Audio file handling
|
| 18 |
+
- pydub: Audio format conversion
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import gradio as gr
|
| 22 |
+
import os
|
| 23 |
+
import sys
|
| 24 |
+
import platform
|
| 25 |
+
from datetime import datetime
|
| 26 |
+
import shutil
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
import soundfile as sf
|
| 29 |
+
from pydub import AudioSegment
|
| 30 |
+
import torch
|
| 31 |
+
import numpy as np
|
| 32 |
+
from typing import Union, List, Optional, Tuple, Dict, Any
|
| 33 |
+
from models import (
|
| 34 |
+
list_available_voices, build_model,
|
| 35 |
+
generate_speech, download_voice_files
|
| 36 |
+
)
|
| 37 |
+
from kokoro import KPipeline
|
| 38 |
+
import speed_dial
|
| 39 |
+
|
| 40 |
+
# Define path type for consistent handling
|
| 41 |
+
PathLike = Union[str, Path]
|
| 42 |
+
|
| 43 |
+
# Configuration validation
|
| 44 |
+
def validate_sample_rate(rate: int) -> int:
|
| 45 |
+
"""Validate sample rate is within acceptable range"""
|
| 46 |
+
valid_rates = [16000, 22050, 24000, 44100, 48000]
|
| 47 |
+
if rate not in valid_rates:
|
| 48 |
+
print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}")
|
| 49 |
+
return 24000 # Default to safe value
|
| 50 |
+
return rate
|
| 51 |
+
|
| 52 |
+
# Global configuration
|
| 53 |
+
CONFIG_FILE = Path("tts_config.json") # Stores user preferences and paths
|
| 54 |
+
DEFAULT_OUTPUT_DIR = Path("outputs") # Directory for generated audio files
|
| 55 |
+
SAMPLE_RATE = validate_sample_rate(24000) # Validated sample rate
|
| 56 |
+
|
| 57 |
+
# Initialize model globally
|
| 58 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 59 |
+
model = None
|
| 60 |
+
|
| 61 |
+
LANG_MAP = {
|
| 62 |
+
"af_": "a", "am_": "a",
|
| 63 |
+
"bf_": "b", "bm_": "b",
|
| 64 |
+
"jf_": "j", "jm_": "j",
|
| 65 |
+
"zf_": "z", "zm_": "z",
|
| 66 |
+
"ef_": "e", "em_": "e",
|
| 67 |
+
"ff_": "f",
|
| 68 |
+
"hf_": "h", "hm_": "h",
|
| 69 |
+
"if_": "i", "im_": "i",
|
| 70 |
+
"pf_": "p", "pm_": "p",
|
| 71 |
+
}
|
| 72 |
+
pipelines = {}
|
| 73 |
+
|
| 74 |
+
def get_available_voices():
|
| 75 |
+
"""Get list of available voice models."""
|
| 76 |
+
try:
|
| 77 |
+
# Initialize model to trigger voice downloads
|
| 78 |
+
global model
|
| 79 |
+
if model is None:
|
| 80 |
+
print("Initializing model and downloading voices...")
|
| 81 |
+
model = build_model(None, device)
|
| 82 |
+
|
| 83 |
+
voices = list_available_voices()
|
| 84 |
+
if not voices:
|
| 85 |
+
print("No voices found after initialization. Attempting to download...")
|
| 86 |
+
download_voice_files() # Try downloading again
|
| 87 |
+
voices = list_available_voices()
|
| 88 |
+
|
| 89 |
+
print("Available voices:", voices)
|
| 90 |
+
return voices
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"Error getting voices: {e}")
|
| 93 |
+
return []
|
| 94 |
+
|
| 95 |
+
def get_pipeline_for_voice(voice_name: str) -> KPipeline:
|
| 96 |
+
"""
|
| 97 |
+
Determine the language code from the voice prefix and return the associated pipeline.
|
| 98 |
+
"""
|
| 99 |
+
prefix = voice_name[:3].lower()
|
| 100 |
+
lang_code = LANG_MAP.get(prefix, "a")
|
| 101 |
+
if lang_code not in pipelines:
|
| 102 |
+
print(f"[INFO] Creating pipeline for lang_code='{lang_code}'")
|
| 103 |
+
pipelines[lang_code] = KPipeline(lang_code=lang_code, model=True)
|
| 104 |
+
return pipelines[lang_code]
|
| 105 |
+
|
| 106 |
+
def convert_audio(input_path: PathLike, output_path: PathLike, format: str) -> Optional[PathLike]:
|
| 107 |
+
"""Convert audio to specified format.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
input_path: Path to input audio file
|
| 111 |
+
output_path: Path to output audio file
|
| 112 |
+
format: Output format ('wav', 'mp3', or 'aac')
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
Path to output file or None on error
|
| 116 |
+
"""
|
| 117 |
+
try:
|
| 118 |
+
# Normalize paths
|
| 119 |
+
input_path = Path(input_path).absolute()
|
| 120 |
+
output_path = Path(output_path).absolute()
|
| 121 |
+
|
| 122 |
+
# Validate input file
|
| 123 |
+
if not input_path.exists():
|
| 124 |
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
| 125 |
+
|
| 126 |
+
# For WAV format, just return the input path
|
| 127 |
+
if format.lower() == "wav":
|
| 128 |
+
return input_path
|
| 129 |
+
|
| 130 |
+
# Create output directory if it doesn't exist
|
| 131 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
|
| 133 |
+
# Convert format
|
| 134 |
+
audio = AudioSegment.from_wav(str(input_path))
|
| 135 |
+
|
| 136 |
+
# Select proper format and options
|
| 137 |
+
if format.lower() == "mp3":
|
| 138 |
+
audio.export(str(output_path), format="mp3", bitrate="192k")
|
| 139 |
+
elif format.lower() == "aac":
|
| 140 |
+
audio.export(str(output_path), format="aac", bitrate="192k")
|
| 141 |
+
else:
|
| 142 |
+
raise ValueError(f"Unsupported format: {format}")
|
| 143 |
+
|
| 144 |
+
# Verify file was created
|
| 145 |
+
if not output_path.exists() or output_path.stat().st_size == 0:
|
| 146 |
+
raise IOError(f"Failed to create {format} file")
|
| 147 |
+
|
| 148 |
+
return output_path
|
| 149 |
+
|
| 150 |
+
except (IOError, FileNotFoundError, ValueError) as e:
|
| 151 |
+
print(f"Error converting audio: {type(e).__name__}: {e}")
|
| 152 |
+
return None
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Unexpected error converting audio: {type(e).__name__}: {e}")
|
| 155 |
+
import traceback
|
| 156 |
+
traceback.print_exc()
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
def generate_tts_with_logs(voice_name: str, text: str, format: str, speed: float = 1.0) -> Optional[PathLike]:
|
| 160 |
+
"""Generate TTS audio with progress logging.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
voice_name: Name of the voice to use
|
| 164 |
+
text: Text to convert to speech
|
| 165 |
+
format: Output format ('wav', 'mp3', 'aac')
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
Path to generated audio file or None on error
|
| 169 |
+
"""
|
| 170 |
+
global model
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
# Initialize model if needed
|
| 174 |
+
if model is None:
|
| 175 |
+
print("Initializing model...")
|
| 176 |
+
model = build_model(None, device)
|
| 177 |
+
|
| 178 |
+
# Create output directory
|
| 179 |
+
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 180 |
+
|
| 181 |
+
# Validate input text
|
| 182 |
+
if not text or not text.strip():
|
| 183 |
+
raise ValueError("Text input cannot be empty")
|
| 184 |
+
|
| 185 |
+
# Limit extremely long texts to prevent memory issues
|
| 186 |
+
MAX_CHARS = 5000
|
| 187 |
+
if len(text) > MAX_CHARS:
|
| 188 |
+
print(f"Warning: Text exceeds {MAX_CHARS} characters. Truncating to prevent memory issues.")
|
| 189 |
+
text = text[:MAX_CHARS] + "..."
|
| 190 |
+
|
| 191 |
+
# Generate base filename from text
|
| 192 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 193 |
+
base_name = f"tts_{timestamp}"
|
| 194 |
+
wav_path = DEFAULT_OUTPUT_DIR / f"{base_name}.wav"
|
| 195 |
+
|
| 196 |
+
# Generate speech
|
| 197 |
+
print(f"\nGenerating speech for: '{text}'")
|
| 198 |
+
print(f"Using voice: {voice_name}")
|
| 199 |
+
|
| 200 |
+
# Validate voice path using Path for consistent handling
|
| 201 |
+
voice_path = Path("voices").absolute() / f"{voice_name}.pt"
|
| 202 |
+
if not voice_path.exists():
|
| 203 |
+
raise FileNotFoundError(f"Voice file not found: {voice_path}")
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
if voice_name.startswith(tuple(LANG_MAP.keys())):
|
| 207 |
+
pipeline = get_pipeline_for_voice(voice_name)
|
| 208 |
+
generator = pipeline(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
|
| 209 |
+
else:
|
| 210 |
+
generator = model(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
|
| 211 |
+
|
| 212 |
+
all_audio = []
|
| 213 |
+
max_segments = 100 # Safety limit for very long texts
|
| 214 |
+
segment_count = 0
|
| 215 |
+
|
| 216 |
+
for gs, ps, audio in generator:
|
| 217 |
+
segment_count += 1
|
| 218 |
+
if segment_count > max_segments:
|
| 219 |
+
print(f"Warning: Reached maximum segment limit ({max_segments})")
|
| 220 |
+
break
|
| 221 |
+
|
| 222 |
+
if audio is not None:
|
| 223 |
+
if isinstance(audio, np.ndarray):
|
| 224 |
+
audio = torch.from_numpy(audio).float()
|
| 225 |
+
all_audio.append(audio)
|
| 226 |
+
print(f"Generated segment: {gs}")
|
| 227 |
+
if ps: # Only print phonemes if available
|
| 228 |
+
print(f"Phonemes: {ps}")
|
| 229 |
+
|
| 230 |
+
if not all_audio:
|
| 231 |
+
raise Exception("No audio generated")
|
| 232 |
+
except Exception as e:
|
| 233 |
+
raise Exception(f"Error in speech generation: {e}")
|
| 234 |
+
|
| 235 |
+
# Combine audio segments and save
|
| 236 |
+
if not all_audio:
|
| 237 |
+
raise Exception("No audio segments were generated")
|
| 238 |
+
|
| 239 |
+
# Handle single segment case without concatenation
|
| 240 |
+
if len(all_audio) == 1:
|
| 241 |
+
final_audio = all_audio[0]
|
| 242 |
+
else:
|
| 243 |
+
try:
|
| 244 |
+
final_audio = torch.cat(all_audio, dim=0)
|
| 245 |
+
except RuntimeError as e:
|
| 246 |
+
raise Exception(f"Failed to concatenate audio segments: {e}")
|
| 247 |
+
|
| 248 |
+
# Save audio file
|
| 249 |
+
try:
|
| 250 |
+
sf.write(wav_path, final_audio.numpy(), SAMPLE_RATE)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
raise Exception(f"Failed to save audio file: {e}")
|
| 253 |
+
|
| 254 |
+
# Convert to requested format if needed
|
| 255 |
+
if format.lower() != "wav":
|
| 256 |
+
output_path = DEFAULT_OUTPUT_DIR / f"{base_name}.{format.lower()}"
|
| 257 |
+
return convert_audio(wav_path, output_path, format.lower())
|
| 258 |
+
|
| 259 |
+
return wav_path
|
| 260 |
+
|
| 261 |
+
except Exception as e:
|
| 262 |
+
print(f"Error generating speech: {e}")
|
| 263 |
+
import traceback
|
| 264 |
+
traceback.print_exc()
|
| 265 |
+
return None
|
| 266 |
+
|
| 267 |
+
def create_interface(server_name="0.0.0.0", server_port=7860):
|
| 268 |
+
"""Create and launch the Gradio interface."""
|
| 269 |
+
|
| 270 |
+
# Get available voices
|
| 271 |
+
voices = get_available_voices()
|
| 272 |
+
if not voices:
|
| 273 |
+
print("No voices found! Please check the voices directory.")
|
| 274 |
+
return
|
| 275 |
+
|
| 276 |
+
# Get speed dial presets
|
| 277 |
+
preset_names = speed_dial.get_preset_names()
|
| 278 |
+
|
| 279 |
+
# Create interface
|
| 280 |
+
with gr.Blocks(title="Kokoro TTS Generator") as interface:
|
| 281 |
+
gr.Markdown("# Kokoro TTS Generator")
|
| 282 |
+
|
| 283 |
+
with gr.Row():
|
| 284 |
+
with gr.Column(scale=2):
|
| 285 |
+
# Main TTS controls
|
| 286 |
+
voice = gr.Dropdown(
|
| 287 |
+
choices=voices,
|
| 288 |
+
value=voices[0] if voices else None,
|
| 289 |
+
label="Voice"
|
| 290 |
+
)
|
| 291 |
+
text = gr.Textbox(
|
| 292 |
+
lines=3,
|
| 293 |
+
placeholder="Enter text to convert to speech...",
|
| 294 |
+
label="Text"
|
| 295 |
+
)
|
| 296 |
+
with gr.Row():
|
| 297 |
+
format = gr.Radio(
|
| 298 |
+
choices=["wav", "mp3", "aac"],
|
| 299 |
+
value="wav",
|
| 300 |
+
label="Output Format"
|
| 301 |
+
)
|
| 302 |
+
speed = gr.Slider(
|
| 303 |
+
minimum=0.5,
|
| 304 |
+
maximum=2.0,
|
| 305 |
+
value=1.0,
|
| 306 |
+
step=0.1,
|
| 307 |
+
label="Speed"
|
| 308 |
+
)
|
| 309 |
+
generate = gr.Button("Generate Speech")
|
| 310 |
+
|
| 311 |
+
with gr.Column(scale=1):
|
| 312 |
+
# Speed dial section
|
| 313 |
+
gr.Markdown("## Speed Dial")
|
| 314 |
+
preset_dropdown = gr.Dropdown(
|
| 315 |
+
choices=preset_names,
|
| 316 |
+
value=preset_names[0] if preset_names else None,
|
| 317 |
+
label="Saved Presets",
|
| 318 |
+
interactive=True
|
| 319 |
+
)
|
| 320 |
+
preset_name = gr.Textbox(
|
| 321 |
+
placeholder="Enter preset name...",
|
| 322 |
+
label="New Preset Name"
|
| 323 |
+
)
|
| 324 |
+
with gr.Row():
|
| 325 |
+
load_preset = gr.Button("Load")
|
| 326 |
+
save_preset = gr.Button("Save Current")
|
| 327 |
+
delete_preset = gr.Button("Delete")
|
| 328 |
+
|
| 329 |
+
# Output section
|
| 330 |
+
output = gr.Audio(label="Generated Audio")
|
| 331 |
+
|
| 332 |
+
# Function to load a preset
|
| 333 |
+
def load_preset_fn(preset_name):
|
| 334 |
+
if not preset_name:
|
| 335 |
+
return None, None, None, None
|
| 336 |
+
|
| 337 |
+
preset = speed_dial.get_preset(preset_name)
|
| 338 |
+
if not preset:
|
| 339 |
+
return None, None, None, None
|
| 340 |
+
|
| 341 |
+
return preset["voice"], preset["text"], preset["format"], preset["speed"]
|
| 342 |
+
|
| 343 |
+
# Function to save a preset
|
| 344 |
+
def save_preset_fn(name, voice, text, format, speed):
|
| 345 |
+
if not name or not voice or not text:
|
| 346 |
+
return gr.update(value="Please provide a name, voice, and text")
|
| 347 |
+
|
| 348 |
+
success = speed_dial.save_preset(name, voice, text, format, speed)
|
| 349 |
+
|
| 350 |
+
# Update the dropdown with the new preset list
|
| 351 |
+
preset_names = speed_dial.get_preset_names()
|
| 352 |
+
|
| 353 |
+
if success:
|
| 354 |
+
return gr.update(choices=preset_names, value=name)
|
| 355 |
+
else:
|
| 356 |
+
return gr.update(choices=preset_names)
|
| 357 |
+
|
| 358 |
+
# Function to delete a preset
|
| 359 |
+
def delete_preset_fn(name):
|
| 360 |
+
if not name:
|
| 361 |
+
return gr.update(value="Please select a preset to delete")
|
| 362 |
+
|
| 363 |
+
success = speed_dial.delete_preset(name)
|
| 364 |
+
|
| 365 |
+
# Update the dropdown with the new preset list
|
| 366 |
+
preset_names = speed_dial.get_preset_names()
|
| 367 |
+
|
| 368 |
+
if success:
|
| 369 |
+
return gr.update(choices=preset_names, value=None)
|
| 370 |
+
else:
|
| 371 |
+
return gr.update(choices=preset_names)
|
| 372 |
+
|
| 373 |
+
# Connect the buttons to their functions
|
| 374 |
+
load_preset.click(
|
| 375 |
+
fn=load_preset_fn,
|
| 376 |
+
inputs=preset_dropdown,
|
| 377 |
+
outputs=[voice, text, format, speed]
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
save_preset.click(
|
| 381 |
+
fn=save_preset_fn,
|
| 382 |
+
inputs=[preset_name, voice, text, format, speed],
|
| 383 |
+
outputs=preset_dropdown
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
delete_preset.click(
|
| 387 |
+
fn=delete_preset_fn,
|
| 388 |
+
inputs=preset_dropdown,
|
| 389 |
+
outputs=preset_dropdown
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
# Connect the generate button
|
| 393 |
+
generate.click(
|
| 394 |
+
fn=generate_tts_with_logs,
|
| 395 |
+
inputs=[voice, text, format, speed],
|
| 396 |
+
outputs=output
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
# Launch interface
|
| 400 |
+
interface.launch(
|
| 401 |
+
server_name=server_name,
|
| 402 |
+
server_port=server_port,
|
| 403 |
+
share=True
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
def cleanup_resources():
|
| 407 |
+
"""Properly clean up resources when the application exits"""
|
| 408 |
+
global model
|
| 409 |
+
|
| 410 |
+
try:
|
| 411 |
+
print("Cleaning up resources...")
|
| 412 |
+
|
| 413 |
+
# Clean up model resources
|
| 414 |
+
if model is not None:
|
| 415 |
+
print("Releasing model resources...")
|
| 416 |
+
|
| 417 |
+
# Clear voice dictionary to release memory
|
| 418 |
+
if hasattr(model, 'voices') and model.voices is not None:
|
| 419 |
+
try:
|
| 420 |
+
voice_count = len(model.voices)
|
| 421 |
+
for voice_name in list(model.voices.keys()):
|
| 422 |
+
try:
|
| 423 |
+
# Release each voice explicitly
|
| 424 |
+
model.voices[voice_name] = None
|
| 425 |
+
except:
|
| 426 |
+
pass
|
| 427 |
+
model.voices.clear()
|
| 428 |
+
print(f"Cleared {voice_count} voice references")
|
| 429 |
+
except Exception as ve:
|
| 430 |
+
print(f"Error clearing voices: {type(ve).__name__}: {ve}")
|
| 431 |
+
|
| 432 |
+
# Clear model attributes that might hold tensors
|
| 433 |
+
for attr_name in dir(model):
|
| 434 |
+
if not attr_name.startswith('__') and hasattr(model, attr_name):
|
| 435 |
+
try:
|
| 436 |
+
attr = getattr(model, attr_name)
|
| 437 |
+
# Handle specific tensor attributes
|
| 438 |
+
if isinstance(attr, torch.Tensor):
|
| 439 |
+
if attr.is_cuda:
|
| 440 |
+
print(f"Releasing CUDA tensor: {attr_name}")
|
| 441 |
+
setattr(model, attr_name, None)
|
| 442 |
+
elif hasattr(attr, 'to'): # Module or Tensor-like object
|
| 443 |
+
setattr(model, attr_name, None)
|
| 444 |
+
except:
|
| 445 |
+
pass
|
| 446 |
+
|
| 447 |
+
# Delete model reference
|
| 448 |
+
try:
|
| 449 |
+
del model
|
| 450 |
+
model = None
|
| 451 |
+
print("Model reference deleted")
|
| 452 |
+
except Exception as me:
|
| 453 |
+
print(f"Error deleting model: {type(me).__name__}: {me}")
|
| 454 |
+
|
| 455 |
+
# Clear CUDA memory explicitly
|
| 456 |
+
if torch.cuda.is_available():
|
| 457 |
+
try:
|
| 458 |
+
# Get initial memory usage
|
| 459 |
+
try:
|
| 460 |
+
initial = torch.cuda.memory_allocated()
|
| 461 |
+
initial_mb = initial / (1024 * 1024)
|
| 462 |
+
print(f"CUDA memory before cleanup: {initial_mb:.2f} MB")
|
| 463 |
+
except:
|
| 464 |
+
pass
|
| 465 |
+
|
| 466 |
+
# Free memory
|
| 467 |
+
print("Clearing CUDA cache...")
|
| 468 |
+
torch.cuda.empty_cache()
|
| 469 |
+
|
| 470 |
+
# Force synchronization
|
| 471 |
+
try:
|
| 472 |
+
torch.cuda.synchronize()
|
| 473 |
+
except:
|
| 474 |
+
pass
|
| 475 |
+
|
| 476 |
+
# Get final memory usage
|
| 477 |
+
try:
|
| 478 |
+
final = torch.cuda.memory_allocated()
|
| 479 |
+
final_mb = final / (1024 * 1024)
|
| 480 |
+
freed_mb = (initial - final) / (1024 * 1024)
|
| 481 |
+
print(f"CUDA memory after cleanup: {final_mb:.2f} MB (freed {freed_mb:.2f} MB)")
|
| 482 |
+
except:
|
| 483 |
+
pass
|
| 484 |
+
except Exception as ce:
|
| 485 |
+
print(f"Error clearing CUDA memory: {type(ce).__name__}: {ce}")
|
| 486 |
+
|
| 487 |
+
# Restore original functions
|
| 488 |
+
try:
|
| 489 |
+
from models import _cleanup_monkey_patches
|
| 490 |
+
_cleanup_monkey_patches()
|
| 491 |
+
print("Monkey patches restored")
|
| 492 |
+
except Exception as pe:
|
| 493 |
+
print(f"Error restoring monkey patches: {type(pe).__name__}: {pe}")
|
| 494 |
+
|
| 495 |
+
# Final garbage collection
|
| 496 |
+
try:
|
| 497 |
+
import gc
|
| 498 |
+
collected = gc.collect()
|
| 499 |
+
print(f"Garbage collection completed: {collected} objects collected")
|
| 500 |
+
except Exception as gce:
|
| 501 |
+
print(f"Error during garbage collection: {type(gce).__name__}: {gce}")
|
| 502 |
+
|
| 503 |
+
print("Cleanup completed")
|
| 504 |
+
|
| 505 |
+
except Exception as e:
|
| 506 |
+
print(f"Error during cleanup: {type(e).__name__}: {e}")
|
| 507 |
+
import traceback
|
| 508 |
+
traceback.print_exc()
|
| 509 |
+
|
| 510 |
+
# Register cleanup for normal exit
|
| 511 |
+
import atexit
|
| 512 |
+
atexit.register(cleanup_resources)
|
| 513 |
+
|
| 514 |
+
# Register cleanup for signals
|
| 515 |
+
import signal
|
| 516 |
+
import sys
|
| 517 |
+
|
| 518 |
+
def signal_handler(signum, frame):
|
| 519 |
+
print(f"\nReceived signal {signum}, shutting down...")
|
| 520 |
+
cleanup_resources()
|
| 521 |
+
sys.exit(0)
|
| 522 |
+
|
| 523 |
+
# Register for common signals
|
| 524 |
+
for sig in [signal.SIGINT, signal.SIGTERM]:
|
| 525 |
+
try:
|
| 526 |
+
signal.signal(sig, signal_handler)
|
| 527 |
+
except (ValueError, AttributeError):
|
| 528 |
+
# Some signals might not be available on all platforms
|
| 529 |
+
pass
|
| 530 |
+
|
| 531 |
+
if __name__ == "__main__":
|
| 532 |
+
try:
|
| 533 |
+
create_interface()
|
| 534 |
+
finally:
|
| 535 |
+
# Ensure cleanup even if Gradio encounters an error
|
| 536 |
+
cleanup_resources()
|
models.py
ADDED
|
@@ -0,0 +1,651 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models module for Kokoro TTS Local"""
|
| 2 |
+
from typing import Optional, Tuple, List
|
| 3 |
+
import torch
|
| 4 |
+
from kokoro import KPipeline
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import codecs
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import numpy as np
|
| 10 |
+
import shutil
|
| 11 |
+
import threading
|
| 12 |
+
|
| 13 |
+
# Set environment variables for proper encoding
|
| 14 |
+
os.environ["PYTHONIOENCODING"] = "utf-8"
|
| 15 |
+
# Disable symlinks warning
|
| 16 |
+
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
| 17 |
+
|
| 18 |
+
# Setup for safer monkey-patching
|
| 19 |
+
import atexit
|
| 20 |
+
import signal
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
# Track whether patches have been applied
|
| 24 |
+
_patches_applied = {
|
| 25 |
+
'json_load': False,
|
| 26 |
+
'load_voice': False
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
def _cleanup_monkey_patches():
|
| 30 |
+
"""Restore original functions that were monkey-patched"""
|
| 31 |
+
try:
|
| 32 |
+
if _patches_applied['json_load'] and _original_json_load is not None:
|
| 33 |
+
restore_json_load()
|
| 34 |
+
_patches_applied['json_load'] = False
|
| 35 |
+
print("Restored original json.load function")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"Warning: Error restoring json.load: {e}")
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
if _patches_applied['load_voice']:
|
| 41 |
+
restore_original_load_voice()
|
| 42 |
+
_patches_applied['load_voice'] = False
|
| 43 |
+
print("Restored original KPipeline.load_voice function")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"Warning: Error restoring KPipeline.load_voice: {e}")
|
| 46 |
+
|
| 47 |
+
# Register cleanup for normal exit
|
| 48 |
+
atexit.register(_cleanup_monkey_patches)
|
| 49 |
+
|
| 50 |
+
# Register cleanup for signals
|
| 51 |
+
for sig in [signal.SIGINT, signal.SIGTERM]:
|
| 52 |
+
try:
|
| 53 |
+
signal.signal(sig, lambda signum, frame: (
|
| 54 |
+
print(f"\nReceived signal {signum}, cleaning up..."),
|
| 55 |
+
_cleanup_monkey_patches(),
|
| 56 |
+
sys.exit(1)
|
| 57 |
+
))
|
| 58 |
+
except (ValueError, AttributeError):
|
| 59 |
+
# Some signals might not be available on all platforms
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
# List of available voice files (54 voices across 8 languages)
|
| 63 |
+
VOICE_FILES = [
|
| 64 |
+
# American English Female voices (11 voices)
|
| 65 |
+
"af_heart.pt", "af_alloy.pt", "af_aoede.pt", "af_bella.pt", "af_jessica.pt",
|
| 66 |
+
"af_kore.pt", "af_nicole.pt", "af_nova.pt", "af_river.pt", "af_sarah.pt", "af_sky.pt",
|
| 67 |
+
|
| 68 |
+
# American English Male voices (9 voices)
|
| 69 |
+
"am_adam.pt", "am_echo.pt", "am_eric.pt", "am_fenrir.pt", "am_liam.pt",
|
| 70 |
+
"am_michael.pt", "am_onyx.pt", "am_puck.pt", "am_santa.pt",
|
| 71 |
+
|
| 72 |
+
# British English Female voices (4 voices)
|
| 73 |
+
"bf_alice.pt", "bf_emma.pt", "bf_isabella.pt", "bf_lily.pt",
|
| 74 |
+
|
| 75 |
+
# British English Male voices (4 voices)
|
| 76 |
+
"bm_daniel.pt", "bm_fable.pt", "bm_george.pt", "bm_lewis.pt",
|
| 77 |
+
|
| 78 |
+
# Japanese voices (5 voices)
|
| 79 |
+
"jf_alpha.pt", "jf_gongitsune.pt", "jf_nezumi.pt", "jf_tebukuro.pt", "jm_kumo.pt",
|
| 80 |
+
|
| 81 |
+
# Mandarin Chinese voices (8 voices)
|
| 82 |
+
"zf_xiaobei.pt", "zf_xiaoni.pt", "zf_xiaoxiao.pt", "zf_xiaoyi.pt",
|
| 83 |
+
"zm_yunjian.pt", "zm_yunxi.pt", "zm_yunxia.pt", "zm_yunyang.pt",
|
| 84 |
+
|
| 85 |
+
# Spanish voices (3 voices)
|
| 86 |
+
"ef_dora.pt", "em_alex.pt", "em_santa.pt",
|
| 87 |
+
|
| 88 |
+
# French voices (1 voice)
|
| 89 |
+
"ff_siwis.pt",
|
| 90 |
+
|
| 91 |
+
# Hindi voices (4 voices)
|
| 92 |
+
"hf_alpha.pt", "hf_beta.pt", "hm_omega.pt", "hm_psi.pt",
|
| 93 |
+
|
| 94 |
+
# Italian voices (2 voices)
|
| 95 |
+
"if_sara.pt", "im_nicola.pt",
|
| 96 |
+
|
| 97 |
+
# Brazilian Portuguese voices (3 voices)
|
| 98 |
+
"pf_dora.pt", "pm_alex.pt", "pm_santa.pt"
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
# Language code mapping for different languages
|
| 102 |
+
LANGUAGE_CODES = {
|
| 103 |
+
'a': 'American English',
|
| 104 |
+
'b': 'British English',
|
| 105 |
+
'j': 'Japanese',
|
| 106 |
+
'z': 'Mandarin Chinese',
|
| 107 |
+
'e': 'Spanish',
|
| 108 |
+
'f': 'French',
|
| 109 |
+
'h': 'Hindi',
|
| 110 |
+
'i': 'Italian',
|
| 111 |
+
'p': 'Brazilian Portuguese'
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
# Patch KPipeline's load_voice method to use weights_only=False
|
| 115 |
+
original_load_voice = KPipeline.load_voice
|
| 116 |
+
|
| 117 |
+
def patched_load_voice(self, voice_path):
|
| 118 |
+
"""Load voice model with weights_only=False for compatibility"""
|
| 119 |
+
if not os.path.exists(voice_path):
|
| 120 |
+
raise FileNotFoundError(f"Voice file not found: {voice_path}")
|
| 121 |
+
voice_name = Path(voice_path).stem
|
| 122 |
+
try:
|
| 123 |
+
voice_model = torch.load(voice_path, weights_only=False)
|
| 124 |
+
if voice_model is None:
|
| 125 |
+
raise ValueError(f"Failed to load voice model from {voice_path}")
|
| 126 |
+
# Ensure device is set
|
| 127 |
+
if not hasattr(self, 'device'):
|
| 128 |
+
self.device = 'cpu'
|
| 129 |
+
# Move model to device and store in voices dictionary
|
| 130 |
+
self.voices[voice_name] = voice_model.to(self.device)
|
| 131 |
+
return self.voices[voice_name]
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f"Error loading voice {voice_name}: {e}")
|
| 134 |
+
raise
|
| 135 |
+
|
| 136 |
+
# Apply the patch
|
| 137 |
+
KPipeline.load_voice = patched_load_voice
|
| 138 |
+
_patches_applied['load_voice'] = True
|
| 139 |
+
|
| 140 |
+
# Store original function for restoration if needed
|
| 141 |
+
def restore_original_load_voice():
|
| 142 |
+
global _patches_applied
|
| 143 |
+
if _patches_applied['load_voice']:
|
| 144 |
+
KPipeline.load_voice = original_load_voice
|
| 145 |
+
_patches_applied['load_voice'] = False
|
| 146 |
+
|
| 147 |
+
def patch_json_load():
|
| 148 |
+
"""Patch json.load to handle UTF-8 encoded files with special characters"""
|
| 149 |
+
global _patches_applied, _original_json_load
|
| 150 |
+
original_load = json.load
|
| 151 |
+
_original_json_load = original_load # Store for restoration
|
| 152 |
+
|
| 153 |
+
def custom_load(fp, *args, **kwargs):
|
| 154 |
+
try:
|
| 155 |
+
# Try reading with UTF-8 encoding
|
| 156 |
+
if hasattr(fp, 'buffer'):
|
| 157 |
+
content = fp.buffer.read().decode('utf-8')
|
| 158 |
+
else:
|
| 159 |
+
content = fp.read()
|
| 160 |
+
try:
|
| 161 |
+
return json.loads(content)
|
| 162 |
+
except json.JSONDecodeError as e:
|
| 163 |
+
print(f"JSON parsing error: {e}")
|
| 164 |
+
raise
|
| 165 |
+
except UnicodeDecodeError:
|
| 166 |
+
# If UTF-8 fails, try with utf-8-sig for files with BOM
|
| 167 |
+
fp.seek(0)
|
| 168 |
+
content = fp.read()
|
| 169 |
+
if isinstance(content, bytes):
|
| 170 |
+
content = content.decode('utf-8-sig', errors='replace')
|
| 171 |
+
try:
|
| 172 |
+
return json.loads(content)
|
| 173 |
+
except json.JSONDecodeError as e:
|
| 174 |
+
print(f"JSON parsing error: {e}")
|
| 175 |
+
raise
|
| 176 |
+
|
| 177 |
+
json.load = custom_load
|
| 178 |
+
_patches_applied['json_load'] = True
|
| 179 |
+
return original_load # Return original for restoration
|
| 180 |
+
|
| 181 |
+
# Store the original load function for potential restoration
|
| 182 |
+
_original_json_load = None
|
| 183 |
+
|
| 184 |
+
def restore_json_load():
|
| 185 |
+
"""Restore the original json.load function"""
|
| 186 |
+
global _original_json_load, _patches_applied
|
| 187 |
+
if _original_json_load is not None and _patches_applied['json_load']:
|
| 188 |
+
json.load = _original_json_load
|
| 189 |
+
_original_json_load = None
|
| 190 |
+
_patches_applied['json_load'] = False
|
| 191 |
+
|
| 192 |
+
def load_config(config_path: str) -> dict:
|
| 193 |
+
"""Load configuration file with proper encoding handling"""
|
| 194 |
+
try:
|
| 195 |
+
with codecs.open(config_path, 'r', encoding='utf-8') as f:
|
| 196 |
+
return json.load(f)
|
| 197 |
+
except UnicodeDecodeError:
|
| 198 |
+
# Fallback to utf-8-sig if regular utf-8 fails
|
| 199 |
+
with codecs.open(config_path, 'r', encoding='utf-8-sig') as f:
|
| 200 |
+
return json.load(f)
|
| 201 |
+
|
| 202 |
+
# Initialize espeak-ng
|
| 203 |
+
phonemizer_available = False # Global flag to track if phonemizer is working
|
| 204 |
+
try:
|
| 205 |
+
from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
| 206 |
+
from phonemizer import phonemize
|
| 207 |
+
import espeakng_loader
|
| 208 |
+
|
| 209 |
+
# Make library available first
|
| 210 |
+
library_path = espeakng_loader.get_library_path()
|
| 211 |
+
data_path = espeakng_loader.get_data_path()
|
| 212 |
+
espeakng_loader.make_library_available()
|
| 213 |
+
|
| 214 |
+
# Set up espeak-ng paths
|
| 215 |
+
EspeakWrapper.library_path = library_path
|
| 216 |
+
EspeakWrapper.data_path = data_path
|
| 217 |
+
|
| 218 |
+
# Verify espeak-ng is working
|
| 219 |
+
try:
|
| 220 |
+
test_phonemes = phonemize('test', language='en-us')
|
| 221 |
+
if test_phonemes:
|
| 222 |
+
phonemizer_available = True
|
| 223 |
+
print("Phonemizer successfully initialized")
|
| 224 |
+
else:
|
| 225 |
+
print("Note: Phonemization returned empty result")
|
| 226 |
+
print("TTS will work, but phoneme visualization will be disabled")
|
| 227 |
+
except Exception as e:
|
| 228 |
+
# Continue without espeak functionality
|
| 229 |
+
print(f"Note: Phonemizer not available: {e}")
|
| 230 |
+
print("TTS will work, but phoneme visualization will be disabled")
|
| 231 |
+
|
| 232 |
+
except ImportError as e:
|
| 233 |
+
print(f"Note: Phonemizer packages not installed: {e}")
|
| 234 |
+
print("TTS will work, but phoneme visualization will be disabled")
|
| 235 |
+
# Rather than automatically installing packages, inform the user
|
| 236 |
+
print("If you want phoneme visualization, manually install required packages:")
|
| 237 |
+
print("pip install espeakng-loader phonemizer-fork")
|
| 238 |
+
|
| 239 |
+
# Initialize pipeline globally with thread safety
|
| 240 |
+
_pipeline = None
|
| 241 |
+
_pipeline_lock = threading.RLock() # Reentrant lock for thread safety
|
| 242 |
+
|
| 243 |
+
def download_voice_files(voice_files=None, repo_version="main", required_count=1):
|
| 244 |
+
"""Download voice files from Hugging Face.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
voice_files: Optional list of voice files to download. If None, download all VOICE_FILES.
|
| 248 |
+
repo_version: Version/tag of the repository to use (default: "main")
|
| 249 |
+
required_count: Minimum number of voices required (default: 1)
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
List of successfully downloaded voice files
|
| 253 |
+
|
| 254 |
+
Raises:
|
| 255 |
+
ValueError: If fewer than required_count voices could be downloaded
|
| 256 |
+
"""
|
| 257 |
+
# Use absolute path for voices directory
|
| 258 |
+
voices_dir = Path(os.path.abspath("voices"))
|
| 259 |
+
voices_dir.mkdir(exist_ok=True)
|
| 260 |
+
|
| 261 |
+
# Import here to avoid startup dependency
|
| 262 |
+
from huggingface_hub import hf_hub_download
|
| 263 |
+
downloaded_voices = []
|
| 264 |
+
failed_voices = []
|
| 265 |
+
|
| 266 |
+
# If specific voice files are requested, use those. Otherwise use all.
|
| 267 |
+
files_to_download = voice_files if voice_files is not None else VOICE_FILES
|
| 268 |
+
total_files = len(files_to_download)
|
| 269 |
+
|
| 270 |
+
print(f"\nDownloading voice files... ({total_files} total files)")
|
| 271 |
+
|
| 272 |
+
# Check for existing voice files first
|
| 273 |
+
existing_files = []
|
| 274 |
+
for voice_file in files_to_download:
|
| 275 |
+
voice_path = voices_dir / voice_file
|
| 276 |
+
if voice_path.exists():
|
| 277 |
+
print(f"Voice file {voice_file} already exists")
|
| 278 |
+
downloaded_voices.append(voice_file)
|
| 279 |
+
existing_files.append(voice_file)
|
| 280 |
+
|
| 281 |
+
# Remove existing files from the download list
|
| 282 |
+
files_to_download = [f for f in files_to_download if f not in existing_files]
|
| 283 |
+
if not files_to_download and downloaded_voices:
|
| 284 |
+
print(f"All required voice files already exist ({len(downloaded_voices)} files)")
|
| 285 |
+
return downloaded_voices
|
| 286 |
+
|
| 287 |
+
# Proceed with downloading missing files
|
| 288 |
+
retry_count = 3
|
| 289 |
+
try:
|
| 290 |
+
import tempfile
|
| 291 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 292 |
+
for voice_file in files_to_download:
|
| 293 |
+
# Full path where the voice file should be
|
| 294 |
+
voice_path = voices_dir / voice_file
|
| 295 |
+
|
| 296 |
+
# Try with retries
|
| 297 |
+
for attempt in range(retry_count):
|
| 298 |
+
try:
|
| 299 |
+
print(f"Downloading {voice_file}... (attempt {attempt+1}/{retry_count})")
|
| 300 |
+
# Download to a temporary location first
|
| 301 |
+
temp_path = hf_hub_download(
|
| 302 |
+
repo_id="hexgrad/Kokoro-82M",
|
| 303 |
+
filename=f"voices/{voice_file}",
|
| 304 |
+
local_dir=temp_dir,
|
| 305 |
+
force_download=True,
|
| 306 |
+
revision=repo_version
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
# Move the file to the correct location
|
| 310 |
+
os.makedirs(os.path.dirname(str(voice_path)), exist_ok=True)
|
| 311 |
+
shutil.copy2(temp_path, str(voice_path)) # Use copy2 instead of move
|
| 312 |
+
|
| 313 |
+
# Verify file integrity
|
| 314 |
+
if os.path.getsize(str(voice_path)) > 0:
|
| 315 |
+
downloaded_voices.append(voice_file)
|
| 316 |
+
print(f"Successfully downloaded {voice_file}")
|
| 317 |
+
break # Success, exit retry loop
|
| 318 |
+
else:
|
| 319 |
+
print(f"Warning: Downloaded file {voice_file} has zero size, retrying...")
|
| 320 |
+
os.remove(str(voice_path)) # Remove invalid file
|
| 321 |
+
if attempt == retry_count - 1:
|
| 322 |
+
failed_voices.append(voice_file)
|
| 323 |
+
except (IOError, OSError, ValueError, FileNotFoundError, ConnectionError) as e:
|
| 324 |
+
print(f"Warning: Failed to download {voice_file} (attempt {attempt+1}): {e}")
|
| 325 |
+
if attempt == retry_count - 1:
|
| 326 |
+
failed_voices.append(voice_file)
|
| 327 |
+
print(f"Error: Failed all {retry_count} attempts to download {voice_file}")
|
| 328 |
+
except Exception as e:
|
| 329 |
+
print(f"Error during voice download process: {e}")
|
| 330 |
+
import traceback
|
| 331 |
+
traceback.print_exc()
|
| 332 |
+
|
| 333 |
+
# Report results
|
| 334 |
+
if failed_voices:
|
| 335 |
+
print(f"Warning: Failed to download {len(failed_voices)} voice files: {', '.join(failed_voices)}")
|
| 336 |
+
|
| 337 |
+
if not downloaded_voices:
|
| 338 |
+
error_msg = "No voice files could be downloaded. Please check your internet connection."
|
| 339 |
+
print(f"Error: {error_msg}")
|
| 340 |
+
raise ValueError(error_msg)
|
| 341 |
+
elif len(downloaded_voices) < required_count:
|
| 342 |
+
error_msg = f"Only {len(downloaded_voices)} voice files could be downloaded, but {required_count} were required."
|
| 343 |
+
print(f"Error: {error_msg}")
|
| 344 |
+
raise ValueError(error_msg)
|
| 345 |
+
else:
|
| 346 |
+
print(f"Successfully processed {len(downloaded_voices)} voice files")
|
| 347 |
+
|
| 348 |
+
return downloaded_voices
|
| 349 |
+
|
| 350 |
+
def build_model(model_path: str, device: str, repo_version: str = "main") -> KPipeline:
|
| 351 |
+
"""Build and return the Kokoro pipeline with proper encoding configuration
|
| 352 |
+
|
| 353 |
+
Args:
|
| 354 |
+
model_path: Path to the model file or None to use default
|
| 355 |
+
device: Device to use ('cuda' or 'cpu')
|
| 356 |
+
repo_version: Version/tag of the repository to use (default: "main")
|
| 357 |
+
|
| 358 |
+
Returns:
|
| 359 |
+
Initialized KPipeline instance
|
| 360 |
+
"""
|
| 361 |
+
global _pipeline, _pipeline_lock
|
| 362 |
+
|
| 363 |
+
# Use a lock for thread safety
|
| 364 |
+
with _pipeline_lock:
|
| 365 |
+
# Double-check pattern to avoid race conditions
|
| 366 |
+
if _pipeline is not None:
|
| 367 |
+
return _pipeline
|
| 368 |
+
|
| 369 |
+
try:
|
| 370 |
+
# Patch json loading before initializing pipeline
|
| 371 |
+
patch_json_load()
|
| 372 |
+
|
| 373 |
+
# Download model if it doesn't exist
|
| 374 |
+
if model_path is None:
|
| 375 |
+
model_path = 'kokoro-v1_0.pth'
|
| 376 |
+
|
| 377 |
+
model_path = os.path.abspath(model_path)
|
| 378 |
+
if not os.path.exists(model_path):
|
| 379 |
+
print(f"Downloading model file {model_path}...")
|
| 380 |
+
try:
|
| 381 |
+
from huggingface_hub import hf_hub_download
|
| 382 |
+
model_path = hf_hub_download(
|
| 383 |
+
repo_id="hexgrad/Kokoro-82M",
|
| 384 |
+
filename="kokoro-v1_0.pth",
|
| 385 |
+
local_dir=".",
|
| 386 |
+
force_download=True,
|
| 387 |
+
revision=repo_version
|
| 388 |
+
)
|
| 389 |
+
print(f"Model downloaded to {model_path}")
|
| 390 |
+
except Exception as e:
|
| 391 |
+
print(f"Error downloading model: {e}")
|
| 392 |
+
raise ValueError(f"Could not download model: {e}") from e
|
| 393 |
+
|
| 394 |
+
# Download config if it doesn't exist
|
| 395 |
+
config_path = os.path.abspath("config.json")
|
| 396 |
+
if not os.path.exists(config_path):
|
| 397 |
+
print("Downloading config file...")
|
| 398 |
+
try:
|
| 399 |
+
config_path = hf_hub_download(
|
| 400 |
+
repo_id="hexgrad/Kokoro-82M",
|
| 401 |
+
filename="config.json",
|
| 402 |
+
local_dir=".",
|
| 403 |
+
force_download=True,
|
| 404 |
+
revision=repo_version
|
| 405 |
+
)
|
| 406 |
+
print(f"Config downloaded to {config_path}")
|
| 407 |
+
except Exception as e:
|
| 408 |
+
print(f"Error downloading config: {e}")
|
| 409 |
+
raise ValueError(f"Could not download config: {e}") from e
|
| 410 |
+
|
| 411 |
+
# Download voice files - require at least one voice
|
| 412 |
+
try:
|
| 413 |
+
downloaded_voices = download_voice_files(repo_version=repo_version, required_count=1)
|
| 414 |
+
except ValueError as e:
|
| 415 |
+
print(f"Error: Voice files download failed: {e}")
|
| 416 |
+
raise ValueError("Voice files download failed") from e
|
| 417 |
+
|
| 418 |
+
# Validate language code
|
| 419 |
+
lang_code = 'a' # Default to 'a' for American English
|
| 420 |
+
supported_codes = list(LANGUAGE_CODES.keys())
|
| 421 |
+
if lang_code not in supported_codes:
|
| 422 |
+
print(f"Warning: Unsupported language code '{lang_code}'. Using 'a' (American English).")
|
| 423 |
+
print(f"Supported language codes: {', '.join(supported_codes)}")
|
| 424 |
+
lang_code = 'a'
|
| 425 |
+
|
| 426 |
+
# Initialize pipeline with validated language code
|
| 427 |
+
pipeline_instance = KPipeline(lang_code=lang_code)
|
| 428 |
+
if pipeline_instance is None:
|
| 429 |
+
raise ValueError("Failed to initialize KPipeline - pipeline is None")
|
| 430 |
+
|
| 431 |
+
# Store device parameter for reference in other operations
|
| 432 |
+
pipeline_instance.device = device
|
| 433 |
+
|
| 434 |
+
# Initialize voices dictionary if it doesn't exist
|
| 435 |
+
if not hasattr(pipeline_instance, 'voices'):
|
| 436 |
+
pipeline_instance.voices = {}
|
| 437 |
+
|
| 438 |
+
# Try to load the first available voice with improved error handling
|
| 439 |
+
voice_loaded = False
|
| 440 |
+
for voice_file in downloaded_voices:
|
| 441 |
+
voice_path = os.path.abspath(os.path.join("voices", voice_file))
|
| 442 |
+
if os.path.exists(voice_path):
|
| 443 |
+
try:
|
| 444 |
+
pipeline_instance.load_voice(voice_path)
|
| 445 |
+
print(f"Successfully loaded voice: {voice_file}")
|
| 446 |
+
voice_loaded = True
|
| 447 |
+
break # Successfully loaded a voice
|
| 448 |
+
except Exception as e:
|
| 449 |
+
print(f"Warning: Failed to load voice {voice_file}: {e}")
|
| 450 |
+
continue
|
| 451 |
+
|
| 452 |
+
if not voice_loaded:
|
| 453 |
+
print("Warning: Could not load any voice models")
|
| 454 |
+
|
| 455 |
+
# Set the global _pipeline only after successful initialization
|
| 456 |
+
_pipeline = pipeline_instance
|
| 457 |
+
|
| 458 |
+
except Exception as e:
|
| 459 |
+
print(f"Error initializing pipeline: {e}")
|
| 460 |
+
# Restore original json.load on error
|
| 461 |
+
restore_json_load()
|
| 462 |
+
raise
|
| 463 |
+
|
| 464 |
+
return _pipeline
|
| 465 |
+
|
| 466 |
+
def list_available_voices() -> List[str]:
|
| 467 |
+
"""List all available voice models"""
|
| 468 |
+
# Always use absolute path for consistency
|
| 469 |
+
voices_dir = Path(os.path.abspath("voices"))
|
| 470 |
+
|
| 471 |
+
# Create voices directory if it doesn't exist
|
| 472 |
+
if not voices_dir.exists():
|
| 473 |
+
print(f"Creating voices directory at {voices_dir}")
|
| 474 |
+
voices_dir.mkdir(exist_ok=True)
|
| 475 |
+
return []
|
| 476 |
+
|
| 477 |
+
# Get all .pt files in the voices directory
|
| 478 |
+
voice_files = list(voices_dir.glob("*.pt"))
|
| 479 |
+
|
| 480 |
+
# If we found voice files, return them
|
| 481 |
+
if voice_files:
|
| 482 |
+
return [f.stem for f in sorted(voice_files, key=lambda f: f.stem.lower())]
|
| 483 |
+
|
| 484 |
+
# If no voice files in standard location, check if we need to do a one-time migration
|
| 485 |
+
# This is legacy support for older installations
|
| 486 |
+
alt_voices_path = Path(".") / "voices"
|
| 487 |
+
if alt_voices_path.exists() and alt_voices_path.is_dir() and alt_voices_path != voices_dir:
|
| 488 |
+
print(f"Checking alternative voice location: {alt_voices_path.absolute()}")
|
| 489 |
+
alt_voice_files = list(alt_voices_path.glob("*.pt"))
|
| 490 |
+
|
| 491 |
+
if alt_voice_files:
|
| 492 |
+
print(f"Found {len(alt_voice_files)} voice files in alternate location")
|
| 493 |
+
print("Moving files to the standard voices directory...")
|
| 494 |
+
|
| 495 |
+
# Process files in a batch for efficiency
|
| 496 |
+
files_moved = 0
|
| 497 |
+
for voice_file in alt_voice_files:
|
| 498 |
+
target_path = voices_dir / voice_file.name
|
| 499 |
+
if not target_path.exists():
|
| 500 |
+
try:
|
| 501 |
+
# Use copy2 to preserve metadata, then remove original if successful
|
| 502 |
+
shutil.copy2(str(voice_file), str(target_path))
|
| 503 |
+
files_moved += 1
|
| 504 |
+
except (OSError, IOError) as e:
|
| 505 |
+
print(f"Error copying {voice_file.name}: {e}")
|
| 506 |
+
|
| 507 |
+
if files_moved > 0:
|
| 508 |
+
print(f"Successfully moved {files_moved} voice files")
|
| 509 |
+
return [f.stem for f in sorted(voices_dir.glob("*.pt"), key=lambda f: f.stem.lower())]
|
| 510 |
+
|
| 511 |
+
print("No voice files found. Please run the application again to download voices.")
|
| 512 |
+
return []
|
| 513 |
+
|
| 514 |
+
def get_language_code_from_voice(voice_name: str) -> str:
|
| 515 |
+
"""Get the appropriate language code from a voice name
|
| 516 |
+
|
| 517 |
+
Args:
|
| 518 |
+
voice_name: Name of the voice (e.g., 'af_bella', 'jf_alpha')
|
| 519 |
+
|
| 520 |
+
Returns:
|
| 521 |
+
Language code for the voice
|
| 522 |
+
"""
|
| 523 |
+
# Extract prefix from voice name
|
| 524 |
+
prefix = voice_name[:2] if len(voice_name) >= 2 else 'af'
|
| 525 |
+
|
| 526 |
+
# Map voice prefixes to language codes
|
| 527 |
+
prefix_to_lang = {
|
| 528 |
+
'af': 'a', 'am': 'a', # American English
|
| 529 |
+
'bf': 'b', 'bm': 'b', # British English
|
| 530 |
+
'jf': 'j', 'jm': 'j', # Japanese
|
| 531 |
+
'zf': 'z', 'zm': 'z', # Mandarin Chinese
|
| 532 |
+
'ef': 'e', 'em': 'e', # Spanish
|
| 533 |
+
'ff': 'f', 'fm': 'f', # French
|
| 534 |
+
'hf': 'h', 'hm': 'h', # Hindi
|
| 535 |
+
'if': 'i', 'im': 'i', # Italian
|
| 536 |
+
'pf': 'p', 'pm': 'p', # Brazilian Portuguese
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
return prefix_to_lang.get(prefix, 'a') # Default to American English
|
| 540 |
+
|
| 541 |
+
def load_voice(voice_name: str, device: str) -> torch.Tensor:
|
| 542 |
+
"""Load a voice model in a thread-safe manner
|
| 543 |
+
|
| 544 |
+
Args:
|
| 545 |
+
voice_name: Name of the voice to load (with or without .pt extension)
|
| 546 |
+
device: Device to use ('cuda' or 'cpu')
|
| 547 |
+
|
| 548 |
+
Returns:
|
| 549 |
+
Loaded voice model tensor
|
| 550 |
+
|
| 551 |
+
Raises:
|
| 552 |
+
ValueError: If voice file not found or loading fails
|
| 553 |
+
"""
|
| 554 |
+
pipeline = build_model(None, device)
|
| 555 |
+
|
| 556 |
+
# Format voice path correctly - strip .pt if it was included
|
| 557 |
+
voice_name = voice_name.replace('.pt', '')
|
| 558 |
+
voice_path = os.path.abspath(os.path.join("voices", f"{voice_name}.pt"))
|
| 559 |
+
|
| 560 |
+
if not os.path.exists(voice_path):
|
| 561 |
+
raise ValueError(f"Voice file not found: {voice_path}")
|
| 562 |
+
|
| 563 |
+
# Use a lock to ensure thread safety when loading voices
|
| 564 |
+
with _pipeline_lock:
|
| 565 |
+
# Check if voice is already loaded
|
| 566 |
+
if hasattr(pipeline, 'voices') and voice_name in pipeline.voices:
|
| 567 |
+
return pipeline.voices[voice_name]
|
| 568 |
+
|
| 569 |
+
# Load voice if not already loaded
|
| 570 |
+
return pipeline.load_voice(voice_path)
|
| 571 |
+
|
| 572 |
+
def generate_speech(
|
| 573 |
+
model: KPipeline,
|
| 574 |
+
text: str,
|
| 575 |
+
voice: str,
|
| 576 |
+
lang: str = 'a',
|
| 577 |
+
device: str = 'cpu',
|
| 578 |
+
speed: float = 1.0
|
| 579 |
+
) -> Tuple[Optional[torch.Tensor], Optional[str]]:
|
| 580 |
+
"""Generate speech using the Kokoro pipeline in a thread-safe manner
|
| 581 |
+
|
| 582 |
+
Args:
|
| 583 |
+
model: KPipeline instance
|
| 584 |
+
text: Text to synthesize
|
| 585 |
+
voice: Voice name (e.g. 'af_bella')
|
| 586 |
+
lang: Language code ('a' for American English, 'b' for British English)
|
| 587 |
+
device: Device to use ('cuda' or 'cpu')
|
| 588 |
+
speed: Speech speed multiplier (default: 1.0)
|
| 589 |
+
|
| 590 |
+
Returns:
|
| 591 |
+
Tuple of (audio tensor, phonemes string) or (None, None) on error
|
| 592 |
+
"""
|
| 593 |
+
global _pipeline_lock
|
| 594 |
+
|
| 595 |
+
try:
|
| 596 |
+
if model is None:
|
| 597 |
+
raise ValueError("Model is None - pipeline not properly initialized")
|
| 598 |
+
|
| 599 |
+
# Format voice name and path
|
| 600 |
+
voice_name = voice.replace('.pt', '')
|
| 601 |
+
voice_path = os.path.abspath(os.path.join("voices", f"{voice_name}.pt"))
|
| 602 |
+
|
| 603 |
+
# Check if voice file exists
|
| 604 |
+
if not os.path.exists(voice_path):
|
| 605 |
+
raise ValueError(f"Voice file not found: {voice_path}")
|
| 606 |
+
|
| 607 |
+
# Thread-safe initialization of model properties and voice loading
|
| 608 |
+
with _pipeline_lock:
|
| 609 |
+
# Initialize voices dictionary if it doesn't exist
|
| 610 |
+
if not hasattr(model, 'voices'):
|
| 611 |
+
model.voices = {}
|
| 612 |
+
|
| 613 |
+
# Ensure device is set
|
| 614 |
+
if not hasattr(model, 'device'):
|
| 615 |
+
model.device = device
|
| 616 |
+
|
| 617 |
+
# Ensure voice is loaded before generating
|
| 618 |
+
if voice_name not in model.voices:
|
| 619 |
+
print(f"Loading voice {voice_name}...")
|
| 620 |
+
try:
|
| 621 |
+
model.load_voice(voice_path)
|
| 622 |
+
if voice_name not in model.voices:
|
| 623 |
+
raise ValueError("Voice load succeeded but voice not in model.voices dictionary")
|
| 624 |
+
except Exception as e:
|
| 625 |
+
raise ValueError(f"Failed to load voice {voice_name}: {e}")
|
| 626 |
+
|
| 627 |
+
# Generate speech (outside the lock for better concurrency)
|
| 628 |
+
print(f"Generating speech with device: {model.device}")
|
| 629 |
+
generator = model(
|
| 630 |
+
text,
|
| 631 |
+
voice=voice_path,
|
| 632 |
+
speed=speed,
|
| 633 |
+
split_pattern=r'\n+'
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
# Get first generated segment and convert numpy array to tensor if needed
|
| 637 |
+
for gs, ps, audio in generator:
|
| 638 |
+
if audio is not None:
|
| 639 |
+
if isinstance(audio, np.ndarray):
|
| 640 |
+
audio = torch.from_numpy(audio).float()
|
| 641 |
+
return audio, ps
|
| 642 |
+
|
| 643 |
+
return None, None
|
| 644 |
+
except (ValueError, FileNotFoundError, RuntimeError, KeyError, AttributeError, TypeError) as e:
|
| 645 |
+
print(f"Error generating speech: {e}")
|
| 646 |
+
return None, None
|
| 647 |
+
except Exception as e:
|
| 648 |
+
print(f"Unexpected error during speech generation: {e}")
|
| 649 |
+
import traceback
|
| 650 |
+
traceback.print_exc()
|
| 651 |
+
return None, None
|
outputs/tts_20250608_125559.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad3317e6fef86203f76bc9bd0d47267575594a30d76af064a0427d868c8d04f3
|
| 3 |
+
size 4414125
|
outputs/tts_20250608_125559.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71e8cd896e8f7c5df4bbf721945771d44fe947375dab70ea3f34bc17f3c42017
|
| 3 |
+
size 10590044
|
outputs/tts_20250608_125703.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e950dc144d38fa862cf1306cfa602d7f1a3181d1ab9130dab192c82a2b76fe6e
|
| 3 |
+
size 4677165
|
outputs/tts_20250608_125703.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7a99e011575460595ef14ae32b698775b264da96ca8ba96a29d4f8139cc7ad3
|
| 3 |
+
size 11221244
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kokoro>=0.9.2 # Official Kokoro TTS library (v1.0 model support)
|
| 2 |
+
misaki # G2P library for Kokoro (multi-language support)
|
| 3 |
+
torch>=2.0.0 # PyTorch for model inference (for GPU support, see README.md for CUDA-specific installation)
|
| 4 |
+
soundfile>=0.12.1 # Audio file handling
|
| 5 |
+
huggingface-hub>=0.16.0 # Model downloads from Hugging Face
|
| 6 |
+
gradio>=4.0.0 # Web interface
|
| 7 |
+
pydub>=0.25.1 # For audio format conversion
|
| 8 |
+
espeakng-loader>=0.1.0 # For loading espeak-ng library
|
| 9 |
+
phonemizer-fork>=3.2.1 # For phoneme generation
|
| 10 |
+
wheel>=0.38.0 # For building packages
|
| 11 |
+
setuptools>=65.0.0 # For installing packages
|
| 12 |
+
num2words>=0.5.12 # For number to word conversion
|
| 13 |
+
spacy>=3.4.0 # For text processing
|
speed_dial.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speed Dial Module for Kokoro-TTS-Local
|
| 3 |
+
--------------------------------------
|
| 4 |
+
Manages speed dial presets for quick access to frequently used voice and text combinations.
|
| 5 |
+
|
| 6 |
+
This module provides functions to:
|
| 7 |
+
- Load speed dial presets from a JSON file
|
| 8 |
+
- Save new presets to the JSON file
|
| 9 |
+
- Delete presets from the JSON file
|
| 10 |
+
- Validate preset data
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict, List, Optional, Any
|
| 17 |
+
|
| 18 |
+
# Define the path for the speed dial presets file
|
| 19 |
+
SPEED_DIAL_FILE = Path("speed_dial.json")
|
| 20 |
+
|
| 21 |
+
def load_presets() -> Dict[str, Dict[str, Any]]:
|
| 22 |
+
"""
|
| 23 |
+
Load speed dial presets from the JSON file.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Dictionary of presets where keys are preset names and values are preset data
|
| 27 |
+
"""
|
| 28 |
+
if not SPEED_DIAL_FILE.exists():
|
| 29 |
+
# If file doesn't exist, return an empty dictionary
|
| 30 |
+
return {}
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
with open(SPEED_DIAL_FILE, 'r', encoding='utf-8') as f:
|
| 34 |
+
presets = json.load(f)
|
| 35 |
+
|
| 36 |
+
# Validate the loaded presets
|
| 37 |
+
validated_presets = {}
|
| 38 |
+
for name, preset in presets.items():
|
| 39 |
+
if validate_preset(preset):
|
| 40 |
+
validated_presets[name] = preset
|
| 41 |
+
|
| 42 |
+
return validated_presets
|
| 43 |
+
except (json.JSONDecodeError, IOError) as e:
|
| 44 |
+
print(f"Error loading speed dial presets: {e}")
|
| 45 |
+
return {}
|
| 46 |
+
|
| 47 |
+
def save_preset(name: str, voice: str, text: str, format: str = "wav", speed: float = 1.0) -> bool:
|
| 48 |
+
"""
|
| 49 |
+
Save a new speed dial preset.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
name: Name of the preset
|
| 53 |
+
voice: Voice to use
|
| 54 |
+
text: Text to convert to speech
|
| 55 |
+
format: Output format (default: "wav")
|
| 56 |
+
speed: Speech speed (default: 1.0)
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
True if successful, False otherwise
|
| 60 |
+
"""
|
| 61 |
+
# Create preset data
|
| 62 |
+
preset = {
|
| 63 |
+
"voice": voice,
|
| 64 |
+
"text": text,
|
| 65 |
+
"format": format,
|
| 66 |
+
"speed": speed
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Validate preset data
|
| 70 |
+
if not validate_preset(preset):
|
| 71 |
+
return False
|
| 72 |
+
|
| 73 |
+
# Load existing presets
|
| 74 |
+
presets = load_presets()
|
| 75 |
+
|
| 76 |
+
# Add or update the preset
|
| 77 |
+
presets[name] = preset
|
| 78 |
+
|
| 79 |
+
# Save presets to file
|
| 80 |
+
try:
|
| 81 |
+
with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f:
|
| 82 |
+
json.dump(presets, f, indent=2, ensure_ascii=False)
|
| 83 |
+
return True
|
| 84 |
+
except IOError as e:
|
| 85 |
+
print(f"Error saving speed dial preset: {e}")
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
def delete_preset(name: str) -> bool:
|
| 89 |
+
"""
|
| 90 |
+
Delete a speed dial preset.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
name: Name of the preset to delete
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
True if successful, False otherwise
|
| 97 |
+
"""
|
| 98 |
+
# Load existing presets
|
| 99 |
+
presets = load_presets()
|
| 100 |
+
|
| 101 |
+
# Check if preset exists
|
| 102 |
+
if name not in presets:
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
# Remove the preset
|
| 106 |
+
del presets[name]
|
| 107 |
+
|
| 108 |
+
# Save presets to file
|
| 109 |
+
try:
|
| 110 |
+
with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f:
|
| 111 |
+
json.dump(presets, f, indent=2, ensure_ascii=False)
|
| 112 |
+
return True
|
| 113 |
+
except IOError as e:
|
| 114 |
+
print(f"Error deleting speed dial preset: {e}")
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
def validate_preset(preset: Dict[str, Any]) -> bool:
|
| 118 |
+
"""
|
| 119 |
+
Validate a preset's data structure.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
preset: Preset data to validate
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
True if valid, False otherwise
|
| 126 |
+
"""
|
| 127 |
+
# Check required fields
|
| 128 |
+
required_fields = ["voice", "text"]
|
| 129 |
+
for field in required_fields:
|
| 130 |
+
if field not in preset:
|
| 131 |
+
print(f"Preset missing required field: {field}")
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
# Check field types
|
| 135 |
+
if not isinstance(preset.get("voice"), str):
|
| 136 |
+
print("Preset voice must be a string")
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
if not isinstance(preset.get("text"), str):
|
| 140 |
+
print("Preset text must be a string")
|
| 141 |
+
return False
|
| 142 |
+
|
| 143 |
+
# Optional fields with defaults
|
| 144 |
+
if "format" not in preset:
|
| 145 |
+
preset["format"] = "wav"
|
| 146 |
+
elif not isinstance(preset["format"], str):
|
| 147 |
+
print("Preset format must be a string")
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
if "speed" not in preset:
|
| 151 |
+
preset["speed"] = 1.0
|
| 152 |
+
elif not isinstance(preset["speed"], (int, float)):
|
| 153 |
+
print("Preset speed must be a number")
|
| 154 |
+
return False
|
| 155 |
+
|
| 156 |
+
return True
|
| 157 |
+
|
| 158 |
+
def get_preset_names() -> List[str]:
|
| 159 |
+
"""
|
| 160 |
+
Get a list of all preset names.
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
List of preset names
|
| 164 |
+
"""
|
| 165 |
+
presets = load_presets()
|
| 166 |
+
return list(presets.keys())
|
| 167 |
+
|
| 168 |
+
def get_preset(name: str) -> Optional[Dict[str, Any]]:
|
| 169 |
+
"""
|
| 170 |
+
Get a specific preset by name.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
name: Name of the preset to get
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Preset data or None if not found
|
| 177 |
+
"""
|
| 178 |
+
presets = load_presets()
|
| 179 |
+
return presets.get(name)
|
test.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
print(torch.cuda.is_available())
|
tts_demo.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from typing import Optional, Tuple, List, Union
|
| 3 |
+
from models import build_model, generate_speech, list_available_voices
|
| 4 |
+
from tqdm.auto import tqdm
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import numpy as np
|
| 8 |
+
import time
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
# Define path type for consistent handling
|
| 13 |
+
PathLike = Union[str, Path]
|
| 14 |
+
|
| 15 |
+
# Constants with validation
|
| 16 |
+
def validate_sample_rate(rate: int) -> int:
|
| 17 |
+
"""Validate sample rate is within acceptable range"""
|
| 18 |
+
valid_rates = [16000, 22050, 24000, 44100, 48000]
|
| 19 |
+
if rate not in valid_rates:
|
| 20 |
+
print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}")
|
| 21 |
+
return 24000 # Default to safe value
|
| 22 |
+
return rate
|
| 23 |
+
|
| 24 |
+
def validate_language(lang: str) -> str:
|
| 25 |
+
"""Validate language code"""
|
| 26 |
+
# Import here to avoid circular imports
|
| 27 |
+
from models import LANGUAGE_CODES
|
| 28 |
+
valid_langs = list(LANGUAGE_CODES.keys())
|
| 29 |
+
if lang not in valid_langs:
|
| 30 |
+
print(f"Warning: Invalid language code '{lang}'. Using 'a' (American English).")
|
| 31 |
+
print(f"Supported language codes: {', '.join(valid_langs)}")
|
| 32 |
+
return 'a' # Default to American English
|
| 33 |
+
return lang
|
| 34 |
+
|
| 35 |
+
# Define and validate constants
|
| 36 |
+
SAMPLE_RATE = validate_sample_rate(24000)
|
| 37 |
+
DEFAULT_MODEL_PATH = Path('kokoro-v1_0.pth').absolute()
|
| 38 |
+
DEFAULT_OUTPUT_FILE = Path('output.wav').absolute()
|
| 39 |
+
DEFAULT_LANGUAGE = validate_language('a') # 'a' for American English, 'b' for British English
|
| 40 |
+
DEFAULT_TEXT = "Hello, welcome to this text-to-speech test."
|
| 41 |
+
|
| 42 |
+
# Ensure output directory exists
|
| 43 |
+
DEFAULT_OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# Configure tqdm for better Windows console support
|
| 46 |
+
tqdm.monitor_interval = 0
|
| 47 |
+
|
| 48 |
+
def print_menu():
|
| 49 |
+
"""Print the main menu options."""
|
| 50 |
+
print("\n=== Kokoro TTS Menu ===")
|
| 51 |
+
print("1. List available voices")
|
| 52 |
+
print("2. Generate speech")
|
| 53 |
+
print("3. Exit")
|
| 54 |
+
return input("Select an option (1-3): ").strip()
|
| 55 |
+
|
| 56 |
+
def select_voice(voices: List[str]) -> str:
|
| 57 |
+
"""Interactive voice selection."""
|
| 58 |
+
print("\nAvailable voices:")
|
| 59 |
+
for i, voice in enumerate(voices, 1):
|
| 60 |
+
print(f"{i}. {voice}")
|
| 61 |
+
|
| 62 |
+
while True:
|
| 63 |
+
try:
|
| 64 |
+
choice = input("\nSelect a voice number (or press Enter for default 'af_bella'): ").strip()
|
| 65 |
+
if not choice:
|
| 66 |
+
return "af_bella"
|
| 67 |
+
choice = int(choice)
|
| 68 |
+
if 1 <= choice <= len(voices):
|
| 69 |
+
return voices[choice - 1]
|
| 70 |
+
print("Invalid choice. Please try again.")
|
| 71 |
+
except ValueError:
|
| 72 |
+
print("Please enter a valid number.")
|
| 73 |
+
|
| 74 |
+
def get_text_input() -> str:
|
| 75 |
+
"""Get text input from user."""
|
| 76 |
+
print("\nEnter the text you want to convert to speech")
|
| 77 |
+
print("(or press Enter for default text)")
|
| 78 |
+
text = input("> ").strip()
|
| 79 |
+
return text if text else DEFAULT_TEXT
|
| 80 |
+
|
| 81 |
+
def get_speed() -> float:
|
| 82 |
+
"""Get speech speed from user."""
|
| 83 |
+
while True:
|
| 84 |
+
try:
|
| 85 |
+
speed = input("\nEnter speech speed (0.5-2.0, default 1.0): ").strip()
|
| 86 |
+
if not speed:
|
| 87 |
+
return 1.0
|
| 88 |
+
speed = float(speed)
|
| 89 |
+
if 0.5 <= speed <= 2.0:
|
| 90 |
+
return speed
|
| 91 |
+
print("Speed must be between 0.5 and 2.0")
|
| 92 |
+
except ValueError:
|
| 93 |
+
print("Please enter a valid number.")
|
| 94 |
+
|
| 95 |
+
def save_audio_with_retry(audio_data: np.ndarray, sample_rate: int, output_path: PathLike, max_retries: int = 3, retry_delay: float = 1.0) -> bool:
|
| 96 |
+
"""
|
| 97 |
+
Attempt to save audio data to file with retry logic.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
audio_data: Audio data as numpy array
|
| 101 |
+
sample_rate: Sample rate in Hz
|
| 102 |
+
output_path: Path to save the audio file
|
| 103 |
+
max_retries: Maximum number of retry attempts
|
| 104 |
+
retry_delay: Delay between retries in seconds
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
True if successful, False otherwise
|
| 108 |
+
"""
|
| 109 |
+
# Convert and normalize path to Path object
|
| 110 |
+
output_path = Path(output_path).absolute()
|
| 111 |
+
|
| 112 |
+
# Create parent directory if it doesn't exist
|
| 113 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 114 |
+
|
| 115 |
+
# Try to remove the file if it exists to avoid "file in use" issues
|
| 116 |
+
try:
|
| 117 |
+
if output_path.exists():
|
| 118 |
+
print(f"Removing existing file: {output_path}")
|
| 119 |
+
output_path.unlink()
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Warning: Could not remove existing file: {e}")
|
| 122 |
+
print("This might indicate the file is in use by another program.")
|
| 123 |
+
|
| 124 |
+
for attempt in range(max_retries):
|
| 125 |
+
try:
|
| 126 |
+
# Validate audio data before saving
|
| 127 |
+
if audio_data is None or len(audio_data) == 0:
|
| 128 |
+
raise ValueError("Empty audio data")
|
| 129 |
+
|
| 130 |
+
# Check write permissions for the directory
|
| 131 |
+
if not os.access(str(output_path.parent), os.W_OK):
|
| 132 |
+
raise PermissionError(f"No write permission for directory: {output_path.parent}")
|
| 133 |
+
|
| 134 |
+
# Try to use a temporary file first, then rename it
|
| 135 |
+
temp_path = output_path.with_name(f"temp_{output_path.name}")
|
| 136 |
+
|
| 137 |
+
# Save audio file to temporary location
|
| 138 |
+
print(f"Saving audio to temporary file: {temp_path}")
|
| 139 |
+
sf.write(str(temp_path), audio_data, sample_rate)
|
| 140 |
+
|
| 141 |
+
# If successful, rename to final location
|
| 142 |
+
if temp_path.exists():
|
| 143 |
+
# Remove target file if it exists
|
| 144 |
+
if output_path.exists():
|
| 145 |
+
output_path.unlink()
|
| 146 |
+
# Rename temp file to target file
|
| 147 |
+
temp_path.rename(output_path)
|
| 148 |
+
print(f"Successfully renamed temporary file to: {output_path}")
|
| 149 |
+
|
| 150 |
+
return True
|
| 151 |
+
|
| 152 |
+
except (IOError, PermissionError) as e:
|
| 153 |
+
if attempt < max_retries - 1:
|
| 154 |
+
print(f"\nFailed to save audio (attempt {attempt + 1}/{max_retries}): {e}")
|
| 155 |
+
print("The output file might be in use by another program (e.g., media player).")
|
| 156 |
+
print(f"Please close any programs that might be using '{output_path}'")
|
| 157 |
+
print(f"Retrying in {retry_delay} seconds...")
|
| 158 |
+
time.sleep(retry_delay)
|
| 159 |
+
else:
|
| 160 |
+
print(f"\nError: Could not save audio after {max_retries} attempts: {e}")
|
| 161 |
+
print(f"Please ensure '{output_path}' is not open in any other program and try again.")
|
| 162 |
+
print(f"You might need to restart your computer if the file remains locked.")
|
| 163 |
+
return False
|
| 164 |
+
except Exception as e:
|
| 165 |
+
print(f"\nUnexpected error saving audio: {type(e).__name__}: {e}")
|
| 166 |
+
if attempt < max_retries - 1:
|
| 167 |
+
print(f"Retrying in {retry_delay} seconds...")
|
| 168 |
+
time.sleep(retry_delay)
|
| 169 |
+
else:
|
| 170 |
+
return False
|
| 171 |
+
finally:
|
| 172 |
+
# Clean up temp file if it exists and we failed
|
| 173 |
+
try:
|
| 174 |
+
temp_path = output_path.with_name(f"temp_{output_path.name}")
|
| 175 |
+
if temp_path.exists():
|
| 176 |
+
temp_path.unlink()
|
| 177 |
+
except Exception:
|
| 178 |
+
pass
|
| 179 |
+
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
def main() -> None:
|
| 183 |
+
try:
|
| 184 |
+
# Set up device safely
|
| 185 |
+
try:
|
| 186 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 187 |
+
except (RuntimeError, AttributeError, ImportError) as e:
|
| 188 |
+
print(f"CUDA initialization error: {e}. Using CPU instead.")
|
| 189 |
+
device = 'cpu' # Fallback if CUDA check fails
|
| 190 |
+
print(f"Using device: {device}")
|
| 191 |
+
|
| 192 |
+
# Build model
|
| 193 |
+
print("\nInitializing model...")
|
| 194 |
+
with tqdm(total=1, desc="Building model") as pbar:
|
| 195 |
+
model = build_model(DEFAULT_MODEL_PATH, device)
|
| 196 |
+
pbar.update(1)
|
| 197 |
+
|
| 198 |
+
# Cache for voices to avoid redundant calls
|
| 199 |
+
voices_cache = None
|
| 200 |
+
|
| 201 |
+
while True:
|
| 202 |
+
choice = print_menu()
|
| 203 |
+
|
| 204 |
+
if choice == "1":
|
| 205 |
+
# List voices
|
| 206 |
+
voices_cache = list_available_voices()
|
| 207 |
+
print("\nAvailable voices:")
|
| 208 |
+
for voice in voices_cache:
|
| 209 |
+
print(f"- {voice}")
|
| 210 |
+
|
| 211 |
+
elif choice == "2":
|
| 212 |
+
# Generate speech
|
| 213 |
+
# Use cached voices if available
|
| 214 |
+
if voices_cache is None:
|
| 215 |
+
voices_cache = list_available_voices()
|
| 216 |
+
|
| 217 |
+
if not voices_cache:
|
| 218 |
+
print("No voices found! Please check the voices directory.")
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
# Get user inputs
|
| 222 |
+
voice = select_voice(voices_cache)
|
| 223 |
+
text = get_text_input()
|
| 224 |
+
|
| 225 |
+
# Validate text (don't allow extremely long inputs)
|
| 226 |
+
if len(text) > 10000: # Reasonable limit for text length
|
| 227 |
+
print("Text is too long. Please enter a shorter text.")
|
| 228 |
+
continue
|
| 229 |
+
|
| 230 |
+
speed = get_speed()
|
| 231 |
+
|
| 232 |
+
print(f"\nGenerating speech for: '{text}'")
|
| 233 |
+
print(f"Using voice: {voice}")
|
| 234 |
+
print(f"Speed: {speed}x")
|
| 235 |
+
|
| 236 |
+
# Generate speech
|
| 237 |
+
all_audio = []
|
| 238 |
+
# Use Path object for consistent path handling
|
| 239 |
+
voice_path = Path("voices").absolute() / f"{voice}.pt"
|
| 240 |
+
|
| 241 |
+
# Verify voice file exists
|
| 242 |
+
if not voice_path.exists():
|
| 243 |
+
print(f"Error: Voice file not found: {voice_path}")
|
| 244 |
+
continue
|
| 245 |
+
|
| 246 |
+
# Set a timeout for generation with per-segment timeout
|
| 247 |
+
max_gen_time = 300 # 5 minutes max total
|
| 248 |
+
max_segment_time = 60 # 60 seconds max per segment
|
| 249 |
+
start_time = time.time()
|
| 250 |
+
segment_start_time = start_time
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
# Setup watchdog timer for overall process
|
| 254 |
+
import threading
|
| 255 |
+
generation_complete = False
|
| 256 |
+
|
| 257 |
+
def watchdog_timer():
|
| 258 |
+
if not generation_complete:
|
| 259 |
+
print("\nWatchdog: Generation taking too long, process will be cancelled")
|
| 260 |
+
# Can't directly interrupt generator, but this will inform user
|
| 261 |
+
|
| 262 |
+
# Start watchdog timer
|
| 263 |
+
watchdog = threading.Timer(max_gen_time, watchdog_timer)
|
| 264 |
+
watchdog.daemon = True # Don't prevent program exit
|
| 265 |
+
watchdog.start()
|
| 266 |
+
|
| 267 |
+
# Initialize generator
|
| 268 |
+
try:
|
| 269 |
+
generator = model(text, voice=voice_path, speed=speed, split_pattern=r'\n+')
|
| 270 |
+
except (ValueError, TypeError, RuntimeError) as e:
|
| 271 |
+
print(f"Error initializing speech generator: {e}")
|
| 272 |
+
watchdog.cancel()
|
| 273 |
+
continue
|
| 274 |
+
except Exception as e:
|
| 275 |
+
print(f"Unexpected error initializing generator: {type(e).__name__}: {e}")
|
| 276 |
+
watchdog.cancel()
|
| 277 |
+
continue
|
| 278 |
+
|
| 279 |
+
# Process segments
|
| 280 |
+
with tqdm(desc="Generating speech") as pbar:
|
| 281 |
+
for gs, ps, audio in generator:
|
| 282 |
+
# Check overall timeout
|
| 283 |
+
current_time = time.time()
|
| 284 |
+
if current_time - start_time > max_gen_time:
|
| 285 |
+
print("\nWarning: Total generation time exceeded limit, stopping")
|
| 286 |
+
break
|
| 287 |
+
|
| 288 |
+
# Check per-segment timeout
|
| 289 |
+
segment_elapsed = current_time - segment_start_time
|
| 290 |
+
if segment_elapsed > max_segment_time:
|
| 291 |
+
print(f"\nWarning: Segment took too long ({segment_elapsed:.1f}s), stopping")
|
| 292 |
+
break
|
| 293 |
+
|
| 294 |
+
# Reset segment timer
|
| 295 |
+
segment_start_time = current_time
|
| 296 |
+
|
| 297 |
+
# Process audio if available
|
| 298 |
+
if audio is not None:
|
| 299 |
+
# Only convert if it's a numpy array, not if already tensor
|
| 300 |
+
audio_tensor = audio if isinstance(audio, torch.Tensor) else torch.from_numpy(audio).float()
|
| 301 |
+
|
| 302 |
+
all_audio.append(audio_tensor)
|
| 303 |
+
print(f"\nGenerated segment: {gs}")
|
| 304 |
+
if ps: # Only print phonemes if available
|
| 305 |
+
print(f"Phonemes: {ps}")
|
| 306 |
+
pbar.update(1)
|
| 307 |
+
|
| 308 |
+
# Mark generation as complete (for watchdog)
|
| 309 |
+
generation_complete = True
|
| 310 |
+
watchdog.cancel()
|
| 311 |
+
|
| 312 |
+
except ValueError as e:
|
| 313 |
+
print(f"Value error during speech generation: {e}")
|
| 314 |
+
except RuntimeError as e:
|
| 315 |
+
print(f"Runtime error during speech generation: {e}")
|
| 316 |
+
# If CUDA out of memory, provide more helpful message
|
| 317 |
+
if "CUDA out of memory" in str(e):
|
| 318 |
+
print("CUDA out of memory error - try using a shorter text or switching to CPU")
|
| 319 |
+
except KeyError as e:
|
| 320 |
+
print(f"Key error during speech generation: {e}")
|
| 321 |
+
print("This might be caused by a missing voice configuration")
|
| 322 |
+
except FileNotFoundError as e:
|
| 323 |
+
print(f"File not found: {e}")
|
| 324 |
+
except Exception as e:
|
| 325 |
+
print(f"Unexpected error during speech generation: {type(e).__name__}: {e}")
|
| 326 |
+
import traceback
|
| 327 |
+
traceback.print_exc()
|
| 328 |
+
|
| 329 |
+
# Save audio
|
| 330 |
+
if all_audio:
|
| 331 |
+
try:
|
| 332 |
+
# Handle single segment case without concatenation
|
| 333 |
+
if len(all_audio) == 1:
|
| 334 |
+
final_audio = all_audio[0]
|
| 335 |
+
else:
|
| 336 |
+
try:
|
| 337 |
+
final_audio = torch.cat(all_audio, dim=0)
|
| 338 |
+
except RuntimeError as e:
|
| 339 |
+
print(f"Error concatenating audio segments: {e}")
|
| 340 |
+
continue
|
| 341 |
+
|
| 342 |
+
# Use consistent Path object
|
| 343 |
+
output_path = Path(DEFAULT_OUTPUT_FILE)
|
| 344 |
+
if save_audio_with_retry(final_audio.numpy(), SAMPLE_RATE, output_path):
|
| 345 |
+
print(f"\nAudio saved to {output_path}")
|
| 346 |
+
# Play a system beep to indicate completion
|
| 347 |
+
try:
|
| 348 |
+
print('\a') # ASCII bell - should make a sound on most systems
|
| 349 |
+
except:
|
| 350 |
+
pass
|
| 351 |
+
else:
|
| 352 |
+
print("Failed to save audio file")
|
| 353 |
+
except Exception as e:
|
| 354 |
+
print(f"Error processing audio: {type(e).__name__}: {e}")
|
| 355 |
+
else:
|
| 356 |
+
print("Error: Failed to generate audio")
|
| 357 |
+
|
| 358 |
+
elif choice == "3":
|
| 359 |
+
print("\nGoodbye!")
|
| 360 |
+
break
|
| 361 |
+
|
| 362 |
+
else:
|
| 363 |
+
print("\nInvalid choice. Please try again.")
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print(f"Error in main: {e}")
|
| 367 |
+
import traceback
|
| 368 |
+
traceback.print_exc()
|
| 369 |
+
finally:
|
| 370 |
+
# Comprehensive cleanup with error handling
|
| 371 |
+
try:
|
| 372 |
+
print("\nPerforming cleanup...")
|
| 373 |
+
|
| 374 |
+
# Ensure model is properly released
|
| 375 |
+
if 'model' in locals() and model is not None:
|
| 376 |
+
print("Cleaning up model resources...")
|
| 377 |
+
# First clear any references to voice models
|
| 378 |
+
if hasattr(model, 'voices'):
|
| 379 |
+
try:
|
| 380 |
+
voices_count = len(model.voices)
|
| 381 |
+
model.voices.clear()
|
| 382 |
+
print(f"Cleared {voices_count} voice references")
|
| 383 |
+
except Exception as voice_error:
|
| 384 |
+
print(f"Error clearing voice references: {voice_error}")
|
| 385 |
+
|
| 386 |
+
# Clear any other model attributes that might hold references
|
| 387 |
+
try:
|
| 388 |
+
for attr in list(model.__dict__.keys()):
|
| 389 |
+
if hasattr(model, attr) and not attr.startswith('__'):
|
| 390 |
+
try:
|
| 391 |
+
delattr(model, attr)
|
| 392 |
+
except:
|
| 393 |
+
pass
|
| 394 |
+
except Exception as attr_error:
|
| 395 |
+
print(f"Error clearing model attributes: {attr_error}")
|
| 396 |
+
|
| 397 |
+
# Then delete the model
|
| 398 |
+
try:
|
| 399 |
+
del model
|
| 400 |
+
model = None
|
| 401 |
+
print("Model reference deleted")
|
| 402 |
+
except Exception as del_error:
|
| 403 |
+
print(f"Error deleting model: {del_error}")
|
| 404 |
+
|
| 405 |
+
# Clean up voice cache
|
| 406 |
+
if 'voices_cache' in locals() and voices_cache is not None:
|
| 407 |
+
try:
|
| 408 |
+
voices_cache.clear()
|
| 409 |
+
voices_cache = None
|
| 410 |
+
print("Voice cache cleared")
|
| 411 |
+
except Exception as cache_error:
|
| 412 |
+
print(f"Error clearing voice cache: {cache_error}")
|
| 413 |
+
|
| 414 |
+
# Clean up any CUDA resources
|
| 415 |
+
if torch.cuda.is_available():
|
| 416 |
+
try:
|
| 417 |
+
print("Cleaning up CUDA resources...")
|
| 418 |
+
torch.cuda.empty_cache()
|
| 419 |
+
print("CUDA cache emptied")
|
| 420 |
+
except Exception as cuda_error:
|
| 421 |
+
print(f"Error clearing CUDA cache: {cuda_error}")
|
| 422 |
+
|
| 423 |
+
# Make sure patched functions are restored
|
| 424 |
+
try:
|
| 425 |
+
from models import _cleanup_monkey_patches
|
| 426 |
+
_cleanup_monkey_patches()
|
| 427 |
+
print("Monkey patches restored")
|
| 428 |
+
except Exception as patch_error:
|
| 429 |
+
print(f"Error restoring monkey patches: {patch_error}")
|
| 430 |
+
|
| 431 |
+
# Final garbage collection
|
| 432 |
+
try:
|
| 433 |
+
import gc
|
| 434 |
+
gc.collect()
|
| 435 |
+
print("Garbage collection completed")
|
| 436 |
+
except Exception as gc_error:
|
| 437 |
+
print(f"Error during garbage collection: {gc_error}")
|
| 438 |
+
|
| 439 |
+
print("Cleanup completed")
|
| 440 |
+
|
| 441 |
+
except Exception as e:
|
| 442 |
+
print(f"Error during cleanup: {type(e).__name__}: {e}")
|
| 443 |
+
import traceback
|
| 444 |
+
traceback.print_exc()
|
| 445 |
+
|
| 446 |
+
if __name__ == "__main__":
|
| 447 |
+
main()
|