File size: 1,566 Bytes
0558aa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import gradio as gr
import numpy as np

from nemo.collections.tts.modules.magpietts_inference.utils import ModelLoadConfig, load_magpie_model

'''
If gradio is not already installed, run: pip install --no-cache-dir gradio
export PYTHONPATH=$PYTHONPATH:/workspace/NeMo
pip install kaldialign
pip install git+https://github.com/sarulab-speech/[email protected]
place this file in root directory of NeMo
'''

CHECKPOINT_PATH = "/checkpoints/results/ML_MagpieTTS/CE-Removed_GRPO_Magpie_TTS_ML_V1.nemo"
CODEC_MODEL_PATH = "nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps"



def setup_model():
    model_config = ModelLoadConfig(
                nemo_file=CHECKPOINT_PATH,
                codecmodel_path=CODEC_MODEL_PATH,
                legacy_codebooks=False,
                legacy_text_conditioning=False,
                hparams_from_wandb=None,
            )
    
    model, _ = load_magpie_model(model_config)
    model.eval().cuda()
    return model


def main():
    model = setup_model()

    def demo_tts(input_text, language):
        audio, audio_len = model.do_tts(input_text, language=language, apply_TN=True)
        audio_np = audio[0, :audio_len[0]].cpu().numpy()
        return model.sample_rate, audio_np


    demo = gr.Interface(
        fn=demo_tts, 
        inputs=[gr.Textbox(label="Text to synthesize"),
                gr.Textbox(label="Language", value="en")],
        outputs="audio", 
        title="Text to Speech MagpieTTS Demo")
    demo.launch(server_name="0.0.0.0", server_port=6007, share=True)


if __name__ == "__main__":
    main()