Spaces:

vyles
/

GPT-SoVITS-V2-NIIMI_SORA

Running

App Files Files Community

AkitoP commited on Sep 8, 2024

Commit

2372084

verified ·

1 Parent(s): adf6347

Update GPT_SoVITS/text/japanese.py

Browse files

Files changed (1) hide show

GPT_SoVITS/text/japanese.py +223 -227

GPT_SoVITS/text/japanese.py CHANGED Viewed

@@ -1,227 +1,223 @@
-# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
-import re
-import pyopenjtalk
-import os
-import hashlib
-from text.symbols2 import symbols
-current_file_path = os.path.dirname(__file__)
-def get_hash(fp: str) -> str:
-    hash_md5 = hashlib.md5()
-    with open(fp, "rb") as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            hash_md5.update(chunk)
-    return hash_md5.hexdigest()
-USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
-USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
-USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
-# 如果没有用户词典，就生成一个；如果有，就检查md5，如果不一样，就重新生成
-if os.path.exists(USERDIC_CSV_PATH):
-    print("userdict exists")
-    if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read():
-        pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
-        with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f:
-            f.write(get_hash(USERDIC_CSV_PATH))
-if os.path.exists(USERDIC_BIN_PATH):
-    pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
-from text.symbols import punctuation
-# Regular expression matching Japanese without punctuation marks:
-_japanese_characters = re.compile(
-    r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
-)
-# Regular expression matching non-Japanese characters or punctuation marks:
-_japanese_marks = re.compile(
-    r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
-)
-# List of (symbol, Japanese) pairs for marks:
-_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
-# List of (consonant, sokuon) pairs:
-_real_sokuon = [
-    (re.compile("%s" % x[0]), x[1])
-    for x in [
-        (r"Q([↑↓]*[kg])", r"k#\1"),
-        (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
-        (r"Q([↑↓]*[sʃ])", r"s\1"),
-        (r"Q([↑↓]*[pb])", r"p#\1"),
-    ]
-]
-# List of (consonant, hatsuon) pairs:
-_real_hatsuon = [
-    (re.compile("%s" % x[0]), x[1])
-    for x in [
-        (r"N([↑↓]*[pbm])", r"m\1"),
-        (r"N([↑↓]*[ʧʥj])", r"n^\1"),
-        (r"N([↑↓]*[tdn])", r"n\1"),
-        (r"N([↑↓]*[kg])", r"ŋ\1"),
-    ]
-]
-def post_replace_ph(ph):
-    rep_map = {
-        "：": ",",
-        "；": ",",
-        "，": ",",
-        "。": ".",
-        "！": "!",
-        "？": "?",
-        "\n": ".",
-        "·": ",",
-        "、": ",",
-        "...": "…",
-    }
-    if ph in rep_map.keys():
-        ph = rep_map[ph]
-    if ph in symbols:
-        return ph
-    if ph not in symbols:
-        ph = "UNK"
-    return ph
-def replace_consecutive_punctuation(text):
-    punctuations = ''.join(re.escape(p) for p in punctuation)
-    pattern = f'([{punctuations}])([{punctuations}])+'
-    result = re.sub(pattern, r'\1', text)
-    return result
-def symbols_to_japanese(text):
-    for regex, replacement in _symbols_to_japanese:
-        text = re.sub(regex, replacement, text)
-    return text
-def preprocess_jap(text, with_prosody=False):
-    """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
-    text = symbols_to_japanese(text)
-    sentences = re.split(_japanese_marks, text)
-    marks = re.findall(_japanese_marks, text)
-    text = []
-    for i, sentence in enumerate(sentences):
-        if re.match(_japanese_characters, sentence):
-            if with_prosody:
-                text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
-            else:
-                p = pyopenjtalk.g2p(sentence)
-                text += p.split(" ")
-        if i < len(marks):
-            if marks[i] == " ":# 防止意外的UNK
-                continue
-            text += [marks[i].replace(" ", "")]
-    return text
-def text_normalize(text):
-    # todo: jap text normalize
-    # 避免重复标点引起的参考泄露
-    text = replace_consecutive_punctuation(text)
-    text = "".join([i.lower() for i in text])
-    return text
-# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
-def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
-    """Extract phoneme + prosoody symbol sequence from input full-context labels.
-    The algorithm is based on `Prosodic features control by symbols as input of
-    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
-    Args:
-        text (str): Input text.
-        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
-    Returns:
-        List[str]: List of phoneme + prosody symbols.
-    Examples:
-        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
-        >>> pyopenjtalk_g2p_prosody("こんにちは。")
-        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
-    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
-        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
-    """
-    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
-    N = len(labels)
-    phones = []
-    for n in range(N):
-        lab_curr = labels[n]
-        # current phoneme
-        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
-        # deal unvoiced vowels as normal vowels
-        if drop_unvoiced_vowels and p3 in "AEIOU":
-            p3 = p3.lower()
-        # deal with sil at the beginning and the end of text
-        if p3 == "sil":
-            assert n == 0 or n == N - 1
-            if n == 0:
-                phones.append("^")
-            elif n == N - 1:
-                # check question form or not
-                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
-                if e3 == 0:
-                    phones.append("$")
-                elif e3 == 1:
-                    phones.append("?")
-            continue
-        elif p3 == "pau":
-            phones.append("_")
-            continue
-        else:
-            phones.append(p3)
-        # accent type and position info (forward or backward)
-        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
-        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
-        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
-        # number of mora in accent phrase
-        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
-        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
-        # accent phrase border
-        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
-            phones.append("#")
-        # pitch falling
-        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
-            phones.append("]")
-        # pitch rising
-        elif a2 == 1 and a2_next == 2:
-            phones.append("[")
-    return phones
-# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
-def _numeric_feature_by_regex(regex, s):
-    match = re.search(regex, s)
-    if match is None:
-        return -50
-    return int(match.group(1))
-def g2p(norm_text, with_prosody=True):
-    norm_text = text_normalize(norm_text)
-    phones = preprocess_jap(norm_text, with_prosody)
-    phones = [post_replace_ph(i) for i in phones]
-    # todo: implement tones and word2ph
-    return phones
-if __name__ == "__main__":
-    phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね！")
-    print(phones)

+# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
+import re
+import pyopenjtalk
+import os
+import hashlib
+from text.symbols2 import symbols
+current_file_path = os.path.dirname(__file__)
+def get_hash(fp: str) -> str:
+    hash_md5 = hashlib.md5()
+    with open(fp, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
+USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
+USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
+# 如果没有用户词典，就生成一个；如果有，就检查md5，如果不一样，就重新生成
+try:
+    if os.path.exists(USERDIC_BIN_PATH):
+        pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
+except:
+    print("FAIL TO USE USERDICT")
+from text.symbols import punctuation
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
+# List of (consonant, sokuon) pairs:
+_real_sokuon = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        (r"Q([↑↓]*[kg])", r"k#\1"),
+        (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
+        (r"Q([↑↓]*[sʃ])", r"s\1"),
+        (r"Q([↑↓]*[pb])", r"p#\1"),
+    ]
+]
+# List of (consonant, hatsuon) pairs:
+_real_hatsuon = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        (r"N([↑↓]*[pbm])", r"m\1"),
+        (r"N([↑↓]*[ʧʥj])", r"n^\1"),
+        (r"N([↑↓]*[tdn])", r"n\1"),
+        (r"N([↑↓]*[kg])", r"ŋ\1"),
+    ]
+]
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "...": "…",
+    }
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    if ph in symbols:
+        return ph
+    if ph not in symbols:
+        ph = "UNK"
+    return ph
+def replace_consecutive_punctuation(text):
+    punctuations = ''.join(re.escape(p) for p in punctuation)
+    pattern = f'([{punctuations}])([{punctuations}])+'
+    result = re.sub(pattern, r'\1', text)
+    return result
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def preprocess_jap(text, with_prosody=False):
+    """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = []
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if with_prosody:
+                text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
+            else:
+                p = pyopenjtalk.g2p(sentence)
+                text += p.split(" ")
+        if i < len(marks):
+            if marks[i] == " ":# 防止意外的UNK
+                continue
+            text += [marks[i].replace(" ", "")]
+    return text
+def text_normalize(text):
+    # todo: jap text normalize
+    # 避免重复标点引起的参考泄露
+    text = replace_consecutive_punctuation(text)
+    text = "".join([i.lower() for i in text])
+    return text
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
+    """Extract phoneme + prosoody symbol sequence from input full-context labels.
+    The algorithm is based on `Prosodic features control by symbols as input of
+    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
+    Args:
+        text (str): Input text.
+        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
+    Returns:
+        List[str]: List of phoneme + prosody symbols.
+    Examples:
+        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
+        >>> pyopenjtalk_g2p_prosody("こんにちは。")
+        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
+    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
+        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
+    """
+    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
+    N = len(labels)
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            assert n == 0 or n == N - 1
+            if n == 0:
+                phones.append("^")
+            elif n == N - 1:
+                # check question form or not
+                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+                if e3 == 0:
+                    phones.append("$")
+                elif e3 == 1:
+                    phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+    return phones
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+def g2p(norm_text, with_prosody=True):
+    norm_text = text_normalize(norm_text)
+    phones = preprocess_jap(norm_text, with_prosody)
+    phones = [post_replace_ph(i) for i in phones]
+    # todo: implement tones and word2ph
+    return phones
+if __name__ == "__main__":
+    phones = g2p("���んにちは, hello, AKITOです,よろしくお願いしますね！")
+    print(phones)