Spaces:

jhansss
/

SingingSDS

Sleeping

Han Jionghao commited on Jun 25

Commit

93bddf5

unverified ·

2 Parent(s): ee7581f 586bf69

Merge branch 'SingingSDS:main' into main

Files changed (4) hide show

requirements.txt CHANGED Viewed

@@ -13,4 +13,5 @@ basic-pitch[onnx]
 audiobox_aesthetics
 transformers
 s3prl
 git+https://github.com/sea-turt1e/kanjiconv

 audiobox_aesthetics
 transformers
 s3prl
+zhconv
 git+https://github.com/sea-turt1e/kanjiconv

server.py CHANGED Viewed

@@ -12,6 +12,7 @@ import jiwer
 import librosa
 from svs_utils import load_song_database, estimate_sentence_length
 from svs_eval import singmos_warmup, singmos_evaluation
 asr_pipeline = pipeline(
@@ -144,13 +145,13 @@ def on_click_metrics():
     # OWSM ctc + PER
     y, sr = librosa.load("tmp/response.wav", sr=16000)
     asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
-    hyp_pinin = lazy_pinyin(asr_result)
     with open(f"tmp/llm.txt", "r") as f:
         ref = f.read().replace(' ', '')
-    ref_pinin = lazy_pinyin(ref)
-    per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
     audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0]
     singmos = singmos_evaluation(

 import librosa
 from svs_utils import load_song_database, estimate_sentence_length
 from svs_eval import singmos_warmup, singmos_evaluation
+from util import get_pinyin
 asr_pipeline = pipeline(
     # OWSM ctc + PER
     y, sr = librosa.load("tmp/response.wav", sr=16000)
     asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
+    hyp_pinyin = get_pinyin(asr_result)
     with open(f"tmp/llm.txt", "r") as f:
         ref = f.read().replace(' ', '')
+    ref_pinyin = get_pinyin(ref)
+    per = jiwer.wer(" ".join(ref_pinyin), " ".join(hyp_pinyin))
     audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0]
     singmos = singmos_evaluation(

test_performance.py CHANGED Viewed

@@ -118,10 +118,10 @@ def on_click_metrics(audio_path, ref):
     asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
     # Espnet embeded g2p, but sometimes it will mispronunce polyphonic characters
-    hyp_pinin = pypinyin_g2p_phone_without_prosody(asr_result)
-    ref_pinin = pypinyin_g2p_phone_without_prosody(ref)
-    per = jiwer.wer(ref_pinin, hyp_pinin)
     audio = librosa.load(audio_path, sr=22050)[0]
     singmos = singmos_evaluation(

     asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
     # Espnet embeded g2p, but sometimes it will mispronunce polyphonic characters
+    hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result)
+    ref_pinyin = pypinyin_g2p_phone_without_prosody(ref)
+    per = jiwer.wer(ref_pinyin, hyp_pinyin)
     audio = librosa.load(audio_path, sr=22050)[0]
     singmos = singmos_evaluation(

util.py CHANGED Viewed

@@ -5,7 +5,8 @@ from typing import List
 import re
 from resource.pinyin_dict import PINYIN_DICT
-from pypinyin import lazy_pinyin
 def preprocess_input(src_str, seg_syb=" "):
@@ -77,14 +78,36 @@ def get_tokenizer(model, lang):
         raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
 def get_pinyin(texts):
-    pinyin_list = lazy_pinyin(texts)
     text_list = []
-    for text in pinyin_list:
-        if text[0] == "S" or text[0] == "A" or text[0] == "-":
-            sp_strs = re.findall(r"-|AP|SP", text)
-            for phn in sp_strs:
-                text_list.append(phn)
         else:
-            text_list.append(text)
     return text_list

 import re
 from resource.pinyin_dict import PINYIN_DICT
+from pypinyin import pinyin, Style
+from zhconv import convert
 def preprocess_input(src_str, seg_syb=" "):
         raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now")
+def is_chinese(char):
+    return '\u4e00' <= char <= '\u9fff'
+def is_special(block):
+    return any(token in block for token in ['-', 'AP', 'SP'])
 def get_pinyin(texts):
+    texts = preprocess_input(texts, seg_syb="")
+    blocks = re.compile(r'[\u4e00-\u9fff]|[^\u4e00-\u9fff]+').findall(texts)
+    characters = [block for block in blocks if is_chinese(block)]
+    chinese_text = ''.join(characters)
+    chinese_text = convert(chinese_text, 'zh-cn')
+    chinese_pinyin = pinyin(chinese_text, style=Style.NORMAL)
+    chinese_pinyin = [item[0] for item in chinese_pinyin]
     text_list = []
+    pinyin_idx = 0
+    for block in blocks:
+        if is_chinese(block):
+            text_list.append(chinese_pinyin[pinyin_idx])
+            pinyin_idx += 1
         else:
+            if is_special(block):
+                specials = re.compile(r"-|AP|SP").findall(block)
+                text_list.extend(specials)
+            else:
+                text_list.append(block)
     return text_list