ms180 commited on
Commit
7a3402f
·
1 Parent(s): 51968da

Move initialization process to init_per()

Browse files
Files changed (1) hide show
  1. evaluation/svs_eval.py +14 -11
evaluation/svs_eval.py CHANGED
@@ -4,16 +4,9 @@ import numpy as np
4
  import torch
5
  import uuid
6
  from pathlib import Path
7
- from transformers import pipeline
8
- import jiwer
9
 
10
  # ----------- Initialization -----------
11
 
12
- asr_pipeline = pipeline(
13
- "automatic-speech-recognition",
14
- model="openai/whisper-large-v3-turbo"
15
- )
16
-
17
  def init_singmos():
18
  print("[Init] Loading SingMOS...")
19
  return torch.hub.load(
@@ -29,7 +22,17 @@ def init_basic_pitch():
29
 
30
 
31
  def init_per():
32
- return None # TODO: implement PER evaluation
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  def init_audiobox_aesthetics():
@@ -103,15 +106,15 @@ def pypinyin_g2p_phone_without_prosody(text):
103
  return phones
104
 
105
 
106
- def eval_per(audio_path, reference_text, model=None):
107
  audio_array, sr = librosa.load(audio_path, sr=16000)
108
- asr_result = asr_pipeline(
109
  audio_array,
110
  generate_kwargs={"language": "mandarin"}
111
  )['text']
112
  hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result)
113
  ref_pinyin = pypinyin_g2p_phone_without_prosody(reference_text)
114
- per = jiwer.wer(ref_pinyin, hyp_pinyin)
115
  return {"per": per}
116
 
117
 
 
4
  import torch
5
  import uuid
6
  from pathlib import Path
 
 
7
 
8
  # ----------- Initialization -----------
9
 
 
 
 
 
 
10
  def init_singmos():
11
  print("[Init] Loading SingMOS...")
12
  return torch.hub.load(
 
22
 
23
 
24
  def init_per():
25
+ print("[Init] Loading PER...")
26
+ from transformers import pipeline
27
+ import jiwer
28
+ asr_pipeline = pipeline(
29
+ "automatic-speech-recognition",
30
+ model="openai/whisper-large-v3-turbo"
31
+ )
32
+ return {
33
+ "asr_pipeline": asr_pipeline,
34
+ "jiwer": jiwer,
35
+ }
36
 
37
 
38
  def init_audiobox_aesthetics():
 
106
  return phones
107
 
108
 
109
+ def eval_per(audio_path, reference_text, evaluator=None):
110
  audio_array, sr = librosa.load(audio_path, sr=16000)
111
+ asr_result = evaluator['asr_pipeline'](
112
  audio_array,
113
  generate_kwargs={"language": "mandarin"}
114
  )['text']
115
  hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result)
116
  ref_pinyin = pypinyin_g2p_phone_without_prosody(reference_text)
117
+ per = evaluator['jiwer'].wer(ref_pinyin, hyp_pinyin)
118
  return {"per": per}
119
 
120