remove audio fusion for first chunk
Browse files- modeling_minicpmo.py +11 -3
modeling_minicpmo.py
CHANGED
|
@@ -1730,8 +1730,11 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|
| 1730 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
| 1731 |
|
| 1732 |
else:
|
| 1733 |
-
prev_wav = wav_np
|
| 1734 |
-
|
|
|
|
|
|
|
|
|
|
| 1735 |
if outputs.finished:
|
| 1736 |
logger.debug("Generation finished.")
|
| 1737 |
eos_lab = True
|
|
@@ -1828,6 +1831,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|
| 1828 |
prev_text_len = len(gen_text_raw)
|
| 1829 |
yield OmniOutput(text=cur_text, audio_wav=wav_y, sampling_rate=sr)
|
| 1830 |
else:
|
|
|
|
| 1831 |
prev_wav = wav_np
|
| 1832 |
else:
|
| 1833 |
# smooth wav
|
|
@@ -1839,7 +1843,11 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|
| 1839 |
prev_text_len = len(gen_text_raw)
|
| 1840 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
| 1841 |
else:
|
| 1842 |
-
prev_wav = wav_np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1843 |
|
| 1844 |
if outputs.finished:
|
| 1845 |
logger.debug("Generation finished.")
|
|
|
|
| 1730 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
| 1731 |
|
| 1732 |
else:
|
| 1733 |
+
prev_wav = wav_np[-512 * 4:]
|
| 1734 |
+
wav_np = wav_np[:-512 * 4]
|
| 1735 |
+
cur_text = gen_text_raw[prev_text_len:]
|
| 1736 |
+
prev_text_len = len(gen_text_raw)
|
| 1737 |
+
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
| 1738 |
if outputs.finished:
|
| 1739 |
logger.debug("Generation finished.")
|
| 1740 |
eos_lab = True
|
|
|
|
| 1831 |
prev_text_len = len(gen_text_raw)
|
| 1832 |
yield OmniOutput(text=cur_text, audio_wav=wav_y, sampling_rate=sr)
|
| 1833 |
else:
|
| 1834 |
+
|
| 1835 |
prev_wav = wav_np
|
| 1836 |
else:
|
| 1837 |
# smooth wav
|
|
|
|
| 1843 |
prev_text_len = len(gen_text_raw)
|
| 1844 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
| 1845 |
else:
|
| 1846 |
+
prev_wav = wav_np[-512 * 4:]
|
| 1847 |
+
wav_np = wav_np[:-512 * 4]
|
| 1848 |
+
cur_text = gen_text_raw[prev_text_len:]
|
| 1849 |
+
prev_text_len = len(gen_text_raw)
|
| 1850 |
+
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
| 1851 |
|
| 1852 |
if outputs.finished:
|
| 1853 |
logger.debug("Generation finished.")
|