Release `get_audio_placeholder` interface in processing (#24)
Browse files- release (7616f5094f7995e8f126d75e97493ef7ff2fc767)
Co-authored-by: Zhihui He <[email protected]>
- processing_minicpmo.py +26 -26
processing_minicpmo.py
CHANGED
|
@@ -102,6 +102,31 @@ class MiniCPMOProcessor(ProcessorMixin):
|
|
| 102 |
|
| 103 |
return MiniCPMOBatchFeature(data={**model_inputs})
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
def audio_feature_extract(
|
| 106 |
self,
|
| 107 |
audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
|
|
@@ -111,31 +136,6 @@ class MiniCPMOProcessor(ProcessorMixin):
|
|
| 111 |
chunk_length: Optional[int] = 1,
|
| 112 |
**kwargs,
|
| 113 |
):
|
| 114 |
-
def get_audio_placeholder(audio_lens, chunk_input):
|
| 115 |
-
pool_step = 2
|
| 116 |
-
feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
|
| 117 |
-
|
| 118 |
-
feature_lens = (feature_lens - 1) // 2 + 1
|
| 119 |
-
output_lens = (feature_lens - pool_step) // pool_step + 1
|
| 120 |
-
|
| 121 |
-
if chunk_input:
|
| 122 |
-
fbank_feat_in_chunk = int(chunk_length * 100)
|
| 123 |
-
cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
|
| 124 |
-
audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
|
| 125 |
-
num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
|
| 126 |
-
|
| 127 |
-
place_holders = ""
|
| 128 |
-
total_unk_len = 0
|
| 129 |
-
for _ in range(num_audio_chunks):
|
| 130 |
-
unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
|
| 131 |
-
place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
|
| 132 |
-
total_unk_len += unk_len
|
| 133 |
-
audio_placeholder = place_holders
|
| 134 |
-
else:
|
| 135 |
-
audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
|
| 136 |
-
|
| 137 |
-
return audio_placeholder
|
| 138 |
-
|
| 139 |
if isinstance(audios, np.ndarray):
|
| 140 |
audios_list = [[audios]]
|
| 141 |
elif isinstance(audios[0], np.ndarray):
|
|
@@ -156,7 +156,7 @@ class MiniCPMOProcessor(ProcessorMixin):
|
|
| 156 |
# audio placeholder not dependent on audio_parts
|
| 157 |
for audios in audios_list:
|
| 158 |
if audios:
|
| 159 |
-
audio_ph_list.append([get_audio_placeholder(len(a), chunk_input) for a in audios])
|
| 160 |
else:
|
| 161 |
audio_ph_list.append([])
|
| 162 |
|
|
|
|
| 102 |
|
| 103 |
return MiniCPMOBatchFeature(data={**model_inputs})
|
| 104 |
|
| 105 |
+
def get_audio_placeholder(self, audio_lens, chunk_input, chunk_length):
|
| 106 |
+
pool_step = 2
|
| 107 |
+
feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
|
| 108 |
+
|
| 109 |
+
feature_lens = (feature_lens - 1) // 2 + 1
|
| 110 |
+
output_lens = (feature_lens - pool_step) // pool_step + 1
|
| 111 |
+
|
| 112 |
+
if chunk_input:
|
| 113 |
+
fbank_feat_in_chunk = int(chunk_length * 100)
|
| 114 |
+
cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
|
| 115 |
+
audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
|
| 116 |
+
num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
|
| 117 |
+
|
| 118 |
+
place_holders = ""
|
| 119 |
+
total_unk_len = 0
|
| 120 |
+
for _ in range(num_audio_chunks):
|
| 121 |
+
unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
|
| 122 |
+
place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
|
| 123 |
+
total_unk_len += unk_len
|
| 124 |
+
audio_placeholder = place_holders
|
| 125 |
+
else:
|
| 126 |
+
audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
|
| 127 |
+
|
| 128 |
+
return audio_placeholder
|
| 129 |
+
|
| 130 |
def audio_feature_extract(
|
| 131 |
self,
|
| 132 |
audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
|
|
|
|
| 136 |
chunk_length: Optional[int] = 1,
|
| 137 |
**kwargs,
|
| 138 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
if isinstance(audios, np.ndarray):
|
| 140 |
audios_list = [[audios]]
|
| 141 |
elif isinstance(audios[0], np.ndarray):
|
|
|
|
| 156 |
# audio placeholder not dependent on audio_parts
|
| 157 |
for audios in audios_list:
|
| 158 |
if audios:
|
| 159 |
+
audio_ph_list.append([self.get_audio_placeholder(len(a), chunk_input, chunk_length) for a in audios])
|
| 160 |
else:
|
| 161 |
audio_ph_list.append([])
|
| 162 |
|