Improve sentence tokenization handling - MosesSentenceSplitter now works with list input
Browse files
src/whisper_streaming/online_asr.py
CHANGED
|
@@ -263,11 +263,26 @@ class OnlineASRProcessor:
|
|
| 263 |
|
| 264 |
def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
|
| 265 |
"""
|
| 266 |
-
Converts a list of tokens to a list of Sentence objects
|
| 267 |
sentence tokenizer.
|
| 268 |
"""
|
|
|
|
|
|
|
|
|
|
| 269 |
full_text = " ".join(token.text for token in tokens)
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
sentences: List[Sentence] = []
|
| 272 |
token_index = 0
|
| 273 |
for sent_text in sentence_texts:
|
|
@@ -276,7 +291,7 @@ class OnlineASRProcessor:
|
|
| 276 |
continue
|
| 277 |
sent_tokens = []
|
| 278 |
accumulated = ""
|
| 279 |
-
# Accumulate tokens until roughly matching the sentence text.
|
| 280 |
while token_index < len(tokens) and len(accumulated) < len(sent_text):
|
| 281 |
token = tokens[token_index]
|
| 282 |
accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
|
|
@@ -290,7 +305,6 @@ class OnlineASRProcessor:
|
|
| 290 |
)
|
| 291 |
sentences.append(sentence)
|
| 292 |
return sentences
|
| 293 |
-
|
| 294 |
def finish(self) -> Transcript:
|
| 295 |
"""
|
| 296 |
Flush the remaining transcript when processing ends.
|
|
|
|
| 263 |
|
| 264 |
def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
|
| 265 |
"""
|
| 266 |
+
Converts a list of tokens to a list of Sentence objects using the provided
|
| 267 |
sentence tokenizer.
|
| 268 |
"""
|
| 269 |
+
if not tokens:
|
| 270 |
+
return []
|
| 271 |
+
|
| 272 |
full_text = " ".join(token.text for token in tokens)
|
| 273 |
+
|
| 274 |
+
if self.tokenize:
|
| 275 |
+
try:
|
| 276 |
+
sentence_texts = self.tokenize(full_text)
|
| 277 |
+
except Exception as e:
|
| 278 |
+
# Some tokenizers (e.g., MosesSentenceSplitter) expect a list input.
|
| 279 |
+
try:
|
| 280 |
+
sentence_texts = self.tokenize([full_text])
|
| 281 |
+
except Exception as e2:
|
| 282 |
+
raise ValueError("Tokenization failed") from e2
|
| 283 |
+
else:
|
| 284 |
+
sentence_texts = [full_text]
|
| 285 |
+
|
| 286 |
sentences: List[Sentence] = []
|
| 287 |
token_index = 0
|
| 288 |
for sent_text in sentence_texts:
|
|
|
|
| 291 |
continue
|
| 292 |
sent_tokens = []
|
| 293 |
accumulated = ""
|
| 294 |
+
# Accumulate tokens until roughly matching the length of the sentence text.
|
| 295 |
while token_index < len(tokens) and len(accumulated) < len(sent_text):
|
| 296 |
token = tokens[token_index]
|
| 297 |
accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
|
|
|
|
| 305 |
)
|
| 306 |
sentences.append(sentence)
|
| 307 |
return sentences
|
|
|
|
| 308 |
def finish(self) -> Transcript:
|
| 309 |
"""
|
| 310 |
Flush the remaining transcript when processing ends.
|