ruslanmv commited on
Commit
e1e4a12
·
1 Parent(s): c3f9f3a
Files changed (2) hide show
  1. app.py +64 -117
  2. requirements.txt +2 -3
app.py CHANGED
@@ -3,7 +3,6 @@
3
  # ===================================================================================
4
  from __future__ import annotations
5
  import os
6
- import sys
7
  import base64
8
  import struct
9
  import textwrap
@@ -15,23 +14,31 @@ from typing import List, Dict, Tuple, Generator
15
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
16
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
17
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
18
- os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false") # truly disable analytics
19
- os.environ.setdefault("TORCHAUDIO_USE_FFMPEG", "0") # avoid torchaudio ffmpeg path entirely
20
 
21
- # --- .env early (HF_TOKEN / SECRET_TOKEN) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  from dotenv import load_dotenv
23
  load_dotenv()
24
 
25
- # --- NumPy sanity (Torch 2.2.2 expects NumPy 1.x in your stack) ---
26
- import numpy as _np
27
- if int(_np.__version__.split(".", 1)[0]) >= 2:
28
- raise RuntimeError(
29
- f"Detected numpy=={_np.__version__}. Please ensure numpy<2 (e.g., 1.26.4) for this Space."
30
- )
31
-
32
- # --- Hugging Face Spaces & ZeroGPU (import BEFORE CUDA libs) ---
33
  try:
34
- import spaces # Required for ZeroGPU on HF
35
  except Exception:
36
  class _SpacesShim:
37
  def GPU(self, *args, **kwargs):
@@ -42,20 +49,17 @@ except Exception:
42
 
43
  import gradio as gr
44
 
45
- # --- Core ML & Data Libraries (after spaces import) ---
46
  import torch
47
  import numpy as np
48
  from huggingface_hub import HfApi, hf_hub_download
49
  from llama_cpp import Llama
50
 
51
- # --- Audio decoding (we'll use ffmpeg-python to avoid torchaudio/torio) ---
52
- import ffmpeg
53
-
54
  # --- TTS Libraries ---
55
  from TTS.tts.configs.xtts_config import XttsConfig
56
  from TTS.tts.models.xtts import Xtts
57
  from TTS.utils.manage import ModelManager
58
- import TTS.tts.models.xtts as xtts_module # for monkey-patching load_audio
59
 
60
  # --- Text & Audio Processing ---
61
  import nltk
@@ -67,15 +71,12 @@ import noisereduce as nr
67
  # 2) GLOBALS & HELPERS
68
  # ===================================================================================
69
 
70
- # NLTK data
71
  nltk.download("punkt", quiet=True)
72
 
73
- # Cached models & latents
74
  tts_model: Xtts | None = None
75
  llm_model: Llama | None = None
76
  voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
77
 
78
- # Config
79
  HF_TOKEN = os.environ.get("HF_TOKEN")
80
  api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
81
  repo_id = "ruslanmv/ai-story-server"
@@ -83,7 +84,6 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
83
  SENTENCE_SPLIT_LENGTH = 250
84
  LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
85
 
86
- # System prompts and roles
87
  default_system_message = (
88
  "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
89
  "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
@@ -96,7 +96,6 @@ ROLE_PROMPTS["Pirate"] = (
96
  "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
97
  )
98
 
99
- # ---------- small utils ----------
100
  def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
101
  if pcm_data.startswith(b"RIFF"):
102
  return pcm_data
@@ -129,44 +128,11 @@ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sy
129
  prompt += f"<|user|>\n{message}</s><|assistant|>"
130
  return prompt
131
 
132
- # ---------- robust audio decode (24k mono via ffmpeg) ----------
133
- def _decode_audio_ffmpeg_to_24k_mono(path: str) -> Tuple[np.ndarray, int]:
134
- """Return float32 waveform in [-1,1], 24 kHz mono."""
135
- try:
136
- out, _ = (
137
- ffmpeg
138
- .input(path)
139
- .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=24000)
140
- .run(capture_stdout=True, capture_stderr=True, cmd="ffmpeg")
141
- )
142
- pcm = np.frombuffer(out, dtype=np.int16)
143
- if pcm.size == 0:
144
- raise RuntimeError("ffmpeg produced empty audio.")
145
- wav = (pcm.astype(np.float32) / 32767.0).copy()
146
- return wav, 24000
147
- except ffmpeg.Error as e:
148
- raise RuntimeError(f"ffmpeg decode failed: {e.stderr.decode(errors='ignore') if e.stderr else e}") from e
149
-
150
- # ---------- monkey-patch XTTS internal loader to avoid torchaudio/torio ----------
151
- def _patched_load_audio(audiopath: str, load_sr: int):
152
- wav, sr = _decode_audio_ffmpeg_to_24k_mono(audiopath)
153
- # XTTS expects (audio, sr) and will handle truncation/conditioning windows.
154
- return wav, sr
155
-
156
- xtts_module.load_audio = _patched_load_audio # <- critical fix
157
-
158
- # ---------- where Coqui caches models (avoid get_user_data_dir import) ----------
159
- def _coqui_cache_dir() -> str:
160
- # Matches what TTS uses on Linux: ~/.local/share/tts
161
- return os.path.join(os.path.expanduser("~"), ".local", "share", "tts")
162
-
163
  # ===================================================================================
164
  # 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
165
  # ===================================================================================
166
 
167
  def precache_assets() -> None:
168
- """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
169
- # Voices
170
  print("Pre-caching voice files...")
171
  file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
172
  base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
@@ -182,31 +148,27 @@ def precache_assets() -> None:
182
  except Exception as e:
183
  print(f"Failed to download {name}: {e}")
184
 
185
- # XTTS model files
186
  print("Pre-caching XTTS v2 model files...")
187
  ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
188
 
189
- # LLM GGUF
190
  print("Pre-caching Zephyr GGUF...")
191
  try:
192
  hf_hub_download(
193
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
194
- filename="zephyr-7b-beta.Q5_K_M.gguf",
195
- force_download=False
196
  )
197
  except Exception as e:
198
  print(f"Warning: GGUF pre-cache error: {e}")
199
 
200
  def _load_xtts(device: str) -> Xtts:
201
- """Load XTTS from the local cache. Use checkpoint_dir to avoid None path bugs."""
202
  print("Loading Coqui XTTS V2 model (CPU first)...")
203
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
204
- ModelManager().download_model(model_name) # idempotent
205
- model_dir = os.path.join(_coqui_cache_dir(), model_name.replace("/", "--"))
206
 
207
  cfg = XttsConfig()
208
  cfg.load_json(os.path.join(model_dir, "config.json"))
209
  model = Xtts.init_from_config(cfg)
 
210
  model.load_checkpoint(
211
  cfg,
212
  checkpoint_dir=model_dir,
@@ -218,7 +180,6 @@ def _load_xtts(device: str) -> Xtts:
218
  return model
219
 
220
  def _load_llama() -> Llama:
221
- """Load Llama (Zephyr GGUF) on CPU so it's ready immediately."""
222
  print("Loading LLM (Zephyr GGUF) on CPU...")
223
  zephyr_model_path = hf_hub_download(
224
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
@@ -226,7 +187,7 @@ def _load_llama() -> Llama:
226
  )
227
  llm = Llama(
228
  model_path=zephyr_model_path,
229
- n_gpu_layers=0, # CPU by default
230
  n_ctx=4096,
231
  n_batch=512,
232
  verbose=False
@@ -235,16 +196,14 @@ def _load_llama() -> Llama:
235
  return llm
236
 
237
  def init_models_and_latents() -> None:
238
- """Preload TTS and LLM on CPU and compute voice latents once (using patched audio loader)."""
239
  global tts_model, llm_model, voice_latents
240
 
241
  if tts_model is None:
242
- tts_model = _load_xtts(device="cpu") # keep on CPU at startup
243
 
244
  if llm_model is None:
245
  llm_model = _load_llama()
246
 
247
- # Pre-compute latents once (CPU OK); uses patched loader (ffmpeg) under the hood
248
  if not voice_latents:
249
  print("Computing voice conditioning latents...")
250
  for role, filename in [
@@ -259,14 +218,13 @@ def init_models_and_latents() -> None:
259
  )
260
  print("Voice latents ready.")
261
 
262
- # Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
263
  def _close_llm():
264
  global llm_model
265
- try:
266
- if llm_model is not None:
267
  llm_model.close()
268
- except Exception:
269
- pass
270
  atexit.register(_close_llm)
271
 
272
  # ===================================================================================
@@ -311,8 +269,8 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
311
  except RuntimeError as e:
312
  print(f"Error during TTS inference: {e}")
313
  if "device-side assert" in str(e) and api:
 
314
  try:
315
- gr.Warning("Critical GPU error. Attempting to restart the Space...")
316
  api.restart_space(repo_id=repo_id)
317
  except Exception:
318
  pass
@@ -321,68 +279,57 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
321
  # 5) ZERO-GPU ENTRYPOINT
322
  # ===================================================================================
323
 
324
- @spaces.GPU(duration=120) # Request GPU for up to 120s; adjust as needed
325
  def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
326
  if secret_token_input != SECRET_TOKEN:
327
  raise gr.Error("Invalid secret token provided.")
328
  if not input_text:
329
  return []
330
 
331
- # Ensure models/latents exist
332
  if tts_model is None or llm_model is None or not voice_latents:
333
- init_models_and_latents()
334
 
335
- # Move XTTS to CUDA for this call if GPU is available; otherwise stay on CPU
336
  try:
337
  if torch.cuda.is_available():
338
  tts_model.to("cuda")
339
  else:
340
  tts_model.to("cpu")
341
- except Exception:
342
- tts_model.to("cpu")
343
 
344
- # Generate story text (LLM on CPU)
345
- history: List[Tuple[str, str | None]] = [(input_text, None)]
346
- full_story_text = "".join(
347
- generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
348
- ).strip()
349
- if not full_story_text:
350
- return []
351
 
352
- # Split into TTS-friendly sentences
353
- sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
354
- lang = langid.classify(sentences[0])[0] if sentences else "en"
355
 
356
- results: List[Dict[str, str]] = []
357
- for sentence in sentences:
358
- if not any(c.isalnum() for c in sentence):
359
- continue
360
 
361
- audio_chunks = generate_audio_stream(tts_model, sentence, lang, voice_latents[chatbot_role])
362
- pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
363
 
364
- # Optional noise reduction (best-effort)
365
- try:
366
- data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
367
- if data_s16.size > 0:
368
- float_data = data_s16.astype(np.float32) / 32767.0
369
- reduced = nr.reduce_noise(y=float_data, sr=24000)
370
- final_pcm = (reduced * 32767).astype(np.int16).tobytes()
371
- else:
 
372
  final_pcm = pcm_data
373
- except Exception:
374
- final_pcm = pcm_data
375
 
376
- b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
377
- results.append({"text": sentence, "audio": b64_wav})
378
 
379
- # Release GPU immediately
380
- try:
381
  tts_model.to("cpu")
382
- except Exception:
383
- pass
384
-
385
- return results
386
 
387
  # ===================================================================================
388
  # 6) STARTUP: PRECACHE & UI
@@ -404,9 +351,9 @@ def build_ui() -> gr.Interface:
404
 
405
  if __name__ == "__main__":
406
  print("===== Startup: pre-cache assets and preload models =====")
407
- precache_assets() # 1) download everything to disk
408
- init_models_and_latents() # 2) load models on CPU + compute voice latents (via patched loader)
409
  print("Models and assets ready. Launching UI...")
410
 
411
  demo = build_ui()
412
- demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
3
  # ===================================================================================
4
  from __future__ import annotations
5
  import os
 
6
  import base64
7
  import struct
8
  import textwrap
 
14
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
15
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
16
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
17
+ os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")
 
18
 
19
+ # --- Prefer torchaudio sox_io/soundfile backend (avoid FFmpeg/torio bug) ---
20
+ try:
21
+ import torchaudio
22
+ _backend_set = False
23
+ for _cand in ("sox_io", "soundfile"):
24
+ try:
25
+ torchaudio.set_audio_backend(_cand)
26
+ _backend_set = True
27
+ break
28
+ except Exception:
29
+ pass
30
+ if not _backend_set:
31
+ os.environ["TORCHAUDIO_USE_FFMPEG"] = "0"
32
+ except Exception:
33
+ torchaudio = None
34
+
35
+ # --- Load .env early (HF_TOKEN / SECRET_TOKEN) ---
36
  from dotenv import load_dotenv
37
  load_dotenv()
38
 
39
+ # --- Hugging Face Spaces & ZeroGPU ---
 
 
 
 
 
 
 
40
  try:
41
+ import spaces
42
  except Exception:
43
  class _SpacesShim:
44
  def GPU(self, *args, **kwargs):
 
49
 
50
  import gradio as gr
51
 
52
+ # --- Core ML & Data Libraries ---
53
  import torch
54
  import numpy as np
55
  from huggingface_hub import HfApi, hf_hub_download
56
  from llama_cpp import Llama
57
 
 
 
 
58
  # --- TTS Libraries ---
59
  from TTS.tts.configs.xtts_config import XttsConfig
60
  from TTS.tts.models.xtts import Xtts
61
  from TTS.utils.manage import ModelManager
62
+ from TTS.utils.generic_utils import get_user_data_dir
63
 
64
  # --- Text & Audio Processing ---
65
  import nltk
 
71
  # 2) GLOBALS & HELPERS
72
  # ===================================================================================
73
 
 
74
  nltk.download("punkt", quiet=True)
75
 
 
76
  tts_model: Xtts | None = None
77
  llm_model: Llama | None = None
78
  voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
79
 
 
80
  HF_TOKEN = os.environ.get("HF_TOKEN")
81
  api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
82
  repo_id = "ruslanmv/ai-story-server"
 
84
  SENTENCE_SPLIT_LENGTH = 250
85
  LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
86
 
 
87
  default_system_message = (
88
  "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
89
  "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
 
96
  "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
97
  )
98
 
 
99
  def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
100
  if pcm_data.startswith(b"RIFF"):
101
  return pcm_data
 
128
  prompt += f"<|user|>\n{message}</s><|assistant|>"
129
  return prompt
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # ===================================================================================
132
  # 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
133
  # ===================================================================================
134
 
135
  def precache_assets() -> None:
 
 
136
  print("Pre-caching voice files...")
137
  file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
138
  base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
 
148
  except Exception as e:
149
  print(f"Failed to download {name}: {e}")
150
 
 
151
  print("Pre-caching XTTS v2 model files...")
152
  ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
153
 
 
154
  print("Pre-caching Zephyr GGUF...")
155
  try:
156
  hf_hub_download(
157
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
158
+ filename="zephyr-7b-beta.Q5_K_M.gguf"
 
159
  )
160
  except Exception as e:
161
  print(f"Warning: GGUF pre-cache error: {e}")
162
 
163
  def _load_xtts(device: str) -> Xtts:
 
164
  print("Loading Coqui XTTS V2 model (CPU first)...")
165
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
166
+ model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 
167
 
168
  cfg = XttsConfig()
169
  cfg.load_json(os.path.join(model_dir, "config.json"))
170
  model = Xtts.init_from_config(cfg)
171
+
172
  model.load_checkpoint(
173
  cfg,
174
  checkpoint_dir=model_dir,
 
180
  return model
181
 
182
  def _load_llama() -> Llama:
 
183
  print("Loading LLM (Zephyr GGUF) on CPU...")
184
  zephyr_model_path = hf_hub_download(
185
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
 
187
  )
188
  llm = Llama(
189
  model_path=zephyr_model_path,
190
+ n_gpu_layers=0,
191
  n_ctx=4096,
192
  n_batch=512,
193
  verbose=False
 
196
  return llm
197
 
198
  def init_models_and_latents() -> None:
 
199
  global tts_model, llm_model, voice_latents
200
 
201
  if tts_model is None:
202
+ tts_model = _load_xtts(device="cpu")
203
 
204
  if llm_model is None:
205
  llm_model = _load_llama()
206
 
 
207
  if not voice_latents:
208
  print("Computing voice conditioning latents...")
209
  for role, filename in [
 
218
  )
219
  print("Voice latents ready.")
220
 
 
221
  def _close_llm():
222
  global llm_model
223
+ if llm_model is not None:
224
+ try:
225
  llm_model.close()
226
+ except Exception:
227
+ pass
228
  atexit.register(_close_llm)
229
 
230
  # ===================================================================================
 
269
  except RuntimeError as e:
270
  print(f"Error during TTS inference: {e}")
271
  if "device-side assert" in str(e) and api:
272
+ gr.Warning("Critical GPU error. Attempting to restart the Space...")
273
  try:
 
274
  api.restart_space(repo_id=repo_id)
275
  except Exception:
276
  pass
 
279
  # 5) ZERO-GPU ENTRYPOINT
280
  # ===================================================================================
281
 
282
+ @spaces.GPU(duration=120)
283
  def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
284
  if secret_token_input != SECRET_TOKEN:
285
  raise gr.Error("Invalid secret token provided.")
286
  if not input_text:
287
  return []
288
 
 
289
  if tts_model is None or llm_model is None or not voice_latents:
290
+ raise gr.Error("Models not initialized. Please restart the Space.")
291
 
 
292
  try:
293
  if torch.cuda.is_available():
294
  tts_model.to("cuda")
295
  else:
296
  tts_model.to("cpu")
 
 
297
 
298
+ history: List[Tuple[str, str | None]] = [(input_text, None)]
299
+ full_story_text = "".join(
300
+ generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
301
+ ).strip()
302
+ if not full_story_text:
303
+ return []
 
304
 
305
+ sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
306
+ lang = langid.classify(sentences[0])[0] if sentences else "en"
 
307
 
308
+ results: List[Dict[str, str]] = []
309
+ for sentence in sentences:
310
+ if not any(c.isalnum() for c in sentence):
311
+ continue
312
 
313
+ audio_chunks = generate_audio_stream(tts_model, sentence, lang, voice_latents[chatbot_role])
314
+ pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
315
 
316
+ try:
317
+ data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
318
+ if data_s16.size > 0:
319
+ float_data = data_s16.astype(np.float32) / 32767.0
320
+ reduced = nr.reduce_noise(y=float_data, sr=24000)
321
+ final_pcm = (reduced * 32767).astype(np.int16).tobytes()
322
+ else:
323
+ final_pcm = pcm_data
324
+ except Exception:
325
  final_pcm = pcm_data
 
 
326
 
327
+ b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
328
+ results.append({"text": sentence, "audio": b64_wav})
329
 
330
+ return results
331
+ finally:
332
  tts_model.to("cpu")
 
 
 
 
333
 
334
  # ===================================================================================
335
  # 6) STARTUP: PRECACHE & UI
 
351
 
352
  if __name__ == "__main__":
353
  print("===== Startup: pre-cache assets and preload models =====")
354
+ precache_assets()
355
+ init_models_and_latents()
356
  print("Models and assets ready. Launching UI...")
357
 
358
  demo = build_ui()
359
+ demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
requirements.txt CHANGED
@@ -9,7 +9,7 @@ requests
9
  numpy
10
  pandas==1.5.3
11
 
12
- # TTS
13
  TTS @ git+https://github.com/coqui-ai/[email protected]
14
  pydantic==2.5.3
15
 
@@ -17,7 +17,6 @@ pydantic==2.5.3
17
  llama-cpp-python==0.2.79
18
 
19
  # Audio & Text
20
- soundfile
21
  noisereduce==3.0.3
22
  pydub
23
  langid
@@ -27,4 +26,4 @@ ffmpeg-python
27
 
28
  # Japanese Text (optional)
29
  mecab-python3==1.0.9
30
- unidic-lite==1.0.8
 
9
  numpy
10
  pandas==1.5.3
11
 
12
+ # TTS (legacy)
13
  TTS @ git+https://github.com/coqui-ai/[email protected]
14
  pydantic==2.5.3
15
 
 
17
  llama-cpp-python==0.2.79
18
 
19
  # Audio & Text
 
20
  noisereduce==3.0.3
21
  pydub
22
  langid
 
26
 
27
  # Japanese Text (optional)
28
  mecab-python3==1.0.9
29
+ unidic-lite==1.0.8