thecollabagepatch commited on
Commit
e0bae41
·
1 Parent(s): c29a250

sometimes a claude yolo pt2

Browse files
Files changed (1) hide show
  1. jam_worker.py +36 -23
jam_worker.py CHANGED
@@ -413,13 +413,13 @@ class JamWorker(threading.Thread):
413
 
414
  def _append_model_chunk_and_spool(self, wav: au.Waveform) -> None:
415
  """
416
- REWRITTEN: Robust audio processing with silence detection and health monitoring.
417
 
418
  Strategy:
419
  1. Validate input chunk for silence/issues
420
- 2. Use simpler crossfading that handles silence gracefully
421
- 3. Update model stream with health checks
422
- 4. Convert to target SR and append to spool
423
  """
424
  # Unpack model-rate samples
425
  s = wav.samples.astype(np.float32, copy=False)
@@ -429,8 +429,9 @@ class JamWorker(threading.Thread):
429
  if n_samps == 0:
430
  return
431
 
432
- # Health check on new chunk
433
  is_healthy = self._check_model_health(s)
 
434
 
435
  # Get crossfade params
436
  try:
@@ -439,13 +440,29 @@ class JamWorker(threading.Thread):
439
  xfade_s = 0.0
440
  xfade_n = int(round(max(0.0, xfade_s) * float(self._model_sr)))
441
 
442
- print(f"[model] chunk len={n_samps} rms={_dbg_rms_dbfs_model(s):+.1f} dBFS healthy={is_healthy}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  # Helper: resample to target SR
445
  def to_target(y: np.ndarray) -> np.ndarray:
446
  return y if self._rs is None else self._rs.process(y, final=False)
447
 
448
- # --- SIMPLIFIED CROSSFADE LOGIC ---
449
 
450
  if self._model_stream is None:
451
  # First chunk - no crossfading needed
@@ -455,13 +472,14 @@ class JamWorker(threading.Thread):
455
  # No crossfade configured or chunk too short - simple append
456
  self._model_stream = np.concatenate([self._model_stream, s], axis=0)
457
 
458
- elif _is_silent(self._model_stream[-xfade_n:]) or _is_silent(s[:xfade_n]):
459
- # One side is silent - don't crossfade, just append
460
- print(f"[crossfade] Skipping crossfade due to silence")
461
- self._model_stream = np.concatenate([self._model_stream, s], axis=0)
 
462
 
463
  else:
464
- # Normal crossfade between non-silent audio
465
  tail = self._model_stream[-xfade_n:]
466
  head = s[:xfade_n]
467
  body = s[xfade_n:] if n_samps > xfade_n else np.zeros((0, s.shape[1]), dtype=np.float32)
@@ -482,9 +500,9 @@ class JamWorker(threading.Thread):
482
  body
483
  ], axis=0)
484
 
485
- # --- CONVERT AND APPEND TO SPOOL ---
486
 
487
- # Take the new audio from this iteration (avoid reprocessing old audio)
488
  if xfade_n > 0 and n_samps >= xfade_n:
489
  # Normal case: body after crossfade region
490
  new_audio = s[xfade_n:] if n_samps > xfade_n else s
@@ -499,15 +517,10 @@ class JamWorker(threading.Thread):
499
  self._spool = np.concatenate([self._spool, target_audio], axis=0) if self._spool.size else target_audio
500
  self._spool_written += target_audio.shape[0]
501
 
502
- # --- HEALTH MONITORING ---
503
-
504
- if not is_healthy:
505
- if self._silence_streak >= 3: # After 3 silent chunks, try to recover
506
- self._recover_from_silence()
507
- else:
508
- # Save current context as "good" backup
509
- if hasattr(self.state, 'context_tokens') and self.state.context_tokens is not None:
510
- self._last_good_context_tokens = np.copy(self.state.context_tokens)
511
 
512
  # Trim model stream to reasonable length (keep ~30 seconds)
513
  max_model_samples = int(30.0 * self._model_sr)
 
413
 
414
  def _append_model_chunk_and_spool(self, wav: au.Waveform) -> None:
415
  """
416
+ REWRITTEN: Robust audio processing that rejects silent chunks entirely.
417
 
418
  Strategy:
419
  1. Validate input chunk for silence/issues
420
+ 2. REJECT silent chunks - don't add them to spool or model stream
421
+ 3. Use healthy crossfading only between good audio
422
+ 4. Aggressive recovery when silence detected
423
  """
424
  # Unpack model-rate samples
425
  s = wav.samples.astype(np.float32, copy=False)
 
429
  if n_samps == 0:
430
  return
431
 
432
+ # Health check on new chunk - use stricter threshold
433
  is_healthy = self._check_model_health(s)
434
+ is_very_quiet = _is_silent(s, threshold_db=-50.0) # stricter than default -60
435
 
436
  # Get crossfade params
437
  try:
 
440
  xfade_s = 0.0
441
  xfade_n = int(round(max(0.0, xfade_s) * float(self._model_sr)))
442
 
443
+ print(f"[model] chunk len={n_samps} rms={_dbg_rms_dbfs_model(s):+.1f} dBFS healthy={is_healthy} quiet={is_very_quiet}")
444
+
445
+ # --- REJECT PROBLEMATIC CHUNKS ---
446
+ if not is_healthy or is_very_quiet:
447
+ print(f"[REJECT] Discarding unhealthy/quiet chunk - not adding to spool or model stream")
448
+
449
+ # Trigger recovery immediately on first bad chunk
450
+ if self._silence_streak >= 1:
451
+ self._recover_from_silence()
452
+
453
+ # Don't process this chunk at all - return early
454
+ return
455
+
456
+ # Reset silence streak on good chunk
457
+ if self._silence_streak > 0:
458
+ print(f"✅ Audio resumed after {self._silence_streak} rejected chunks")
459
+ self._silence_streak = 0
460
 
461
  # Helper: resample to target SR
462
  def to_target(y: np.ndarray) -> np.ndarray:
463
  return y if self._rs is None else self._rs.process(y, final=False)
464
 
465
+ # --- SIMPLIFIED CROSSFADE LOGIC (only for healthy audio) ---
466
 
467
  if self._model_stream is None:
468
  # First chunk - no crossfading needed
 
472
  # No crossfade configured or chunk too short - simple append
473
  self._model_stream = np.concatenate([self._model_stream, s], axis=0)
474
 
475
+ elif _is_silent(self._model_stream[-xfade_n:], threshold_db=-50.0):
476
+ # Previous tail is quiet - don't crossfade, just replace
477
+ print(f"[crossfade] Replacing quiet tail with new audio")
478
+ # Remove quiet tail and append new chunk
479
+ self._model_stream = np.concatenate([self._model_stream[:-xfade_n], s], axis=0)
480
 
481
  else:
482
+ # Normal crossfade between healthy audio
483
  tail = self._model_stream[-xfade_n:]
484
  head = s[:xfade_n]
485
  body = s[xfade_n:] if n_samps > xfade_n else np.zeros((0, s.shape[1]), dtype=np.float32)
 
500
  body
501
  ], axis=0)
502
 
503
+ # --- CONVERT AND APPEND TO SPOOL (only healthy audio reaches here) ---
504
 
505
+ # Take the new audio from this iteration
506
  if xfade_n > 0 and n_samps >= xfade_n:
507
  # Normal case: body after crossfade region
508
  new_audio = s[xfade_n:] if n_samps > xfade_n else s
 
517
  self._spool = np.concatenate([self._spool, target_audio], axis=0) if self._spool.size else target_audio
518
  self._spool_written += target_audio.shape[0]
519
 
520
+ # --- SAVE GOOD CONTEXT ---
521
+ # Only save context from healthy chunks
522
+ if hasattr(self.state, 'context_tokens') and self.state.context_tokens is not None:
523
+ self._last_good_context_tokens = np.copy(self.state.context_tokens)
 
 
 
 
 
524
 
525
  # Trim model stream to reasonable length (keep ~30 seconds)
526
  max_model_samples = int(30.0 * self._model_sr)