Gapeleon commited on
Commit
a2e831a
·
verified ·
1 Parent(s): 41a1b5c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -0
app.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ NeuCodec Test - Gradio App
4
+ Equivalent to nemo and snac test spaces, but for NeuCodec used in NeuTTS-Air models.
5
+ Allows testing encode/decode cycles with the neuphonic/neucodec model.
6
+ """
7
+
8
+ import gradio as gr
9
+ import torch
10
+ import librosa
11
+ import numpy as np
12
+ import traceback
13
+ import time
14
+
15
+ # Attempt to import NeuCodec
16
+ try:
17
+ from neucodec import NeuCodec, DistillNeuCodec
18
+ print("NeuCodec modules imported successfully.")
19
+ except ImportError as e:
20
+ print(f"Error importing NeuCodec: {e}")
21
+ raise ImportError("Could not import NeuCodec. Make sure 'neucodec' is installed correctly.") from e
22
+
23
+ # --- Configuration ---
24
+ TARGET_SR = 16000 # NeuCodec operates at 16kHz for encoding
25
+ OUTPUT_SR = 24000 # NeuCodec outputs at 24kHz
26
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
+ MODEL_NAME = "neuphonic/neucodec" # Options: neuphonic/neucodec, neuphonic/distill-neucodec
28
+ print(f"Using device: {DEVICE}")
29
+
30
+ # --- Load Model (Load once globally) ---
31
+ neucodec = None
32
+ try:
33
+ print(f"Loading NeuCodec model: {MODEL_NAME}...")
34
+ start_time = time.time()
35
+
36
+ if MODEL_NAME == "neuphonic/distill-neucodec":
37
+ neucodec = DistillNeuCodec.from_pretrained(MODEL_NAME)
38
+ else:
39
+ neucodec = NeuCodec.from_pretrained(MODEL_NAME)
40
+
41
+ neucodec = neucodec.to(DEVICE)
42
+ neucodec.eval() # Set model to evaluation mode
43
+ end_time = time.time()
44
+ print(f"NeuCodec loaded successfully to {DEVICE}. Time taken: {end_time - start_time:.2f} seconds.")
45
+ except Exception as e:
46
+ print(f"FATAL: Error loading NeuCodec: {e}")
47
+ print(traceback.format_exc())
48
+
49
+ # --- Main Processing Function ---
50
+ def process_audio(audio_filepath):
51
+ """
52
+ Loads, resamples, encodes, decodes audio using NeuCodec, and returns results.
53
+ """
54
+ if neucodec is None:
55
+ return None, None, None, "Error: NeuCodec could not be loaded. Cannot process audio."
56
+
57
+ if audio_filepath is None:
58
+ return None, None, None, "Please upload an audio file."
59
+
60
+ logs = ["--- Starting Audio Processing with NeuCodec ---"]
61
+ try:
62
+ # 1. Load Audio
63
+ logs.append(f"Loading audio file: {audio_filepath}")
64
+ load_start = time.time()
65
+
66
+ # Load original audio (for playback reference)
67
+ original_waveform, original_sr = librosa.load(audio_filepath, sr=None, mono=False)
68
+ logs.append(f"Audio loaded. Original SR: {original_sr} Hz, Shape: {original_waveform.shape}")
69
+
70
+ # Convert to mono if stereo
71
+ if len(original_waveform.shape) > 1:
72
+ logs.append(f"Warning: Input audio has {original_waveform.shape[0]} channels. Converting to mono.")
73
+ original_waveform = librosa.to_mono(original_waveform)
74
+
75
+ load_end = time.time()
76
+ logs.append(f"Loading time: {load_end - load_start:.2f}s")
77
+
78
+ # --- Prepare Original for Playback ---
79
+ original_audio_playback = (original_sr, original_waveform)
80
+ logs.append("Prepared original audio for playback.")
81
+
82
+ # 2. Resample to 16kHz for encoding (NeuCodec expects 16kHz input)
83
+ resample_start = time.time()
84
+ logs.append(f"Resampling waveform to {TARGET_SR} Hz for encoding...")
85
+ waveform_16k = librosa.resample(original_waveform, orig_sr=original_sr, target_sr=TARGET_SR)
86
+ logs.append(f"Resampling complete. New Shape: {waveform_16k.shape}")
87
+ resample_end = time.time()
88
+ logs.append(f"Resampling time: {resample_end - resample_start:.2f}s")
89
+
90
+ # --- Prepare 16kHz version for Playback ---
91
+ resampled_audio_playback = (TARGET_SR, waveform_16k)
92
+ logs.append("Prepared 16kHz audio for playback.")
93
+
94
+ # 3. Prepare for NeuCodec Encoding
95
+ # NeuCodec expects [batch, channels, samples] format
96
+ waveform_tensor = torch.from_numpy(waveform_16k).float().unsqueeze(0).unsqueeze(0) # [1, 1, samples]
97
+ waveform_tensor = waveform_tensor.to(DEVICE)
98
+
99
+ logs.append(f"Waveform prepared for encoding. Shape: {waveform_tensor.shape}, Device: {DEVICE}")
100
+
101
+ # 4. Encode Audio using NeuCodec
102
+ logs.append("Encoding audio with NeuCodec...")
103
+ encode_start = time.time()
104
+ with torch.no_grad():
105
+ encoded_codes = neucodec.encode_code(audio_or_path=waveform_tensor.cpu())
106
+ encode_end = time.time()
107
+
108
+ if encoded_codes is None:
109
+ log_msg = "Encoding failed: encoded_codes is None"
110
+ logs.append(log_msg)
111
+ raise ValueError(log_msg)
112
+
113
+ logs.append(f"Encoding complete. Time: {encode_end - encode_start:.2f}s")
114
+ logs.append(f"Encoded codes shape: {encoded_codes.shape}")
115
+ logs.append(f"Encoded codes device: {encoded_codes.device}")
116
+
117
+ # Log some statistics about the codes
118
+ logs.append(f"Code sequence length: {encoded_codes.shape[-1]}")
119
+ logs.append(f"Code range: [{encoded_codes.min().item():.0f}, {encoded_codes.max().item():.0f}]")
120
+
121
+ # Calculate compression ratio
122
+ original_samples = waveform_16k.shape[0]
123
+ code_elements = encoded_codes.numel()
124
+ compression_ratio = original_samples / code_elements if code_elements > 0 else 0
125
+ logs.append(f"Compression ratio: ~{compression_ratio:.1f}:1 ({original_samples} samples -> {code_elements} codes)")
126
+
127
+ # 5. Decode the Codes using NeuCodec
128
+ logs.append("Decoding the generated codes with NeuCodec...")
129
+ decode_start = time.time()
130
+ with torch.no_grad():
131
+ reconstructed_waveform = neucodec.decode_code(encoded_codes)
132
+ decode_end = time.time()
133
+ logs.append(f"Decoding complete. Reconstructed waveform shape: {reconstructed_waveform.shape}, Device: {reconstructed_waveform.device}. Time: {decode_end - decode_start:.2f}s")
134
+
135
+ # 6. Prepare Reconstructed Audio for Playback
136
+ # Output is at 24kHz. Move to CPU, remove batch and channel dims, convert to NumPy.
137
+ reconstructed_audio_np = reconstructed_waveform.cpu().squeeze().numpy()
138
+ logs.append(f"Reconstructed audio prepared for playback at {OUTPUT_SR} Hz. Shape: {reconstructed_audio_np.shape}")
139
+ reconstructed_audio_playback = (OUTPUT_SR, reconstructed_audio_np)
140
+
141
+ # 7. Calculate quality metrics
142
+ # For comparison, we need to resample original to 24kHz to match reconstructed output
143
+ logs.append("Calculating quality metrics...")
144
+ original_24k = librosa.resample(original_waveform, orig_sr=original_sr, target_sr=OUTPUT_SR)
145
+
146
+ # Handle length differences (common with codecs)
147
+ min_len = min(len(original_24k), len(reconstructed_audio_np))
148
+ original_trimmed = original_24k[:min_len]
149
+ reconstructed_trimmed = reconstructed_audio_np[:min_len]
150
+
151
+ # Simple MSE calculation
152
+ mse = np.mean((original_trimmed - reconstructed_trimmed) ** 2)
153
+
154
+ if len(original_24k) != len(reconstructed_audio_np):
155
+ logs.append(f"Audio length difference: Original {len(original_24k)} samples, Reconstructed {len(reconstructed_audio_np)} samples")
156
+
157
+ logs.append(f"MSE (first {min_len} samples at 24kHz): {mse:.6f}")
158
+
159
+ # Calculate Signal-to-Noise Ratio (SNR)
160
+ signal_power = np.mean(original_trimmed ** 2)
161
+ noise_power = mse
162
+ if noise_power > 0:
163
+ snr_db = 10 * np.log10(signal_power / noise_power)
164
+ logs.append(f"SNR: {snr_db:.2f} dB")
165
+
166
+ logs.append("\n--- Audio Processing Completed Successfully ---")
167
+
168
+ # Summary statistics
169
+ total_time = (load_end - load_start) + (resample_end - resample_start) + (encode_end - encode_start) + (decode_end - decode_start)
170
+ logs.append(f"Total processing time: {total_time:.2f}s")
171
+ logs.append(f"Audio duration: {len(original_waveform) / original_sr:.2f}s")
172
+ logs.append(f"Real-time factor: {(len(original_waveform) / original_sr) / total_time:.2f}x")
173
+
174
+ return original_audio_playback, resampled_audio_playback, reconstructed_audio_playback, "\n".join(logs)
175
+
176
+ except Exception as e:
177
+ logs.append("\n--- An Error Occurred ---")
178
+ logs.append(f"Error Type: {type(e).__name__}")
179
+ logs.append(f"Error Details: {e}")
180
+ logs.append("\n--- Traceback ---")
181
+ logs.append(traceback.format_exc())
182
+ return None, None, None, "\n".join(logs)
183
+
184
+ # --- Gradio Interface ---
185
+ DESCRIPTION = """
186
+ This app demonstrates the **NeuCodec** model (`neuphonic/neucodec`) used in NeuTTS-Air.
187
+
188
+ **How it works:**
189
+ 1. Upload an audio file (wav, mp3, flac, etc.).
190
+ 2. The audio will be automatically resampled to 16kHz for encoding.
191
+ 3. The 16kHz audio is encoded into discrete codes by NeuCodec.
192
+ 4. These codes are then decoded back into 24kHz audio by NeuCodec.
193
+ 5. You can listen to the original, the 16kHz version, and the final reconstructed 24kHz audio.
194
+
195
+ **Technical details:**
196
+ - Input sample rate: 16kHz (for encoding)
197
+ - Output sample rate: 24kHz (after decoding)
198
+ - Architecture: 50Hz neural audio codec with single codebook
199
+ - Hop length: 480 samples
200
+
201
+ **Note:** If the input is stereo, it will be converted to mono.
202
+ """
203
+
204
+ iface = gr.Interface(
205
+ fn=process_audio,
206
+ inputs=gr.Audio(type="filepath", label="Upload Audio File"),
207
+ outputs=[
208
+ gr.Audio(label="Original Audio"),
209
+ gr.Audio(label="16kHz Audio (Input to NeuCodec)"),
210
+ gr.Audio(label="Reconstructed Audio (24kHz Output from NeuCodec)"),
211
+ gr.Textbox(label="Log Output", lines=20)
212
+ ],
213
+ title="NeuCodec Demo (16kHz -> 24kHz)",
214
+ description=DESCRIPTION,
215
+ examples=[
216
+ # TODO
217
+ # ["examples/example1.wav"],
218
+ ],
219
+ cache_examples=False
220
+ )
221
+
222
+ if __name__ == "__main__":
223
+ if neucodec is None:
224
+ print("Cannot launch Gradio interface because NeuCodec failed to load.")
225
+ else:
226
+ print("Launching Gradio Interface...")
227
+ print(f"Model: {MODEL_NAME}")
228
+ print(f"Input sample rate: {TARGET_SR} Hz")
229
+ print(f"Output sample rate: {OUTPUT_SR} Hz")
230
+ print(f"Device: {DEVICE}")
231
+ iface.launch(share=True)