Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Oct 18, 2024

Commit

ef49e48

1 Parent(s): 78bac9e

modify loss

Browse files

Files changed (4) hide show

__pycache__/inference.cpython-311.pyc +0 -0
inference.py +4 -4
modules/__pycache__/loss.cpython-311.pyc +0 -0
modules/loss.py +26 -102

__pycache__/inference.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/inference.cpython-311.pyc and b/__pycache__/inference.cpython-311.pyc differ

inference.py CHANGED Viewed

@@ -68,7 +68,7 @@ class MasteringStyleTransfer:
         return output_audio, predicted_params
     def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
-        fit_embedding = torch.nn.Parameter(initial_reference_feature)
         optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
         min_loss = float('inf')
@@ -97,9 +97,7 @@ class MasteringStyleTransfer:
                     target = reference_tensor
                 else:
                     target = ito_config['clap_text_prompt']
-                print(f'ito_config clap_distance_fn: {ito_config["clap_distance_fn"]}')
                 total_loss = self.clap_loss(output_audio, target, self.args.sample_rate, distance_fn=ito_config['clap_distance_fn'])
-                print(f'total_loss: {total_loss}')
             if total_loss < min_loss:
                 min_loss = total_loss.item()
@@ -122,6 +120,9 @@ class MasteringStyleTransfer:
             total_loss.backward()
             optimizer.step()
         return all_results, min_loss_step
     def preprocess_audio(self, audio, target_sample_rate=44100, normalize=False):
@@ -290,7 +291,6 @@ class MasteringStyleTransfer:
         return "\n".join(output)
 def reload_weights(model, ckpt_path, device):
     checkpoint = torch.load(ckpt_path, map_location=device)

         return output_audio, predicted_params
     def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
+        fit_embedding = torch.nn.Parameter(initial_reference_feature, requires_grad=True)
         optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
         min_loss = float('inf')
                     target = reference_tensor
                 else:
                     target = ito_config['clap_text_prompt']
                 total_loss = self.clap_loss(output_audio, target, self.args.sample_rate, distance_fn=ito_config['clap_distance_fn'])
             if total_loss < min_loss:
                 min_loss = total_loss.item()
             total_loss.backward()
             optimizer.step()
+            gc.collect()
+            torch.cuda.empty_cache()
         return all_results, min_loss_step
     def preprocess_audio(self, audio, target_sample_rate=44100, normalize=False):
         return "\n".join(output)
 def reload_weights(model, ckpt_path, device):
     checkpoint = torch.load(ckpt_path, map_location=device)

modules/__pycache__/loss.cpython-311.pyc CHANGED Viewed

Binary files a/modules/__pycache__/loss.cpython-311.pyc and b/modules/__pycache__/loss.cpython-311.pyc differ

modules/loss.py CHANGED Viewed

@@ -185,35 +185,26 @@ class CLAPFeatureLoss(nn.Module):
         self.target_sample_rate = 48000  # CLAP expects 48kHz audio
         self.model = laion_clap.CLAP_Module(enable_fusion=False)
         self.model.load_ckpt()  # download the default pretrained checkpoint
-        # Freeze the CLAP model parameters
-        for param in self.model.parameters():
-            param.requires_grad = False
-    def forward(self, input_audio, target, sample_rate, distance_fn='mse'):
         # Process input audio
-        with torch.no_grad():
-            input_audio = self.preprocess_audio(input_audio, sample_rate)
-        with torch.enable_grad():
-            input_embed = self.model.get_audio_embedding_from_data(x=input_audio, use_tensor=True)
         # Process target (audio or text)
-        with torch.no_grad():
-            if isinstance(target, torch.Tensor):
-                target_audio = self.preprocess_audio(target, sample_rate)
-                target_embed = self.model.get_audio_embedding_from_data(x=target_audio, use_tensor=True)
-            elif isinstance(target, str) or (isinstance(target, list) and isinstance(target[0], str)):
-                target_embed = self.model.get_text_embedding(target, use_tensor=True)
-            else:
-                raise ValueError("Target must be either audio tensor or text (string or list of strings)")
         # Compute loss using the specified distance function
         loss = self.compute_distance(input_embed, target_embed, distance_fn)
         return loss
-    def preprocess_audio(self, audio, sample_rate):
         # Ensure input is in the correct shape (N, C, T)
         if audio.dim() == 2:
             audio = audio.unsqueeze(1)
@@ -221,15 +212,22 @@ class CLAPFeatureLoss(nn.Module):
         # Convert to mono if stereo
         if audio.shape[1] > 1:
             audio = audio.mean(dim=1, keepdim=True)
         # Resample if necessary
         if sample_rate != self.target_sample_rate:
             audio = self.resample(audio, sample_rate)
-        # Quantize audio data
-        audio = self.quantize(audio)
-        return audio
     def compute_distance(self, x, y, distance_fn):
         if distance_fn == 'mse':
@@ -241,86 +239,12 @@ class CLAPFeatureLoss(nn.Module):
         else:
             raise ValueError(f"Unsupported distance function: {distance_fn}")
-    def quantize(self, audio):
-        audio = audio.squeeze(1)  # Remove channel dimension
-        audio = torch.clamp(audio, -1.0, 1.0)
-        audio = (audio * 32767.0).to(torch.int16).to(torch.float32) / 32767.0
-        return audio
-    def resample(self, audio, orig_sample_rate):
         resampler = torchaudio.transforms.Resample(
-            orig_freq=orig_sample_rate, new_freq=self.target_sample_rate
         ).to(audio.device)
         return resampler(audio)
-    # def forward(self, input_audio, target, sample_rate, distance_fn='cosine'):
-    #     # Process input audio
-    #     input_embed = self.process_audio(input_audio, sample_rate)
-    #     # Process target (audio or text)
-    #     if isinstance(target, torch.Tensor):
-    #         target_embed = self.process_audio(target, sample_rate)
-    #     elif isinstance(target, str) or (isinstance(target, list) and isinstance(target[0], str)):
-    #         target_embed = self.process_text(target)
-    #     else:
-    #         raise ValueError("Target must be either audio tensor or text (string or list of strings)")
-    #     # Compute loss using the specified distance function
-    #     loss = self.compute_distance(input_embed, target_embed, distance_fn)
-    #     return loss
-    # def process_audio(self, audio, sample_rate):
-    #     # Ensure input is in the correct shape (N, C, T)
-    #     if audio.dim() == 2:
-    #         audio = audio.unsqueeze(1)
-    #     # Convert to mono if stereo
-    #     if audio.shape[1] > 1:
-    #         audio = audio.mean(dim=1, keepdim=True)
-    #     # Resample if necessary
-    #     if sample_rate != self.target_sample_rate:
-    #         audio = self.resample(audio, sample_rate)
-    #     # Quantize audio data
-    #     audio = self.quantize(audio)
-    #     # Get CLAP embeddings
-    #     with torch.no_grad():
-    #         embed = self.model.get_audio_embedding_from_data(x=audio, use_tensor=True)
-    #     return embed
-    # def process_text(self, text):
-    #     # Get CLAP embeddings for text
-    #     # ensure input is a list of strings
-    #     if not isinstance(text, list):
-    #         text = [text]
-    #     with torch.no_grad():
-    #         embed = self.model.get_text_embedding(text, use_tensor=True)
-    #     return embed
-    # def compute_distance(self, x, y, distance_fn):
-    #     if distance_fn == 'mse':
-    #         return F.mse_loss(x, y)
-    #     elif distance_fn == 'l1':
-    #         return F.l1_loss(x, y)
-    #     elif distance_fn == 'cosine':
-    #         return 1 - F.cosine_similarity(x, y).mean()
-    #     else:
-    #         raise ValueError(f"Unsupported distance function: {distance_fn}")
-    # def quantize(self, audio):
-    #     audio = audio.squeeze(1)  # Remove channel dimension
-    #     audio = torch.clamp(audio, -1.0, 1.0)
-    #     audio = (audio * 32767.0).to(torch.int16).to(torch.float32) / 32767.0
-    #     return audio
-    # def resample(self, audio, input_sample_rate):
-    #     resampler = torchaudio.transforms.Resample(
-    #         orig_freq=input_sample_rate, new_freq=self.target_sample_rate
-    #     ).to(audio.device)
-    #     return resampler(audio)
 """

         self.target_sample_rate = 48000  # CLAP expects 48kHz audio
         self.model = laion_clap.CLAP_Module(enable_fusion=False)
         self.model.load_ckpt()  # download the default pretrained checkpoint
+        self.model.eval()
+    def forward(self, input_audio, target, sample_rate, distance_fn='cosine'):
         # Process input audio
+        input_embed = self.process_audio(input_audio, sample_rate)
         # Process target (audio or text)
+        if isinstance(target, torch.Tensor):
+            target_embed = self.process_audio(target, sample_rate)
+        elif isinstance(target, str) or (isinstance(target, list) and isinstance(target[0], str)):
+            target_embed = self.process_text(target)
+        else:
+            raise ValueError("Target must be either audio tensor or text (string or list of strings)")
         # Compute loss using the specified distance function
         loss = self.compute_distance(input_embed, target_embed, distance_fn)
         return loss
+    def process_audio(self, audio, sample_rate):
         # Ensure input is in the correct shape (N, C, T)
         if audio.dim() == 2:
             audio = audio.unsqueeze(1)
         # Convert to mono if stereo
         if audio.shape[1] > 1:
             audio = audio.mean(dim=1, keepdim=True)
         # Resample if necessary
         if sample_rate != self.target_sample_rate:
             audio = self.resample(audio, sample_rate)
+        audio = audio.squeeze(1)
+        # Get CLAP embeddings
+        embed = self.model.get_audio_embedding_from_data(x=audio, use_tensor=True)
+        return embed
+    def process_text(self, text):
+        # Get CLAP embeddings for text
+        # ensure input is a list of strings
+        if not isinstance(text, list):
+            text = [text]
+        embed = self.model.get_text_embedding(text, use_tensor=True)
+        return embed
     def compute_distance(self, x, y, distance_fn):
         if distance_fn == 'mse':
         else:
             raise ValueError(f"Unsupported distance function: {distance_fn}")
+    def resample(self, audio, input_sample_rate):
         resampler = torchaudio.transforms.Resample(
+            orig_freq=input_sample_rate, new_freq=self.target_sample_rate
         ).to(audio.device)
         return resampler(audio)
 """