Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

Flux9665 commited on May 15

Commit

5f0da2f

verified ·

1 Parent(s): 1d10354

overwrite some pitch values at the start and end to make it sound more lively

Browse files

Files changed (1) hide show

Modules/ToucanTTS/InferenceToucanTTS.py +32 -22

Modules/ToucanTTS/InferenceToucanTTS.py CHANGED Viewed

@@ -219,32 +219,42 @@ class ToucanTTS(torch.nn.Module):
         encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)
         # predicting pitch, energy and durations
-        reduced_pitch_space = torchfunc.dropout(self.pitch_latent_reduction(encoded_texts), p=0.1).transpose(1, 2)
         pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
                                                  mask=text_masks.float(),
-                                                 n_timesteps=10,
                                                  temperature=prosody_creativity,
                                                  c=utterance_embedding) if gold_pitch is None else gold_pitch
         pitch_predictions = _scale_variance(pitch_predictions, pitch_variance_scale)
         embedded_pitch_curve = self.pitch_embed(pitch_predictions).transpose(1, 2)
-        reduced_energy_space = torchfunc.dropout(self.energy_latent_reduction(encoded_texts + embedded_pitch_curve), p=0.1).transpose(1, 2)
         energy_predictions = self.energy_predictor(mu=reduced_energy_space,
                                                    mask=text_masks.float(),
-                                                   n_timesteps=10,
                                                    temperature=prosody_creativity,
                                                    c=utterance_embedding) if gold_energy is None else gold_energy
         energy_predictions = _scale_variance(energy_predictions, energy_variance_scale)
         embedded_energy_curve = self.energy_embed(energy_predictions).transpose(1, 2)
-        reduced_duration_space = torchfunc.dropout(self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve), p=0.1).transpose(1, 2)
         predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
                                                                              mask=text_masks.float(),
-                                                                             n_timesteps=10,
                                                                              temperature=prosody_creativity,
-                                                                             c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations
         # modifying the predictions with control parameters
         for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)):
             if phoneme_vector[get_feature_to_index_lookup()["word-boundary"]] == 1:
                 predicted_durations[0][phoneme_index] = 0
@@ -267,8 +277,8 @@ class ToucanTTS(torch.nn.Module):
         refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
                                                           mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
-                                                          n_timesteps=15,
-                                                          temperature=0.1,  # low temperature, so the model follows the specified prosody curves better.
                                                           c=None).transpose(1, 2)
         return refined_codec_frames, predicted_durations.squeeze(), pitch_predictions.squeeze(), energy_predictions.squeeze()
@@ -326,19 +336,19 @@ class ToucanTTS(torch.nn.Module):
             lang_id = lang_id.to(text.device)
         outs, \
-        predicted_durations, \
-        pitch_predictions, \
-        energy_predictions = self._forward(text.unsqueeze(0),
-                                           text_length,
-                                           gold_durations=durations,
-                                           gold_pitch=pitch,
-                                           gold_energy=energy,
-                                           utterance_embedding=utterance_embedding.unsqueeze(0) if utterance_embedding is not None else None, lang_ids=lang_id,
-                                           duration_scaling_factor=duration_scaling_factor,
-                                           pitch_variance_scale=pitch_variance_scale,
-                                           energy_variance_scale=energy_variance_scale,
-                                           pause_duration_scaling_factor=pause_duration_scaling_factor,
-                                           prosody_creativity=prosody_creativity)
         if return_duration_pitch_energy:
             return outs.squeeze().transpose(0, 1), predicted_durations, pitch_predictions, energy_predictions

         encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)
         # predicting pitch, energy and durations
+        reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
         pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
                                                  mask=text_masks.float(),
+                                                 n_timesteps=20,
                                                  temperature=prosody_creativity,
                                                  c=utterance_embedding) if gold_pitch is None else gold_pitch
+        # because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
+        pitch_predictions[0][0][0] = pitch_predictions[0][0][1]
+        pitch_predictions[0][0][-1] = pitch_predictions[0][0][-3]
+        pitch_predictions[0][0][-2] = pitch_predictions[0][0][-3]
         pitch_predictions = _scale_variance(pitch_predictions, pitch_variance_scale)
         embedded_pitch_curve = self.pitch_embed(pitch_predictions).transpose(1, 2)
+        reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
         energy_predictions = self.energy_predictor(mu=reduced_energy_space,
                                                    mask=text_masks.float(),
+                                                   n_timesteps=20,
                                                    temperature=prosody_creativity,
                                                    c=utterance_embedding) if gold_energy is None else gold_energy
+        # because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low energy value. To fix this, we just overwrite them here.
+        energy_predictions[0][0][0] = energy_predictions[0][0][1]
+        energy_predictions[0][0][-1] = energy_predictions[0][0][-3]
+        energy_predictions[0][0][-2] = energy_predictions[0][0][-3]
         energy_predictions = _scale_variance(energy_predictions, energy_variance_scale)
         embedded_energy_curve = self.energy_embed(energy_predictions).transpose(1, 2)
+        reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
         predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
                                                                              mask=text_masks.float(),
+                                                                             n_timesteps=20,
                                                                              temperature=prosody_creativity,
+                                                                             c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
         # modifying the predictions with control parameters
+        predicted_durations[0][0] = 1 # if the initial pause is too long, we get artifacts. This is once more a dirty hack.
         for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)):
             if phoneme_vector[get_feature_to_index_lookup()["word-boundary"]] == 1:
                 predicted_durations[0][phoneme_index] = 0
         refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
                                                           mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
+                                                          n_timesteps=30,
+                                                          temperature=0.2,  # low temperature, so the model follows the specified prosody curves better.
                                                           c=None).transpose(1, 2)
         return refined_codec_frames, predicted_durations.squeeze(), pitch_predictions.squeeze(), energy_predictions.squeeze()
             lang_id = lang_id.to(text.device)
         outs, \
+            predicted_durations, \
+            pitch_predictions, \
+            energy_predictions = self._forward(text.unsqueeze(0),
+                                               text_length,
+                                               gold_durations=durations,
+                                               gold_pitch=pitch,
+                                               gold_energy=energy,
+                                               utterance_embedding=utterance_embedding.unsqueeze(0) if utterance_embedding is not None else None, lang_ids=lang_id,
+                                               duration_scaling_factor=duration_scaling_factor,
+                                               pitch_variance_scale=pitch_variance_scale,
+                                               energy_variance_scale=energy_variance_scale,
+                                               pause_duration_scaling_factor=pause_duration_scaling_factor,
+                                               prosody_creativity=prosody_creativity)
         if return_duration_pitch_energy:
             return outs.squeeze().transpose(0, 1), predicted_durations, pitch_predictions, energy_predictions