| 
							 | 
						import io | 
					
					
						
						| 
							 | 
						import json | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import wave | 
					
					
						
						| 
							 | 
						from dataclasses import dataclass | 
					
					
						
						| 
							 | 
						from pathlib import Path | 
					
					
						
						| 
							 | 
						from typing import List, Mapping, Optional, Sequence, Union | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import numpy as np | 
					
					
						
						| 
							 | 
						import onnxruntime | 
					
					
						
						| 
							 | 
						from espeak_phonemizer import Phonemizer | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						_BOS = "^" | 
					
					
						
						| 
							 | 
						_EOS = "$" | 
					
					
						
						| 
							 | 
						_PAD = "_" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@dataclass | 
					
					
						
						| 
							 | 
						class PiperConfig: | 
					
					
						
						| 
							 | 
						    num_symbols: int | 
					
					
						
						| 
							 | 
						    num_speakers: int | 
					
					
						
						| 
							 | 
						    sample_rate: int | 
					
					
						
						| 
							 | 
						    espeak_voice: str | 
					
					
						
						| 
							 | 
						    length_scale: float | 
					
					
						
						| 
							 | 
						    noise_scale: float | 
					
					
						
						| 
							 | 
						    noise_w: float | 
					
					
						
						| 
							 | 
						    phoneme_id_map: Mapping[str, Sequence[int]] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						class Piper: | 
					
					
						
						| 
							 | 
						    def __init__( | 
					
					
						
						| 
							 | 
						        self, | 
					
					
						
						| 
							 | 
						        model_path: Union[str, Path], | 
					
					
						
						| 
							 | 
						        config_path: Optional[Union[str, Path]] = None, | 
					
					
						
						| 
							 | 
						        use_cuda: bool = False, | 
					
					
						
						| 
							 | 
						    ): | 
					
					
						
						| 
							 | 
						        if config_path is None: | 
					
					
						
						| 
							 | 
						            config_path = f"{model_path}.json" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        self.config = load_config(config_path) | 
					
					
						
						| 
							 | 
						        self.phonemizer = Phonemizer(self.config.espeak_voice) | 
					
					
						
						| 
							 | 
						        self.onnx_options = onnxruntime.SessionOptions() | 
					
					
						
						| 
							 | 
						        self.onnx_options.intra_op_num_threads = os.cpu_count() - 1 | 
					
					
						
						| 
							 | 
						        self.model = onnxruntime.InferenceSession( | 
					
					
						
						| 
							 | 
						            str(model_path), | 
					
					
						
						| 
							 | 
						            sess_options=self.onnx_options, | 
					
					
						
						| 
							 | 
						            providers=["CPUExecutionProvider"] | 
					
					
						
						| 
							 | 
						            if not use_cuda | 
					
					
						
						| 
							 | 
						            else ["CUDAExecutionProvider"], | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def synthesize( | 
					
					
						
						| 
							 | 
						        self, | 
					
					
						
						| 
							 | 
						        text: str, | 
					
					
						
						| 
							 | 
						        speaker_id: Optional[int] = None, | 
					
					
						
						| 
							 | 
						        length_scale: Optional[float] = None, | 
					
					
						
						| 
							 | 
						        noise_scale: Optional[float] = None, | 
					
					
						
						| 
							 | 
						        noise_w: Optional[float] = None, | 
					
					
						
						| 
							 | 
						    ) -> bytes: | 
					
					
						
						| 
							 | 
						        """Synthesize WAV audio from text.""" | 
					
					
						
						| 
							 | 
						        if length_scale is None: | 
					
					
						
						| 
							 | 
						            length_scale = self.config.length_scale | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if noise_scale is None: | 
					
					
						
						| 
							 | 
						            noise_scale = self.config.noise_scale | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if noise_w is None: | 
					
					
						
						| 
							 | 
						            noise_w = self.config.noise_w | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        phonemes_str = self.phonemizer.phonemize(text, keep_clause_breakers=True) | 
					
					
						
						| 
							 | 
						        phonemes = [_BOS] + list(phonemes_str) | 
					
					
						
						| 
							 | 
						        phoneme_ids: List[int] = [] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        for phoneme in phonemes: | 
					
					
						
						| 
							 | 
						            phoneme_ids.extend(self.config.phoneme_id_map[phoneme]) | 
					
					
						
						| 
							 | 
						            phoneme_ids.extend(self.config.phoneme_id_map[_PAD]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        phoneme_ids.extend(self.config.phoneme_id_map[_EOS]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) | 
					
					
						
						| 
							 | 
						        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64) | 
					
					
						
						| 
							 | 
						        scales = np.array( | 
					
					
						
						| 
							 | 
						            [noise_scale, length_scale, noise_w], | 
					
					
						
						| 
							 | 
						            dtype=np.float32, | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if (self.config.num_speakers > 1) and (speaker_id is not None): | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            speaker_id = 0 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        sid = None | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if speaker_id is not None: | 
					
					
						
						| 
							 | 
						            sid = np.array([speaker_id], dtype=np.int64) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        audio = self.model.run( | 
					
					
						
						| 
							 | 
						            None, | 
					
					
						
						| 
							 | 
						            { | 
					
					
						
						| 
							 | 
						                "input": phoneme_ids_array, | 
					
					
						
						| 
							 | 
						                "input_lengths": phoneme_ids_lengths, | 
					
					
						
						| 
							 | 
						                "scales": scales, | 
					
					
						
						| 
							 | 
						                "sid": sid, | 
					
					
						
						| 
							 | 
						            }, | 
					
					
						
						| 
							 | 
						        )[0].squeeze((0, 1)) | 
					
					
						
						| 
							 | 
						        audio = audio_float_to_int16(audio.squeeze()) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        with io.BytesIO() as wav_io: | 
					
					
						
						| 
							 | 
						            wav_file: wave.Wave_write = wave.open(wav_io, "wb") | 
					
					
						
						| 
							 | 
						            with wav_file: | 
					
					
						
						| 
							 | 
						                wav_file.setframerate(self.config.sample_rate) | 
					
					
						
						| 
							 | 
						                wav_file.setsampwidth(2) | 
					
					
						
						| 
							 | 
						                wav_file.setnchannels(1) | 
					
					
						
						| 
							 | 
						                wav_file.writeframes(audio.tobytes()) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            return wav_io.getvalue() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def load_config(config_path: Union[str, Path]) -> PiperConfig: | 
					
					
						
						| 
							 | 
						    with open(config_path, "r", encoding="utf-8") as config_file: | 
					
					
						
						| 
							 | 
						        config_dict = json.load(config_file) | 
					
					
						
						| 
							 | 
						        inference = config_dict.get("inference", {}) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return PiperConfig( | 
					
					
						
						| 
							 | 
						            num_symbols=config_dict["num_symbols"], | 
					
					
						
						| 
							 | 
						            num_speakers=config_dict["num_speakers"], | 
					
					
						
						| 
							 | 
						            sample_rate=config_dict["audio"]["sample_rate"], | 
					
					
						
						| 
							 | 
						            espeak_voice=config_dict["espeak"]["voice"], | 
					
					
						
						| 
							 | 
						            noise_scale=inference.get("noise_scale", 0.667), | 
					
					
						
						| 
							 | 
						            length_scale=inference.get("length_scale", 1.0), | 
					
					
						
						| 
							 | 
						            noise_w=inference.get("noise_w", 0.8), | 
					
					
						
						| 
							 | 
						            phoneme_id_map=config_dict["phoneme_id_map"], | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def audio_float_to_int16( | 
					
					
						
						| 
							 | 
						    audio: np.ndarray, max_wav_value: float = 32767.0 | 
					
					
						
						| 
							 | 
						) -> np.ndarray: | 
					
					
						
						| 
							 | 
						    """Normalize audio and convert to int16 range""" | 
					
					
						
						| 
							 | 
						    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio)))) | 
					
					
						
						| 
							 | 
						    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value) | 
					
					
						
						| 
							 | 
						    audio_norm = audio_norm.astype("int16") | 
					
					
						
						| 
							 | 
						    return audio_norm | 
					
					
						
						| 
							 | 
						
 |