Update openvoice/se_extractor.py
Browse files- openvoice/se_extractor.py +153 -153
    	
        openvoice/se_extractor.py
    CHANGED
    
    | @@ -1,153 +1,153 @@ | |
| 1 | 
            -
            import os
         | 
| 2 | 
            -
            import glob
         | 
| 3 | 
            -
            import torch
         | 
| 4 | 
            -
            import hashlib
         | 
| 5 | 
            -
            import librosa
         | 
| 6 | 
            -
            import base64
         | 
| 7 | 
            -
            from glob import glob
         | 
| 8 | 
            -
            import numpy as np
         | 
| 9 | 
            -
            from pydub import AudioSegment
         | 
| 10 | 
            -
            from faster_whisper import WhisperModel
         | 
| 11 | 
            -
            import hashlib
         | 
| 12 | 
            -
            import base64
         | 
| 13 | 
            -
            import librosa
         | 
| 14 | 
            -
            from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
         | 
| 15 | 
            -
             | 
| 16 | 
            -
            model_size = "medium"
         | 
| 17 | 
            -
            # Run on GPU with FP16
         | 
| 18 | 
            -
            model = None
         | 
| 19 | 
            -
            def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
         | 
| 20 | 
            -
                global model
         | 
| 21 | 
            -
                if model is None:
         | 
| 22 | 
            -
                    model = WhisperModel(model_size, device=" | 
| 23 | 
            -
                audio = AudioSegment.from_file(audio_path)
         | 
| 24 | 
            -
                max_len = len(audio)
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                target_folder = os.path.join(target_dir, audio_name)
         | 
| 27 | 
            -
                
         | 
| 28 | 
            -
                segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
         | 
| 29 | 
            -
                segments = list(segments)    
         | 
| 30 | 
            -
             | 
| 31 | 
            -
                # create directory
         | 
| 32 | 
            -
                os.makedirs(target_folder, exist_ok=True)
         | 
| 33 | 
            -
                wavs_folder = os.path.join(target_folder, 'wavs')
         | 
| 34 | 
            -
                os.makedirs(wavs_folder, exist_ok=True)
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                # segments
         | 
| 37 | 
            -
                s_ind = 0
         | 
| 38 | 
            -
                start_time = None
         | 
| 39 | 
            -
                
         | 
| 40 | 
            -
                for k, w in enumerate(segments):
         | 
| 41 | 
            -
                    # process with the time
         | 
| 42 | 
            -
                    if k == 0:
         | 
| 43 | 
            -
                        start_time = max(0, w.start)
         | 
| 44 | 
            -
             | 
| 45 | 
            -
                    end_time = w.end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                    # calculate confidence
         | 
| 48 | 
            -
                    if len(w.words) > 0:
         | 
| 49 | 
            -
                        confidence = sum([s.probability for s in w.words]) / len(w.words)
         | 
| 50 | 
            -
                    else:
         | 
| 51 | 
            -
                        confidence = 0.
         | 
| 52 | 
            -
                    # clean text
         | 
| 53 | 
            -
                    text = w.text.replace('...', '')
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                    # left 0.08s for each audios
         | 
| 56 | 
            -
                    audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
         | 
| 57 | 
            -
             | 
| 58 | 
            -
                    # segment file name
         | 
| 59 | 
            -
                    fname = f"{audio_name}_seg{s_ind}.wav"
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                    # filter out the segment shorter than 1.5s and longer than 20s
         | 
| 62 | 
            -
                    save = audio_seg.duration_seconds > 1.5 and \
         | 
| 63 | 
            -
                            audio_seg.duration_seconds < 20. and \
         | 
| 64 | 
            -
                            len(text) >= 2 and len(text) < 200 
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                    if save:
         | 
| 67 | 
            -
                        output_file = os.path.join(wavs_folder, fname)
         | 
| 68 | 
            -
                        audio_seg.export(output_file, format='wav')
         | 
| 69 | 
            -
             | 
| 70 | 
            -
                    if k < len(segments) - 1:
         | 
| 71 | 
            -
                        start_time = max(0, segments[k+1].start - 0.08)
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                    s_ind = s_ind + 1
         | 
| 74 | 
            -
                return wavs_folder
         | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
            def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
         | 
| 78 | 
            -
                SAMPLE_RATE = 16000
         | 
| 79 | 
            -
                audio_vad = get_audio_tensor(audio_path)
         | 
| 80 | 
            -
                segments = get_vad_segments(
         | 
| 81 | 
            -
                    audio_vad,
         | 
| 82 | 
            -
                    output_sample=True,
         | 
| 83 | 
            -
                    min_speech_duration=0.1,
         | 
| 84 | 
            -
                    min_silence_duration=1,
         | 
| 85 | 
            -
                    method="silero",
         | 
| 86 | 
            -
                )
         | 
| 87 | 
            -
                segments = [(seg["start"], seg["end"]) for seg in segments]
         | 
| 88 | 
            -
                segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
         | 
| 89 | 
            -
                print(segments)
         | 
| 90 | 
            -
                audio_active = AudioSegment.silent(duration=0)
         | 
| 91 | 
            -
                audio = AudioSegment.from_file(audio_path)
         | 
| 92 | 
            -
             | 
| 93 | 
            -
                for start_time, end_time in segments:
         | 
| 94 | 
            -
                    audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
         | 
| 95 | 
            -
                
         | 
| 96 | 
            -
                audio_dur = audio_active.duration_seconds
         | 
| 97 | 
            -
                print(f'after vad: dur = {audio_dur}')
         | 
| 98 | 
            -
                target_folder = os.path.join(target_dir, audio_name)
         | 
| 99 | 
            -
                wavs_folder = os.path.join(target_folder, 'wavs')
         | 
| 100 | 
            -
                os.makedirs(wavs_folder, exist_ok=True)
         | 
| 101 | 
            -
                start_time = 0.
         | 
| 102 | 
            -
                count = 0
         | 
| 103 | 
            -
                num_splits = int(np.round(audio_dur / split_seconds))
         | 
| 104 | 
            -
                assert num_splits > 0, 'input audio is too short'
         | 
| 105 | 
            -
                interval = audio_dur / num_splits
         | 
| 106 | 
            -
             | 
| 107 | 
            -
                for i in range(num_splits):
         | 
| 108 | 
            -
                    end_time = min(start_time + interval, audio_dur)
         | 
| 109 | 
            -
                    if i == num_splits - 1:
         | 
| 110 | 
            -
                        end_time = audio_dur
         | 
| 111 | 
            -
                    output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
         | 
| 112 | 
            -
                    audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
         | 
| 113 | 
            -
                    audio_seg.export(output_file, format='wav')
         | 
| 114 | 
            -
                    start_time = end_time
         | 
| 115 | 
            -
                    count += 1
         | 
| 116 | 
            -
                return wavs_folder
         | 
| 117 | 
            -
             | 
| 118 | 
            -
            def hash_numpy_array(audio_path):
         | 
| 119 | 
            -
                array, _ = librosa.load(audio_path, sr=None, mono=True)
         | 
| 120 | 
            -
                # Convert the array to bytes
         | 
| 121 | 
            -
                array_bytes = array.tobytes()
         | 
| 122 | 
            -
                # Calculate the hash of the array bytes
         | 
| 123 | 
            -
                hash_object = hashlib.sha256(array_bytes)
         | 
| 124 | 
            -
                hash_value = hash_object.digest()
         | 
| 125 | 
            -
                # Convert the hash value to base64
         | 
| 126 | 
            -
                base64_value = base64.b64encode(hash_value)
         | 
| 127 | 
            -
                return base64_value.decode('utf-8')[:16].replace('/', '_^')
         | 
| 128 | 
            -
             | 
| 129 | 
            -
            def get_se(audio_path, vc_model, target_dir='processed', vad=True):
         | 
| 130 | 
            -
                device = vc_model.device
         | 
| 131 | 
            -
                version = vc_model.version
         | 
| 132 | 
            -
                print("OpenVoice version:", version)
         | 
| 133 | 
            -
             | 
| 134 | 
            -
                audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
         | 
| 135 | 
            -
                se_path = os.path.join(target_dir, audio_name, 'se.pth')
         | 
| 136 | 
            -
             | 
| 137 | 
            -
                # if os.path.isfile(se_path):
         | 
| 138 | 
            -
                #     se = torch.load(se_path).to(device)
         | 
| 139 | 
            -
                #     return se, audio_name
         | 
| 140 | 
            -
                # if os.path.isdir(audio_path):
         | 
| 141 | 
            -
                #     wavs_folder = audio_path
         | 
| 142 | 
            -
                
         | 
| 143 | 
            -
                if vad:
         | 
| 144 | 
            -
                    wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
         | 
| 145 | 
            -
                else:
         | 
| 146 | 
            -
                    wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
         | 
| 147 | 
            -
                
         | 
| 148 | 
            -
                audio_segs = glob(f'{wavs_folder}/*.wav')
         | 
| 149 | 
            -
                if len(audio_segs) == 0:
         | 
| 150 | 
            -
                    raise NotImplementedError('No audio segments found!')
         | 
| 151 | 
            -
                
         | 
| 152 | 
            -
                return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
         | 
| 153 | 
            -
             | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import glob
         | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            import hashlib
         | 
| 5 | 
            +
            import librosa
         | 
| 6 | 
            +
            import base64
         | 
| 7 | 
            +
            from glob import glob
         | 
| 8 | 
            +
            import numpy as np
         | 
| 9 | 
            +
            from pydub import AudioSegment
         | 
| 10 | 
            +
            from faster_whisper import WhisperModel
         | 
| 11 | 
            +
            import hashlib
         | 
| 12 | 
            +
            import base64
         | 
| 13 | 
            +
            import librosa
         | 
| 14 | 
            +
            from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            model_size = "medium"
         | 
| 17 | 
            +
            # Run on GPU with FP16
         | 
| 18 | 
            +
            model = None
         | 
| 19 | 
            +
            def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
         | 
| 20 | 
            +
                global model
         | 
| 21 | 
            +
                if model is None:
         | 
| 22 | 
            +
                    model = WhisperModel(model_size, device="cpu", compute_type="int8")
         | 
| 23 | 
            +
                audio = AudioSegment.from_file(audio_path)
         | 
| 24 | 
            +
                max_len = len(audio)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                target_folder = os.path.join(target_dir, audio_name)
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
         | 
| 29 | 
            +
                segments = list(segments)    
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                # create directory
         | 
| 32 | 
            +
                os.makedirs(target_folder, exist_ok=True)
         | 
| 33 | 
            +
                wavs_folder = os.path.join(target_folder, 'wavs')
         | 
| 34 | 
            +
                os.makedirs(wavs_folder, exist_ok=True)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                # segments
         | 
| 37 | 
            +
                s_ind = 0
         | 
| 38 | 
            +
                start_time = None
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                for k, w in enumerate(segments):
         | 
| 41 | 
            +
                    # process with the time
         | 
| 42 | 
            +
                    if k == 0:
         | 
| 43 | 
            +
                        start_time = max(0, w.start)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    end_time = w.end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    # calculate confidence
         | 
| 48 | 
            +
                    if len(w.words) > 0:
         | 
| 49 | 
            +
                        confidence = sum([s.probability for s in w.words]) / len(w.words)
         | 
| 50 | 
            +
                    else:
         | 
| 51 | 
            +
                        confidence = 0.
         | 
| 52 | 
            +
                    # clean text
         | 
| 53 | 
            +
                    text = w.text.replace('...', '')
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    # left 0.08s for each audios
         | 
| 56 | 
            +
                    audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    # segment file name
         | 
| 59 | 
            +
                    fname = f"{audio_name}_seg{s_ind}.wav"
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    # filter out the segment shorter than 1.5s and longer than 20s
         | 
| 62 | 
            +
                    save = audio_seg.duration_seconds > 1.5 and \
         | 
| 63 | 
            +
                            audio_seg.duration_seconds < 20. and \
         | 
| 64 | 
            +
                            len(text) >= 2 and len(text) < 200 
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    if save:
         | 
| 67 | 
            +
                        output_file = os.path.join(wavs_folder, fname)
         | 
| 68 | 
            +
                        audio_seg.export(output_file, format='wav')
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    if k < len(segments) - 1:
         | 
| 71 | 
            +
                        start_time = max(0, segments[k+1].start - 0.08)
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    s_ind = s_ind + 1
         | 
| 74 | 
            +
                return wavs_folder
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
         | 
| 78 | 
            +
                SAMPLE_RATE = 16000
         | 
| 79 | 
            +
                audio_vad = get_audio_tensor(audio_path)
         | 
| 80 | 
            +
                segments = get_vad_segments(
         | 
| 81 | 
            +
                    audio_vad,
         | 
| 82 | 
            +
                    output_sample=True,
         | 
| 83 | 
            +
                    min_speech_duration=0.1,
         | 
| 84 | 
            +
                    min_silence_duration=1,
         | 
| 85 | 
            +
                    method="silero",
         | 
| 86 | 
            +
                )
         | 
| 87 | 
            +
                segments = [(seg["start"], seg["end"]) for seg in segments]
         | 
| 88 | 
            +
                segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
         | 
| 89 | 
            +
                print(segments)
         | 
| 90 | 
            +
                audio_active = AudioSegment.silent(duration=0)
         | 
| 91 | 
            +
                audio = AudioSegment.from_file(audio_path)
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                for start_time, end_time in segments:
         | 
| 94 | 
            +
                    audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
         | 
| 95 | 
            +
                
         | 
| 96 | 
            +
                audio_dur = audio_active.duration_seconds
         | 
| 97 | 
            +
                print(f'after vad: dur = {audio_dur}')
         | 
| 98 | 
            +
                target_folder = os.path.join(target_dir, audio_name)
         | 
| 99 | 
            +
                wavs_folder = os.path.join(target_folder, 'wavs')
         | 
| 100 | 
            +
                os.makedirs(wavs_folder, exist_ok=True)
         | 
| 101 | 
            +
                start_time = 0.
         | 
| 102 | 
            +
                count = 0
         | 
| 103 | 
            +
                num_splits = int(np.round(audio_dur / split_seconds))
         | 
| 104 | 
            +
                assert num_splits > 0, 'input audio is too short'
         | 
| 105 | 
            +
                interval = audio_dur / num_splits
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                for i in range(num_splits):
         | 
| 108 | 
            +
                    end_time = min(start_time + interval, audio_dur)
         | 
| 109 | 
            +
                    if i == num_splits - 1:
         | 
| 110 | 
            +
                        end_time = audio_dur
         | 
| 111 | 
            +
                    output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
         | 
| 112 | 
            +
                    audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
         | 
| 113 | 
            +
                    audio_seg.export(output_file, format='wav')
         | 
| 114 | 
            +
                    start_time = end_time
         | 
| 115 | 
            +
                    count += 1
         | 
| 116 | 
            +
                return wavs_folder
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            def hash_numpy_array(audio_path):
         | 
| 119 | 
            +
                array, _ = librosa.load(audio_path, sr=None, mono=True)
         | 
| 120 | 
            +
                # Convert the array to bytes
         | 
| 121 | 
            +
                array_bytes = array.tobytes()
         | 
| 122 | 
            +
                # Calculate the hash of the array bytes
         | 
| 123 | 
            +
                hash_object = hashlib.sha256(array_bytes)
         | 
| 124 | 
            +
                hash_value = hash_object.digest()
         | 
| 125 | 
            +
                # Convert the hash value to base64
         | 
| 126 | 
            +
                base64_value = base64.b64encode(hash_value)
         | 
| 127 | 
            +
                return base64_value.decode('utf-8')[:16].replace('/', '_^')
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            def get_se(audio_path, vc_model, target_dir='processed', vad=True):
         | 
| 130 | 
            +
                device = vc_model.device
         | 
| 131 | 
            +
                version = vc_model.version
         | 
| 132 | 
            +
                print("OpenVoice version:", version)
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
         | 
| 135 | 
            +
                se_path = os.path.join(target_dir, audio_name, 'se.pth')
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                # if os.path.isfile(se_path):
         | 
| 138 | 
            +
                #     se = torch.load(se_path).to(device)
         | 
| 139 | 
            +
                #     return se, audio_name
         | 
| 140 | 
            +
                # if os.path.isdir(audio_path):
         | 
| 141 | 
            +
                #     wavs_folder = audio_path
         | 
| 142 | 
            +
                
         | 
| 143 | 
            +
                if vad:
         | 
| 144 | 
            +
                    wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
         | 
| 145 | 
            +
                else:
         | 
| 146 | 
            +
                    wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
         | 
| 147 | 
            +
                
         | 
| 148 | 
            +
                audio_segs = glob(f'{wavs_folder}/*.wav')
         | 
| 149 | 
            +
                if len(audio_segs) == 0:
         | 
| 150 | 
            +
                    raise NotImplementedError('No audio segments found!')
         | 
| 151 | 
            +
                
         | 
| 152 | 
            +
                return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
         | 
| 153 | 
            +
             | 
