import argparse import torch import torchaudio from typing import Tuple import numpy as np def load_and_preprocess_audio( audio_path: str, target_sample_rate: int = 44100, target_channels: int = 1, verbose: bool = True ) -> torch.Tensor: # 加载音频 waveform, orig_sample_rate = torchaudio.load(audio_path) if verbose: print(f"Loaded audio: {audio_path}") print(f" Original shape: {waveform.shape}") print(f" Original sample rate: {orig_sample_rate} Hz") print(f" Duration: {waveform.shape[1] / orig_sample_rate:.2f} seconds") print(f" Target sample rate: {target_sample_rate} Hz") print(f" Target channels: {target_channels}") # 重采样处理 if orig_sample_rate != target_sample_rate: if verbose: print(f" Resampling: {orig_sample_rate} Hz -> {target_sample_rate} Hz") resampler = torchaudio.transforms.Resample( orig_sample_rate, target_sample_rate ) waveform = resampler(waveform) if verbose: print(f" After resampling shape: {waveform.shape}") print(f" New duration: {waveform.shape[1] / target_sample_rate:.2f} seconds") # 通道处理 current_channels = waveform.shape[0] if current_channels > target_channels: if verbose: print(f" Downmixing: {current_channels} channels -> {target_channels} channel(s)") print(f" Using mean averaging for downmixing") assert target_channels == 1, "Downmixing only supported to mono" waveform = waveform.mean(dim=0, keepdim=True) if verbose: print(f" After downmixing shape: {waveform.shape}") elif current_channels < target_channels: if verbose: print(f" Upmixing: {current_channels} channel(s) -> {target_channels} channels") print(f" Repeating single channel data") assert waveform.shape[0] == 1, "Upmixing only supported from mono" waveform = waveform.repeat(target_channels, 1) if verbose: print(f" After upmixing shape: {waveform.shape}") else: if verbose: print(f" No channel conversion needed (already {target_channels} channels)") # 最终信息 if verbose: print(f" Final shape: {waveform.shape}") print(f" Final sample rate: {target_sample_rate} Hz") print(f" Final duration: {waveform.shape[1] / target_sample_rate:.2f} seconds") print("-" * 50) return waveform def align_audio_length( audio1: torch.Tensor, audio2: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: min_length = min(audio1.shape[1], audio2.shape[1]) audio1_aligned = audio1[:, :min_length] audio2_aligned = audio2[:, :min_length] return audio1_aligned, audio2_aligned def subtract_audio_files( audio1_path: str, audio2_path: str, output_path: str, target_sample_rate: int = 44100, target_channels: int = 1, normalize_output: bool = True, verbose: bool = True ): """ 读取两个音频文件,相减后保存到输出文件 Args: audio1_path: 第一个音频文件路径 audio2_path: 第二个音频文件路径 output_path: 输出音频文件路径 target_sample_rate: 目标采样率 target_channels: 目标通道数 normalize_output: 是否对输出进行归一化 verbose: 是否显示详细信息 """ # 加载并预处理音频 audio1 = load_and_preprocess_audio( audio1_path, target_sample_rate, target_channels, verbose ) audio2 = load_and_preprocess_audio( audio2_path, target_sample_rate, target_channels, verbose ) # 对齐音频长度 audio1, audio2 = align_audio_length(audio1, audio2) if verbose: print(f"Audio 1 shape after alignment: {audio1.shape}") print(f"Audio 2 shape after alignment: {audio2.shape}") # 音频相减 result_audio = audio1 - audio2 if verbose: print(f"Result audio shape: {result_audio.shape}") print(f"Result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]") # 可选:归一化输出 if normalize_output: max_val = torch.max(torch.abs(result_audio)) if max_val > 0: result_audio = result_audio / max_val if verbose: print(f"Normalized result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]") # 保存结果 torchaudio.save(output_path, result_audio, target_sample_rate) if verbose: print(f"Result saved to: {output_path}") print(f"Sample rate: {target_sample_rate} Hz") print(f"Channels: {result_audio.shape[0]}") print(f"Duration: {result_audio.shape[1] / target_sample_rate:.2f} seconds") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Subtract two audio files and save the result") parser.add_argument("audio1", help="First audio file (e.g., a.wav)") parser.add_argument("audio2", help="Second audio file (e.g., b.wav)") parser.add_argument("output", help="Output audio file (e.g., result.wav)") parser.add_argument("--sample_rate", type=int, default=44100, help="Target sample rate (default: 44100)") parser.add_argument("--channels", type=int, default=1, help="Target number of channels (default: 1)") parser.add_argument("--no_normalize", action="store_true", help="Disable output normalization") args = parser.parse_args() subtract_audio_files( audio1_path=args.audio1, audio2_path=args.audio2, output_path=args.output, target_sample_rate=args.sample_rate, target_channels=args.channels, normalize_output=not args.no_normalize, verbose=True )