Jihuai's picture
have to create an orphan branch to bypass large file history: cleanup .ipynb and create LFS
d572f56
import argparse
import torch
import torchaudio
from typing import Tuple
import numpy as np
def load_and_preprocess_audio(
audio_path: str,
target_sample_rate: int = 44100,
target_channels: int = 1,
verbose: bool = True
) -> torch.Tensor:
# 加载音频
waveform, orig_sample_rate = torchaudio.load(audio_path)
if verbose:
print(f"Loaded audio: {audio_path}")
print(f" Original shape: {waveform.shape}")
print(f" Original sample rate: {orig_sample_rate} Hz")
print(f" Duration: {waveform.shape[1] / orig_sample_rate:.2f} seconds")
print(f" Target sample rate: {target_sample_rate} Hz")
print(f" Target channels: {target_channels}")
# 重采样处理
if orig_sample_rate != target_sample_rate:
if verbose:
print(f" Resampling: {orig_sample_rate} Hz -> {target_sample_rate} Hz")
resampler = torchaudio.transforms.Resample(
orig_sample_rate, target_sample_rate
)
waveform = resampler(waveform)
if verbose:
print(f" After resampling shape: {waveform.shape}")
print(f" New duration: {waveform.shape[1] / target_sample_rate:.2f} seconds")
# 通道处理
current_channels = waveform.shape[0]
if current_channels > target_channels:
if verbose:
print(f" Downmixing: {current_channels} channels -> {target_channels} channel(s)")
print(f" Using mean averaging for downmixing")
assert target_channels == 1, "Downmixing only supported to mono"
waveform = waveform.mean(dim=0, keepdim=True)
if verbose:
print(f" After downmixing shape: {waveform.shape}")
elif current_channels < target_channels:
if verbose:
print(f" Upmixing: {current_channels} channel(s) -> {target_channels} channels")
print(f" Repeating single channel data")
assert waveform.shape[0] == 1, "Upmixing only supported from mono"
waveform = waveform.repeat(target_channels, 1)
if verbose:
print(f" After upmixing shape: {waveform.shape}")
else:
if verbose:
print(f" No channel conversion needed (already {target_channels} channels)")
# 最终信息
if verbose:
print(f" Final shape: {waveform.shape}")
print(f" Final sample rate: {target_sample_rate} Hz")
print(f" Final duration: {waveform.shape[1] / target_sample_rate:.2f} seconds")
print("-" * 50)
return waveform
def align_audio_length(
audio1: torch.Tensor,
audio2: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
min_length = min(audio1.shape[1], audio2.shape[1])
audio1_aligned = audio1[:, :min_length]
audio2_aligned = audio2[:, :min_length]
return audio1_aligned, audio2_aligned
def subtract_audio_files(
audio1_path: str,
audio2_path: str,
output_path: str,
target_sample_rate: int = 44100,
target_channels: int = 1,
normalize_output: bool = True,
verbose: bool = True
):
"""
读取两个音频文件,相减后保存到输出文件
Args:
audio1_path: 第一个音频文件路径
audio2_path: 第二个音频文件路径
output_path: 输出音频文件路径
target_sample_rate: 目标采样率
target_channels: 目标通道数
normalize_output: 是否对输出进行归一化
verbose: 是否显示详细信息
"""
# 加载并预处理音频
audio1 = load_and_preprocess_audio(
audio1_path, target_sample_rate, target_channels, verbose
)
audio2 = load_and_preprocess_audio(
audio2_path, target_sample_rate, target_channels, verbose
)
# 对齐音频长度
audio1, audio2 = align_audio_length(audio1, audio2)
if verbose:
print(f"Audio 1 shape after alignment: {audio1.shape}")
print(f"Audio 2 shape after alignment: {audio2.shape}")
# 音频相减
result_audio = audio1 - audio2
if verbose:
print(f"Result audio shape: {result_audio.shape}")
print(f"Result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]")
# 可选:归一化输出
if normalize_output:
max_val = torch.max(torch.abs(result_audio))
if max_val > 0:
result_audio = result_audio / max_val
if verbose:
print(f"Normalized result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]")
# 保存结果
torchaudio.save(output_path, result_audio, target_sample_rate)
if verbose:
print(f"Result saved to: {output_path}")
print(f"Sample rate: {target_sample_rate} Hz")
print(f"Channels: {result_audio.shape[0]}")
print(f"Duration: {result_audio.shape[1] / target_sample_rate:.2f} seconds")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Subtract two audio files and save the result")
parser.add_argument("audio1", help="First audio file (e.g., a.wav)")
parser.add_argument("audio2", help="Second audio file (e.g., b.wav)")
parser.add_argument("output", help="Output audio file (e.g., result.wav)")
parser.add_argument("--sample_rate", type=int, default=44100,
help="Target sample rate (default: 44100)")
parser.add_argument("--channels", type=int, default=1,
help="Target number of channels (default: 1)")
parser.add_argument("--no_normalize", action="store_true",
help="Disable output normalization")
args = parser.parse_args()
subtract_audio_files(
audio1_path=args.audio1,
audio2_path=args.audio2,
output_path=args.output,
target_sample_rate=args.sample_rate,
target_channels=args.channels,
normalize_output=not args.no_normalize,
verbose=True
)