Spaces:

chenxie95
/

Language-Audio-Banquet

Sleeping

File size: 6,034 Bytes

d572f56

import argparse
import torch
import torchaudio
from typing import Tuple
import numpy as np

def load_and_preprocess_audio(
    audio_path: str, 
    target_sample_rate: int = 44100,
    target_channels: int = 1,
    verbose: bool = True
) -> torch.Tensor:
    # 加载音频
    waveform, orig_sample_rate = torchaudio.load(audio_path)
    
    if verbose:
        print(f"Loaded audio: {audio_path}")
        print(f"  Original shape: {waveform.shape}")
        print(f"  Original sample rate: {orig_sample_rate} Hz")
        print(f"  Duration: {waveform.shape[1] / orig_sample_rate:.2f} seconds")
        print(f"  Target sample rate: {target_sample_rate} Hz")
        print(f"  Target channels: {target_channels}")
    
    # 重采样处理
    if orig_sample_rate != target_sample_rate:
        if verbose:
            print(f"  Resampling: {orig_sample_rate} Hz -> {target_sample_rate} Hz")
        
        resampler = torchaudio.transforms.Resample(
            orig_sample_rate, target_sample_rate
        )
        waveform = resampler(waveform)
        
        if verbose:
            print(f"  After resampling shape: {waveform.shape}")
            print(f"  New duration: {waveform.shape[1] / target_sample_rate:.2f} seconds")
    
    # 通道处理
    current_channels = waveform.shape[0]
    
    if current_channels > target_channels:
        if verbose:
            print(f"  Downmixing: {current_channels} channels -> {target_channels} channel(s)")
            print(f"  Using mean averaging for downmixing")
        
        assert target_channels == 1, "Downmixing only supported to mono"
        waveform = waveform.mean(dim=0, keepdim=True)
        
        if verbose:
            print(f"  After downmixing shape: {waveform.shape}")
    
    elif current_channels < target_channels:
        if verbose:
            print(f"  Upmixing: {current_channels} channel(s) -> {target_channels} channels")
            print(f"  Repeating single channel data")
        
        assert waveform.shape[0] == 1, "Upmixing only supported from mono"
        waveform = waveform.repeat(target_channels, 1)
        
        if verbose:
            print(f"  After upmixing shape: {waveform.shape}")
    
    else:
        if verbose:
            print(f"  No channel conversion needed (already {target_channels} channels)")
    
    # 最终信息
    if verbose:
        print(f"  Final shape: {waveform.shape}")
        print(f"  Final sample rate: {target_sample_rate} Hz")
        print(f"  Final duration: {waveform.shape[1] / target_sample_rate:.2f} seconds")
        print("-" * 50)
    
    return waveform

def align_audio_length(
    audio1: torch.Tensor, 
    audio2: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    min_length = min(audio1.shape[1], audio2.shape[1])
    audio1_aligned = audio1[:, :min_length]
    audio2_aligned = audio2[:, :min_length]
    return audio1_aligned, audio2_aligned

def subtract_audio_files(
    audio1_path: str,
    audio2_path: str,
    output_path: str,
    target_sample_rate: int = 44100,
    target_channels: int = 1,
    normalize_output: bool = True,
    verbose: bool = True
):
    """
    读取两个音频文件，相减后保存到输出文件
    
    Args:
        audio1_path: 第一个音频文件路径
        audio2_path: 第二个音频文件路径
        output_path: 输出音频文件路径
        target_sample_rate: 目标采样率
        target_channels: 目标通道数
        normalize_output: 是否对输出进行归一化
        verbose: 是否显示详细信息
    """
    
    # 加载并预处理音频
    audio1 = load_and_preprocess_audio(
        audio1_path, target_sample_rate, target_channels, verbose
    )
    audio2 = load_and_preprocess_audio(
        audio2_path, target_sample_rate, target_channels, verbose
    )
    
    # 对齐音频长度
    audio1, audio2 = align_audio_length(audio1, audio2)
    
    if verbose:
        print(f"Audio 1 shape after alignment: {audio1.shape}")
        print(f"Audio 2 shape after alignment: {audio2.shape}")
    
    # 音频相减
    result_audio = audio1 - audio2
    
    if verbose:
        print(f"Result audio shape: {result_audio.shape}")
        print(f"Result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]")
    
    # 可选：归一化输出
    if normalize_output:
        max_val = torch.max(torch.abs(result_audio))
        if max_val > 0:
            result_audio = result_audio / max_val
            if verbose:
                print(f"Normalized result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]")
    
    # 保存结果
    torchaudio.save(output_path, result_audio, target_sample_rate)
    
    if verbose:
        print(f"Result saved to: {output_path}")
        print(f"Sample rate: {target_sample_rate} Hz")
        print(f"Channels: {result_audio.shape[0]}")
        print(f"Duration: {result_audio.shape[1] / target_sample_rate:.2f} seconds")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Subtract two audio files and save the result")
    parser.add_argument("audio1", help="First audio file (e.g., a.wav)")
    parser.add_argument("audio2", help="Second audio file (e.g., b.wav)")
    parser.add_argument("output", help="Output audio file (e.g., result.wav)")
    parser.add_argument("--sample_rate", type=int, default=44100, 
                       help="Target sample rate (default: 44100)")
    parser.add_argument("--channels", type=int, default=1, 
                       help="Target number of channels (default: 1)")
    parser.add_argument("--no_normalize", action="store_true",
                       help="Disable output normalization")
    
    args = parser.parse_args()
    
    subtract_audio_files(
        audio1_path=args.audio1,
        audio2_path=args.audio2,
        output_path=args.output,
        target_sample_rate=args.sample_rate,
        target_channels=args.channels,
        normalize_output=not args.no_normalize,
        verbose=True
    )