Spaces:

chenxie95
/

Language-Audio-Banquet

Sleeping

App Files Files Community

Language-Audio-Banquet / minus.py

Jihuai

have to create an orphan branch to bypass large file history: cleanup .ipynb and create LFS

d572f56 about 2 months ago

raw

history blame contribute delete

6.03 kB

	import argparse
	import torch
	import torchaudio
	from typing import Tuple
	import numpy as np

	def load_and_preprocess_audio(
	audio_path: str,
	target_sample_rate: int = 44100,
	target_channels: int = 1,
	verbose: bool = True
	) -> torch.Tensor:
	# 加载音频
	waveform, orig_sample_rate = torchaudio.load(audio_path)

	if verbose:
	print(f"Loaded audio: {audio_path}")
	print(f" Original shape: {waveform.shape}")
	print(f" Original sample rate: {orig_sample_rate} Hz")
	print(f" Duration: {waveform.shape[1] / orig_sample_rate:.2f} seconds")
	print(f" Target sample rate: {target_sample_rate} Hz")
	print(f" Target channels: {target_channels}")

	# 重采样处理
	if orig_sample_rate != target_sample_rate:
	if verbose:
	print(f" Resampling: {orig_sample_rate} Hz -> {target_sample_rate} Hz")

	resampler = torchaudio.transforms.Resample(
	orig_sample_rate, target_sample_rate
	)
	waveform = resampler(waveform)

	if verbose:
	print(f" After resampling shape: {waveform.shape}")
	print(f" New duration: {waveform.shape[1] / target_sample_rate:.2f} seconds")

	# 通道处理
	current_channels = waveform.shape[0]

	if current_channels > target_channels:
	if verbose:
	print(f" Downmixing: {current_channels} channels -> {target_channels} channel(s)")
	print(f" Using mean averaging for downmixing")

	assert target_channels == 1, "Downmixing only supported to mono"
	waveform = waveform.mean(dim=0, keepdim=True)

	if verbose:
	print(f" After downmixing shape: {waveform.shape}")

	elif current_channels < target_channels:
	if verbose:
	print(f" Upmixing: {current_channels} channel(s) -> {target_channels} channels")
	print(f" Repeating single channel data")

	assert waveform.shape[0] == 1, "Upmixing only supported from mono"
	waveform = waveform.repeat(target_channels, 1)

	if verbose:
	print(f" After upmixing shape: {waveform.shape}")

	else:
	if verbose:
	print(f" No channel conversion needed (already {target_channels} channels)")

	# 最终信息
	if verbose:
	print(f" Final shape: {waveform.shape}")
	print(f" Final sample rate: {target_sample_rate} Hz")
	print(f" Final duration: {waveform.shape[1] / target_sample_rate:.2f} seconds")
	print("-" * 50)

	return waveform

	def align_audio_length(
	audio1: torch.Tensor,
	audio2: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	min_length = min(audio1.shape[1], audio2.shape[1])
	audio1_aligned = audio1[:, :min_length]
	audio2_aligned = audio2[:, :min_length]
	return audio1_aligned, audio2_aligned

	def subtract_audio_files(
	audio1_path: str,
	audio2_path: str,
	output_path: str,
	target_sample_rate: int = 44100,
	target_channels: int = 1,
	normalize_output: bool = True,
	verbose: bool = True
	):
	"""
	读取两个音频文件，相减后保存到输出文件

	Args:
	audio1_path: 第一个音频文件路径
	audio2_path: 第二个音频文件路径
	output_path: 输出音频文件路径
	target_sample_rate: 目标采样率
	target_channels: 目标通道数
	normalize_output: 是否对输出进行归一化
	verbose: 是否显示详细信息
	"""

	# 加载并预处理音频
	audio1 = load_and_preprocess_audio(
	audio1_path, target_sample_rate, target_channels, verbose
	)
	audio2 = load_and_preprocess_audio(
	audio2_path, target_sample_rate, target_channels, verbose
	)

	# 对齐音频长度
	audio1, audio2 = align_audio_length(audio1, audio2)

	if verbose:
	print(f"Audio 1 shape after alignment: {audio1.shape}")
	print(f"Audio 2 shape after alignment: {audio2.shape}")

	# 音频相减
	result_audio = audio1 - audio2

	if verbose:
	print(f"Result audio shape: {result_audio.shape}")
	print(f"Result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]")

	# 可选：归一化输出
	if normalize_output:
	max_val = torch.max(torch.abs(result_audio))
	if max_val > 0:
	result_audio = result_audio / max_val
	if verbose:
	print(f"Normalized result audio range: [{result_audio.min():.4f}, {result_audio.max():.4f}]")

	# 保存结果
	torchaudio.save(output_path, result_audio, target_sample_rate)

	if verbose:
	print(f"Result saved to: {output_path}")
	print(f"Sample rate: {target_sample_rate} Hz")
	print(f"Channels: {result_audio.shape[0]}")
	print(f"Duration: {result_audio.shape[1] / target_sample_rate:.2f} seconds")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Subtract two audio files and save the result")
	parser.add_argument("audio1", help="First audio file (e.g., a.wav)")
	parser.add_argument("audio2", help="Second audio file (e.g., b.wav)")
	parser.add_argument("output", help="Output audio file (e.g., result.wav)")
	parser.add_argument("--sample_rate", type=int, default=44100,
	help="Target sample rate (default: 44100)")
	parser.add_argument("--channels", type=int, default=1,
	help="Target number of channels (default: 1)")
	parser.add_argument("--no_normalize", action="store_true",
	help="Disable output normalization")

	args = parser.parse_args()

	subtract_audio_files(
	audio1_path=args.audio1,
	audio2_path=args.audio2,
	output_path=args.output,
	target_sample_rate=args.sample_rate,
	target_channels=args.channels,
	normalize_output=not args.no_normalize,
	verbose=True
	)