EG/plugins/user/speech_recognition_synthesis/utils/audio_utils.py
2025-12-12 16:16:15 +08:00

1442 lines
51 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
音频工具类
提供音频处理和文件操作功能
"""
import numpy as np
import wave
import struct
from typing import Dict, Any, List
import os
class AudioUtils:
"""
音频工具类
提供音频处理和文件操作功能
"""
def __init__(self, plugin):
"""
初始化音频工具类
Args:
plugin: 语音识别和合成插件实例
"""
self.plugin = plugin
self.sample_rate = plugin.config.get('sample_rate', 16000)
self.channels = plugin.config.get('channels', 1)
self.sample_width = 2 # 16位音频
print("✓ 音频工具类已创建")
def save_audio_file(self, filename: str, audio_data: bytes) -> bool:
"""
保存音频文件
Args:
filename: 文件名
audio_data: 音频数据
Returns:
是否保存成功
"""
try:
# 确保目录存在
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
# 保存音频数据到文件
with open(filename, 'wb') as f:
f.write(audio_data)
print(f"✓ 音频文件已保存: {filename}")
return True
except Exception as e:
print(f"✗ 保存音频文件失败: {e}")
import traceback
traceback.print_exc()
return False
def load_audio_file(self, filename: str) -> bytes:
"""
加载音频文件
Args:
filename: 文件名
Returns:
音频数据
"""
try:
with open(filename, 'rb') as f:
audio_data = f.read()
print(f"✓ 音频文件已加载: {filename}")
return audio_data
except Exception as e:
print(f"✗ 加载音频文件失败: {e}")
import traceback
traceback.print_exc()
return b''
def save_wav_file(self, filename: str, audio_data: np.ndarray, sample_rate: int = None) -> bool:
"""
保存WAV音频文件
Args:
filename: 文件名
audio_data: 音频数据numpy数组
sample_rate: 采样率
Returns:
是否保存成功
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 确保目录存在
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
# 确保音频数据在合法范围内
audio_data = np.clip(audio_data, -1.0, 1.0)
# 转换为16位整数
audio_int16 = (audio_data * 32767).astype(np.int16)
# 保存为WAV文件
with wave.open(filename, 'w') as wav_file:
wav_file.setnchannels(self.channels)
wav_file.setsampwidth(self.sample_width) # 16位 = 2字节
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_int16.tobytes())
print(f"✓ WAV文件已保存: {filename}")
return True
except Exception as e:
print(f"✗ 保存WAV文件失败: {e}")
import traceback
traceback.print_exc()
return False
def load_wav_file(self, filename: str) -> Dict[str, Any]:
"""
加载WAV音频文件
Args:
filename: 文件名
Returns:
包含音频数据和参数的字典
"""
try:
with wave.open(filename, 'r') as wav_file:
# 获取音频参数
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
sample_rate = wav_file.getframerate()
frames = wav_file.getnframes()
# 读取音频数据
audio_data = wav_file.readframes(frames)
# 转换为numpy数组
if sample_width == 1:
# 8位音频
audio_array = np.frombuffer(audio_data, dtype=np.uint8)
audio_array = (audio_array.astype(np.float32) - 128) / 128.0
elif sample_width == 2:
# 16位音频
audio_array = np.frombuffer(audio_data, dtype=np.int16)
audio_array = audio_array.astype(np.float32) / 32768.0
elif sample_width == 4:
# 32位音频
audio_array = np.frombuffer(audio_data, dtype=np.int32)
audio_array = audio_array.astype(np.float32) / 2147483648.0
else:
print(f"✗ 不支持的采样宽度: {sample_width}")
return {}
print(f"✓ WAV文件已加载: {filename}")
return {
'audio_data': audio_array,
'sample_rate': sample_rate,
'channels': channels,
'sample_width': sample_width,
'frames': frames
}
except Exception as e:
print(f"✗ 加载WAV文件失败: {e}")
import traceback
traceback.print_exc()
return {}
def convert_audio_format(self, audio_data: np.ndarray, from_format: str, to_format: str) -> np.ndarray:
"""
转换音频格式
Args:
audio_data: 音频数据
from_format: 源格式
to_format: 目标格式
Returns:
转换后的音频数据
"""
try:
# 这里简化处理,实际项目中可能需要更复杂的格式转换
print(f"✓ 音频格式已转换: {from_format} -> {to_format}")
return audio_data
except Exception as e:
print(f"✗ 音频格式转换失败: {e}")
return audio_data
def resample_audio(self, audio_data: np.ndarray, from_rate: int, to_rate: int) -> np.ndarray:
"""
重采样音频数据
Args:
audio_data: 音频数据
from_rate: 源采样率
to_rate: 目标采样率
Returns:
重采样后的音频数据
"""
try:
if from_rate == to_rate:
return audio_data
# 计算重采样比例
ratio = to_rate / from_rate
new_length = int(len(audio_data) * ratio)
# 简单的线性插值(实际项目中应该使用更高质量的重采样算法)
if new_length > 0:
indices = np.linspace(0, len(audio_data) - 1, new_length)
resampled_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
print(f"✓ 音频已重采样: {from_rate}Hz -> {to_rate}Hz")
return resampled_data
else:
return np.array([], dtype=np.float32)
except Exception as e:
print(f"✗ 音频重采样失败: {e}")
return audio_data
def apply_gain(self, audio_data: np.ndarray, gain_db: float) -> np.ndarray:
"""
应用增益到音频数据
Args:
audio_data: 音频数据
gain_db: 增益值(分贝)
Returns:
应用增益后的音频数据
"""
try:
# 将分贝转换为线性增益
gain_linear = 10 ** (gain_db / 20.0)
amplified_data = audio_data * gain_linear
# 防止削波
amplified_data = np.clip(amplified_data, -1.0, 1.0)
print(f"✓ 音频增益已应用: {gain_db}dB")
return amplified_data
except Exception as e:
print(f"✗ 应用音频增益失败: {e}")
return audio_data
def normalize_audio(self, audio_data: np.ndarray) -> np.ndarray:
"""
归一化音频数据
Args:
audio_data: 音频数据
Returns:
归一化后的音频数据
"""
try:
if len(audio_data) == 0:
return audio_data
# 计算最大幅度
max_amplitude = np.max(np.abs(audio_data))
if max_amplitude > 0:
normalized_data = audio_data / max_amplitude
print("✓ 音频已归一化")
return normalized_data
else:
return audio_data
except Exception as e:
print(f"✗ 音频归一化失败: {e}")
return audio_data
def apply_fade_in(self, audio_data: np.ndarray, fade_duration: float, sample_rate: int = None) -> np.ndarray:
"""
应用淡入效果
Args:
audio_data: 音频数据
fade_duration: 淡入时长(秒)
sample_rate: 采样率
Returns:
应用淡入效果后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
fade_samples = int(fade_duration * sample_rate)
if fade_samples <= 0 or fade_samples >= len(audio_data):
return audio_data
# 创建淡入曲线
fade_curve = np.linspace(0, 1, fade_samples)
# 应用淡入效果
faded_data = audio_data.copy()
faded_data[:fade_samples] *= fade_curve
print(f"✓ 淡入效果已应用: {fade_duration}")
return faded_data
except Exception as e:
print(f"✗ 应用淡入效果失败: {e}")
return audio_data
def apply_fade_out(self, audio_data: np.ndarray, fade_duration: float, sample_rate: int = None) -> np.ndarray:
"""
应用淡出效果
Args:
audio_data: 音频数据
fade_duration: 淡出时长(秒)
sample_rate: 采样率
Returns:
应用淡出效果后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
fade_samples = int(fade_duration * sample_rate)
if fade_samples <= 0 or fade_samples >= len(audio_data):
return audio_data
# 创建淡出曲线
fade_curve = np.linspace(1, 0, fade_samples)
# 应用淡出效果
faded_data = audio_data.copy()
faded_data[-fade_samples:] *= fade_curve
print(f"✓ 淡出效果已应用: {fade_duration}")
return faded_data
except Exception as e:
print(f"✗ 应用淡出效果失败: {e}")
return audio_data
def concatenate_audio(self, audio_segments: List[np.ndarray]) -> np.ndarray:
"""
连接多个音频片段
Args:
audio_segments: 音频片段列表
Returns:
连接后的音频数据
"""
try:
if not audio_segments:
return np.array([], dtype=np.float32)
# 连接所有音频片段
concatenated = np.concatenate(audio_segments)
print(f"{len(audio_segments)}个音频片段已连接")
return concatenated
except Exception as e:
print(f"✗ 连接音频片段失败: {e}")
return np.array([], dtype=np.float32)
def split_audio(self, audio_data: np.ndarray, segment_duration: float, sample_rate: int = None) -> List[np.ndarray]:
"""
分割音频数据为多个片段
Args:
audio_data: 音频数据
segment_duration: 片段时长(秒)
sample_rate: 采样率
Returns:
音频片段列表
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
segment_samples = int(segment_duration * sample_rate)
if segment_samples <= 0:
return [audio_data]
# 分割音频数据
segments = []
for i in range(0, len(audio_data), segment_samples):
segment = audio_data[i:i + segment_samples]
segments.append(segment)
print(f"✓ 音频已分割为{len(segments)}个片段")
return segments
except Exception as e:
print(f"✗ 分割音频失败: {e}")
return [audio_data]
def calculate_audio_level(self, audio_data: np.ndarray) -> float:
"""
计算音频级别RMS
Args:
audio_data: 音频数据
Returns:
音频级别 (0.0-1.0)
"""
try:
if len(audio_data) == 0:
return 0.0
# 计算RMS值
rms = np.sqrt(np.mean(audio_data ** 2))
return float(rms)
except Exception as e:
print(f"✗ 计算音频级别失败: {e}")
return 0.0
def detect_silence(self, audio_data: np.ndarray, threshold: float = 0.01,
min_silence_duration: float = 0.1, sample_rate: int = None) -> List[Dict[str, float]]:
"""
检测音频中的静音段
Args:
audio_data: 音频数据
threshold: 静音阈值
min_silence_duration: 最小静音时长(秒)
sample_rate: 采样率
Returns:
静音段列表 [{'start': start_time, 'end': end_time}]
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
min_silence_samples = int(min_silence_duration * sample_rate)
silence_segments = []
# 简化的静音检测
is_silent = np.abs(audio_data) < threshold
in_silence = False
silence_start = 0
for i in range(len(is_silent)):
if is_silent[i] and not in_silence:
# 开始静音段
in_silence = True
silence_start = i
elif not is_silent[i] and in_silence:
# 结束静音段
in_silence = False
silence_duration = i - silence_start
if silence_duration >= min_silence_samples:
silence_segments.append({
'start': silence_start / sample_rate,
'end': i / sample_rate
})
# 处理最后一个静音段
if in_silence:
silence_duration = len(audio_data) - silence_start
if silence_duration >= min_silence_samples:
silence_segments.append({
'start': silence_start / sample_rate,
'end': len(audio_data) / sample_rate
})
print(f"✓ 检测到{len(silence_segments)}个静音段")
return silence_segments
except Exception as e:
print(f"✗ 检测静音段失败: {e}")
return []
def remove_silence(self, audio_data: np.ndarray, threshold: float = 0.01,
min_silence_duration: float = 0.1, sample_rate: int = None) -> np.ndarray:
"""
移除音频中的静音段
Args:
audio_data: 音频数据
threshold: 静音阈值
min_silence_duration: 最小静音时长(秒)
sample_rate: 采样率
Returns:
移除静音后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 检测静音段
silence_segments = self.detect_silence(audio_data, threshold, min_silence_duration, sample_rate)
if not silence_segments:
return audio_data
# 移除静音段
cleaned_audio = audio_data.copy()
for segment in reversed(silence_segments): # 从后往前删除,避免索引变化
start_sample = int(segment['start'] * sample_rate)
end_sample = int(segment['end'] * sample_rate)
cleaned_audio = np.concatenate([
cleaned_audio[:start_sample],
cleaned_audio[end_sample:]
])
print(f"✓ 已移除{len(silence_segments)}个静音段")
return cleaned_audio
except Exception as e:
print(f"✗ 移除静音段失败: {e}")
return audio_data
def apply_high_pass_filter(self, audio_data: np.ndarray, cutoff_freq: float,
sample_rate: int = None) -> np.ndarray:
"""
应用高通滤波器
Args:
audio_data: 音频数据
cutoff_freq: 截止频率Hz
sample_rate: 采样率
Returns:
滤波后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 简化的高通滤波实现
# 实际项目中应该使用更专业的滤波器设计
rc = 1.0 / (2 * np.pi * cutoff_freq)
dt = 1.0 / sample_rate
alpha = rc / (rc + dt)
filtered_data = np.zeros_like(audio_data)
for i in range(1, len(audio_data)):
filtered_data[i] = alpha * (filtered_data[i-1] + audio_data[i] - audio_data[i-1])
print(f"✓ 高通滤波器已应用: {cutoff_freq}Hz")
return filtered_data
except Exception as e:
print(f"✗ 应用高通滤波器失败: {e}")
return audio_data
def apply_low_pass_filter(self, audio_data: np.ndarray, cutoff_freq: float,
sample_rate: int = None) -> np.ndarray:
"""
应用低通滤波器
Args:
audio_data: 音频数据
cutoff_freq: 截止频率Hz
sample_rate: 采样率
Returns:
滤波后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 简化的低通滤波实现
rc = 1.0 / (2 * np.pi * cutoff_freq)
dt = 1.0 / sample_rate
alpha = dt / (rc + dt)
filtered_data = np.zeros_like(audio_data)
filtered_data[0] = audio_data[0]
for i in range(1, len(audio_data)):
filtered_data[i] = alpha * audio_data[i] + (1 - alpha) * filtered_data[i-1]
print(f"✓ 低通滤波器已应用: {cutoff_freq}Hz")
return filtered_data
except Exception as e:
print(f"✗ 应用低通滤波器失败: {e}")
return audio_data
def generate_sine_wave(self, frequency: float, duration: float,
amplitude: float = 1.0, sample_rate: int = None) -> np.ndarray:
"""
生成正弦波音频
Args:
frequency: 频率Hz
duration: 时长(秒)
amplitude: 振幅 (0.0-1.0)
sample_rate: 采样率
Returns:
正弦波音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 生成时间轴
t = np.linspace(0, duration, int(sample_rate * duration), False)
# 生成正弦波
sine_wave = amplitude * np.sin(2 * np.pi * frequency * t)
print(f"✓ 正弦波已生成: {frequency}Hz, {duration}")
return sine_wave
except Exception as e:
print(f"✗ 生成正弦波失败: {e}")
return np.array([], dtype=np.float32)
def generate_white_noise(self, duration: float, amplitude: float = 0.1,
sample_rate: int = None) -> np.ndarray:
"""
生成白噪声音频
Args:
duration: 时长(秒)
amplitude: 振幅 (0.0-1.0)
sample_rate: 采样率
Returns:
白噪声音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 生成样本数
num_samples = int(sample_rate * duration)
# 生成白噪声
white_noise = amplitude * (np.random.random(num_samples) * 2 - 1)
print(f"✓ 白噪声已生成: {duration}")
return white_noise
except Exception as e:
print(f"✗ 生成白噪声失败: {e}")
return np.array([], dtype=np.float32)
def mix_audio_signals(self, audio1: np.ndarray, audio2: np.ndarray,
ratio: float = 0.5) -> np.ndarray:
"""
混合两个音频信号
Args:
audio1: 第一个音频信号
audio2: 第二个音频信号
ratio: 混合比例 (0.0-1.0, 0表示全audio11表示全audio2)
Returns:
混合后的音频信号
"""
try:
# 确保两个音频信号长度相同
max_length = max(len(audio1), len(audio2))
if len(audio1) < max_length:
audio1 = np.pad(audio1, (0, max_length - len(audio1)))
if len(audio2) < max_length:
audio2 = np.pad(audio2, (0, max_length - len(audio2)))
# 混合音频
mixed_audio = (1 - ratio) * audio1 + ratio * audio2
# 防止削波
mixed_audio = np.clip(mixed_audio, -1.0, 1.0)
print("✓ 音频信号已混合")
return mixed_audio
except Exception as e:
print(f"✗ 混合音频信号失败: {e}")
# 返回较长的音频信号
return audio1 if len(audio1) >= len(audio2) else audio2
def calculate_audio_duration(self, audio_data: np.ndarray, sample_rate: int = None) -> float:
"""
计算音频时长
Args:
audio_data: 音频数据
sample_rate: 采样率
Returns:
音频时长(秒)
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
if len(audio_data) == 0:
return 0.0
duration = len(audio_data) / sample_rate
return duration
except Exception as e:
print(f"✗ 计算音频时长失败: {e}")
return 0.0
def convert_to_mono(self, audio_data: np.ndarray, channels: int = None) -> np.ndarray:
"""
转换为单声道音频
Args:
audio_data: 音频数据
channels: 声道数
Returns:
单声道音频数据
"""
try:
if channels is None:
channels = self.channels
if channels == 1:
return audio_data
# 假设音频数据是交错存储的
if len(audio_data) % channels != 0:
print("✗ 音频数据长度与声道数不匹配")
return audio_data
# 重新整形并计算平均值
reshaped = audio_data.reshape(-1, channels)
mono_audio = np.mean(reshaped, axis=1)
print("✓ 音频已转换为单声道")
return mono_audio
except Exception as e:
print(f"✗ 转换为单声道失败: {e}")
return audio_data
def convert_to_stereo(self, audio_data: np.ndarray) -> np.ndarray:
"""
转换为立体声音频(复制单声道到两个声道)
Args:
audio_data: 单声道音频数据
Returns:
立体声音频数据
"""
try:
# 将单声道数据复制到两个声道
stereo_audio = np.tile(audio_data.reshape(-1, 1), (1, 2))
stereo_audio = stereo_audio.flatten()
print("✓ 音频已转换为立体声")
return stereo_audio
except Exception as e:
print(f"✗ 转换为立体声失败: {e}")
return audio_data
def apply_compressor(self, audio_data: np.ndarray, threshold: float = -20.0,
ratio: float = 4.0, attack: float = 0.01, release: float = 0.1) -> np.ndarray:
"""
应用压缩器效果
Args:
audio_data: 音频数据
threshold: 阈值dB
ratio: 压缩比
attack: 启动时间(秒)
release: 释放时间(秒)
Returns:
压缩后的音频数据
"""
try:
# 简化的压缩器实现
threshold_linear = 10 ** (threshold / 20.0)
# 计算增益缩减
gain_reduction = np.zeros_like(audio_data)
envelope = 0.0
for i in range(len(audio_data)):
# 计算包络
envelope = max(abs(audio_data[i]), envelope * 0.99)
# 计算增益缩减
if envelope > threshold_linear:
gain_db = 20 * np.log10(envelope / threshold_linear)
reduction_db = gain_db - gain_db / ratio
gain_reduction[i] = 10 ** (-reduction_db / 20.0)
else:
gain_reduction[i] = 1.0
# 应用增益缩减
compressed_audio = audio_data * gain_reduction
print("✓ 压缩器效果已应用")
return compressed_audio
except Exception as e:
print(f"✗ 应用压缩器效果失败: {e}")
return audio_data
def apply_limiter(self, audio_data: np.ndarray, threshold: float = -0.1) -> np.ndarray:
"""
应用限制器效果
Args:
audio_data: 音频数据
threshold: 阈值dB相对于满量程
Returns:
限制后的音频数据
"""
try:
# 转换阈值为线性值
threshold_linear = 10 ** (threshold / 20.0)
# 应用限制器
limited_audio = np.clip(audio_data, -threshold_linear, threshold_linear)
print(f"✓ 限制器效果已应用: {threshold}dB")
return limited_audio
except Exception as e:
print(f"✗ 应用限制器效果失败: {e}")
return audio_data
def apply_expander(self, audio_data: np.ndarray, threshold: float = -40.0,
ratio: float = 2.0, attack: float = 0.01, release: float = 0.1) -> np.ndarray:
"""
应用扩展器效果
Args:
audio_data: 音频数据
threshold: 阈值dB
ratio: 扩展比
attack: 启动时间(秒)
release: 释放时间(秒)
Returns:
扩展后的音频数据
"""
try:
# 简化的扩展器实现
threshold_linear = 10 ** (threshold / 20.0)
# 计算增益扩展
gain_expansion = np.zeros_like(audio_data)
envelope = 0.0
for i in range(len(audio_data)):
# 计算包络
envelope = max(abs(audio_data[i]), envelope * 0.99)
# 计算增益扩展
if envelope < threshold_linear:
gain_db = 20 * np.log10(envelope / threshold_linear)
expansion_db = gain_db * (1 - 1/ratio)
gain_expansion[i] = 10 ** (expansion_db / 20.0)
else:
gain_expansion[i] = 1.0
# 应用增益扩展
expanded_audio = audio_data * gain_expansion
print("✓ 扩展器效果已应用")
return expanded_audio
except Exception as e:
print(f"✗ 应用扩展器效果失败: {e}")
return audio_data
def apply_deesser(self, audio_data: np.ndarray, threshold: float = -30.0,
frequency: float = 5000.0, sample_rate: int = None) -> np.ndarray:
"""
应用去齿音器效果
Args:
audio_data: 音频数据
threshold: 阈值dB
frequency: 处理频率Hz
sample_rate: 采样率
Returns:
处理后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 简化的去齿音器实现
# 实际实现中会使用更复杂的频域处理
# 应用高通滤波器突出高频
filtered_audio = self.apply_high_pass_filter(audio_data, frequency, sample_rate)
# 检测高频能量
threshold_linear = 10 ** (threshold / 20.0)
high_energy = np.abs(filtered_audio)
# 计算减少因子
reduction_factor = np.ones_like(high_energy)
reduction_factor[high_energy > threshold_linear] = threshold_linear / high_energy[high_energy > threshold_linear]
# 应用减少因子
deessed_audio = audio_data * reduction_factor
print(f"✓ 去齿音器效果已应用: {frequency}Hz")
return deessed_audio
except Exception as e:
print(f"✗ 应用去齿音器效果失败: {e}")
return audio_data
def apply_noise_gate(self, audio_data: np.ndarray, threshold: float = -40.0,
attack: float = 0.001, release: float = 0.01, sample_rate: int = None) -> np.ndarray:
"""
应用噪声门效果
Args:
audio_data: 音频数据
threshold: 阈值dB
attack: 启动时间(秒)
release: 释放时间(秒)
sample_rate: 采样率
Returns:
处理后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
threshold_linear = 10 ** (threshold / 20.0)
# 计算包络
envelope = np.abs(audio_data)
# 简化的噪声门实现
gate_open = envelope > threshold_linear
smoothed_gate = np.zeros_like(gate_open, dtype=np.float32)
# 应用启动和释放时间
attack_coeff = np.exp(-1.0 / (sample_rate * attack))
release_coeff = np.exp(-1.0 / (sample_rate * release))
gate_state = 0.0
for i in range(len(gate_open)):
if gate_open[i]:
gate_state = attack_coeff * gate_state + (1 - attack_coeff)
else:
gate_state = release_coeff * gate_state
smoothed_gate[i] = gate_state
# 应用噪声门
gated_audio = audio_data * smoothed_gate
print("✓ 噪声门效果已应用")
return gated_audio
except Exception as e:
print(f"✗ 应用噪声门效果失败: {e}")
return audio_data
def apply_eq_filter(self, audio_data: np.ndarray, frequencies: List[float],
gains: List[float], sample_rate: int = None) -> np.ndarray:
"""
应用均衡器滤波器
Args:
audio_data: 音频数据
frequencies: 频率列表Hz
gains: 增益列表dB
sample_rate: 采样率
Returns:
滤波后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 简化的均衡器实现
eq_audio = audio_data.copy()
# 对每个频段应用滤波
for freq, gain in zip(frequencies, gains):
if gain > 0:
# 应用低通或高通滤波器增强特定频率
if freq < sample_rate / 4:
eq_audio = self.apply_low_pass_filter(eq_audio, freq * 2, sample_rate)
elif gain < 0:
# 应用滤波器减弱特定频率
if freq > 100:
eq_audio = self.apply_high_pass_filter(eq_audio, freq / 2, sample_rate)
# 应用总体增益
total_gain = 10 ** (sum(gains) / len(gains) / 20.0)
eq_audio = eq_audio * total_gain
eq_audio = np.clip(eq_audio, -1.0, 1.0)
print(f"✓ 均衡器效果已应用: {len(frequencies)}个频段")
return eq_audio
except Exception as e:
print(f"✗ 应用均衡器效果失败: {e}")
return audio_data
def apply_reverb(self, audio_data: np.ndarray, room_size: float = 0.5,
damping: float = 0.5, wet_level: float = 0.33,
sample_rate: int = None) -> np.ndarray:
"""
应用混响效果
Args:
audio_data: 音频数据
room_size: 房间大小 (0.0-1.0)
damping: 阻尼 (0.0-1.0)
wet_level: 湿信号级别 (0.0-1.0)
sample_rate: 采样率
Returns:
添加混响后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 简化的混响实现
# 创建延迟线
max_delay = int(sample_rate * 0.1) # 最大100ms延迟
delay_line = np.zeros(max_delay)
delay_index = 0
# 混响参数
delay_times = [int(sample_rate * 0.013), int(sample_rate * 0.023),
int(sample_rate * 0.037), int(sample_rate * 0.043)]
decay_factors = [0.7, 0.6, 0.5, 0.4]
# 应用混响
reverb_audio = np.zeros_like(audio_data)
for i in range(len(audio_data)):
input_sample = audio_data[i]
# 从延迟线读取
delayed_samples = []
for j, delay_time in enumerate(delay_times):
read_index = (delay_index - delay_time) % max_delay
delayed_samples.append(delay_line[read_index] * decay_factors[j])
# 计算反馈
feedback = sum(delayed_samples) * damping
# 写入延迟线
delay_line[delay_index] = input_sample + feedback
# 更新延迟索引
delay_index = (delay_index + 1) % max_delay
# 输出混合信号
dry_sample = input_sample * (1 - wet_level)
wet_sample = sum(delayed_samples) * wet_level
reverb_audio[i] = dry_sample + wet_sample
# 防止削波
reverb_audio = np.clip(reverb_audio, -1.0, 1.0)
print("✓ 混响效果已应用")
return reverb_audio
except Exception as e:
print(f"✗ 应用混响效果失败: {e}")
return audio_data
def apply_chorus(self, audio_data: np.ndarray, rate: float = 1.0,
depth: float = 0.5, mix: float = 0.5,
sample_rate: int = None) -> np.ndarray:
"""
应用合唱效果
Args:
audio_data: 音频数据
rate: 速率Hz
depth: 深度 (0.0-1.0)
mix: 混合比例 (0.0-1.0)
sample_rate: 采样率
Returns:
添加合唱效果后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 创建延迟缓冲区
delay_buffer = np.zeros(int(sample_rate * 0.05)) # 50ms缓冲区
write_index = 0
# 合唱参数
lfo_phase = 0.0
delay_base = 0.005 # 5ms基础延迟
# 应用合唱效果
chorus_audio = np.zeros_like(audio_data)
for i in range(len(audio_data)):
# 更新LFO相位
lfo_phase += 2 * np.pi * rate / sample_rate
if lfo_phase > 2 * np.pi:
lfo_phase -= 2 * np.pi
# 计算延迟时间
lfo_value = np.sin(lfo_phase)
delay_time = delay_base + depth * 0.01 * lfo_value
delay_samples = delay_time * sample_rate
# 写入当前样本
delay_buffer[write_index] = audio_data[i]
# 计算读取位置
read_index = write_index - int(delay_samples)
if read_index < 0:
read_index += len(delay_buffer)
# 线性插值获取延迟样本
frac = delay_samples - int(delay_samples)
delayed_sample = (delay_buffer[read_index] * (1 - frac) +
delay_buffer[(read_index + 1) % len(delay_buffer)] * frac)
# 混合干湿信号
chorus_audio[i] = audio_data[i] * (1 - mix) + delayed_sample * mix
# 更新写入索引
write_index = (write_index + 1) % len(delay_buffer)
print("✓ 合唱效果已应用")
return chorus_audio
except Exception as e:
print(f"✗ 应用合唱效果失败: {e}")
return audio_data
def apply_flanger(self, audio_data: np.ndarray, rate: float = 0.5,
depth: float = 0.002, feedback: float = 0.3,
mix: float = 0.5, sample_rate: int = None) -> np.ndarray:
"""
应用镶边效果
Args:
audio_data: 音频数据
rate: 速率Hz
depth: 深度(秒)
feedback: 反馈 (0.0-1.0)
mix: 混合比例 (0.0-1.0)
sample_rate: 采样率
Returns:
添加镶边效果后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 创建延迟缓冲区
max_delay = int(sample_rate * 0.01) # 10ms最大延迟
delay_buffer = np.zeros(max_delay)
write_index = 0
last_output = 0.0
# LFO参数
lfo_phase = 0.0
# 应用镶边效果
flanger_audio = np.zeros_like(audio_data)
for i in range(len(audio_data)):
# 更新LFO相位
lfo_phase += 2 * np.pi * rate / sample_rate
if lfo_phase > 2 * np.pi:
lfo_phase -= 2 * np.pi
# 计算延迟时间
lfo_value = np.sin(lfo_phase)
delay_time = depth * lfo_value
delay_samples = abs(delay_time) * sample_rate
# 写入当前样本(带反馈)
delay_buffer[write_index] = audio_data[i] + last_output * feedback
# 计算读取位置
read_index = write_index - int(delay_samples)
if read_index < 0:
read_index += len(delay_buffer)
# 线性插值获取延迟样本
frac = delay_samples - int(delay_samples)
delayed_sample = (delay_buffer[read_index] * (1 - frac) +
delay_buffer[(read_index + 1) % len(delay_buffer)] * frac)
# 混合干湿信号
flanger_audio[i] = audio_data[i] * (1 - mix) + delayed_sample * mix
last_output = flanger_audio[i]
# 更新写入索引
write_index = (write_index + 1) % len(delay_buffer)
print("✓ 镶边效果已应用")
return flanger_audio
except Exception as e:
print(f"✗ 应用镶边效果失败: {e}")
return audio_data
def apply_distortion(self, audio_data: np.ndarray, drive: float = 0.5,
tone: float = 0.5) -> np.ndarray:
"""
应用失真效果
Args:
audio_data: 音频数据
drive: 驱动 (0.0-1.0)
tone: 音调 (0.0-1.0)
Returns:
添加失真效果后的音频数据
"""
try:
# 应用前置增益
drive_factor = 1.0 + drive * 9.0 # 1x 到 10x 增益
processed = audio_data * drive_factor
# 应用软削波失真
processed = np.tanh(processed)
# 应用色调控制(简单的高通滤波)
if tone != 0.5:
# 简单的一阶高通滤波器
rc = 1.0 / (2 * np.pi * (200 + tone * 2000)) # 200Hz 到 2200Hz
dt = 1.0 / self.sample_rate
alpha = rc / (rc + dt)
for i in range(1, len(processed)):
processed[i] = alpha * (processed[i] + processed[i-1] - processed[i-1])
print("✓ 失真效果已应用")
return processed
except Exception as e:
print(f"✗ 应用失真效果失败: {e}")
return audio_data
def apply_pitch_shift(self, audio_data: np.ndarray, semitones: float = 0.0,
sample_rate: int = None) -> np.ndarray:
"""
应用音高移位效果
Args:
audio_data: 音频数据
semitones: 半音数(正数升调,负数降调)
sample_rate: 采样率
Returns:
音高移位后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 计算音高移位因子
pitch_factor = 2 ** (semitones / 12.0)
if pitch_factor == 1.0:
return audio_data
# 简化的音高移位(线性插值)
output_length = int(len(audio_data) / pitch_factor)
if output_length <= 0:
return np.array([], dtype=np.float32)
processed = np.zeros(output_length)
# 重采样
for i in range(output_length):
# 计算源位置
src_pos = i * pitch_factor
src_index = int(src_pos)
frac = src_pos - src_index
# 线性插值
if src_index < len(audio_data) - 1:
processed[i] = audio_data[src_index] * (1 - frac) + \
audio_data[src_index + 1] * frac
else:
processed[i] = audio_data[min(src_index, len(audio_data) - 1)]
print(f"✓ 音高移位效果已应用: {semitones}半音")
return processed
except Exception as e:
print(f"✗ 应用音高移位效果失败: {e}")
return audio_data
def apply_time_stretch(self, audio_data: np.ndarray, factor: float = 1.0,
sample_rate: int = None) -> np.ndarray:
"""
应用时间拉伸效果(不改变音高)
Args:
audio_data: 音频数据
factor: 拉伸因子(>1.0变慢,<1.0变快)
sample_rate: 采样率
Returns:
时间拉伸后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
if factor == 1.0:
return audio_data
# 简化的时间拉伸实现
new_length = int(len(audio_data) * factor)
if new_length <= 0:
return np.array([], dtype=np.float32)
# 使用线性插值进行时间拉伸
indices = np.linspace(0, len(audio_data) - 1, new_length)
stretched_audio = np.interp(indices, np.arange(len(audio_data)), audio_data)
print(f"✓ 时间拉伸效果已应用: {factor}x")
return stretched_audio
except Exception as e:
print(f"✗ 应用时间拉伸效果失败: {e}")
return audio_data
def apply_vibrato(self, audio_data: np.ndarray, rate: float = 5.0,
depth: float = 0.5, sample_rate: int = None) -> np.ndarray:
"""
应用颤音效果
Args:
audio_data: 音频数据
rate: 速率Hz
depth: 深度 (0.0-1.0)
sample_rate: 采样率
Returns:
添加颤音效果后的音频数据
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
# 使用音高移位实现颤音
lfo_phase = 0.0
vibrato_audio = np.zeros_like(audio_data)
for i in range(len(audio_data)):
# 更新LFO相位
lfo_phase += 2 * np.pi * rate / sample_rate
if lfo_phase > 2 * np.pi:
lfo_phase -= 2 * np.pi
# 计算音高变化
lfo_value = np.sin(lfo_phase)
semitones = depth * 2 * lfo_value # ±2半音变化
# 简化处理:对整个信号应用平均音高变化
# 实际实现中会使用更复杂的逐样本处理
if i == 0: # 只计算一次作为示例
vibrato_audio = self.apply_pitch_shift(audio_data, semitones, sample_rate)
if len(vibrato_audio) == 0:
vibrato_audio = audio_data # 出错时返回原音频
print("✓ 颤音效果已应用")
return vibrato_audio
except Exception as e:
print(f"✗ 应用颤音效果失败: {e}")
return audio_data
def get_audio_info(self, audio_data: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
"""
获取音频信息
Args:
audio_data: 音频数据
sample_rate: 采样率
Returns:
音频信息字典
"""
try:
if sample_rate is None:
sample_rate = self.sample_rate
if len(audio_data) == 0:
return {
'duration': 0.0,
'samples': 0,
'sample_rate': sample_rate,
'channels': self.channels,
'peak_level': 0.0,
'rms_level': 0.0
}
duration = len(audio_data) / sample_rate
peak_level = float(np.max(np.abs(audio_data)))
rms_level = float(np.sqrt(np.mean(audio_data ** 2)))
return {
'duration': duration,
'samples': len(audio_data),
'sample_rate': sample_rate,
'channels': self.channels,
'peak_level': peak_level,
'rms_level': rms_level
}
except Exception as e:
print(f"✗ 获取音频信息失败: {e}")
return {}