# 传统音频处理 pip install numpy scipy matplotlib librosa soundfile pyaudio # AI/深度学习音频处理 (推荐) pip install torch torchaudio torchvision pip install transformers datasets pip install speechbrain pip install audiolm-pytorch audiocraft pip install onnxruntime onnx # 实时音频处理 pip install sounddevice noisereduce
import torch import torchaudio from transformers import WhisperProcessor, WhisperForConditionalGeneration import numpy as np class AIWhisperRecognizer: """ 版AI语音识别器 使用OpenAI Whisper模型进行高精度语音识别 """ def __init__(self, model_size="medium", device="cuda"): """ 初始化Whisper模型 参数: model_size: 模型大小 ("tiny", "base", "small", "medium", "large") device: 运行设备 ("cuda", "cpu", "mps") """ self.device = torch.device(device if torch.cuda.is_available() else "cpu") print(f"Loading Whisper {model_size} on {self.device}...") self.processor = WhisperProcessor.from_pretrained( f"openai/whisper-{model_size}" ) self.model = WhisperForConditionalGeneration.from_pretrained( f"openai/whisper-{model_size}" ) self.model.to(self.device) self.model.config.forced_decoder_ids = None def transcribe(self, audio_path, language="zh"): """ 转录音频文件 参数: audio_path: 音频文件路径 language: 语言 ("zh" for Chinese, "en" for English) 返回: dict: 转录结果,包含文本和时间戳 """ # 加载音频 waveform, sample_rate = torchaudio.load(audio_path) # 重采样到16kHz if sample_rate != 16000: resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform) # 预处理 input_features = self.processor( waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt" ).input_features # 推理 self.model.eval() with torch.no_grad(): predicted_ids = self.model.generate( input_features.to(self.device), language=language ) # 解码 transcription = self.processor.batch_decode( predicted_ids, skip_special_tokens=True )[0] return { "text": transcription, "language": language, "model": f"whisper-{self.model.config.model_name_or_path.split('-')[-1]}" } # 使用示例 if __name__ == "__main__": recognizer = AIWhisperRecognizer(model_size="medium", device="cuda") # 转录音频 result = recognizer.transcribe("speech.wav", language="zh") print(f"转录结果: {result['text']}")
import numpy as np import torch import torch.nn as nn import torchaudio class Denoiser(nn.Module): """ 版轻量级语音降噪模型 基于时域卷积网络的实时降噪 """ def __init__(self): super(Denoiser, self).__init__() # 编码器 self.encoder = nn.Sequential( nn.Conv1d(1, 32, 8, stride=4, padding=2), nn.ReLU(), nn.Conv1d(32, 64, 8, stride=4, padding=2), nn.ReLU(), nn.Conv1d(64, 128, 8, stride=4, padding=2), nn.ReLU(), nn.Conv1d(128, 256, 8, stride=4, padding=2), ) # 解码器 self.decoder = nn.Sequential( nn.ConvTranspose1d(256, 128, 8, stride=4, padding=2), nn.ReLU(), nn.ConvTranspose1d(128, 64, 8, stride=4, padding=2), nn.ReLU(), nn.ConvTranspose1d(64, 32, 8, stride=4, padding=2), nn.ReLU(), nn.ConvTranspose1d(32, 1, 8, stride=4, padding=2), nn.Tanh() ) def forward(self, x): """ 前向传播 参数: x: 输入音频 tensor [batch, 1, samples] 返回: 增强后的音频 """ encoded = self.encoder(x) decoded = self.decoder(encoded) return decoded class RealTimeNoiseSuppressor: """ 实时语音降噪处理器 年优化版,支持流式处理 """ def __init__(self, model_path=None, sample_rate=16000): """ 初始化降噪器 参数: model_path: 预训练模型路径 sample_rate: 采样率 """ self.sample_rate = sample_rate self.model = Denoiser() if model_path: self.model.load_state_dict(torch.load(model_path)) self.model.eval() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) # 缓存区 self.buffer = torch.zeros(1, 1, 16000 * 2) # 2秒缓存 def process_chunk(self, audio_chunk): """ 处理音频块(实时模式) 参数: audio_chunk: numpy数组或tensor 返回: 增强后的音频 """ if isinstance(audio_chunk, np.ndarray): audio_chunk = torch.from_numpy(audio_chunk).float().unsqueeze(0).unsqueeze(0) with torch.no_grad(): enhanced = self.model(audio_chunk.to(self.device)) return enhanced.cpu().squeeze().numpy() # 使用示例 if __name__ == "__main__": # 创建降噪器 suppressor = RealTimeNoiseSuppressor() # 处理音频文件 waveform, sr = torchaudio.load("noisy_speech.wav") enhanced = suppressor.process_chunk(waveform) # 保存结果 torchaudio.save("enhanced_speech.wav", torch.from_numpy(enhanced).unsqueeze(0), sr)
import torch import torchaudio from audiocraft.models import MusicGen from audiocraft.utils import play_audio class AIMusicGenerator: """ 版AI音乐生成器 使用Meta MusicGen根据文本描述生成音乐 """ def __init__(self, model_size="medium"): """ 初始化音乐生成模型 参数: model_size: 模型大小 ("small", "medium", "large", "melody") """ print(f"Loading MusicGen {model_size}...") self.model = MusicGen.get_pretrained(model_size) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) def generate(self, description, duration=10, tempo=None): """ 根据文本描述生成音乐 参数: description: 音乐描述 (英文) 示例: "a relaxing piano melody with soft strings" 示例: "electronic dance music with strong beat" duration: 生成时长(秒) tempo: BPM (可选) 返回: 生成的音频tensor """ self.model.set_generation_params( duration=duration, tempo=tempo ) with torch.no_grad(): audio = self.model.generate([ description ]) return audio.cpu() def generate_melody_control(self, melody_audio_path, duration=10): """ 基于参考旋律生成音乐 参数: melody_audio_path: 参考旋律音频路径 duration: 生成时长 返回: 生成的音频tensor """ # 加载参考旋律 melody, sr = torchaudio.load(melody_audio_path) self.model.set_generation_params(duration=duration) with torch.no_grad(): audio = self.model.generate_with_chroma( [melody], description="continue the melody style" ) return audio.cpu() def save_audio(self, audio_tensor, output_path, sample_rate=32000): """ 保存生成的音频 """ torchaudio.save(output_path, audio_tensor.squeeze(), sample_rate) # 使用示例 if __name__ == "__main__": # 初始化生成器 generator = AIMusicGenerator(model_size="medium") # 生成音乐 prompt = "a calm ambient electronic music with soft pads and gentle rhythm" audio = generator.generate(prompt, duration=15, tempo=80) # 保存 generator.save_audio(audio, "generated_music.wav") # 播放 play_audio(audio.squeeze())
import torch import torchaudio import numpy as np from transformers import Wav2Vec2Processor, Wav2Vec2Model from sklearn.decomposition import PCA class AudioFeatureExtractor: """ 版高级音频特征提取器 使用Wav2Vec 2.0预训练模型提取音频嵌入 """ def __init__(self, model_name="facebook/wav2vec2-base-960h"): """ 初始化特征提取器 参数: model_name: HuggingFace模型名称 """ self.processor = Wav2Vec2Processor.from_pretrained(model_name) self.model = Wav2Vec2Model.from_pretrained(model_name) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) self.model.eval() def extract_features(self, audio_path): """ 提取音频特征嵌入 参数: audio_path: 音频文件路径 返回: audio_embeddings: 帧级别嵌入 sentence_embedding: 句子级别嵌入 """ # 加载音频 waveform, sample_rate = torchaudio.load(audio_path) # 重采样到16kHz if sample_rate != 16000: resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform) # 预处理 input_values = self.processor( waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt" ).input_values "># 推理 with torch.no_grad(): output = self.model(input_values.to(self.device)) # 提取嵌入 audio_embeddings = output.last_hidden_state.cpu() # 全局平均池化得到句子嵌入 sentence_embedding = torch.mean(audio_embeddings, dim=1) return { "frame_embeddings": audio_embeddings, "sentence_embedding": sentence_embedding, "sample_rate": sample_rate } def similarity_search(self, query_path, reference_paths): """ 音频相似度搜索 参数: query_path: 查询音频路径 reference_paths: 参考音频路径列表 返回: 相似度排序列表 """ # 提取查询音频嵌入 query_features = self.extract_features(query_path) query_embedding = query_features["sentence_embedding"] similarities = [] for ref_path in reference_paths: ref_features = self.extract_features(ref_path) ref_embedding = ref_features["sentence_embedding"] # 余弦相似度 cos_sim = torch.nn.functional.cosine_similarity( query_embedding, ref_embedding ).item() similarities.append((ref_path, cos_sim)) # 排序 similarities.sort(key=lambda x: x[1], reverse=True) return similarities # 使用示例 if __name__ == "__main__": # 初始化特征提取器 extractor = AudioFeatureExtractor( model_name="facebook/wav2vec2-base-960h" ) # 提取特征 features = extractor.extract_features("speech.wav") print(f"Frame embeddings shape: {features['frame_embeddings'].shape}") print(f"Sentence embedding shape: {features['sentence_embedding'].shape}")
import torch import torchaudio from demucs import pretrained from demucs.audio import AudioFile class MusicSourceSeparator: """ 版AI音乐源分离器 使用Demucs模型将音乐分离为人声、伴奏、鼓、贝斯等 """ def __init__(self, model="demucs", device="cuda"): """ 初始化分离器 参数: model: 模型选择 - "demucs": 标准版 - "demucs-mmi": 更精确但更慢 - "htdemucs": 最高质量 device: 运行设备 """ self.device = torch.device(device if torch.cuda.is_available() else "cpu") print(f"Loading {model} on {self.device}...") self.model = pretrained.get_model(model) self.model.to(self.device) def separate(self, audio_path, output_dir="./"): """ 分离音乐源 参数: audio_path: 输入音乐文件路径 output_dir: 输出目录 返回: dict: 分离后的各音轨 """ # 加载音频 audio = AudioFile(audio_path).read(streams=0) # 分离 with torch.no_grad(): estimates = self.model(audio.unsqueeze(0).to(self.device)) # 获取音轨名称 sources = self.model.sources return { source: estimates[0, i].cpu() for i, source in enumerate(sources) } def save_stems(self, stems, output_dir="./", sample_rate=44100): """ 保存分离后的音轨 """ for name, audio in stems.items(): output_path = f"{output_dir}/{name}.wav" torchaudio.save(output_path, audio, sample_rate) print(f"Saved: {output_path}") # 使用示例 if __name__ == "__main__": # 初始化分离器 separator = MusicSourceSeparator(model="htdemucs", device="cuda") # 分离音乐 stems = separator.separate("song.wav") # 保存各音轨 separator.save_stems(stems, output_dir="separated") # 使用分离后的音轨 vocals = stems["vocals"] drums = stems["drums"] bass = stems["bass"] other = stems["other"]
import torch import onnxruntime as ort import numpy as np import torchaudio class ONNXAudioProcessor: """ 版ONNX音频处理引擎 支持跨平台部署和硬件加速推理 """ def __init__(self, model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]): """ 初始化ONNX推理引擎 参数: model_path: ONNX模型文件路径 providers: 执行提供程序 (CUDA, TensorRT, CoreML, NPU等) """ # 检查可用的providers available = ort.get_available_providers() providers = [p for p in providers if p in available] print(f"Using providers: {providers}") self.session = ort.InferenceSession(model_path, providers=providers) "># 获取模型信息 self.input_name = self.session.get_inputs()[0].name self.output_names = [o.name for o in self.session.get_outputs()] def infer(self, input_data): """ 执行推理 参数: input_data: 输入音频tensor或numpy数组 返回: 模型输出 """ if isinstance(input_data, torch.Tensor): input_data = input_data.numpy() # 执行推理 outputs = self.session.run(self.output_names, {self.input_name: input_data}) return outputs class ModelConverter: """ PyTorch模型转ONNX """ def __init__(self, torch_model): self.model = torch_model def export(self, dummy_input, output_path, input_names=["audio"], output_names=["output"], dynamic_axes=None): """ 导出ONNX模型 参数: dummy_input: 示例输入 output_path: 输出路径 input_names: 输入名称 output_names: 输出名称 dynamic_axes: 动态轴(支持变长输入) """ torch.onnx.export( self.model, dummy_input, output_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, opset_version=17 # ONNX Opset 17 () ) print(f"Exported to: {output_path}") # 使用示例 if __name__ == "__main__": # 转换PyTorch模型为ONNX class SimpleAudioModel(torch.nn.Module): def forward(self, x): return x * 0.5 # 简单示例 model = SimpleAudioModel() converter = ModelConverter(model) dummy_input = torch.randn(1, 16000) # 1秒音频 converter.export( dummy_input, "audio_model.onnx", input_names=["waveform"], output_names=["output"], dynamic_axes={"waveform": {0: "batch", 1: "samples"}} ) # 使用ONNX模型推理 engine = ONNXAudioProcessor("audio_model.onnx") test_audio = np.random.randn(16000).astype(np.float32) output = engine.infer(test_audio)