📦 环境准备 (版)

安装所需库

# 传统音频处理
pip install numpy scipy matplotlib librosa soundfile pyaudio

# AI/深度学习音频处理 (推荐)
pip install torch torchaudio torchvision
pip install transformers datasets
pip install speechbrain
pip install audiolm-pytorch audiocraft
pip install onnxruntime onnx

# 实时音频处理
pip install sounddevice noisereduce

年推荐环境

  • PyTorch 2.5+:支持GPU加速和量化推理
  • Transformers:HuggingFace预训练模型库
  • SpeechBrain:端到端语音处理
  • ONNX Runtime:跨平台AI推理

🎵 示例1:AI语音识别 (Whisper)

使用OpenAI Whisper进行语音识别

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np

class AIWhisperRecognizer:
    """
    版AI语音识别器
    使用OpenAI Whisper模型进行高精度语音识别
    """
    
    def __init__(self, model_size="medium", device="cuda"):
        """
        初始化Whisper模型
        
        参数:
            model_size: 模型大小 ("tiny", "base", "small", "medium", "large")
            device: 运行设备 ("cuda", "cpu", "mps")
        """
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
        print(f"Loading Whisper {model_size} on {self.device}...")
        
        self.processor = WhisperProcessor.from_pretrained(
            f"openai/whisper-{model_size}"
        )
        self.model = WhisperForConditionalGeneration.from_pretrained(
            f"openai/whisper-{model_size}"
        )
        self.model.to(self.device)
        self.model.config.forced_decoder_ids = None
        
    def transcribe(self, audio_path, language="zh"):
        """
        转录音频文件
        
        参数:
            audio_path: 音频文件路径
            language: 语言 ("zh" for Chinese, "en" for English)
            
        返回:
            dict: 转录结果,包含文本和时间戳
        """
        # 加载音频
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # 重采样到16kHz
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        
        # 预处理
        input_features = self.processor(
            waveform.squeeze().numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features
        
        # 推理
        self.model.eval()
        with torch.no_grad():
            predicted_ids = self.model.generate(
                input_features.to(self.device),
                language=language
            )
        
        # 解码
        transcription = self.processor.batch_decode(
            predicted_ids, skip_special_tokens=True
        )[0]
        
        return {
            "text": transcription,
            "language": language,
            "model": f"whisper-{self.model.config.model_name_or_path.split('-')[-1]}"
        }

# 使用示例
if __name__ == "__main__":
    recognizer = AIWhisperRecognizer(model_size="medium", device="cuda")
    
    # 转录音频
    result = recognizer.transcribe("speech.wav", language="zh")
    print(f"转录结果: {result['text']}")
🎵 示例2:AI语音增强 (RNNoise)

使用深度学习进行实时语音降噪

import numpy as np
import torch
import torch.nn as nn
import torchaudio

class Denoiser(nn.Module):
    """
    版轻量级语音降噪模型
    基于时域卷积网络的实时降噪
    """
    
    def __init__(self):
        super(Denoiser, self).__init__()
        
        # 编码器
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 32, 8, stride=4, padding=2),
            nn.ReLU(),
            nn.Conv1d(32, 64, 8, stride=4, padding=2),
            nn.ReLU(),
            nn.Conv1d(64, 128, 8, stride=4, padding=2),
            nn.ReLU(),
            nn.Conv1d(128, 256, 8, stride=4, padding=2),
        )
        
        # 解码器
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(256, 128, 8, stride=4, padding=2),
            nn.ReLU(),
            nn.ConvTranspose1d(128, 64, 8, stride=4, padding=2),
            nn.ReLU(),
            nn.ConvTranspose1d(64, 32, 8, stride=4, padding=2),
            nn.ReLU(),
            nn.ConvTranspose1d(32, 1, 8, stride=4, padding=2),
            nn.Tanh()
        )
        
    def forward(self, x):
        """
        前向传播
        
        参数:
            x: 输入音频 tensor [batch, 1, samples]
            
        返回:
            增强后的音频
        """
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

class RealTimeNoiseSuppressor:
    """
    实时语音降噪处理器
    年优化版,支持流式处理
    """
    
    def __init__(self, model_path=None, sample_rate=16000):
        """
        初始化降噪器
        
        参数:
            model_path: 预训练模型路径
            sample_rate: 采样率
        """
        self.sample_rate = sample_rate
        self.model = Denoiser()
        
        if model_path:
            self.model.load_state_dict(torch.load(model_path))
        
        self.model.eval()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        # 缓存区
        self.buffer = torch.zeros(1, 1, 16000 * 2)  # 2秒缓存
        
    def process_chunk(self, audio_chunk):
        """
        处理音频块(实时模式)
        
        参数:
            audio_chunk: numpy数组或tensor
            
        返回:
            增强后的音频
        """
        if isinstance(audio_chunk, np.ndarray):
            audio_chunk = torch.from_numpy(audio_chunk).float().unsqueeze(0).unsqueeze(0)
        
        with torch.no_grad():
            enhanced = self.model(audio_chunk.to(self.device))
        
        return enhanced.cpu().squeeze().numpy()

# 使用示例
if __name__ == "__main__":
    # 创建降噪器
    suppressor = RealTimeNoiseSuppressor()
    
    # 处理音频文件
    waveform, sr = torchaudio.load("noisy_speech.wav")
    enhanced = suppressor.process_chunk(waveform)
    
    # 保存结果
    torchaudio.save("enhanced_speech.wav", 
                   torch.from_numpy(enhanced).unsqueeze(0), sr)
🎵 示例3:AI音乐生成 (MusicGen)

使用Meta MusicGen生成音乐

import torch
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.utils import play_audio

class AIMusicGenerator:
    """
    版AI音乐生成器
    使用Meta MusicGen根据文本描述生成音乐
    """
    
    def __init__(self, model_size="medium"):
        """
        初始化音乐生成模型
        
        参数:
            model_size: 模型大小 ("small", "medium", "large", "melody")
        """
        print(f"Loading MusicGen {model_size}...")
        self.model = MusicGen.get_pretrained(model_size)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        
    def generate(self, description, duration=10, tempo=None):
        """
        根据文本描述生成音乐
        
        参数:
            description: 音乐描述 (英文)
                示例: "a relaxing piano melody with soft strings"
                示例: "electronic dance music with strong beat"
            duration: 生成时长(秒)
            tempo: BPM (可选)
            
        返回:
            生成的音频tensor
        """
        self.model.set_generation_params(
            duration=duration,
            tempo=tempo
        )
        
        with torch.no_grad():
            audio = self.model.generate([
                description
            ])
        
        return audio.cpu()
    
    def generate_melody_control(self, melody_audio_path, duration=10):
        """
        基于参考旋律生成音乐
        
        参数:
            melody_audio_path: 参考旋律音频路径
            duration: 生成时长
            
        返回:
            生成的音频tensor
        """
        # 加载参考旋律
        melody, sr = torchaudio.load(melody_audio_path)
        
        self.model.set_generation_params(duration=duration)
        
        with torch.no_grad():
            audio = self.model.generate_with_chroma(
                [melody],
                description="continue the melody style"
            )
        
        return audio.cpu()
    
    def save_audio(self, audio_tensor, output_path, sample_rate=32000):
        """
        保存生成的音频
        """
        torchaudio.save(output_path, audio_tensor.squeeze(), sample_rate)

# 使用示例
if __name__ == "__main__":
    # 初始化生成器
    generator = AIMusicGenerator(model_size="medium")
    
    # 生成音乐
    prompt = "a calm ambient electronic music with soft pads and gentle rhythm"
    audio = generator.generate(prompt, duration=15, tempo=80)
    
    # 保存
    generator.save_audio(audio, "generated_music.wav")
    
    # 播放
    play_audio(audio.squeeze())
🎵 示例4:AI音频特征提取 (Wav2Vec 2.0)

使用预训练模型提取高级音频表示

import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.decomposition import PCA

class AudioFeatureExtractor:
    """
    版高级音频特征提取器
    使用Wav2Vec 2.0预训练模型提取音频嵌入
    """
    
    def __init__(self, model_name="facebook/wav2vec2-base-960h"):
        """
        初始化特征提取器
        
        参数:
            model_name: HuggingFace模型名称
        """
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2Model.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.model.eval()
        
    def extract_features(self, audio_path):
        """
        提取音频特征嵌入
        
        参数:
            audio_path: 音频文件路径
            
        返回:
            audio_embeddings: 帧级别嵌入
            sentence_embedding: 句子级别嵌入
        """
        # 加载音频
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # 重采样到16kHz
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        
        # 预处理
        input_values = self.processor(
            waveform.squeeze().numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        ).input_values
        
        "># 推理
        with torch.no_grad():
            output = self.model(input_values.to(self.device))
            
        # 提取嵌入
        audio_embeddings = output.last_hidden_state.cpu()
        
        # 全局平均池化得到句子嵌入
        sentence_embedding = torch.mean(audio_embeddings, dim=1)
        
        return {
            "frame_embeddings": audio_embeddings,
            "sentence_embedding": sentence_embedding,
            "sample_rate": sample_rate
        }
    
    def similarity_search(self, query_path, reference_paths):
        """
        音频相似度搜索
        
        参数:
            query_path: 查询音频路径
            reference_paths: 参考音频路径列表
            
        返回:
            相似度排序列表
        """
        # 提取查询音频嵌入
        query_features = self.extract_features(query_path)
        query_embedding = query_features["sentence_embedding"]
        
        similarities = []
        for ref_path in reference_paths:
            ref_features = self.extract_features(ref_path)
            ref_embedding = ref_features["sentence_embedding"]
            
            # 余弦相似度
            cos_sim = torch.nn.functional.cosine_similarity(
                query_embedding, ref_embedding
            ).item()
            
            similarities.append((ref_path, cos_sim))
        
        # 排序
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        return similarities

# 使用示例
if __name__ == "__main__":
    # 初始化特征提取器
    extractor = AudioFeatureExtractor(
        model_name="facebook/wav2vec2-base-960h"
    )
    
    # 提取特征
    features = extractor.extract_features("speech.wav")
    print(f"Frame embeddings shape: {features['frame_embeddings'].shape}")
    print(f"Sentence embedding shape: {features['sentence_embedding'].shape}")
🎵 示例5:AI音频分离 (Demucs)

使用Demucs进行音乐源分离

import torch
import torchaudio
from demucs import pretrained
from demucs.audio import AudioFile

class MusicSourceSeparator:
    """
    版AI音乐源分离器
    使用Demucs模型将音乐分离为人声、伴奏、鼓、贝斯等
    """
    
    def __init__(self, model="demucs", device="cuda"):
        """
        初始化分离器
        
        参数:
            model: 模型选择
                - "demucs": 标准版
                - "demucs-mmi": 更精确但更慢
                - "htdemucs": 最高质量
            device: 运行设备
        """
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
        
        print(f"Loading {model} on {self.device}...")
        self.model = pretrained.get_model(model)
        self.model.to(self.device)
        
    def separate(self, audio_path, output_dir="./"):
        """
        分离音乐源
        
        参数:
            audio_path: 输入音乐文件路径
            output_dir: 输出目录
            
        返回:
            dict: 分离后的各音轨
        """
        # 加载音频
        audio = AudioFile(audio_path).read(streams=0)
        
        # 分离
        with torch.no_grad():
            estimates = self.model(audio.unsqueeze(0).to(self.device))
        
        # 获取音轨名称
        sources = self.model.sources
        
        return {
            source: estimates[0, i].cpu()
            for i, source in enumerate(sources)
        }
    
    def save_stems(self, stems, output_dir="./", sample_rate=44100):
        """
        保存分离后的音轨
        """
        for name, audio in stems.items():
            output_path = f"{output_dir}/{name}.wav"
            torchaudio.save(output_path, audio, sample_rate)
            print(f"Saved: {output_path}")

# 使用示例
if __name__ == "__main__":
    # 初始化分离器
    separator = MusicSourceSeparator(model="htdemucs", device="cuda")
    
    # 分离音乐
    stems = separator.separate("song.wav")
    
    # 保存各音轨
    separator.save_stems(stems, output_dir="separated")
    
    # 使用分离后的音轨
    vocals = stems["vocals"]
    drums = stems["drums"]
    bass = stems["bass"]
    other = stems["other"]

🎵 示例6:ONNX模型部署

跨平台AI音频模型部署

import torch
import onnxruntime as ort
import numpy as np
import torchaudio

class ONNXAudioProcessor:
    """
    版ONNX音频处理引擎
    支持跨平台部署和硬件加速推理
    """
    
    def __init__(self, model_path, providers=["CUDAExecutionProvider", 
                                              "CPUExecutionProvider"]):
        """
        初始化ONNX推理引擎
        
        参数:
            model_path: ONNX模型文件路径
            providers: 执行提供程序 (CUDA, TensorRT, CoreML, NPU等)
        """
        # 检查可用的providers
        available = ort.get_available_providers()
        providers = [p for p in providers if p in available]
        
        print(f"Using providers: {providers}")
        self.session = ort.InferenceSession(model_path, providers=providers)
        
        "># 获取模型信息
        self.input_name = self.session.get_inputs()[0].name
        self.output_names = [o.name for o in self.session.get_outputs()]
        
    def infer(self, input_data):
        """
        执行推理
        
        参数:
            input_data: 输入音频tensor或numpy数组
            
        返回:
            模型输出
        """
        if isinstance(input_data, torch.Tensor):
            input_data = input_data.numpy()
        
        # 执行推理
        outputs = self.session.run(self.output_names, {self.input_name: input_data})
        
        return outputs

class ModelConverter:
    """
    PyTorch模型转ONNX
    """
    
    def __init__(self, torch_model):
        self.model = torch_model
        
    def export(self, dummy_input, output_path, input_names=["audio"],
                   output_names=["output"], dynamic_axes=None):
        """
        导出ONNX模型
        
        参数:
            dummy_input: 示例输入
            output_path: 输出路径
            input_names: 输入名称
            output_names: 输出名称
            dynamic_axes: 动态轴(支持变长输入)
        """
        torch.onnx.export(
            self.model,
            dummy_input,
            output_path,
            input_names=input_names,
            output_names=output_names,
            dynamic_axes=dynamic_axes,
            opset_version=17  # ONNX Opset 17 ()
        )
        print(f"Exported to: {output_path}")

# 使用示例
if __name__ == "__main__":
    # 转换PyTorch模型为ONNX
    class SimpleAudioModel(torch.nn.Module):
        def forward(self, x):
            return x * 0.5  # 简单示例
    
    model = SimpleAudioModel()
    converter = ModelConverter(model)
    
    dummy_input = torch.randn(1, 16000)  # 1秒音频
    converter.export(
        dummy_input,
        "audio_model.onnx",
        input_names=["waveform"],
        output_names=["output"],
        dynamic_axes={"waveform": {0: "batch", 1: "samples"}}
    )
    
    # 使用ONNX模型推理
    engine = ONNXAudioProcessor("audio_model.onnx")
    test_audio = np.random.randn(16000).astype(np.float32)
    output = engine.infer(test_audio)
↑ 返回顶部