Initial commit for TTS project

2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions
--- a/scripts/generate/fish_speech_direct_cli.py
+++ b/scripts/generate/fish_speech_direct_cli.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Fish Speech 直接命令行语音合成
+不启动外部服务器，直接使用模型进行合成
+"""
+
+import os
+import sys
+import torch
+from pathlib import Path
+
+def direct_synthesis():
+    """直接进行语音合成"""
+    print("🎊 Fish Speech 直接语音合成")
+    print("=" * 50)
+    
+    # 设置路径
+    fish_speech_dir = Path("/root/tts/fish-speech")
+    os.chdir(fish_speech_dir)
+    
+    model_path = Path("checkpoints/fish-speech-1.5/model.pth")
+    decoder_path = Path("checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
+    reference_audio = Path("../ben_guanquelou.wav")
+    output_file = Path("../audio_files/fish_speech_direct_output.wav")
+    output_file.parent.mkdir(exist_ok=True)
+    
+    # 检查文件
+    print("📦 检查文件...")
+    for file_path, name in [(model_path, "主模型"), (decoder_path, "解码器"), (reference_audio, "参考音频")]:
+        if file_path.exists():
+            size_mb = file_path.stat().st_size / (1024 * 1024)
+            print(f"  ✅ {name}: {file_path.name} ({size_mb:.1f}MB)")
+        else:
+            print(f"  ❌ {name}: {file_path.name} (缺失)")
+            return False
+    
+    # 文本设置
+    reference_text = "登鹳雀楼，白日依山尽，黄河入海流。欲穷千里目，更上一层楼。"
+    target_text = """我们习惯于赞美黄河之水天上来，习惯于歌颂大地的厚德载物。教科书告诉我们，河流是水循环的恩赐，大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果，这一切都是关于"摩擦力"的谎言呢？请试着像挤压一个注满水的海绵球一样，去想象我们脚下的这颗星球。当我们在长白山天池边，看着那并没有足够集雨面积的火山口，却日夜不息地向外喷涌出足以滋养三条大江的淡水时；当我们在巴颜卡拉山，看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时，我们是否应该问自己一个违背常识的问题：这些水，真的是从天上掉下来的吗？物理学告诉我们，毛细现象无法把水推向几千米的高原；简单的蒸发循环，也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后，一定存在一个"第一推动"。它不是温柔的渗透，它是暴力的"挤压"。"""
+    
+    print(f"\n📝 参考文本: {reference_text}")
+    print(f"📝 目标文本长度: {len(target_text)} 字符")
+    
+    try:
+        # 添加到路径
+        sys.path.insert(0, str(fish_speech_dir))
+        
+        print("\n🔧 加载模型...")
+        
+        # 导入模块
+        from fish_speech.models.dac.inference import load_model as load_decoder_model
+        from fish_speech.models.text2semantic.inference import launch_thread_safe_queue
+        from fish_speech.inference_engine import TTSInferenceEngine
+        from fish_speech.utils.file import audio_to_bytes
+        from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest
+        
+        print("✅ 模块导入成功")
+        
+        # 设置设备
+        device = "cpu"
+        precision = torch.float32
+        print(f"🖥️ 使用设备: {device}")
+        
+        print("📦 加载解码器...")
+        decoder_model = load_decoder_model(
+            config_name="modded_dac_vq",
+            checkpoint_path=str(decoder_path),
+            device=device,
+        )
+        print("✅ 解码器加载成功")
+        
+        print("🧠 加载语言模型...")
+        llama_queue = launch_thread_safe_queue(
+            checkpoint_path=str(model_path),
+            device=device,
+            precision=precision,
+            compile=False,
+        )
+        print("✅ 语言模型加载成功")
+        
+        print("🎯 创建推理引擎...")
+        inference_engine = TTSInferenceEngine(
+            llama_queue=llama_queue,
+            decoder_model=decoder_model,
+            compile=False,
+            precision=precision,
+        )
+        print("✅ 推理引擎创建成功")
+        
+        print("🎤 准备参考音频...")
+        ref_audio = ServeReferenceAudio(
+            audio=audio_to_bytes(str(reference_audio)),
+            text=reference_text
+        )
+        print("✅ 参考音频准备完成")
+        
+        print("🎙️ 开始语音合成...")
+        
+        # 创建请求
+        request = ServeTTSRequest(
+            text=target_text,
+            references=[ref_audio],
+            max_new_tokens=1024,
+            chunk_length=200,
+            top_p=0.7,
+            repetition_penalty=1.2,
+            temperature=0.7,
+            format="wav",
+        )
+        
+        print("🔄 正在生成音频（可能需要几分钟）...")
+        
+        # 进行推理
+        audio_data = None
+        for result in inference_engine.inference(request):
+            if result.code == "final":
+                audio_data = result.audio
+                print("✅ 音频生成完成!")
+                break
+            elif result.code == "error":
+                print(f"❌ 推理错误: {result.message}")
+                return False
+        
+        if audio_data:
+            # 保存音频
+            with open(output_file, "wb") as f:
+                f.write(audio_data)
+            
+            print(f"💾 音频已保存: {output_file}")
+            
+            # 验证音频
+            try:
+                import torchaudio
+                waveform, sample_rate = torchaudio.load(str(output_file))
+                duration = waveform.shape[1] / sample_rate
+                
+                print(f"📊 音频信息:")
+                print(f"   文件大小: {output_file.stat().st_size:,} bytes")
+                print(f"   采样率: {sample_rate:,} Hz")
+                print(f"   音频时长: {duration:.2f} 秒")
+                
+                if duration >= 25:
+                    print("🎉 音频时长符合30秒要求!")
+                else:
+                    print(f"⚠️ 音频时长为 {duration:.2f} 秒")
+                
+                return True
+                
+            except Exception as e:
+                print(f"⚠️ 无法验证音频: {e}")
+                return True
+        
+        else:
+            print("❌ 未能生成音频数据")
+            return False
+            
+    except Exception as e:
+        print(f"❌ 语音合成失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    try:
+        success = direct_synthesis()
+        
+        if success:
+            print("\n🎊 Fish Speech 命令行语音合成成功!")
+            print("📁 输出文件: /root/tts/audio_files/fish_speech_direct_output.wav")
+            print("🔊 播放命令: aplay /root/tts/audio_files/fish_speech_direct_output.wav")
+        else:
+            print("\n💔 语音合成失败")
+            
+    except KeyboardInterrupt:
+        print("\n🛑 用户中断操作")
+    except Exception as e:
+        print(f"\n❌ 程序异常: {e}")