Initial commit for TTS project

2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions
--- a/scripts/generate/fish_speech_cli.py
+++ b/scripts/generate/fish_speech_cli.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Fish Speech 命令行语音克隆脚本
+无需 Web UI，纯命令行控制
+"""
+
+import os
+import sys
+import subprocess
+import time
+import signal
+from pathlib import Path
+
+class FishSpeechCLI:
+    def __init__(self):
+        self.fish_speech_dir = Path("/root/tts/fish-speech")
+        self.model_path = self.fish_speech_dir / "checkpoints/fish-speech-1.5/model.pth"
+        self.decoder_path = self.fish_speech_dir / "checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+        self.reference_audio = Path("/root/tts/ben_guanquelou.wav")
+        self.output_dir = Path("/root/tts/audio_files")
+        self.output_dir.mkdir(exist_ok=True)
+        
+        # 默认参数
+        self.reference_text = "登鹳雀楼，白日依山尽，黄河入海流。欲穷千里目，更上一层楼。"
+        self.target_text = """我们习惯于赞美黄河之水天上来，习惯于歌颂大地的厚德载物。教科书告诉我们，河流是水循环的恩赐，大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果，这一切都是关于"摩擦力"的谎言呢？请试着像挤压一个注满水的海绵球一样，去想象我们脚下的这颗星球。当我们在长白山天池边，看着那并没有足够集雨面积的火山口，却日夜不息地向外喷涌出足以滋养三条大江的淡水时；当我们在巴颜卡拉山，看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时，我们是否应该问自己一个违背常识的问题：这些水，真的是从天上掉下来的吗？物理学告诉我们，毛细现象无法把水推向几千米的高原；简单的蒸发循环，也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后，一定存在一个"第一推动"。它不是温柔的渗透，它是暴力的"挤压"。"""
+        
+        self.server_process = None
+    
+    def check_files(self):
+        """检查必需文件"""
+        print("📦 检查文件...")
+        
+        files = [
+            (self.model_path, "主模型"),
+            (self.decoder_path, "解码器"),
+            (self.reference_audio, "参考音频")
+        ]
+        
+        for file_path, name in files:
+            if file_path.exists():
+                size_mb = file_path.stat().st_size / (1024 * 1024)
+                print(f"  ✅ {name}: {file_path.name} ({size_mb:.1f}MB)")
+            else:
+                print(f"  ❌ {name}: {file_path.name} (缺失)")
+                return False
+        
+        return True
+    
+    def start_api_server(self):
+        """启动 API 服务器"""
+        print("🚀 启动 Fish Speech API 服务器...")
+        
+        # 清理旧进程
+        subprocess.run("pkill -f 'api_server'", shell=True)
+        time.sleep(2)
+        
+        # 切换到 Fish Speech 目录
+        os.chdir(self.fish_speech_dir)
+        
+        # 启动命令
+        cmd = [
+            sys.executable, "tools/api_server.py",
+            "--llama-checkpoint-path", str(self.model_path),
+            "--decoder-checkpoint-path", str(self.decoder_path),
+            "--device", "cpu"
+        ]
+        
+        print(f"执行命令: {' '.join(cmd)}")
+        
+        # 启动服务器
+        self.server_process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        
+        # 等待服务器启动
+        print("⏳ 等待服务器启动...")
+        max_wait = 120  # 最多等待2分钟
+        wait_time = 0
+        
+        while wait_time < max_wait:
+            if self.server_process.poll() is not None:
+                print("❌ 服务器启动失败")
+                stdout, stderr = self.server_process.communicate()
+                print(f"错误: {stderr}")
+                return False
+            
+            # 检查端口
+            try:
+                import requests
+                for port in [8080, 7860, 5000]:
+                    try:
+                        response = requests.get(f"http://127.0.0.1:{port}/health", timeout=2)
+                        if response.status_code == 200:
+                            print(f"✅ 服务器已启动: http://127.0.0.1:{port}")
+                            self.server_url = f"http://127.0.0.1:{port}"
+                            return True
+                    except:
+                        continue
+            except ImportError:
+                pass
+            
+            time.sleep(2)
+            wait_time += 2
+            print(f"  等待中... ({wait_time}s)")
+        
+        print("⏰ 服务器启动超时")
+        return False
+    
+    def synthesize_speech(self, output_filename="fish_speech_cli_output"):
+        """进行语音合成"""
+        print("🎙️ 开始语音合成...")
+        print(f"📝 参考文本: {self.reference_text}")
+        print(f"📝 目标文本长度: {len(self.target_text)} 字符")
+        
+        # 准备客户端命令
+        client_cmd = [
+            sys.executable, "tools/api_client.py",
+            "--text", self.target_text,
+            "--reference_audio", str(self.reference_audio),
+            "--reference_text", self.reference_text,
+            "--output", str(self.output_dir / output_filename),
+            "--no-play",
+            "--max_new_tokens", "2048",
+            "--chunk_length", "300",
+            "--top_p", "0.8",
+            "--temperature", "0.8",
+            "--repetition_penalty", "1.1",
+            "--url", f"{self.server_url}/v1/tts",
+            "--format", "wav"
+        ]
+        
+        print(f"执行命令: {' '.join(client_cmd)}")
+        
+        # 运行客户端
+        result = subprocess.run(
+            client_cmd,
+            capture_output=True,
+            text=True,
+            timeout=600  # 10分钟超时
+        )
+        
+        print("🎙️ 合成结果:")
+        if result.stdout:
+            print("输出:", result.stdout.strip())
+        if result.stderr:
+            print("错误:", result.stderr.strip())
+        
+        return result.returncode == 0
+    
+    def check_output(self, output_filename):
+        """检查输出文件"""
+        output_files = [
+            self.output_dir / f"{output_filename}.wav",
+            self.output_dir / f"{output_filename}.mp3",
+            self.output_dir / f"{output_filename}.flac"
+        ]
+        
+        for output_file in output_files:
+            if output_file.exists():
+                try:
+                    import torchaudio
+                    waveform, sample_rate = torchaudio.load(str(output_file))
+                    duration = waveform.shape[1] / sample_rate
+                    
+                    print(f"\n✅ 音频生成成功!")
+                    print(f"📁 文件: {output_file}")
+                    print(f"📊 大小: {output_file.stat().st_size:,} bytes")
+                    print(f"🎵 时长: {duration:.2f} 秒")
+                    print(f"🎵 采样率: {sample_rate:,} Hz")
+                    
+                    if duration >= 25:
+                        print("🎉 时长符合30秒要求!")
+                    else:
+                        print(f"⚠️ 时长为 {duration:.2f} 秒")
+                    
+                    return True, str(output_file)
+                    
+                except Exception as e:
+                    print(f"⚠️ 读取音频失败: {e}")
+                    return True, str(output_file)
+        
+        print("❌ 未找到生成的音频文件")
+        return False, None
+    
+    def cleanup(self):
+        """清理资源"""
+        if self.server_process:
+            print("🧹 停止服务器...")
+            self.server_process.terminate()
+            time.sleep(2)
+    
+    def run(self, output_filename="fish_speech_cli_output"):
+        """运行完整的命令行语音合成流程"""
+        print("🎊 Fish Speech 命令行语音克隆")
+        print("=" * 60)
+        
+        try:
+            # 1. 检查文件
+            if not self.check_files():
+                print("❌ 文件检查失败")
+                return False
+            
+            # 2. 启动服务器
+            if not self.start_api_server():
+                print("❌ 服务器启动失败")
+                return False
+            
+            # 3. 语音合成
+            if not self.synthesize_speech(output_filename):
+                print("❌ 语音合成失败")
+                return False
+            
+            # 4. 检查结果
+            success, output_file = self.check_output(output_filename)
+            
+            if success:
+                print(f"\n🎉 命令行语音合成完成!")
+                print(f"📁 输出文件: {output_file}")
+                return True
+            else:
+                print("❌ 未找到输出文件")
+                return False
+                
+        except KeyboardInterrupt:
+            print("\n🛑 用户中断操作")
+            return False
+        except Exception as e:
+            print(f"❌ 执行失败: {e}")
+            return False
+        finally:
+            # 清理
+            self.cleanup()
+
+def main():
+    """主函数"""
+    if len(sys.argv) > 1:
+        output_filename = sys.argv[1]
+    else:
+        output_filename = "fish_speech_cli_output"
+    
+    cli = FishSpeechCLI()
+    success = cli.run(output_filename)
+    
+    if success:
+        print(f"\n🎊 成功! 使用命令播放音频:")
+        print(f"   aplay {cli.output_dir}/{output_filename}.wav")
+        print(f"   或使用文件管理器打开: {cli.output_dir}/")
+    else:
+        print("\n💔 失败，请检查错误信息")
+
+if __name__ == "__main__":
+    main()