#!/usr/bin/env python3 """ Fish Speech 命令行语音克隆脚本 无需 Web UI,纯命令行控制 """ import os import sys import subprocess import time import signal from pathlib import Path class FishSpeechCLI: def __init__(self): self.fish_speech_dir = Path("/root/tts/fish-speech") self.model_path = self.fish_speech_dir / "checkpoints/fish-speech-1.5/model.pth" self.decoder_path = self.fish_speech_dir / "checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" self.reference_audio = Path("/root/tts/ben_guanquelou.wav") self.output_dir = Path("/root/tts/audio_files") self.output_dir.mkdir(exist_ok=True) # 默认参数 self.reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。" self.target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压"。""" self.server_process = None def check_files(self): """检查必需文件""" print("📦 检查文件...") files = [ (self.model_path, "主模型"), (self.decoder_path, "解码器"), (self.reference_audio, "参考音频") ] for file_path, name in files: if file_path.exists(): size_mb = file_path.stat().st_size / (1024 * 1024) print(f" ✅ {name}: {file_path.name} ({size_mb:.1f}MB)") else: print(f" ❌ {name}: {file_path.name} (缺失)") return False return True def start_api_server(self): """启动 API 服务器""" print("🚀 启动 Fish Speech API 服务器...") # 清理旧进程 subprocess.run("pkill -f 'api_server'", shell=True) time.sleep(2) # 切换到 Fish Speech 目录 os.chdir(self.fish_speech_dir) # 启动命令 cmd = [ sys.executable, "tools/api_server.py", "--llama-checkpoint-path", str(self.model_path), "--decoder-checkpoint-path", str(self.decoder_path), "--device", "cpu" ] print(f"执行命令: {' '.join(cmd)}") # 启动服务器 self.server_process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) # 等待服务器启动 print("⏳ 等待服务器启动...") max_wait = 120 # 最多等待2分钟 wait_time = 0 while wait_time < max_wait: if self.server_process.poll() is not None: print("❌ 服务器启动失败") stdout, stderr = self.server_process.communicate() print(f"错误: {stderr}") return False # 检查端口 try: import requests for port in [8080, 7860, 5000]: try: response = requests.get(f"http://127.0.0.1:{port}/health", timeout=2) if response.status_code == 200: print(f"✅ 服务器已启动: http://127.0.0.1:{port}") self.server_url = f"http://127.0.0.1:{port}" return True except: continue except ImportError: pass time.sleep(2) wait_time += 2 print(f" 等待中... ({wait_time}s)") print("⏰ 服务器启动超时") return False def synthesize_speech(self, output_filename="fish_speech_cli_output"): """进行语音合成""" print("🎙️ 开始语音合成...") print(f"📝 参考文本: {self.reference_text}") print(f"📝 目标文本长度: {len(self.target_text)} 字符") # 准备客户端命令 client_cmd = [ sys.executable, "tools/api_client.py", "--text", self.target_text, "--reference_audio", str(self.reference_audio), "--reference_text", self.reference_text, "--output", str(self.output_dir / output_filename), "--no-play", "--max_new_tokens", "2048", "--chunk_length", "300", "--top_p", "0.8", "--temperature", "0.8", "--repetition_penalty", "1.1", "--url", f"{self.server_url}/v1/tts", "--format", "wav" ] print(f"执行命令: {' '.join(client_cmd)}") # 运行客户端 result = subprocess.run( client_cmd, capture_output=True, text=True, timeout=600 # 10分钟超时 ) print("🎙️ 合成结果:") if result.stdout: print("输出:", result.stdout.strip()) if result.stderr: print("错误:", result.stderr.strip()) return result.returncode == 0 def check_output(self, output_filename): """检查输出文件""" output_files = [ self.output_dir / f"{output_filename}.wav", self.output_dir / f"{output_filename}.mp3", self.output_dir / f"{output_filename}.flac" ] for output_file in output_files: if output_file.exists(): try: import torchaudio waveform, sample_rate = torchaudio.load(str(output_file)) duration = waveform.shape[1] / sample_rate print(f"\n✅ 音频生成成功!") print(f"📁 文件: {output_file}") print(f"📊 大小: {output_file.stat().st_size:,} bytes") print(f"🎵 时长: {duration:.2f} 秒") print(f"🎵 采样率: {sample_rate:,} Hz") if duration >= 25: print("🎉 时长符合30秒要求!") else: print(f"⚠️ 时长为 {duration:.2f} 秒") return True, str(output_file) except Exception as e: print(f"⚠️ 读取音频失败: {e}") return True, str(output_file) print("❌ 未找到生成的音频文件") return False, None def cleanup(self): """清理资源""" if self.server_process: print("🧹 停止服务器...") self.server_process.terminate() time.sleep(2) def run(self, output_filename="fish_speech_cli_output"): """运行完整的命令行语音合成流程""" print("🎊 Fish Speech 命令行语音克隆") print("=" * 60) try: # 1. 检查文件 if not self.check_files(): print("❌ 文件检查失败") return False # 2. 启动服务器 if not self.start_api_server(): print("❌ 服务器启动失败") return False # 3. 语音合成 if not self.synthesize_speech(output_filename): print("❌ 语音合成失败") return False # 4. 检查结果 success, output_file = self.check_output(output_filename) if success: print(f"\n🎉 命令行语音合成完成!") print(f"📁 输出文件: {output_file}") return True else: print("❌ 未找到输出文件") return False except KeyboardInterrupt: print("\n🛑 用户中断操作") return False except Exception as e: print(f"❌ 执行失败: {e}") return False finally: # 清理 self.cleanup() def main(): """主函数""" if len(sys.argv) > 1: output_filename = sys.argv[1] else: output_filename = "fish_speech_cli_output" cli = FishSpeechCLI() success = cli.run(output_filename) if success: print(f"\n🎊 成功! 使用命令播放音频:") print(f" aplay {cli.output_dir}/{output_filename}.wav") print(f" 或使用文件管理器打开: {cli.output_dir}/") else: print("\n💔 失败,请检查错误信息") if __name__ == "__main__": main()