Files
tts/scripts/generate/fish_speech_cli.py
2026-01-19 10:27:41 +08:00

255 lines
9.6 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Fish Speech 命令行语音克隆脚本
无需 Web UI纯命令行控制
"""
import os
import sys
import subprocess
import time
import signal
from pathlib import Path
class FishSpeechCLI:
def __init__(self):
self.fish_speech_dir = Path("/root/tts/fish-speech")
self.model_path = self.fish_speech_dir / "checkpoints/fish-speech-1.5/model.pth"
self.decoder_path = self.fish_speech_dir / "checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
self.reference_audio = Path("/root/tts/ben_guanquelou.wav")
self.output_dir = Path("/root/tts/audio_files")
self.output_dir.mkdir(exist_ok=True)
# 默认参数
self.reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
self.target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压""""
self.server_process = None
def check_files(self):
"""检查必需文件"""
print("📦 检查文件...")
files = [
(self.model_path, "主模型"),
(self.decoder_path, "解码器"),
(self.reference_audio, "参考音频")
]
for file_path, name in files:
if file_path.exists():
size_mb = file_path.stat().st_size / (1024 * 1024)
print(f"{name}: {file_path.name} ({size_mb:.1f}MB)")
else:
print(f"{name}: {file_path.name} (缺失)")
return False
return True
def start_api_server(self):
"""启动 API 服务器"""
print("🚀 启动 Fish Speech API 服务器...")
# 清理旧进程
subprocess.run("pkill -f 'api_server'", shell=True)
time.sleep(2)
# 切换到 Fish Speech 目录
os.chdir(self.fish_speech_dir)
# 启动命令
cmd = [
sys.executable, "tools/api_server.py",
"--llama-checkpoint-path", str(self.model_path),
"--decoder-checkpoint-path", str(self.decoder_path),
"--device", "cpu"
]
print(f"执行命令: {' '.join(cmd)}")
# 启动服务器
self.server_process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# 等待服务器启动
print("⏳ 等待服务器启动...")
max_wait = 120 # 最多等待2分钟
wait_time = 0
while wait_time < max_wait:
if self.server_process.poll() is not None:
print("❌ 服务器启动失败")
stdout, stderr = self.server_process.communicate()
print(f"错误: {stderr}")
return False
# 检查端口
try:
import requests
for port in [8080, 7860, 5000]:
try:
response = requests.get(f"http://127.0.0.1:{port}/health", timeout=2)
if response.status_code == 200:
print(f"✅ 服务器已启动: http://127.0.0.1:{port}")
self.server_url = f"http://127.0.0.1:{port}"
return True
except:
continue
except ImportError:
pass
time.sleep(2)
wait_time += 2
print(f" 等待中... ({wait_time}s)")
print("⏰ 服务器启动超时")
return False
def synthesize_speech(self, output_filename="fish_speech_cli_output"):
"""进行语音合成"""
print("🎙️ 开始语音合成...")
print(f"📝 参考文本: {self.reference_text}")
print(f"📝 目标文本长度: {len(self.target_text)} 字符")
# 准备客户端命令
client_cmd = [
sys.executable, "tools/api_client.py",
"--text", self.target_text,
"--reference_audio", str(self.reference_audio),
"--reference_text", self.reference_text,
"--output", str(self.output_dir / output_filename),
"--no-play",
"--max_new_tokens", "2048",
"--chunk_length", "300",
"--top_p", "0.8",
"--temperature", "0.8",
"--repetition_penalty", "1.1",
"--url", f"{self.server_url}/v1/tts",
"--format", "wav"
]
print(f"执行命令: {' '.join(client_cmd)}")
# 运行客户端
result = subprocess.run(
client_cmd,
capture_output=True,
text=True,
timeout=600 # 10分钟超时
)
print("🎙️ 合成结果:")
if result.stdout:
print("输出:", result.stdout.strip())
if result.stderr:
print("错误:", result.stderr.strip())
return result.returncode == 0
def check_output(self, output_filename):
"""检查输出文件"""
output_files = [
self.output_dir / f"{output_filename}.wav",
self.output_dir / f"{output_filename}.mp3",
self.output_dir / f"{output_filename}.flac"
]
for output_file in output_files:
if output_file.exists():
try:
import torchaudio
waveform, sample_rate = torchaudio.load(str(output_file))
duration = waveform.shape[1] / sample_rate
print(f"\n✅ 音频生成成功!")
print(f"📁 文件: {output_file}")
print(f"📊 大小: {output_file.stat().st_size:,} bytes")
print(f"🎵 时长: {duration:.2f}")
print(f"🎵 采样率: {sample_rate:,} Hz")
if duration >= 25:
print("🎉 时长符合30秒要求!")
else:
print(f"⚠️ 时长为 {duration:.2f}")
return True, str(output_file)
except Exception as e:
print(f"⚠️ 读取音频失败: {e}")
return True, str(output_file)
print("❌ 未找到生成的音频文件")
return False, None
def cleanup(self):
"""清理资源"""
if self.server_process:
print("🧹 停止服务器...")
self.server_process.terminate()
time.sleep(2)
def run(self, output_filename="fish_speech_cli_output"):
"""运行完整的命令行语音合成流程"""
print("🎊 Fish Speech 命令行语音克隆")
print("=" * 60)
try:
# 1. 检查文件
if not self.check_files():
print("❌ 文件检查失败")
return False
# 2. 启动服务器
if not self.start_api_server():
print("❌ 服务器启动失败")
return False
# 3. 语音合成
if not self.synthesize_speech(output_filename):
print("❌ 语音合成失败")
return False
# 4. 检查结果
success, output_file = self.check_output(output_filename)
if success:
print(f"\n🎉 命令行语音合成完成!")
print(f"📁 输出文件: {output_file}")
return True
else:
print("❌ 未找到输出文件")
return False
except KeyboardInterrupt:
print("\n🛑 用户中断操作")
return False
except Exception as e:
print(f"❌ 执行失败: {e}")
return False
finally:
# 清理
self.cleanup()
def main():
"""主函数"""
if len(sys.argv) > 1:
output_filename = sys.argv[1]
else:
output_filename = "fish_speech_cli_output"
cli = FishSpeechCLI()
success = cli.run(output_filename)
if success:
print(f"\n🎊 成功! 使用命令播放音频:")
print(f" aplay {cli.output_dir}/{output_filename}.wav")
print(f" 或使用文件管理器打开: {cli.output_dir}/")
else:
print("\n💔 失败,请检查错误信息")
if __name__ == "__main__":
main()