Files
tts/scripts/generate/real_fish_speech.py
2026-01-19 10:27:41 +08:00

227 lines
8.8 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
使用 Fish Speech 进行真正的语音克隆合成
"""
import os
import sys
import subprocess
import time
import requests
from pathlib import Path
def check_server_ready(url, timeout=60):
"""检查服务器是否准备就绪"""
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(f"{url}/health", timeout=5)
if response.status_code == 200:
return True
except:
pass
time.sleep(2)
return False
def main():
print("=== Fish Speech 真实语音克隆 ===")
# 设置路径
fish_speech_dir = Path("/root/tts/fish-speech")
reference_audio = Path("/root/tts/ben_guanquelou.wav")
output_dir = Path("/root/tts/audio_files")
output_dir.mkdir(exist_ok=True)
# 确保使用完整的参考文本(登鹳雀楼全文)
reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
# 要合成的文本
target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压""""
print(f"Fish Speech 目录: {fish_speech_dir}")
print(f"参考音频: {reference_audio}")
print(f"参考文本: {reference_text}")
print(f"目标文本长度: {len(target_text)} 字符")
if not reference_audio.exists():
print("❌ 参考音频不存在")
return False
# 切换到 Fish Speech 目录
os.chdir(fish_speech_dir)
# 检查模型文件
model_path = Path("./checkpoints/fish-speech-1.5/model.pth")
decoder_path = Path("./checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
if not model_path.exists() or not decoder_path.exists():
print("❌ 模型文件不完整")
return False
try:
# 方法1: 启动 API 服务器
print("\n🚀 启动 Fish Speech API 服务器...")
server_cmd = [
sys.executable, "tools/api_server.py",
"--llama-checkpoint-path", str(model_path),
"--decoder-checkpoint-path", str(decoder_path),
"--device", "cpu"
]
print(f"执行命令: {' '.join(server_cmd)}")
# 启动服务器
server_process = subprocess.Popen(
server_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
print("等待服务器启动...")
# 尝试不同的端口
ports_to_try = [8080, 7860, 5000]
server_url = None
for port in ports_to_try:
url = f"http://127.0.0.1:{port}"
print(f"尝试端口 {port}...")
if check_server_ready(url, timeout=30):
server_url = url
print(f"✅ 服务器已启动: {server_url}")
break
if not server_url:
print("❌ 服务器启动失败")
server_process.terminate()
return False
print("✅ 服务器准备就绪!")
# 方法2: 使用 API 客户端进行语音合成
print("\n🎙️ 开始语音合成...")
# 准备客户端命令
client_cmd = [
sys.executable, "tools/api_client.py",
"--text", target_text,
"--reference_audio", str(reference_audio),
"--reference_text", reference_text,
"--output", str(output_dir / "real_fish_speech_30s"),
"--no-play",
"--max_new_tokens", "2048",
"--chunk_length", "300",
"--top_p", "0.8",
"--temperature", "0.8",
"--repetition_penalty", "1.1",
"--url", f"{server_url}/v1/tts",
"--format", "wav"
]
print(f"客户端命令: {' '.join(client_cmd)}")
# 运行客户端
client_result = subprocess.run(
client_cmd,
capture_output=True,
text=True,
timeout=600 # 10分钟超时
)
print("🎙️ 合成结果:")
if client_result.stdout:
print("输出:", client_result.stdout)
if client_result.stderr:
print("错误:", client_result.stderr)
# 停止服务器
server_process.terminate()
# 检查生成的文件
if client_result.returncode == 0:
print("✅ 语音合成成功!")
# 查找生成的文件
output_files = [
output_dir / "real_fish_speech_30s.wav",
output_dir / "real_fish_speech_30s.mp3",
output_dir / "real_fish_speech_30s.flac"
]
success = False
for output_file in output_files:
if output_file.exists():
try:
import torchaudio
waveform, sample_rate = torchaudio.load(str(output_file))
duration = waveform.shape[1] / sample_rate
print(f"\n✅ 音频文件: {output_file}")
print(f" 文件大小: {output_file.stat().st_size:,} bytes")
print(f" 采样率: {sample_rate:,} Hz")
print(f" 音频时长: {duration:.2f}")
if duration >= 25:
print("🎉 音频长度符合30秒要求!")
else:
print(f"⚠️ 音频长度为 {duration:.2f}")
success = True
break
except Exception as e:
print(f"读取音频文件失败: {e}")
print(f"✅ 文件已保存: {output_file}")
success = True
break
if success:
print("\n🎊 Fish Speech 语音克隆成功完成!")
return True
else:
print("❌ 未找到生成的音频文件")
return False
else:
print("❌ 语音合成失败")
return False
except subprocess.TimeoutExpired:
print("⏰ 操作超时")
if 'server_process' in locals():
server_process.terminate()
return False
except Exception as e:
print(f"❌ 执行失败: {e}")
if 'server_process' in locals():
server_process.terminate()
return False
if __name__ == "__main__":
success = main()
if not success:
print("\n💔 备用方案: 使用现有工具...")
# 提供手动操作指南
print("\n📋 手动操作指南:")
print("=" * 50)
print("1. 启动 Web UI:")
print(" cd /root/tts/fish-speech")
print(" python tools/run_webui.py \\")
print(" --llama-checkpoint-path checkpoints/fish-speech-1.5/model.pth \\")
print(" --decoder-checkpoint-path checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
print()
print("2. 在浏览器中打开 Web UI 界面")
print("3. 上传参考音频: /root/tts/ben_guanquelou.wav")
print("4. 输入参考文本: 登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。")
print("5. 输入目标文本你提供的354字符文本")
print("6. 点击生成并等待结果")
print("=" * 50)
print("\n📦 已完成的准备工作:")
print("✅ Fish Speech 模型已从魔搭社区下载")
print("✅ 参考音频文件已准备")
print("✅ 模型文件完整性验证通过")
print("✅ 文本内容已确认")