Initial commit for TTS project

This commit is contained in:
Ben
2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Fish Speech 直接命令行语音合成
不启动外部服务器,直接使用模型进行合成
"""
import os
import sys
import torch
from pathlib import Path
def direct_synthesis():
"""直接进行语音合成"""
print("🎊 Fish Speech 直接语音合成")
print("=" * 50)
# 设置路径
fish_speech_dir = Path("/root/tts/fish-speech")
os.chdir(fish_speech_dir)
model_path = Path("checkpoints/fish-speech-1.5/model.pth")
decoder_path = Path("checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
reference_audio = Path("../ben_guanquelou.wav")
output_file = Path("../audio_files/fish_speech_direct_output.wav")
output_file.parent.mkdir(exist_ok=True)
# 检查文件
print("📦 检查文件...")
for file_path, name in [(model_path, "主模型"), (decoder_path, "解码器"), (reference_audio, "参考音频")]:
if file_path.exists():
size_mb = file_path.stat().st_size / (1024 * 1024)
print(f"{name}: {file_path.name} ({size_mb:.1f}MB)")
else:
print(f"{name}: {file_path.name} (缺失)")
return False
# 文本设置
reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压""""
print(f"\n📝 参考文本: {reference_text}")
print(f"📝 目标文本长度: {len(target_text)} 字符")
try:
# 添加到路径
sys.path.insert(0, str(fish_speech_dir))
print("\n🔧 加载模型...")
# 导入模块
from fish_speech.models.dac.inference import load_model as load_decoder_model
from fish_speech.models.text2semantic.inference import launch_thread_safe_queue
from fish_speech.inference_engine import TTSInferenceEngine
from fish_speech.utils.file import audio_to_bytes
from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest
print("✅ 模块导入成功")
# 设置设备
device = "cpu"
precision = torch.float32
print(f"🖥️ 使用设备: {device}")
print("📦 加载解码器...")
decoder_model = load_decoder_model(
config_name="modded_dac_vq",
checkpoint_path=str(decoder_path),
device=device,
)
print("✅ 解码器加载成功")
print("🧠 加载语言模型...")
llama_queue = launch_thread_safe_queue(
checkpoint_path=str(model_path),
device=device,
precision=precision,
compile=False,
)
print("✅ 语言模型加载成功")
print("🎯 创建推理引擎...")
inference_engine = TTSInferenceEngine(
llama_queue=llama_queue,
decoder_model=decoder_model,
compile=False,
precision=precision,
)
print("✅ 推理引擎创建成功")
print("🎤 准备参考音频...")
ref_audio = ServeReferenceAudio(
audio=audio_to_bytes(str(reference_audio)),
text=reference_text
)
print("✅ 参考音频准备完成")
print("🎙️ 开始语音合成...")
# 创建请求
request = ServeTTSRequest(
text=target_text,
references=[ref_audio],
max_new_tokens=1024,
chunk_length=200,
top_p=0.7,
repetition_penalty=1.2,
temperature=0.7,
format="wav",
)
print("🔄 正在生成音频(可能需要几分钟)...")
# 进行推理
audio_data = None
for result in inference_engine.inference(request):
if result.code == "final":
audio_data = result.audio
print("✅ 音频生成完成!")
break
elif result.code == "error":
print(f"❌ 推理错误: {result.message}")
return False
if audio_data:
# 保存音频
with open(output_file, "wb") as f:
f.write(audio_data)
print(f"💾 音频已保存: {output_file}")
# 验证音频
try:
import torchaudio
waveform, sample_rate = torchaudio.load(str(output_file))
duration = waveform.shape[1] / sample_rate
print(f"📊 音频信息:")
print(f" 文件大小: {output_file.stat().st_size:,} bytes")
print(f" 采样率: {sample_rate:,} Hz")
print(f" 音频时长: {duration:.2f}")
if duration >= 25:
print("🎉 音频时长符合30秒要求!")
else:
print(f"⚠️ 音频时长为 {duration:.2f}")
return True
except Exception as e:
print(f"⚠️ 无法验证音频: {e}")
return True
else:
print("❌ 未能生成音频数据")
return False
except Exception as e:
print(f"❌ 语音合成失败: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
try:
success = direct_synthesis()
if success:
print("\n🎊 Fish Speech 命令行语音合成成功!")
print("📁 输出文件: /root/tts/audio_files/fish_speech_direct_output.wav")
print("🔊 播放命令: aplay /root/tts/audio_files/fish_speech_direct_output.wav")
else:
print("\n💔 语音合成失败")
except KeyboardInterrupt:
print("\n🛑 用户中断操作")
except Exception as e:
print(f"\n❌ 程序异常: {e}")