177 lines
7.0 KiB
Python
Executable File
177 lines
7.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Fish Speech 直接命令行语音合成
|
|
不启动外部服务器,直接使用模型进行合成
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import torch
|
|
from pathlib import Path
|
|
|
|
def direct_synthesis():
|
|
"""直接进行语音合成"""
|
|
print("🎊 Fish Speech 直接语音合成")
|
|
print("=" * 50)
|
|
|
|
# 设置路径
|
|
fish_speech_dir = Path("/root/tts/fish-speech")
|
|
os.chdir(fish_speech_dir)
|
|
|
|
model_path = Path("checkpoints/fish-speech-1.5/model.pth")
|
|
decoder_path = Path("checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
|
|
reference_audio = Path("../ben_guanquelou.wav")
|
|
output_file = Path("../audio_files/fish_speech_direct_output.wav")
|
|
output_file.parent.mkdir(exist_ok=True)
|
|
|
|
# 检查文件
|
|
print("📦 检查文件...")
|
|
for file_path, name in [(model_path, "主模型"), (decoder_path, "解码器"), (reference_audio, "参考音频")]:
|
|
if file_path.exists():
|
|
size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
print(f" ✅ {name}: {file_path.name} ({size_mb:.1f}MB)")
|
|
else:
|
|
print(f" ❌ {name}: {file_path.name} (缺失)")
|
|
return False
|
|
|
|
# 文本设置
|
|
reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
|
|
target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压"。"""
|
|
|
|
print(f"\n📝 参考文本: {reference_text}")
|
|
print(f"📝 目标文本长度: {len(target_text)} 字符")
|
|
|
|
try:
|
|
# 添加到路径
|
|
sys.path.insert(0, str(fish_speech_dir))
|
|
|
|
print("\n🔧 加载模型...")
|
|
|
|
# 导入模块
|
|
from fish_speech.models.dac.inference import load_model as load_decoder_model
|
|
from fish_speech.models.text2semantic.inference import launch_thread_safe_queue
|
|
from fish_speech.inference_engine import TTSInferenceEngine
|
|
from fish_speech.utils.file import audio_to_bytes
|
|
from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest
|
|
|
|
print("✅ 模块导入成功")
|
|
|
|
# 设置设备
|
|
device = "cpu"
|
|
precision = torch.float32
|
|
print(f"🖥️ 使用设备: {device}")
|
|
|
|
print("📦 加载解码器...")
|
|
decoder_model = load_decoder_model(
|
|
config_name="modded_dac_vq",
|
|
checkpoint_path=str(decoder_path),
|
|
device=device,
|
|
)
|
|
print("✅ 解码器加载成功")
|
|
|
|
print("🧠 加载语言模型...")
|
|
llama_queue = launch_thread_safe_queue(
|
|
checkpoint_path=str(model_path),
|
|
device=device,
|
|
precision=precision,
|
|
compile=False,
|
|
)
|
|
print("✅ 语言模型加载成功")
|
|
|
|
print("🎯 创建推理引擎...")
|
|
inference_engine = TTSInferenceEngine(
|
|
llama_queue=llama_queue,
|
|
decoder_model=decoder_model,
|
|
compile=False,
|
|
precision=precision,
|
|
)
|
|
print("✅ 推理引擎创建成功")
|
|
|
|
print("🎤 准备参考音频...")
|
|
ref_audio = ServeReferenceAudio(
|
|
audio=audio_to_bytes(str(reference_audio)),
|
|
text=reference_text
|
|
)
|
|
print("✅ 参考音频准备完成")
|
|
|
|
print("🎙️ 开始语音合成...")
|
|
|
|
# 创建请求
|
|
request = ServeTTSRequest(
|
|
text=target_text,
|
|
references=[ref_audio],
|
|
max_new_tokens=1024,
|
|
chunk_length=200,
|
|
top_p=0.7,
|
|
repetition_penalty=1.2,
|
|
temperature=0.7,
|
|
format="wav",
|
|
)
|
|
|
|
print("🔄 正在生成音频(可能需要几分钟)...")
|
|
|
|
# 进行推理
|
|
audio_data = None
|
|
for result in inference_engine.inference(request):
|
|
if result.code == "final":
|
|
audio_data = result.audio
|
|
print("✅ 音频生成完成!")
|
|
break
|
|
elif result.code == "error":
|
|
print(f"❌ 推理错误: {result.message}")
|
|
return False
|
|
|
|
if audio_data:
|
|
# 保存音频
|
|
with open(output_file, "wb") as f:
|
|
f.write(audio_data)
|
|
|
|
print(f"💾 音频已保存: {output_file}")
|
|
|
|
# 验证音频
|
|
try:
|
|
import torchaudio
|
|
waveform, sample_rate = torchaudio.load(str(output_file))
|
|
duration = waveform.shape[1] / sample_rate
|
|
|
|
print(f"📊 音频信息:")
|
|
print(f" 文件大小: {output_file.stat().st_size:,} bytes")
|
|
print(f" 采样率: {sample_rate:,} Hz")
|
|
print(f" 音频时长: {duration:.2f} 秒")
|
|
|
|
if duration >= 25:
|
|
print("🎉 音频时长符合30秒要求!")
|
|
else:
|
|
print(f"⚠️ 音频时长为 {duration:.2f} 秒")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ 无法验证音频: {e}")
|
|
return True
|
|
|
|
else:
|
|
print("❌ 未能生成音频数据")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ 语音合成失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = direct_synthesis()
|
|
|
|
if success:
|
|
print("\n🎊 Fish Speech 命令行语音合成成功!")
|
|
print("📁 输出文件: /root/tts/audio_files/fish_speech_direct_output.wav")
|
|
print("🔊 播放命令: aplay /root/tts/audio_files/fish_speech_direct_output.wav")
|
|
else:
|
|
print("\n💔 语音合成失败")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n🛑 用户中断操作")
|
|
except Exception as e:
|
|
print(f"\n❌ 程序异常: {e}") |