#!/usr/bin/env python3 """ Fish Speech 直接命令行语音合成 不启动外部服务器,直接使用模型进行合成 """ import os import sys import torch from pathlib import Path def direct_synthesis(): """直接进行语音合成""" print("🎊 Fish Speech 直接语音合成") print("=" * 50) # 设置路径 fish_speech_dir = Path("/root/tts/fish-speech") os.chdir(fish_speech_dir) model_path = Path("checkpoints/fish-speech-1.5/model.pth") decoder_path = Path("checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth") reference_audio = Path("../ben_guanquelou.wav") output_file = Path("../audio_files/fish_speech_direct_output.wav") output_file.parent.mkdir(exist_ok=True) # 检查文件 print("📦 检查文件...") for file_path, name in [(model_path, "主模型"), (decoder_path, "解码器"), (reference_audio, "参考音频")]: if file_path.exists(): size_mb = file_path.stat().st_size / (1024 * 1024) print(f" ✅ {name}: {file_path.name} ({size_mb:.1f}MB)") else: print(f" ❌ {name}: {file_path.name} (缺失)") return False # 文本设置 reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。" target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压"。""" print(f"\n📝 参考文本: {reference_text}") print(f"📝 目标文本长度: {len(target_text)} 字符") try: # 添加到路径 sys.path.insert(0, str(fish_speech_dir)) print("\n🔧 加载模型...") # 导入模块 from fish_speech.models.dac.inference import load_model as load_decoder_model from fish_speech.models.text2semantic.inference import launch_thread_safe_queue from fish_speech.inference_engine import TTSInferenceEngine from fish_speech.utils.file import audio_to_bytes from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest print("✅ 模块导入成功") # 设置设备 device = "cpu" precision = torch.float32 print(f"🖥️ 使用设备: {device}") print("📦 加载解码器...") decoder_model = load_decoder_model( config_name="modded_dac_vq", checkpoint_path=str(decoder_path), device=device, ) print("✅ 解码器加载成功") print("🧠 加载语言模型...") llama_queue = launch_thread_safe_queue( checkpoint_path=str(model_path), device=device, precision=precision, compile=False, ) print("✅ 语言模型加载成功") print("🎯 创建推理引擎...") inference_engine = TTSInferenceEngine( llama_queue=llama_queue, decoder_model=decoder_model, compile=False, precision=precision, ) print("✅ 推理引擎创建成功") print("🎤 准备参考音频...") ref_audio = ServeReferenceAudio( audio=audio_to_bytes(str(reference_audio)), text=reference_text ) print("✅ 参考音频准备完成") print("🎙️ 开始语音合成...") # 创建请求 request = ServeTTSRequest( text=target_text, references=[ref_audio], max_new_tokens=1024, chunk_length=200, top_p=0.7, repetition_penalty=1.2, temperature=0.7, format="wav", ) print("🔄 正在生成音频(可能需要几分钟)...") # 进行推理 audio_data = None for result in inference_engine.inference(request): if result.code == "final": audio_data = result.audio print("✅ 音频生成完成!") break elif result.code == "error": print(f"❌ 推理错误: {result.message}") return False if audio_data: # 保存音频 with open(output_file, "wb") as f: f.write(audio_data) print(f"💾 音频已保存: {output_file}") # 验证音频 try: import torchaudio waveform, sample_rate = torchaudio.load(str(output_file)) duration = waveform.shape[1] / sample_rate print(f"📊 音频信息:") print(f" 文件大小: {output_file.stat().st_size:,} bytes") print(f" 采样率: {sample_rate:,} Hz") print(f" 音频时长: {duration:.2f} 秒") if duration >= 25: print("🎉 音频时长符合30秒要求!") else: print(f"⚠️ 音频时长为 {duration:.2f} 秒") return True except Exception as e: print(f"⚠️ 无法验证音频: {e}") return True else: print("❌ 未能生成音频数据") return False except Exception as e: print(f"❌ 语音合成失败: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": try: success = direct_synthesis() if success: print("\n🎊 Fish Speech 命令行语音合成成功!") print("📁 输出文件: /root/tts/audio_files/fish_speech_direct_output.wav") print("🔊 播放命令: aplay /root/tts/audio_files/fish_speech_direct_output.wav") else: print("\n💔 语音合成失败") except KeyboardInterrupt: print("\n🛑 用户中断操作") except Exception as e: print(f"\n❌ 程序异常: {e}")