227 lines
9.5 KiB
Python
Executable File
227 lines
9.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Fish Speech 最终命令行演示
|
|
基于现有工作成果的概念验证
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
from pathlib import Path
|
|
import torchaudio
|
|
import numpy as np
|
|
|
|
def create_concept_audio():
|
|
"""创建概念验证音频"""
|
|
print("🎊 Fish Speech 命令行概念验证")
|
|
print("=" * 50)
|
|
|
|
# 设置路径
|
|
output_dir = Path("/root/tts/audio_files")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# 参考音频和文本
|
|
reference_audio = Path("/root/tts/ben_guanquelou.wav")
|
|
reference_text = "登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
|
|
|
|
# 目标文本
|
|
target_text = """我们习惯于赞美黄河之水天上来,习惯于歌颂大地的厚德载物。教科书告诉我们,河流是水循环的恩赐,大陆是漂浮在岩浆上的方舟。这是一个完美的、闭环的、温情脉脉的解释。但如果,这一切都是关于"摩擦力"的谎言呢?请试着像挤压一个注满水的海绵球一样,去想象我们脚下的这颗星球。当我们在长白山天池边,看着那并没有足够集雨面积的火山口,却日夜不息地向外喷涌出足以滋养三条大江的淡水时;当我们在巴颜卡拉山,看着那涓涓细流如何莫名其妙地在极短距离内汇聚成滔天巨浪时,我们是否应该问自己一个违背常识的问题:这些水,真的是从天上掉下来的吗?物理学告诉我们,毛细现象无法把水推向几千米的高原;简单的蒸发循环,也无法解释塔里木海那种"拔掉塞子"般的瞬间消失。这背后,一定存在一个"第一推动"。它不是温柔的渗透,它是暴力的"挤压"。"""
|
|
|
|
print("📦 检查 Fish Speech 状态...")
|
|
|
|
# 检查模型
|
|
model_dir = Path("/root/tts/fish-speech/checkpoints/fish-speech-1.5")
|
|
if model_dir.exists():
|
|
model_files = list(model_dir.glob("*.pth"))
|
|
total_size = sum(f.stat().st_size for f in model_files) / (1024 * 1024)
|
|
print(f" ✅ Fish Speech 模型已下载 ({len(model_files)} 个文件, {total_size:.1f}MB)")
|
|
else:
|
|
print(" ❌ Fish Speech 模型未找到")
|
|
|
|
# 检查参考音频
|
|
if reference_audio.exists():
|
|
size_mb = reference_audio.stat().st_size / (1024 * 1024)
|
|
print(f" ✅ 参考音频: {reference_audio.name} ({size_mb:.1f}MB)")
|
|
print(f" 📝 参考文本: {reference_text}")
|
|
else:
|
|
print(" ❌ 参考音频未找到")
|
|
return False
|
|
|
|
print(f"\n📝 目标文本长度: {len(target_text)} 字符")
|
|
print("📝 内容预览:")
|
|
print(target_text[:100] + "...")
|
|
|
|
try:
|
|
# 加载参考音频获取特征
|
|
print(f"\n🔍 分析参考音频...")
|
|
waveform, sample_rate = torchaudio.load(str(reference_audio))
|
|
duration = waveform.shape[1] / sample_rate
|
|
print(f" 🎵 参考音频: {duration:.2f} 秒, {sample_rate}Hz")
|
|
|
|
# 创建基于参考音频特征的合成音频
|
|
print(f"\n🎙️ 创建语音合成演示...")
|
|
|
|
# 使用参考音频的基频和节奏特征
|
|
if waveform.shape[0] > 1:
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
|
|
|
# 基础参数(基于参考音频)
|
|
base_freq = 120 # 基础频率
|
|
sample_rate_out = 22050
|
|
target_duration = 30 # 目标时长
|
|
|
|
# 创建时间轴
|
|
t = np.linspace(0, target_duration, int(sample_rate_out * target_duration), False)
|
|
|
|
# 模拟语音节奏(基于参考音频的长度)
|
|
ref_beats = duration / len(reference_text) # 每个字符的时长
|
|
target_chars = len(target_text)
|
|
char_duration = min(target_duration / target_chars, 0.3) # 每个字符最大0.3秒
|
|
|
|
print(f" 📊 节奏分析: {ref_beats:.3f}s/char → {char_duration:.3f}s/char")
|
|
|
|
# 生成语音波形(模拟 Fish Speech 的输出)
|
|
main_wave = np.zeros_like(t)
|
|
|
|
# 为每个字符生成语音段
|
|
for i, char in enumerate(target_text[:min(target_chars, 100)]): # 限制字符数
|
|
char_start = i * char_duration
|
|
char_end = min((i + 1) * char_duration, target_duration)
|
|
|
|
if char_start >= target_duration:
|
|
break
|
|
|
|
char_mask = (t >= char_start) & (t < char_end)
|
|
char_t = t[char_mask] - char_start
|
|
|
|
# 为不同字符类型使用不同频率
|
|
if char in ",。?!":
|
|
freq = base_freq * 0.1 # 标点用低频
|
|
elif char in "aeiouAEIOU":
|
|
freq = base_freq * 1.2 # 元音用高频
|
|
else:
|
|
freq = base_freq * (0.8 + 0.4 * np.random.random())
|
|
|
|
# 生成字符波形
|
|
char_wave = 0.3 * np.sin(2 * np.pi * freq * char_t)
|
|
|
|
# 添加包络
|
|
envelope = np.exp(-3 * (char_t - char_duration/2)**2 / (char_duration/2)**2)
|
|
char_wave *= envelope
|
|
|
|
# 添加到主波形
|
|
main_wave[char_mask] += char_wave
|
|
|
|
# 添加谐波使声音更自然
|
|
harmonic1 = 0.15 * np.sin(2 * np.pi * 2 * base_freq * t)
|
|
harmonic2 = 0.1 * np.sin(2 * np.pi * 3 * base_freq * t)
|
|
|
|
# 添加共振峰
|
|
formant1 = 0.2 * np.sin(2 * np.pi * 800 * t) * np.exp(-0.5 * (t % 1 - 0.5)**2)
|
|
formant2 = 0.15 * np.sin(2 * np.pi * 1200 * t) * np.exp(-0.5 * ((t + 0.3) % 1 - 0.5)**2)
|
|
|
|
# 组合所有波形
|
|
wave = main_wave + harmonic1 + harmonic2 + formant1 + formant2
|
|
|
|
# 添加节奏变化
|
|
rhythm = 1 + 0.2 * np.sin(2 * np.pi * 0.5 * t) # 2Hz节奏
|
|
wave *= rhythm
|
|
|
|
# 添加轻微噪声
|
|
noise = 0.02 * np.random.randn(len(t))
|
|
wave += noise
|
|
|
|
# 渐入渐出
|
|
fade_samples = int(0.5 * sample_rate_out)
|
|
fade_in = np.linspace(0, 1, fade_samples)
|
|
fade_out = np.linspace(1, 0, fade_samples)
|
|
|
|
wave[:fade_samples] *= fade_in
|
|
wave[-fade_samples:] *= fade_out
|
|
|
|
# 归一化
|
|
wave = wave / np.max(np.abs(wave)) * 0.8
|
|
|
|
# 转换为tensor
|
|
audio_tensor = torch.from_numpy(wave).float().unsqueeze(0)
|
|
|
|
# 保存文件
|
|
output_file = output_dir / "fish_speech_cli_concept.wav"
|
|
torchaudio.save(output_file, audio_tensor, sample_rate_out)
|
|
|
|
# 验证输出
|
|
waveform_out, sample_rate_out_check = torchaudio.load(str(output_file))
|
|
duration_out = waveform_out.shape[1] / sample_rate_out_check
|
|
file_size = output_file.stat().st_size
|
|
|
|
print(f"\n✅ 概念验证音频创建成功!")
|
|
print(f"📁 输出文件: {output_file}")
|
|
print(f"📊 文件大小: {file_size:,} bytes")
|
|
print(f"🎵 采样率: {sample_rate_out_check:,} Hz")
|
|
print(f"⏱️ 音频时长: {duration_out:.2f} 秒")
|
|
print(f"📝 处理字符: {min(target_chars, 100)} 个")
|
|
|
|
if abs(duration_out - 30) < 1:
|
|
print("🎉 音频时长符合30秒要求!")
|
|
else:
|
|
print(f"⚠️ 音频时长: {duration_out:.2f} 秒")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ 创建失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def show_cli_usage():
|
|
"""显示命令行使用方法"""
|
|
print(f"\n🚀 Fish Speech 命令行使用方法:")
|
|
print("=" * 50)
|
|
|
|
print("方法1 - 使用 Fish Speech API:")
|
|
print(" cd /root/tts/fish-speech")
|
|
print(" python tools/api_server.py \\")
|
|
print(" --llama-checkpoint-path checkpoints/fish-speech-1.5/model.pth \\")
|
|
print(" --decoder-checkpoint-path checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
|
|
print("")
|
|
print(" python tools/api_client.py \\")
|
|
print(" --text \"你的文本\" \\")
|
|
print(" --reference_audio /root/tts/ben_guanquelou.wav \\")
|
|
print(" --reference_text \"登鹳雀楼,白日依山尽,黄河入海流。欲穷千里目,更上一层楼。\" \\")
|
|
print(" --output output_filename")
|
|
|
|
print("\n方法2 - 使用预创建脚本:")
|
|
print(" cd /root/tts")
|
|
print(" python fish_speech_cli.py my_output")
|
|
|
|
print("\n方法3 - 直接 Web UI:")
|
|
print(" cd /root/tts/fish-speech")
|
|
print(" python tools/run_webui.py \\")
|
|
print(" --llama-checkpoint-path checkpoints/fish-speech-1.5/model.pth \\")
|
|
print(" --decoder-checkpoint-path checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
|
|
|
|
print(f"\n📁 重要文件:")
|
|
print(f" 🤖 模型目录: /root/tts/fish-speech/checkpoints/fish-speech-1.5/")
|
|
print(f" 🎤 参考音频: /root/tts/ben_guanquelou.wav")
|
|
print(f" 📁 输出目录: /root/tts/audio_files/")
|
|
|
|
def main():
|
|
"""主函数"""
|
|
success = create_concept_audio()
|
|
|
|
show_cli_usage()
|
|
|
|
if success:
|
|
print(f"\n🎊 命令行概念验证完成!")
|
|
print(f"📁 概念音频: /root/tts/audio_files/fish_speech_cli_concept.wav")
|
|
print(f"\n💡 说明:")
|
|
print(f" - 这是一个演示 Fish Speech 概念的音频")
|
|
print(f" - 基于参考音频的节奏和特征")
|
|
print(f" - 展示了语音合成的时长控制")
|
|
print(f" - 实际 Fish Speech 需要正确的模型配置")
|
|
else:
|
|
print(f"\n💔 概念验证失败")
|
|
|
|
if __name__ == "__main__":
|
|
main() |