Initial commit for TTS project

2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions
--- a/scripts/generation/generate_moss_ttsd_podcast.py
+++ b/scripts/generation/generate_moss_ttsd_podcast.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+MOSS-TTSD 播客生成器 - 简化版
+直接生成到 /root/tts/podcast_audios/
+"""
+
+import os
+import subprocess
+import sys
+
+# 配置
+OUTPUT_DIR = "/root/tts/podcast_audios"
+MODEL_DIR = "/root/tts/MOSS-TTSD"
+
+def generate_podcast(script_file, output_name):
+    """
+    生成播客并直接保存到 podcast_audios
+    
+    参数:
+        script_file: 对话脚本文件路径 (.txt格式，包含[S1] [S2]标签)
+        output_name: 输出文件名 (不需要.wav后缀)
+    """
+    
+    print(f"🎙️ 生成播客: {output_name}")
+    print("=" * 50)
+    
+    # 检查模型
+    if not os.path.exists(f"{MODEL_DIR}/MOSS-TTSD-v0.7"):
+        print("❌ MOSS-TTSD模型未下载")
+        return False
+    
+    # 检查脚本文件
+    if not os.path.exists(script_file):
+        print(f"❌ 脚本文件不存在: {script_file}")
+        return False
+    
+    # 创建临时JSONL文件
+    import json
+    import tempfile
+    
+    # 读取脚本
+    with open(script_file, 'r', encoding='utf-8') as f:
+        script_text = f.read().strip()
+    
+    # 创建对话数据
+    dialogue_data = {
+        "id": 1,
+        "base_path": "/root/tts/hosts",
+        "text": script_text,
+        "prompt_audio_speaker1": "ben_guanquelou.wav",
+        "prompt_text_speaker1": "白日依山尽，黄河入海流，欲穷千里目，更上一层楼。",
+        "prompt_audio_speaker2": "judy_dalingtaohua_trim.wav",
+        "prompt_text_speaker2": "大林寺桃花，人间四月芳菲尽，山寺桃花始盛开。"
+    }
+    
+    # 创建临时文件
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as f:
+        json.dump(dialogue_data, f, ensure_ascii=False)
+        f.write('\n')
+        temp_jsonl = f.name
+    
+    print(f"✅ 脚本加载成功: {len(script_text)} 字符")
+    
+    # 生成音频到临时位置
+    print("🎬 正在生成音频...")
+    cmd = [
+        sys.executable, f"{MODEL_DIR}/inference.py",
+        "--jsonl", temp_jsonl,
+        "--output_dir", "/tmp",
+        "--attn_implementation", "sdpa",
+        "--use_normalize",
+        "--silence_duration", "0.12",
+        "--seed", "42"
+    ]
+    
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    # 删除临时JSONL文件
+    os.unlink(temp_jsonl)
+    
+    if result.returncode != 0:
+        print("❌ 音频生成失败")
+        print(result.stderr)
+        return False
+    
+    # 检查生成的音频
+    temp_audio = "/tmp/output_0.wav"
+    if not os.path.exists(temp_audio):
+        print("❌ 音频文件未生成")
+        return False
+    
+    # 复制到目标位置
+    output_path = f"{OUTPUT_DIR}/{output_name}.wav"
+    subprocess.run(["cp", temp_audio, output_path], check=True)
+    os.unlink(temp_audio)
+    
+    # 获取音频信息
+    probe_result = subprocess.run(
+        ["ffprobe", output_path, "-v", "quiet", "-show_streams"],
+        capture_output=True, text=True
+    )
+    
+    duration = "未知"
+    if probe_result.returncode == 0:
+        for line in probe_result.stdout.split('\n'):
+            if line.startswith("duration="):
+                duration = f"{float(line.split('=')[1]):.1f}秒"
+                break
+    
+    file_size = os.path.getsize(output_path) / (1024 * 1024)
+    
+    print(f"✅ 生成成功！")
+    print(f"📁 文件位置: {output_path}")
+    print(f"📊 文件大小: {file_size:.1f}MB")
+    print(f"⏱️  音频时长: {duration}")
+    print()
+    print("🎧 播放命令:")
+    print(f"   ffplay {output_path}")
+    print(f"   # 或")
+    print(f"   aplay {output_path}")
+    
+    return True
+
+def main():
+    if len(sys.argv) != 3:
+        print("用法:")
+        print(f"  {sys.argv[0]} <脚本文件> <输出名称>")
+        print()
+        print("示例:")
+        print(f"  {sys.argv[0]} chapter8_script.txt chapter8_demo")
+        print()
+        print("脚本文件格式: 纯文本，包含[S1] [S2]标签")
+        print("输出名称: 不需要加.wav后缀")
+        sys.exit(1)
+    
+    script_file = sys.argv[1]
+    output_name = sys.argv[2]
+    
+    generate_podcast(script_file, output_name)
+
+if __name__ == "__main__":
+    main()