tts/scripts/generation/generate_moss_ttsd_podcast.py

#!/usr/bin/env python3
"""
MOSS-TTSD 播客生成器 - 简化版
直接生成到 /root/tts/podcast_audios/
"""

import os
import subprocess
import sys

# 配置
OUTPUT_DIR = "/root/tts/podcast_audios"
MODEL_DIR = "/root/tts/MOSS-TTSD"

def generate_podcast(script_file, output_name):
    """
    生成播客并直接保存到 podcast_audios

    参数:
        script_file: 对话脚本文件路径 (.txt格式，包含[S1] [S2]标签)
        output_name: 输出文件名 (不需要.wav后缀)
    """

    print(f"🎙️ 生成播客: {output_name}")
    print("=" * 50)

    # 检查模型
    if not os.path.exists(f"{MODEL_DIR}/MOSS-TTSD-v0.7"):
        print("❌ MOSS-TTSD模型未下载")
        return False

    # 检查脚本文件
    if not os.path.exists(script_file):
        print(f"❌ 脚本文件不存在: {script_file}")
        return False

    # 创建临时JSONL文件
    import json
    import tempfile

    # 读取脚本
    with open(script_file, 'r', encoding='utf-8') as f:
        script_text = f.read().strip()

    # 创建对话数据
    dialogue_data = {
        "id": 1,
        "base_path": "/root/tts/hosts",
        "text": script_text,
        "prompt_audio_speaker1": "ben_guanquelou.wav",
        "prompt_text_speaker1": "白日依山尽，黄河入海流，欲穷千里目，更上一层楼。",
        "prompt_audio_speaker2": "judy_dalingtaohua_trim.wav",
        "prompt_text_speaker2": "大林寺桃花，人间四月芳菲尽，山寺桃花始盛开。"
    }

    # 创建临时文件
    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as f:
        json.dump(dialogue_data, f, ensure_ascii=False)
        f.write('\n')
        temp_jsonl = f.name

    print(f"✅ 脚本加载成功: {len(script_text)} 字符")

    # 生成音频到临时位置
    print("🎬 正在生成音频...")
    cmd = [
        sys.executable, f"{MODEL_DIR}/inference.py",
        "--jsonl", temp_jsonl,
        "--output_dir", "/tmp",
        "--attn_implementation", "sdpa",
        "--use_normalize",
        "--silence_duration", "0.12",
        "--seed", "42"
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    # 删除临时JSONL文件
    os.unlink(temp_jsonl)

    if result.returncode != 0:
        print("❌ 音频生成失败")
        print(result.stderr)
        return False

    # 检查生成的音频
    temp_audio = "/tmp/output_0.wav"
    if not os.path.exists(temp_audio):
        print("❌ 音频文件未生成")
        return False

    # 复制到目标位置
    output_path = f"{OUTPUT_DIR}/{output_name}.wav"
    subprocess.run(["cp", temp_audio, output_path], check=True)
    os.unlink(temp_audio)

    # 获取音频信息
    probe_result = subprocess.run(
        ["ffprobe", output_path, "-v", "quiet", "-show_streams"],
        capture_output=True, text=True
    )

    duration = "未知"
    if probe_result.returncode == 0:
        for line in probe_result.stdout.split('\n'):
            if line.startswith("duration="):
                duration = f"{float(line.split('=')[1]):.1f}秒"
                break

    file_size = os.path.getsize(output_path) / (1024 * 1024)

    print(f"✅ 生成成功！")
    print(f"📁 文件位置: {output_path}")
    print(f"📊 文件大小: {file_size:.1f}MB")
    print(f"⏱️  音频时长: {duration}")
    print()
    print("🎧 播放命令:")
    print(f"   ffplay {output_path}")
    print(f"   # 或")
    print(f"   aplay {output_path}")

    return True

def main():
    if len(sys.argv) != 3:
        print("用法:")
        print(f"  {sys.argv[0]} <脚本文件> <输出名称>")
        print()
        print("示例:")
        print(f"  {sys.argv[0]} chapter8_script.txt chapter8_demo")
        print()
        print("脚本文件格式: 纯文本，包含[S1] [S2]标签")
        print("输出名称: 不需要加.wav后缀")
        sys.exit(1)

    script_file = sys.argv[1]
    output_name = sys.argv[2]

    generate_podcast(script_file, output_name)

if __name__ == "__main__":
    main()