Initial commit for TTS project

2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions
--- a/scripts/generation/generate_accent_demo.py
+++ b/scripts/generation/generate_accent_demo.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""
+Generate accent demos using VoxCPM
+Supports: Indian, Russian, Singaporean, Hong Kong English accents
+"""
+
+import os
+import numpy as np
+import soundfile as sf
+from voxcpm import VoxCPM
+
+def generate_accent_demo(model, text, accent_name, output_dir="accent_demos"):
+    """Generate accent demo audio"""
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Define reference audio paths (will be created if not exist)
+    ref_audio_map = {
+        "indian": "reference_indian.wav",
+        "russian": "reference_russian.wav",
+        "singaporean": "reference_singaporean.wav",
+        "hongkong": "reference_hongkong.wav"
+    }
+    
+    # Define reference texts that demonstrate accent characteristics
+    ref_text_map = {
+        "indian": "Hello, how are you doing today? I'm from Mumbai, India. The weather here is quite warm and humid during the summer months. Would you like to try some delicious Indian cuisine with me?",
+        "russian": "Hello, how are you doing today? I'm from Moscow, Russia. The winters here are very cold, with lots of snow and ice. But the summers are beautiful and sunny. Would you like to visit the Red Square with me?",
+        "singaporean": "Hello, how are you doing today? I'm from Singapore. It's a small but vibrant city-state in Southeast Asia. We have delicious hawker food and beautiful gardens. Would you like to try some chicken rice with me?",
+        "hongkong": "Hello, how are you doing today? I'm from Hong Kong. It's a bustling metropolitan city with amazing skyline and delicious food. We have dim sum, roast goose, and many other Cantonese delicacies. Would you like to go shopping in Causeway Bay with me?"
+    }
+    
+    ref_audio = ref_audio_map.get(accent_name)
+    ref_text = ref_text_map.get(accent_name)
+    
+    if not ref_audio or not ref_text:
+        print(f"Invalid accent name: {accent_name}")
+        return
+    
+    # Check if reference audio exists (if not, we'll generate it using default voice)
+    if not os.path.exists(ref_audio):
+        print(f"Reference audio not found for {accent_name}, generating with default voice...")
+        # Generate reference audio using default voice
+        audio = model.generate(
+            text=ref_text,
+            cfg_value=2.0,
+            inference_timesteps=20
+        )
+        sf.write(ref_audio, audio, 24000)
+        print(f"Generated reference audio: {ref_audio}")
+    
+    # Generate accent demo
+    output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
+    print(f"Generating {accent_name} accent demo...")
+    
+    audio = model.generate(
+        text=text,
+        prompt_wav_path=ref_audio,
+        prompt_text=ref_text,
+        cfg_value=2.0,
+        inference_timesteps=20
+    )
+    
+    sf.write(output_file, audio, 24000)
+    print(f"Generated {accent_name} accent demo: {output_file}")
+    return output_file
+
+def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos"):
+    """Generate Cantonese pinyin demo"""
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Generate reference audio for Cantonese accent
+    ref_audio = "reference_cantonese.wav"
+    ref_text = "你好，我是张学友。很高兴认识你。我喜欢唱歌和表演。希望你喜欢我的音乐。"
+    
+    if not os.path.exists(ref_audio):
+        print("Generating Cantonese reference audio...")
+        audio = model.generate(
+            text=ref_text,
+            cfg_value=2.0,
+            inference_timesteps=20
+        )
+        sf.write(ref_audio, audio, 24000)
+        print(f"Generated Cantonese reference audio: {ref_audio}")
+    
+    # Generate Cantonese pinyin demo
+    output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
+    print("Generating Cantonese pinyin demo...")
+    
+    audio = model.generate(
+        text=pinyin,
+        prompt_wav_path=ref_audio,
+        prompt_text=ref_text,
+        cfg_value=2.0,
+        inference_timesteps=20
+    )
+    
+    sf.write(output_file, audio, 24000)
+    print(f"Generated Cantonese pinyin demo: {output_file}")
+    return output_file
+
+if __name__ == "__main__":
+    # Initialize VoxCPM
+    print("Initializing VoxCPM...")
+    model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
+    
+    # Test sentence
+    test_text = "Hello everyone, welcome to our podcast. Today we're going to discuss various accents from around the world. I hope you enjoy this episode!"
+    
+    # Generate accent demos
+    accents = ["indian", "russian", "singaporean", "hongkong"]
+    for accent in accents:
+        generate_accent_demo(model, test_text, accent)
+    
+    # Generate Cantonese pinyin demo (Jacky Cheung)
+    cantonese_text = "张学友是香港著名歌手，被誉为歌神。他的歌声深情动人，深受歌迷喜爱。"
+    cantonese_pinyin = "{zoeng1}{hau2}{juk6} {si6} {hoeng1}{gong2} {zyu4}{ming4} {go1}{sau2}，{bei6}{jyu6} {go1}{san4}。{taa1} {dik1} {go1}{sing1} {sam1}{cing4} {dung6}{jan4}，{sam1}{sau6} {go1}{mai4} {hei2}{oi3}。"
+    generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
+    
+    print("All demos generated successfully!")