#!/usr/bin/env python3 """ Voice cloning test using VoxCPM Reference audio: hosts/ben_guanquelou.wav Reference text: 登鹳雀楼 """ import os import sys import soundfile as sf import numpy as np # Paths WORKSPACE = "/root/tts" REFERENCE_FILE = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav") OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "voice_cloning_test") VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM") # Ensure directories exist os.makedirs(OUTPUT_DIR, exist_ok=True) print(f"✅ Output directory: {OUTPUT_DIR}") # Check reference audio if not os.path.exists(REFERENCE_FILE): print(f"❌ Reference audio not found: {REFERENCE_FILE}") sys.exit(1) print(f"✅ Reference audio: {REFERENCE_FILE}") # Add VoxCPM to path sys.path.insert(0, os.path.join(VOXCPM_DIR, "src")) print(f"✅ Added VoxCPM path") # Import VoxCPM try: from voxcpm.core import VoxCPM print(f"✅ VoxCPM imported successfully") except Exception as e: print(f"❌ Failed to import VoxCPM: {e}") sys.exit(1) # Model path LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5") if not os.path.exists(LOCAL_MODEL_PATH): LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5") if not os.path.exists(LOCAL_MODEL_PATH): print(f"❌ Model path not found") sys.exit(1) print(f"✅ Model path: {LOCAL_MODEL_PATH}") # Initialize VoxCPM print(f"\n🚀 Initializing VoxCPM...") try: model = VoxCPM( voxcpm_model_path=LOCAL_MODEL_PATH, enable_denoiser=False, optimize=False ) print(f"✅ VoxCPM initialized successfully") except Exception as e: print(f"❌ VoxCPM initialization failed: {e}") sys.exit(1) # Text preprocessing function (handle numbers) def preprocess_text(text): """Convert numbers to words for better pronunciation""" text = text.replace("2001", "two thousand and one") text = text.replace("2009", "two thousand and nine") text = text.replace("2008", "two thousand and eight") text = text.replace("70%", "seventy percent") text = text.replace("10", "ten") return text # Test texts TEST_TEXTS = [ { "id": "test1", "text": "Hello, this is a voice cloning test using VoxCPM. I am speaking in English to demonstrate the voice cloning capability. The system captures my tone, rhythm, and speaking style from the reference audio.", "filename": "test1_intro.wav" }, { "id": "test2", "text": "Between two thousand and one and two thousand and nine, China used patience to get its entry ticket to the world factory. This period was crucial for China's economic rise and global integration.", "filename": "test2_chapter8.wav" }, { "id": "test3", "text": "The year two thousand and eight was a turning point. While the United States faced the subprime mortgage crisis, China hosted the Beijing Olympics and demonstrated its growing global influence.", "filename": "test3_2008.wav" } ] # Reference text for voice cloning (登鹳雀楼) REFERENCE_TEXT = "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。" # Generate cloned voice print(f"\n{'='*70}") print(f"STARTING VOICE CLONING TEST") print(f"{'='*70}") print(f"Reference text: {REFERENCE_TEXT}") for test in TEST_TEXTS: print(f"\n🎙️ Generating test: {test['id']}") print(f"Text: {test['text'][:50]}...") # Preprocess text processed_text = preprocess_text(test['text']) print(f"Processed: {processed_text[:50]}...") try: # Generate audio with voice cloning audio = model.generate( text=processed_text, prompt_wav_path=REFERENCE_FILE, # Use reference audio for cloning prompt_text=REFERENCE_TEXT, # Provide reference text cfg_value=2.0, inference_timesteps=20, normalize=True, denoise=False, retry_badcase=True ) # Save audio output_file = os.path.join(OUTPUT_DIR, test['filename']) sf.write(output_file, audio, model.tts_model.sample_rate) # Verify if os.path.exists(output_file): file_size = os.path.getsize(output_file) duration = len(audio) / model.tts_model.sample_rate print(f"✅ Voice cloning successful!") print(f" File: {output_file}") print(f" Size: {file_size} bytes") print(f" Duration: {duration:.2f} seconds") else: print(f"❌ Failed to save audio") except Exception as e: print(f"❌ Error generating audio: {e}") import traceback traceback.print_exc() # Summary print(f"\n{'='*70}") print(f"VOICE CLONING TEST COMPLETE") print(f"{'='*70}") print(f"Reference audio: {REFERENCE_FILE}") print(f"Reference text: {REFERENCE_TEXT}") print(f"Output directory: {OUTPUT_DIR}") print(f"\nGenerated files:") for test in TEST_TEXTS: output_file = os.path.join(OUTPUT_DIR, test['filename']) if os.path.exists(output_file): size = os.path.getsize(output_file) print(f" - {test['filename']} ({size} bytes)") else: print(f" - {test['filename']} (FAILED)") print(f"\n{'='*70}")