Initial commit for TTS project

This commit is contained in:
Ben
2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions

View File

@@ -0,0 +1,198 @@
#!/usr/bin/env python3
"""
Analyze accent verification files to check for distinct accent characteristics
"""
import os
import numpy as np
import soundfile as sf
import scipy.signal
from scipy.stats import skew, kurtosis
# Paths
WORKSPACE = "/root/tts"
ACCENT_DIR = os.path.join(WORKSPACE, "accent_verification")
def calculate_rms(audio_data):
"""Calculate RMS energy"""
return np.sqrt(np.mean(audio_data**2))
def calculate_peak_amplitude(audio_data):
"""Calculate peak amplitude"""
return np.max(np.abs(audio_data))
def calculate_zero_crossing_rate(audio_data):
"""Calculate zero crossing rate"""
return np.mean(np.abs(np.diff(np.sign(audio_data))))
def calculate_spectral_centroid(audio_data, sample_rate):
"""Calculate spectral centroid"""
frequencies, times, Sxx = scipy.signal.spectrogram(audio_data, sample_rate)
if np.sum(Sxx) == 0:
return 0
spectral_centroid = np.sum(frequencies[:, np.newaxis] * Sxx) / np.sum(Sxx)
return spectral_centroid
def calculate_skewness(audio_data):
"""Calculate skewness"""
return skew(audio_data)
def calculate_kurtosis(audio_data):
"""Calculate kurtosis"""
return kurtosis(audio_data)
def analyze_audio_quality(audio_data, sample_rate, filename):
"""Analyze audio quality"""
rms = calculate_rms(audio_data)
peak = calculate_peak_amplitude(audio_data)
zcr = calculate_zero_crossing_rate(audio_data)
spectral_centroid = calculate_spectral_centroid(audio_data, sample_rate)
skewness = calculate_skewness(audio_data)
kurt = calculate_kurtosis(audio_data)
# Quality scoring
score = 0
if 0.05 <= rms <= 0.3:
score += 20
if peak <= 1.0:
score += 20
if 0.05 <= zcr <= 0.3:
score += 20
if 400 <= spectral_centroid <= 3000:
score += 20
if -1 <= skewness <= 1:
score += 10
if kurt <= 10:
score += 10
return {
'rms': rms,
'peak': peak,
'zcr': zcr,
'spectral_centroid': spectral_centroid,
'skewness': skewness,
'kurtosis': kurt,
'score': min(score, 100)
}
def analyze_accent_verification():
"""Analyze accent verification files"""
print("=" * 70)
print("ANALYZING ACCENT VERIFICATION FILES")
print("=" * 70)
accent_files = []
emotion_files = []
# Get all files
for filename in os.listdir(ACCENT_DIR):
if filename.endswith('.wav'):
file_path = os.path.join(ACCENT_DIR, filename)
if 'accent' in filename:
accent_files.append((filename, file_path))
elif 'emotion' in filename:
emotion_files.append((filename, file_path))
# Analyze accent files
print("\n🔊 ACCENT FILES ANALYSIS:")
print("-" * 70)
accent_stats = []
for filename, file_path in accent_files:
try:
audio_data, sample_rate = sf.read(file_path)
duration = len(audio_data) / sample_rate
stats = analyze_audio_quality(audio_data, sample_rate, filename)
accent_stats.append({
'filename': filename,
'duration': duration,
'rms': stats['rms'],
'zcr': stats['zcr'],
'spectral_centroid': stats['spectral_centroid'],
'score': stats['score']
})
print(f"{filename}")
print(f" Duration: {duration:.2f}s, RMS: {stats['rms']:.4f}, ZCR: {stats['zcr']:.4f}, Centroid: {stats['spectral_centroid']:.1f}Hz, Score: {stats['score']}/100")
print()
except Exception as e:
print(f"{filename}: Error - {e}")
print()
# Analyze emotion files
print("\n😊 EMOTION FILES ANALYSIS:")
print("-" * 70)
emotion_stats = []
for filename, file_path in emotion_files:
try:
audio_data, sample_rate = sf.read(file_path)
duration = len(audio_data) / sample_rate
stats = analyze_audio_quality(audio_data, sample_rate, filename)
emotion_stats.append({
'filename': filename,
'duration': duration,
'rms': stats['rms'],
'zcr': stats['zcr'],
'spectral_centroid': stats['spectral_centroid'],
'score': stats['score']
})
print(f"{filename}")
print(f" Duration: {duration:.2f}s, RMS: {stats['rms']:.4f}, ZCR: {stats['zcr']:.4f}, Centroid: {stats['spectral_centroid']:.1f}Hz, Score: {stats['score']}/100")
print()
except Exception as e:
print(f"{filename}: Error - {e}")
print()
# Compare accent characteristics
print("\n📊 ACCENT COMPARISON:")
print("-" * 70)
print("Filename | Duration | RMS | ZCR | Centroid | Score")
print("-" * 70)
for stats in sorted(accent_stats, key=lambda x: x['filename']):
print(f"{stats['filename']:24} | {stats['duration']:8.2f} | {stats['rms']:6.4f} | {stats['zcr']:6.4f} | {stats['spectral_centroid']:8.1f} | {stats['score']:5}")
# Compare emotion characteristics
print("\n📊 EMOTION COMPARISON:")
print("-" * 70)
print("Filename | Duration | RMS | ZCR | Centroid | Score")
print("-" * 70)
for stats in sorted(emotion_stats, key=lambda x: x['filename']):
print(f"{stats['filename']:24} | {stats['duration']:8.2f} | {stats['rms']:6.4f} | {stats['zcr']:6.4f} | {stats['spectral_centroid']:8.1f} | {stats['score']:5}")
# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Total accent files: {len(accent_files)}")
print(f"Total emotion files: {len(emotion_files)}")
# Check if accents are distinct
if len(accent_stats) >= 2:
centroid_values = [s['spectral_centroid'] for s in accent_stats]
centroid_std = np.std(centroid_values)
zcr_values = [s['zcr'] for s in accent_stats]
zcr_std = np.std(zcr_values)
print(f"\nAccent distinctiveness metrics:")
print(f"Spectral centroid std: {centroid_std:.2f}Hz (higher = more distinct)")
print(f"Zero crossing rate std: {zcr_std:.4f} (higher = more distinct)")
if centroid_std > 50 or zcr_std > 0.02:
print("✅ Accents appear to be distinct based on acoustic features")
else:
print("⚠️ Accents may sound similar based on acoustic features")
print("\n" + "=" * 70)
if __name__ == "__main__":
analyze_accent_verification()

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""
Audio quality analysis tool for VoxCPM generated files
Analyzes waveform characteristics to determine if audio sounds human
"""
import os
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
from scipy import signal
from scipy.stats import skew, kurtosis
def analyze_audio_file(file_path):
"""Analyze audio file and return quality metrics"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return None
try:
# Read audio file
audio_data, sample_rate = sf.read(file_path)
print(f"✓ Successfully loaded: {os.path.basename(file_path)}")
print(f" Sample rate: {sample_rate} Hz")
print(f" Duration: {len(audio_data)/sample_rate:.2f} seconds")
print(f" Channels: {1 if len(audio_data.shape) == 1 else audio_data.shape[1]}")
# Convert to mono if stereo
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Basic audio statistics
rms_energy = np.sqrt(np.mean(audio_data**2))
peak_amplitude = np.max(np.abs(audio_data))
zero_crossing_rate = np.mean(np.abs(np.diff(np.sign(audio_data))))
spectral_centroid = calculate_spectral_centroid(audio_data, sample_rate)
skewness = skew(audio_data)
kurt = kurtosis(audio_data)
print(f"\n📊 Audio Statistics:")
print(f" RMS Energy: {rms_energy:.4f}")
print(f" Peak Amplitude: {peak_amplitude:.4f}")
print(f" Zero Crossing Rate: {zero_crossing_rate:.4f}")
print(f" Spectral Centroid: {spectral_centroid:.2f} Hz")
print(f" Skewness: {skewness:.4f}")
print(f" Kurtosis: {kurt:.4f}")
# Quality assessment
quality_score = assess_audio_quality({
'rms_energy': rms_energy,
'zero_crossing_rate': zero_crossing_rate,
'spectral_centroid': spectral_centroid,
'skewness': skewness,
'kurtosis': kurt,
'duration': len(audio_data)/sample_rate
})
return {
'file': file_path,
'sample_rate': sample_rate,
'duration': len(audio_data)/sample_rate,
'rms_energy': rms_energy,
'zero_crossing_rate': zero_crossing_rate,
'spectral_centroid': spectral_centroid,
'quality_score': quality_score,
'quality': 'good' if quality_score > 60 else 'poor'
}
except Exception as e:
print(f"Error analyzing {file_path}: {e}")
return None
def calculate_spectral_centroid(audio_data, sample_rate):
"""Calculate spectral centroid (brightness of sound)"""
# Compute spectrogram
frequencies, times, Sxx = signal.spectrogram(audio_data, sample_rate)
# Calculate spectral centroid
if np.sum(Sxx) == 0:
return 0
spectral_centroid = np.sum(frequencies[:, np.newaxis] * Sxx) / np.sum(Sxx)
return spectral_centroid
def assess_audio_quality(metrics):
"""Assess audio quality based on metrics"""
score = 0
# RMS Energy: Good range for speech is 0.05-0.3
rms = metrics['rms_energy']
if 0.05 <= rms <= 0.3:
score += 20
elif 0.02 <= rms < 0.05 or 0.3 < rms <= 0.5:
score += 10
else:
score += 0
# Zero Crossing Rate: Good range for speech is 0.05-0.15
zcr = metrics['zero_crossing_rate']
if 0.05 <= zcr <= 0.15:
score += 20
elif 0.02 <= zcr < 0.05 or 0.15 < zcr <= 0.2:
score += 10
else:
score += 0
# Spectral Centroid: Good range for speech is 800-2500 Hz
sc = metrics['spectral_centroid']
if 800 <= sc <= 2500:
score += 20
elif 500 <= sc < 800 or 2500 < sc <= 3500:
score += 10
else:
score += 0
# Duration: Speech should be reasonable length
duration = metrics['duration']
if 1.0 <= duration <= 10.0:
score += 20
elif 0.5 <= duration < 1.0 or 10.0 < duration <= 15.0:
score += 10
else:
score += 0
# Skewness and Kurtosis: Should be moderate for natural speech
skewness = abs(metrics['skewness'])
kurtosis = abs(metrics['kurtosis'])
if skewness < 2 and kurtosis < 10:
score += 20
elif skewness < 5 and kurtosis < 20:
score += 10
else:
score += 0
return score
def analyze_directory(directory):
"""Analyze all audio files in a directory"""
if not os.path.exists(directory):
print(f"Directory not found: {directory}")
return
print(f"\n{'='*60}")
print(f"ANALYZING AUDIO FILES IN: {directory}")
print(f"{'='*60}")
audio_files = [f for f in os.listdir(directory) if f.endswith('.wav')]
if not audio_files:
print("No WAV files found")
return
results = []
for audio_file in audio_files:
file_path = os.path.join(directory, audio_file)
result = analyze_audio_file(file_path)
if result:
results.append(result)
print(f" Quality Score: {result['quality_score']}/100 ({result['quality']})")
print(f"{'='*60}")
# Summary
if results:
good_files = [r['file'] for r in results if r['quality'] == 'good']
poor_files = [r['file'] for r in results if r['quality'] == 'poor']
print(f"\n📋 Summary:")
print(f"Total files analyzed: {len(results)}")
print(f"Good quality files: {len(good_files)}")
print(f"Poor quality files: {len(poor_files)}")
if good_files:
print("\nGood quality examples:")
for f in good_files[:3]:
print(f" - {os.path.basename(f)}")
if poor_files:
print("\nPoor quality examples:")
for f in poor_files[:3]:
print(f" - {os.path.basename(f)}")
if __name__ == "__main__":
# Analyze both accent demo directories
analyze_directory("accent_demos")
analyze_directory("accent_demos_optimized")
# Also analyze the reference audio files
print(f"\n{'='*60}")
print(f"ANALYZING REFERENCE AUDIO FILES")
print(f"{'='*60}")
reference_files = [
"reference_indian.wav",
"reference_russian.wav",
"reference_singaporean.wav",
"reference_hongkong.wav",
"reference_cantonese.wav",
"reference_indian_opt.wav",
"reference_russian_opt.wav",
"reference_singaporean_opt.wav",
"reference_hongkong_opt.wav",
"reference_cantonese_opt.wav"
]
for ref_file in reference_files:
if os.path.exists(ref_file):
analyze_audio_file(ref_file)
print(f"{'='*60}")

View File

@@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
Analyze only the local accent demos
"""
import os
import numpy as np
import soundfile as sf
from scipy import signal
from scipy.stats import skew, kurtosis
def analyze_audio_file(file_path):
"""Analyze audio file and return quality metrics"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return None
try:
# Read audio file
audio_data, sample_rate = sf.read(file_path)
print(f"✓ Successfully loaded: {os.path.basename(file_path)}")
print(f" Sample rate: {sample_rate} Hz")
print(f" Duration: {len(audio_data)/sample_rate:.2f} seconds")
print(f" Channels: {1 if len(audio_data.shape) == 1 else audio_data.shape[1]}")
# Convert to mono if stereo
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Basic audio statistics
rms_energy = np.sqrt(np.mean(audio_data**2))
peak_amplitude = np.max(np.abs(audio_data))
zero_crossing_rate = np.mean(np.abs(np.diff(np.sign(audio_data))))
spectral_centroid = calculate_spectral_centroid(audio_data, sample_rate)
skewness = skew(audio_data)
kurt = kurtosis(audio_data)
print(f"\n📊 Audio Statistics:")
print(f" RMS Energy: {rms_energy:.4f}")
print(f" Peak Amplitude: {peak_amplitude:.4f}")
print(f" Zero Crossing Rate: {zero_crossing_rate:.4f}")
print(f" Spectral Centroid: {spectral_centroid:.2f} Hz")
print(f" Skewness: {skewness:.4f}")
print(f" Kurtosis: {kurt:.4f}")
# Quality assessment
quality_score = assess_audio_quality({
'rms_energy': rms_energy,
'zero_crossing_rate': zero_crossing_rate,
'spectral_centroid': spectral_centroid,
'skewness': skewness,
'kurtosis': kurt,
'duration': len(audio_data)/sample_rate
})
quality = 'good' if quality_score > 60 else 'poor'
print(f" Quality Score: {quality_score}/100 ({quality})")
return {
'file': file_path,
'sample_rate': sample_rate,
'duration': len(audio_data)/sample_rate,
'rms_energy': rms_energy,
'zero_crossing_rate': zero_crossing_rate,
'spectral_centroid': spectral_centroid,
'quality_score': quality_score,
'quality': quality
}
except Exception as e:
print(f"Error analyzing {file_path}: {e}")
return None
def calculate_spectral_centroid(audio_data, sample_rate):
"""Calculate spectral centroid (brightness of sound)"""
# Compute spectrogram
frequencies, times, Sxx = signal.spectrogram(audio_data, sample_rate)
# Calculate spectral centroid
if np.sum(Sxx) == 0:
return 0
spectral_centroid = np.sum(frequencies[:, np.newaxis] * Sxx) / np.sum(Sxx)
return spectral_centroid
def assess_audio_quality(metrics):
"""Assess audio quality based on metrics"""
score = 0
# RMS Energy: Good range for speech is 0.05-0.3
rms = metrics['rms_energy']
if 0.05 <= rms <= 0.3:
score += 20
elif 0.02 <= rms < 0.05 or 0.3 < rms <= 0.5:
score += 10
else:
score += 0
# Zero Crossing Rate: Good range for speech is 0.05-0.15
zcr = metrics['zero_crossing_rate']
if 0.05 <= zcr <= 0.15:
score += 20
elif 0.02 <= zcr < 0.05 or 0.15 < zcr <= 0.2:
score += 10
else:
score += 0
# Spectral Centroid: Good range for speech is 800-2500 Hz
sc = metrics['spectral_centroid']
if 800 <= sc <= 2500:
score += 20
elif 500 <= sc < 800 or 2500 < sc <= 3500:
score += 10
elif 200 <= sc < 500:
score += 5
else:
score += 0
# Duration: Speech should be reasonable length
duration = metrics['duration']
if 1.0 <= duration <= 10.0:
score += 20
elif 0.5 <= duration < 1.0 or 10.0 < duration <= 15.0:
score += 10
else:
score += 0
# Skewness and Kurtosis: Should be moderate for natural speech
skewness = abs(metrics['skewness'])
kurtosis = abs(metrics['kurtosis'])
if skewness < 2 and kurtosis < 10:
score += 20
elif skewness < 5 and kurtosis < 20:
score += 10
else:
score += 0
return score
def analyze_directory(directory):
"""Analyze all audio files in a directory"""
if not os.path.exists(directory):
print(f"Directory not found: {directory}")
return
print(f"\n{'='*60}")
print(f"ANALYZING LOCAL ACCENT DEMOS: {directory}")
print(f"{'='*60}")
audio_files = [f for f in os.listdir(directory) if f.endswith('.wav')]
if not audio_files:
print("No WAV files found")
return
results = []
for audio_file in audio_files:
file_path = os.path.join(directory, audio_file)
result = analyze_audio_file(file_path)
if result:
results.append(result)
print(f"{'='*60}")
# Summary
if results:
good_files = [r['file'] for r in results if r['quality'] == 'good']
poor_files = [r['file'] for r in results if r['quality'] == 'poor']
print(f"\n📋 Summary:")
print(f"Total files analyzed: {len(results)}")
print(f"Good quality files: {len(good_files)}")
print(f"Poor quality files: {len(poor_files)}")
if good_files:
print("\nGood quality examples:")
for f in good_files[:3]:
print(f" - {os.path.basename(f)}")
if poor_files:
print("\nPoor quality examples:")
for f in poor_files[:3]:
print(f" - {os.path.basename(f)}")
if __name__ == "__main__":
# Analyze only the local accent demos
analyze_directory("accent_demos_local")