huhan3000/phallic-worship-analysis/analysis/statistics/emperor_lifespan_analyzer.py

"""
北魏皇帝寿命统计分析器
分析北魏前期皇帝的寿命分布、生育焦虑与政治政策的关联性
"""

import statistics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any, Tuple
import pandas as pd
from dataclasses import asdict

from analysis.models import Emperor, ReliabilityLevel
from data.emperors.northern_wei_emperors import (
    NORTHERN_WEI_EMPERORS,
    EMPERORS_WITH_LIFESPAN,
    HIGH_RELIABILITY_EMPERORS,
    PRE_REFORM_EMPERORS,
    get_short_lived_emperors,
    get_high_fertility_anxiety_emperors
)

class EmperorLifespanAnalyzer:
    """皇帝寿命统计分析器"""

    def __init__(self, emperors: List[Emperor] = None):
        self.emperors = emperors or NORTHERN_WEI_EMPERORS
        self.emperors_with_lifespan = [emp for emp in self.emperors if emp.lifespan is not None]

    def calculate_basic_statistics(self) -> Dict[str, Any]:
        """计算基础统计数据"""
        if not self.emperors_with_lifespan:
            return {"error": "没有有效的寿命数据"}

        lifespans = [emp.lifespan for emp in self.emperors_with_lifespan]

        stats = {
            "sample_size": len(lifespans),
            "mean_lifespan": statistics.mean(lifespans),
            "median_lifespan": statistics.median(lifespans),
            "mode_lifespan": statistics.mode(lifespans) if len(set(lifespans)) < len(lifespans) else None,
            "std_deviation": statistics.stdev(lifespans) if len(lifespans) > 1 else 0,
            "variance": statistics.variance(lifespans) if len(lifespans) > 1 else 0,
            "min_lifespan": min(lifespans),
            "max_lifespan": max(lifespans),
            "range": max(lifespans) - min(lifespans)
        }

        # 计算四分位数
        if len(lifespans) >= 4:
            sorted_lifespans = sorted(lifespans)
            n = len(sorted_lifespans)
            stats["q1"] = sorted_lifespans[n//4]
            stats["q3"] = sorted_lifespans[3*n//4]
            stats["iqr"] = stats["q3"] - stats["q1"]

        return stats

    def analyze_short_lifespan_phenomenon(self, threshold: int = 30) -> Dict[str, Any]:
        """分析短寿现象"""
        short_lived = get_short_lived_emperors(threshold)
        total_with_data = len(self.emperors_with_lifespan)

        if total_with_data == 0:
            return {"error": "没有有效的寿命数据"}

        short_lived_rate = len(short_lived) / total_with_data

        # 分析短寿皇帝的特征
        short_lived_analysis = {
            "threshold": threshold,
            "short_lived_count": len(short_lived),
            "total_count": total_with_data,
            "short_lived_rate": short_lived_rate,
            "short_lived_emperors": [emp.name for emp in short_lived]
        }

        # 分析短寿与生育焦虑的关系
        if short_lived:
            anxiety_scores = [emp.fertility_anxiety_score for emp in short_lived
                            if emp.fertility_anxiety_score is not None]
            if anxiety_scores:
                short_lived_analysis["avg_fertility_anxiety"] = statistics.mean(anxiety_scores)

        # 分析短寿与子嗣数量的关系
        offspring_counts = [emp.offspring_count for emp in short_lived
                          if emp.offspring_count is not None]
        if offspring_counts:
            short_lived_analysis["avg_offspring_count"] = statistics.mean(offspring_counts)

        return short_lived_analysis

    def analyze_fertility_anxiety_correlation(self) -> Dict[str, Any]:
        """分析生育焦虑与各因素的相关性"""
        # 收集有效数据
        valid_emperors = [emp for emp in self.emperors
                         if emp.fertility_anxiety_score is not None and emp.lifespan is not None]

        if len(valid_emperors) < 3:
            return {"error": "数据不足，无法进行相关性分析"}

        anxiety_scores = [emp.fertility_anxiety_score for emp in valid_emperors]
        lifespans = [emp.lifespan for emp in valid_emperors]
        offspring_counts = [emp.offspring_count for emp in valid_emperors if emp.offspring_count is not None]

        correlations = {}

        # 生育焦虑与寿命的相关性
        if len(anxiety_scores) == len(lifespans):
            correlations["anxiety_lifespan"] = self._calculate_correlation(anxiety_scores, lifespans)

        # 生育焦虑与子嗣数量的相关性
        anxiety_with_offspring = [emp.fertility_anxiety_score for emp in valid_emperors
                                if emp.offspring_count is not None]
        if len(anxiety_with_offspring) == len(offspring_counts) and len(offspring_counts) >= 3:
            correlations["anxiety_offspring"] = self._calculate_correlation(anxiety_with_offspring, offspring_counts)

        return {
            "sample_size": len(valid_emperors),
            "correlations": correlations,
            "interpretation": self._interpret_correlations(correlations)
        }

    def _calculate_correlation(self, x: List[float], y: List[float]) -> Dict[str, float]:
        """计算皮尔逊相关系数"""
        if len(x) != len(y) or len(x) < 2:
            return {"correlation": 0.0, "p_value": 1.0}

        n = len(x)
        sum_x = sum(x)
        sum_y = sum(y)
        sum_xy = sum(xi * yi for xi, yi in zip(x, y))
        sum_x2 = sum(xi * xi for xi in x)
        sum_y2 = sum(yi * yi for yi in y)

        numerator = n * sum_xy - sum_x * sum_y
        denominator = ((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)) ** 0.5

        if denominator == 0:
            correlation = 0.0
        else:
            correlation = numerator / denominator

        # 简化的p值估算（实际应使用更精确的统计检验）
        t_stat = correlation * ((n - 2) / (1 - correlation**2)) ** 0.5 if correlation != 1 else float('inf')
        p_value = 2 * (1 - abs(t_stat) / (abs(t_stat) + n - 2)) if t_stat != float('inf') else 0.0

        return {
            "correlation": correlation,
            "p_value": p_value,
            "sample_size": n
        }

    def _interpret_correlations(self, correlations: Dict[str, Dict[str, float]]) -> Dict[str, str]:
        """解释相关性结果"""
        interpretations = {}

        for key, corr_data in correlations.items():
            corr = corr_data["correlation"]
            p_val = corr_data["p_value"]

            # 相关性强度解释
            if abs(corr) >= 0.7:
                strength = "强"
            elif abs(corr) >= 0.5:
                strength = "中等"
            elif abs(corr) >= 0.3:
                strength = "弱"
            else:
                strength = "很弱或无"

            # 方向解释
            direction = "正" if corr > 0 else "负"

            # 显著性解释
            significance = "显著" if p_val < 0.05 else "不显著"

            interpretations[key] = f"{direction}相关，强度：{strength}，统计显著性：{significance}"

        return interpretations

    def analyze_by_reliability(self) -> Dict[str, Any]:
        """按史料可靠性分析"""
        reliability_groups = {}

        for reliability in ReliabilityLevel:
            group_emperors = [emp for emp in self.emperors if emp.reliability == reliability]
            if group_emperors:
                group_with_lifespan = [emp for emp in group_emperors if emp.lifespan is not None]
                if group_with_lifespan:
                    lifespans = [emp.lifespan for emp in group_with_lifespan]
                    reliability_groups[reliability.value] = {
                        "count": len(group_emperors),
                        "with_lifespan_count": len(group_with_lifespan),
                        "mean_lifespan": statistics.mean(lifespans),
                        "emperors": [emp.name for emp in group_emperors]
                    }

        return reliability_groups

    def generate_lifespan_distribution_chart(self, save_path: str = None) -> str:
        """生成寿命分布图表"""
        if not self.emperors_with_lifespan:
            return "没有有效数据生成图表"

        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
        plt.rcParams['axes.unicode_minus'] = False

        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

        lifespans = [emp.lifespan for emp in self.emperors_with_lifespan]
        names = [emp.name.split('拓跋')[0] for emp in self.emperors_with_lifespan]

        # 1. 寿命分布直方图
        ax1.hist(lifespans, bins=10, alpha=0.7, color='skyblue', edgecolor='black')
        ax1.axvline(statistics.mean(lifespans), color='red', linestyle='--',
                   label=f'平均寿命: {statistics.mean(lifespans):.1f}岁')
        ax1.axvline(30, color='orange', linestyle='--', label='短寿阈值: 30岁')
        ax1.set_xlabel('寿命（岁）')
        ax1.set_ylabel('频数')
        ax1.set_title('北魏皇帝寿命分布')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # 2. 皇帝寿命条形图
        colors = ['red' if lifespan < 30 else 'blue' for lifespan in lifespans]
        bars = ax2.bar(range(len(names)), lifespans, color=colors, alpha=0.7)
        ax2.set_xlabel('皇帝')
        ax2.set_ylabel('寿命（岁）')
        ax2.set_title('各皇帝寿命对比')
        ax2.set_xticks(range(len(names)))
        ax2.set_xticklabels(names, rotation=45, ha='right')
        ax2.axhline(30, color='orange', linestyle='--', alpha=0.7)

        # 添加数值标签
        for i, (bar, lifespan) in enumerate(zip(bars, lifespans)):
            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    str(lifespan), ha='center', va='bottom', fontsize=8)

        # 3. 寿命与生育焦虑散点图
        anxiety_data = [(emp.lifespan, emp.fertility_anxiety_score)
                       for emp in self.emperors_with_lifespan
                       if emp.fertility_anxiety_score is not None]

        if anxiety_data:
            lifespans_with_anxiety, anxiety_scores = zip(*anxiety_data)
            ax3.scatter(lifespans_with_anxiety, anxiety_scores, alpha=0.7, s=60)

            # 添加趋势线
            z = np.polyfit(lifespans_with_anxiety, anxiety_scores, 1)
            p = np.poly1d(z)
            ax3.plot(lifespans_with_anxiety, p(lifespans_with_anxiety), "r--", alpha=0.8)

            ax3.set_xlabel('寿命（岁）')
            ax3.set_ylabel('生育焦虑评分')
            ax3.set_title('寿命与生育焦虑关系')
            ax3.grid(True, alpha=0.3)

        # 4. 箱线图
        reliability_data = {}
        for emp in self.emperors_with_lifespan:
            rel = emp.reliability.value
            if rel not in reliability_data:
                reliability_data[rel] = []
            reliability_data[rel].append(emp.lifespan)

        if reliability_data:
            ax4.boxplot(reliability_data.values(), labels=reliability_data.keys())
            ax4.set_xlabel('史料可靠性')
            ax4.set_ylabel('寿命（岁）')
            ax4.set_title('不同可靠性史料的寿命分布')
            ax4.grid(True, alpha=0.3)

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            return f"图表已保存到: {save_path}"
        else:
            plt.show()
            return "图表已显示"

    def generate_comprehensive_report(self) -> Dict[str, Any]:
        """生成综合分析报告"""
        report = {
            "analysis_date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
            "data_summary": {
                "total_emperors": len(self.emperors),
                "emperors_with_lifespan": len(self.emperors_with_lifespan),
                "data_completeness": len(self.emperors_with_lifespan) / len(self.emperors)
            }
        }

        # 基础统计
        report["basic_statistics"] = self.calculate_basic_statistics()

        # 短寿现象分析
        report["short_lifespan_analysis"] = self.analyze_short_lifespan_phenomenon()

        # 生育焦虑相关性分析
        report["fertility_anxiety_analysis"] = self.analyze_fertility_anxiety_correlation()

        # 可靠性分析
        report["reliability_analysis"] = self.analyze_by_reliability()

        # 关键发现
        report["key_findings"] = self._extract_key_findings(report)

        return report

    def _extract_key_findings(self, report: Dict[str, Any]) -> List[str]:
        """提取关键发现"""
        findings = []

        # 平均寿命发现
        if "mean_lifespan" in report["basic_statistics"]:
            mean_age = report["basic_statistics"]["mean_lifespan"]
            findings.append(f"北魏前期皇帝平均寿命为 {mean_age:.1f} 岁，证实了短寿现象")

        # 短寿比例发现
        if "short_lived_rate" in report["short_lifespan_analysis"]:
            short_rate = report["short_lifespan_analysis"]["short_lived_rate"]
            findings.append(f"{short_rate:.1%} 的皇帝寿命不足30岁，显示严重的短寿问题")

        # 生育焦虑相关性发现
        if "correlations" in report["fertility_anxiety_analysis"]:
            correlations = report["fertility_anxiety_analysis"]["correlations"]
            if "anxiety_offspring" in correlations:
                corr = correlations["anxiety_offspring"]["correlation"]
                if corr < -0.3:
                    findings.append(f"生育焦虑与子嗣数量呈负相关 (r={corr:.3f})，支持生育焦虑假说")

        # 史料可靠性发现
        high_rel_data = report["reliability_analysis"].get("high", {})
        if high_rel_data and "mean_lifespan" in high_rel_data:
            findings.append(f"高可靠性史料显示平均寿命 {high_rel_data['mean_lifespan']:.1f} 岁，验证了分析结果")

        return findings

# 创建分析器实例
emperor_analyzer = EmperorLifespanAnalyzer()

def run_emperor_analysis():
    """运行皇帝分析"""
    print("开始北魏皇帝寿命统计分析...")

    # 生成综合报告
    report = emperor_analyzer.generate_comprehensive_report()

    print("\n=== 北魏皇帝寿命分析报告 ===")
    print(f"分析时间: {report['analysis_date']}")
    print(f"数据样本: {report['data_summary']['total_emperors']} 位皇帝")
    print(f"有效寿命数据: {report['data_summary']['emperors_with_lifespan']} 位")
    print(f"数据完整性: {report['data_summary']['data_completeness']:.1%}")

    # 基础统计
    stats = report['basic_statistics']
    if 'error' not in stats:
        print(f"\n平均寿命: {stats['mean_lifespan']:.1f} 岁")
        print(f"中位寿命: {stats['median_lifespan']:.1f} 岁")
        print(f"标准差: {stats['std_deviation']:.1f} 岁")
        print(f"寿命范围: {stats['min_lifespan']}-{stats['max_lifespan']} 岁")

    # 短寿分析
    short_analysis = report['short_lifespan_analysis']
    if 'error' not in short_analysis:
        print(f"\n短寿皇帝 (<30岁): {short_analysis['short_lived_count']}/{short_analysis['total_count']}")
        print(f"短寿比例: {short_analysis['short_lived_rate']:.1%}")

    # 关键发现
    print("\n=== 关键发现 ===")
    for i, finding in enumerate(report['key_findings'], 1):
        print(f"{i}. {finding}")

    return report

if __name__ == "__main__":
    report = run_emperor_analysis()

    # 生成可视化图表
    chart_result = emperor_analyzer.generate_lifespan_distribution_chart("emperor_lifespan_analysis.png")
    print(f"\n{chart_result}")