Initial commit: 胡汉三千年项目 - 北朝宇宙理论体系
This commit is contained in:
497
phallic-worship-analysis/analysis/quality_control.py
Normal file
497
phallic-worship-analysis/analysis/quality_control.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""
|
||||
数据质量控制系统
|
||||
实现史料来源验证、可靠性评分和多重史料交叉验证
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
import re
|
||||
from collections import Counter
|
||||
import statistics
|
||||
|
||||
from analysis.models import ReliabilityLevel, Emperor, ReligiousBuilding, FolkCustom, CulturalTransmission
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ValidationResult(Enum):
|
||||
"""验证结果"""
|
||||
PASS = "pass"
|
||||
WARNING = "warning"
|
||||
FAIL = "fail"
|
||||
|
||||
@dataclass
|
||||
class QualityReport:
|
||||
"""质量报告"""
|
||||
data_id: str
|
||||
data_type: str
|
||||
overall_score: float
|
||||
reliability_level: ReliabilityLevel
|
||||
validation_results: List[Dict[str, Any]]
|
||||
recommendations: List[str]
|
||||
cross_validation_status: bool
|
||||
|
||||
class SourceValidator:
|
||||
"""史料来源验证器"""
|
||||
|
||||
# 可信史料来源等级
|
||||
TRUSTED_SOURCES = {
|
||||
"high": [
|
||||
"魏书", "北史", "资治通鉴", "竹书纪年", "山海经",
|
||||
"日本书纪", "古事记", "续日本纪", "元史", "明史"
|
||||
],
|
||||
"medium": [
|
||||
"太平御览", "册府元龟", "文献通考", "通典",
|
||||
"三国志", "晋书", "宋书", "南齐书"
|
||||
],
|
||||
"low": [
|
||||
"野史", "传说", "民间故事", "口传资料"
|
||||
]
|
||||
}
|
||||
|
||||
# 现代学术来源
|
||||
ACADEMIC_SOURCES = {
|
||||
"high": [
|
||||
"中国社会科学院", "北京大学", "清华大学", "复旦大学",
|
||||
"东京大学", "京都大学", "哈佛大学", "剑桥大学"
|
||||
],
|
||||
"medium": [
|
||||
"省级社科院", "重点大学", "专业研究机构"
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.source_patterns = self._compile_source_patterns()
|
||||
|
||||
def _compile_source_patterns(self) -> Dict[str, re.Pattern]:
|
||||
"""编译史料来源识别模式"""
|
||||
patterns = {}
|
||||
|
||||
# 古代史料模式
|
||||
ancient_sources = []
|
||||
for level_sources in self.TRUSTED_SOURCES.values():
|
||||
ancient_sources.extend(level_sources)
|
||||
patterns['ancient'] = re.compile(f"({'|'.join(ancient_sources)})")
|
||||
|
||||
# 现代学术模式
|
||||
academic_sources = []
|
||||
for level_sources in self.ACADEMIC_SOURCES.values():
|
||||
academic_sources.extend(level_sources)
|
||||
patterns['academic'] = re.compile(f"({'|'.join(academic_sources)})")
|
||||
|
||||
# 考古发现模式
|
||||
patterns['archaeological'] = re.compile(r"考古|出土|发掘|遗址|文物")
|
||||
|
||||
return patterns
|
||||
|
||||
def validate_sources(self, sources: List[str]) -> Dict[str, Any]:
|
||||
"""验证史料来源"""
|
||||
if not sources:
|
||||
return {
|
||||
"status": ValidationResult.FAIL,
|
||||
"score": 0.0,
|
||||
"message": "缺少史料来源",
|
||||
"source_analysis": {}
|
||||
}
|
||||
|
||||
source_analysis = {
|
||||
"total_count": len(sources),
|
||||
"ancient_sources": 0,
|
||||
"academic_sources": 0,
|
||||
"archaeological_sources": 0,
|
||||
"reliability_distribution": Counter()
|
||||
}
|
||||
|
||||
total_score = 0.0
|
||||
|
||||
for source in sources:
|
||||
source_score = self._evaluate_single_source(source)
|
||||
total_score += source_score['score']
|
||||
|
||||
# 统计来源类型
|
||||
if source_score['type'] == 'ancient':
|
||||
source_analysis['ancient_sources'] += 1
|
||||
elif source_score['type'] == 'academic':
|
||||
source_analysis['academic_sources'] += 1
|
||||
elif source_score['type'] == 'archaeological':
|
||||
source_analysis['archaeological_sources'] += 1
|
||||
|
||||
source_analysis['reliability_distribution'][source_score['reliability']] += 1
|
||||
|
||||
average_score = total_score / len(sources)
|
||||
|
||||
# 确定验证状态
|
||||
if average_score >= 0.8:
|
||||
status = ValidationResult.PASS
|
||||
elif average_score >= 0.5:
|
||||
status = ValidationResult.WARNING
|
||||
else:
|
||||
status = ValidationResult.FAIL
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"score": average_score,
|
||||
"message": f"平均史料可靠性评分: {average_score:.2f}",
|
||||
"source_analysis": source_analysis
|
||||
}
|
||||
|
||||
def _evaluate_single_source(self, source: str) -> Dict[str, Any]:
|
||||
"""评估单个史料来源"""
|
||||
source_lower = source.lower()
|
||||
|
||||
# 检查古代史料
|
||||
for reliability, source_list in self.TRUSTED_SOURCES.items():
|
||||
if any(s in source for s in source_list):
|
||||
score_map = {"high": 1.0, "medium": 0.7, "low": 0.4}
|
||||
return {
|
||||
"score": score_map[reliability],
|
||||
"type": "ancient",
|
||||
"reliability": reliability
|
||||
}
|
||||
|
||||
# 检查现代学术来源
|
||||
for reliability, source_list in self.ACADEMIC_SOURCES.items():
|
||||
if any(s in source for s in source_list):
|
||||
score_map = {"high": 0.9, "medium": 0.6}
|
||||
return {
|
||||
"score": score_map[reliability],
|
||||
"type": "academic",
|
||||
"reliability": reliability
|
||||
}
|
||||
|
||||
# 检查考古来源
|
||||
if self.source_patterns['archaeological'].search(source):
|
||||
return {
|
||||
"score": 0.8,
|
||||
"type": "archaeological",
|
||||
"reliability": "high"
|
||||
}
|
||||
|
||||
# 未知来源
|
||||
return {
|
||||
"score": 0.2,
|
||||
"type": "unknown",
|
||||
"reliability": "low"
|
||||
}
|
||||
|
||||
class DataIntegrityChecker:
|
||||
"""数据完整性检查器"""
|
||||
|
||||
def __init__(self):
|
||||
self.required_fields = {
|
||||
"Emperor": ["name", "reign_period"],
|
||||
"ReligiousBuilding": ["name", "location", "construction_period"],
|
||||
"FolkCustom": ["name", "region", "historical_period"],
|
||||
"CulturalTransmission": ["source_region", "target_region", "transmission_period"]
|
||||
}
|
||||
|
||||
def check_completeness(self, data: Any) -> Dict[str, Any]:
|
||||
"""检查数据完整性"""
|
||||
data_type = type(data).__name__
|
||||
required = self.required_fields.get(data_type, [])
|
||||
|
||||
missing_fields = []
|
||||
empty_fields = []
|
||||
|
||||
for field in required:
|
||||
if not hasattr(data, field):
|
||||
missing_fields.append(field)
|
||||
else:
|
||||
value = getattr(data, field)
|
||||
if value is None or (isinstance(value, (str, list)) and len(value) == 0):
|
||||
empty_fields.append(field)
|
||||
|
||||
completeness_score = 1.0 - (len(missing_fields) + len(empty_fields)) / len(required)
|
||||
|
||||
if completeness_score >= 0.9:
|
||||
status = ValidationResult.PASS
|
||||
elif completeness_score >= 0.7:
|
||||
status = ValidationResult.WARNING
|
||||
else:
|
||||
status = ValidationResult.FAIL
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"score": completeness_score,
|
||||
"missing_fields": missing_fields,
|
||||
"empty_fields": empty_fields,
|
||||
"message": f"数据完整性: {completeness_score:.2%}"
|
||||
}
|
||||
|
||||
def detect_anomalies(self, data: Any) -> Dict[str, Any]:
|
||||
"""检测异常值"""
|
||||
anomalies = []
|
||||
|
||||
if isinstance(data, Emperor):
|
||||
# 检查皇帝寿命异常
|
||||
if data.lifespan and (data.lifespan < 0 or data.lifespan > 120):
|
||||
anomalies.append(f"异常寿命: {data.lifespan}")
|
||||
|
||||
# 检查在位时长异常
|
||||
if data.reign_duration and (data.reign_duration < 0 or data.reign_duration > 80):
|
||||
anomalies.append(f"异常在位时长: {data.reign_duration}")
|
||||
|
||||
# 检查子嗣数量异常
|
||||
if data.offspring_count and (data.offspring_count < 0 or data.offspring_count > 50):
|
||||
anomalies.append(f"异常子嗣数量: {data.offspring_count}")
|
||||
|
||||
elif isinstance(data, CulturalTransmission):
|
||||
# 检查传播时间跨度异常
|
||||
if data.time_span and (data.time_span < 0 or data.time_span > 1000):
|
||||
anomalies.append(f"异常传播时间跨度: {data.time_span}")
|
||||
|
||||
status = ValidationResult.FAIL if anomalies else ValidationResult.PASS
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"anomalies": anomalies,
|
||||
"message": f"发现 {len(anomalies)} 个异常值" if anomalies else "未发现异常值"
|
||||
}
|
||||
|
||||
class CrossValidator:
|
||||
"""交叉验证器"""
|
||||
|
||||
def __init__(self):
|
||||
self.validation_rules = self._load_validation_rules()
|
||||
|
||||
def _load_validation_rules(self) -> Dict[str, List[str]]:
|
||||
"""加载验证规则"""
|
||||
return {
|
||||
"emperor_lifespan": [
|
||||
"北魏前期皇帝平均寿命应在25-30岁之间",
|
||||
"短寿现象应与史料记录一致",
|
||||
"生育焦虑评分应与子嗣数量负相关"
|
||||
],
|
||||
"cultural_transmission": [
|
||||
"传播时期应与历史事件时间线一致",
|
||||
"传播路径应符合地理逻辑",
|
||||
"文化载体应有史料支撑"
|
||||
],
|
||||
"religious_building": [
|
||||
"建造时期应与政治背景一致",
|
||||
"建筑功能应与文化需求匹配",
|
||||
"地理位置应符合选址逻辑"
|
||||
]
|
||||
}
|
||||
|
||||
def cross_validate_emperors(self, emperors: List[Emperor]) -> Dict[str, Any]:
|
||||
"""交叉验证皇帝数据"""
|
||||
if len(emperors) < 3:
|
||||
return {
|
||||
"status": ValidationResult.WARNING,
|
||||
"message": "样本数量不足,无法进行有效交叉验证"
|
||||
}
|
||||
|
||||
# 计算统计指标
|
||||
lifespans = [emp.lifespan for emp in emperors if emp.lifespan]
|
||||
if not lifespans:
|
||||
return {
|
||||
"status": ValidationResult.FAIL,
|
||||
"message": "缺少寿命数据,无法进行交叉验证"
|
||||
}
|
||||
|
||||
avg_lifespan = statistics.mean(lifespans)
|
||||
median_lifespan = statistics.median(lifespans)
|
||||
|
||||
# 验证平均寿命是否符合预期
|
||||
expected_range = (25, 30)
|
||||
lifespan_valid = expected_range[0] <= avg_lifespan <= expected_range[1]
|
||||
|
||||
# 验证生育焦虑与子嗣数量的关系
|
||||
fertility_correlation = self._calculate_fertility_correlation(emperors)
|
||||
|
||||
validation_results = []
|
||||
|
||||
if lifespan_valid:
|
||||
validation_results.append({
|
||||
"rule": "平均寿命范围验证",
|
||||
"status": ValidationResult.PASS,
|
||||
"message": f"平均寿命 {avg_lifespan:.1f} 岁符合预期范围"
|
||||
})
|
||||
else:
|
||||
validation_results.append({
|
||||
"rule": "平均寿命范围验证",
|
||||
"status": ValidationResult.WARNING,
|
||||
"message": f"平均寿命 {avg_lifespan:.1f} 岁超出预期范围 {expected_range}"
|
||||
})
|
||||
|
||||
if fertility_correlation < -0.3:
|
||||
validation_results.append({
|
||||
"rule": "生育焦虑相关性验证",
|
||||
"status": ValidationResult.PASS,
|
||||
"message": f"生育焦虑与子嗣数量呈负相关 (r={fertility_correlation:.3f})"
|
||||
})
|
||||
else:
|
||||
validation_results.append({
|
||||
"rule": "生育焦虑相关性验证",
|
||||
"status": ValidationResult.WARNING,
|
||||
"message": f"生育焦虑与子嗣数量相关性不明显 (r={fertility_correlation:.3f})"
|
||||
})
|
||||
|
||||
overall_status = ValidationResult.PASS
|
||||
if any(result["status"] == ValidationResult.FAIL for result in validation_results):
|
||||
overall_status = ValidationResult.FAIL
|
||||
elif any(result["status"] == ValidationResult.WARNING for result in validation_results):
|
||||
overall_status = ValidationResult.WARNING
|
||||
|
||||
return {
|
||||
"status": overall_status,
|
||||
"statistics": {
|
||||
"sample_size": len(emperors),
|
||||
"avg_lifespan": avg_lifespan,
|
||||
"median_lifespan": median_lifespan,
|
||||
"fertility_correlation": fertility_correlation
|
||||
},
|
||||
"validation_results": validation_results
|
||||
}
|
||||
|
||||
def _calculate_fertility_correlation(self, emperors: List[Emperor]) -> float:
|
||||
"""计算生育焦虑与子嗣数量的相关性"""
|
||||
anxiety_scores = []
|
||||
offspring_counts = []
|
||||
|
||||
for emp in emperors:
|
||||
if emp.fertility_anxiety_score is not None and emp.offspring_count is not None:
|
||||
anxiety_scores.append(emp.fertility_anxiety_score)
|
||||
offspring_counts.append(emp.offspring_count)
|
||||
|
||||
if len(anxiety_scores) < 3:
|
||||
return 0.0
|
||||
|
||||
# 计算皮尔逊相关系数
|
||||
n = len(anxiety_scores)
|
||||
sum_x = sum(anxiety_scores)
|
||||
sum_y = sum(offspring_counts)
|
||||
sum_xy = sum(x * y for x, y in zip(anxiety_scores, offspring_counts))
|
||||
sum_x2 = sum(x * x for x in anxiety_scores)
|
||||
sum_y2 = sum(y * y for y in offspring_counts)
|
||||
|
||||
numerator = n * sum_xy - sum_x * sum_y
|
||||
denominator = ((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)) ** 0.5
|
||||
|
||||
if denominator == 0:
|
||||
return 0.0
|
||||
|
||||
return numerator / denominator
|
||||
|
||||
class QualityControlManager:
|
||||
"""质量控制管理器"""
|
||||
|
||||
def __init__(self):
|
||||
self.source_validator = SourceValidator()
|
||||
self.integrity_checker = DataIntegrityChecker()
|
||||
self.cross_validator = CrossValidator()
|
||||
|
||||
def comprehensive_quality_check(self, data: Any, data_id: str = None) -> QualityReport:
|
||||
"""综合质量检查"""
|
||||
data_type = type(data).__name__
|
||||
data_id = data_id or f"{data_type}_{id(data)}"
|
||||
|
||||
validation_results = []
|
||||
recommendations = []
|
||||
|
||||
# 1. 史料来源验证
|
||||
if hasattr(data, 'sources'):
|
||||
source_result = self.source_validator.validate_sources(data.sources)
|
||||
validation_results.append({
|
||||
"category": "史料来源验证",
|
||||
"result": source_result
|
||||
})
|
||||
|
||||
if source_result["status"] != ValidationResult.PASS:
|
||||
recommendations.append("增加更多可靠的史料来源")
|
||||
|
||||
# 2. 数据完整性检查
|
||||
completeness_result = self.integrity_checker.check_completeness(data)
|
||||
validation_results.append({
|
||||
"category": "数据完整性检查",
|
||||
"result": completeness_result
|
||||
})
|
||||
|
||||
if completeness_result["status"] != ValidationResult.PASS:
|
||||
recommendations.append("补充缺失的必要字段")
|
||||
|
||||
# 3. 异常值检测
|
||||
anomaly_result = self.integrity_checker.detect_anomalies(data)
|
||||
validation_results.append({
|
||||
"category": "异常值检测",
|
||||
"result": anomaly_result
|
||||
})
|
||||
|
||||
if anomaly_result["status"] != ValidationResult.PASS:
|
||||
recommendations.append("检查并修正异常数据")
|
||||
|
||||
# 4. 计算总体评分
|
||||
scores = []
|
||||
for result in validation_results:
|
||||
if 'score' in result['result']:
|
||||
scores.append(result['result']['score'])
|
||||
|
||||
overall_score = statistics.mean(scores) if scores else 0.0
|
||||
|
||||
# 5. 确定可靠性等级
|
||||
if overall_score >= 0.8:
|
||||
reliability_level = ReliabilityLevel.HIGH
|
||||
elif overall_score >= 0.6:
|
||||
reliability_level = ReliabilityLevel.MEDIUM
|
||||
elif overall_score >= 0.4:
|
||||
reliability_level = ReliabilityLevel.LOW
|
||||
else:
|
||||
reliability_level = ReliabilityLevel.UNCERTAIN
|
||||
|
||||
# 6. 交叉验证状态
|
||||
cross_validation_status = overall_score >= 0.6
|
||||
|
||||
return QualityReport(
|
||||
data_id=data_id,
|
||||
data_type=data_type,
|
||||
overall_score=overall_score,
|
||||
reliability_level=reliability_level,
|
||||
validation_results=validation_results,
|
||||
recommendations=recommendations,
|
||||
cross_validation_status=cross_validation_status
|
||||
)
|
||||
|
||||
def batch_quality_check(self, data_list: List[Any]) -> List[QualityReport]:
|
||||
"""批量质量检查"""
|
||||
reports = []
|
||||
for i, data in enumerate(data_list):
|
||||
report = self.comprehensive_quality_check(data, f"batch_{i}")
|
||||
reports.append(report)
|
||||
return reports
|
||||
|
||||
def generate_quality_summary(self, reports: List[QualityReport]) -> Dict[str, Any]:
|
||||
"""生成质量摘要报告"""
|
||||
if not reports:
|
||||
return {"message": "无数据报告"}
|
||||
|
||||
reliability_distribution = Counter(report.reliability_level for report in reports)
|
||||
avg_score = statistics.mean(report.overall_score for report in reports)
|
||||
|
||||
high_quality_count = sum(1 for report in reports if report.overall_score >= 0.8)
|
||||
low_quality_count = sum(1 for report in reports if report.overall_score < 0.4)
|
||||
|
||||
return {
|
||||
"total_records": len(reports),
|
||||
"average_quality_score": avg_score,
|
||||
"reliability_distribution": dict(reliability_distribution),
|
||||
"high_quality_records": high_quality_count,
|
||||
"low_quality_records": low_quality_count,
|
||||
"quality_pass_rate": high_quality_count / len(reports),
|
||||
"recommendations": self._generate_batch_recommendations(reports)
|
||||
}
|
||||
|
||||
def _generate_batch_recommendations(self, reports: List[QualityReport]) -> List[str]:
|
||||
"""生成批量改进建议"""
|
||||
all_recommendations = []
|
||||
for report in reports:
|
||||
all_recommendations.extend(report.recommendations)
|
||||
|
||||
recommendation_counts = Counter(all_recommendations)
|
||||
|
||||
# 返回最常见的建议
|
||||
return [rec for rec, count in recommendation_counts.most_common(5)]
|
||||
|
||||
# 全局质量控制管理器实例
|
||||
quality_manager = QualityControlManager()
|
||||
Reference in New Issue
Block a user