"""
FPN vs 滑窗性能对标脚本

功能：比较 FPN 推理路径与传统图像金字塔滑窗路径的性能差异。

输出指标：
  - 推理时间（ms）
  - 内存占用（MB）
  - 检测到的关键点数
  - 检测精度（匹配内点数）

使用示例：
  uv run python tests/benchmark_fpn.py \
    --layout /path/to/layout.png \
    --template /path/to/template.png \
    --num-runs 5 \
    --output benchmark_results.json
"""

import argparse
import json
import sys
import time
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import psutil
import torch
from PIL import Image

# 添加项目根目录到 Python 路径
sys.path.insert(0, str(Path(__file__).parent.parent))

from models.rord import RoRD
from utils.config_loader import load_config, to_absolute_path
from utils.data_utils import get_transform


def get_memory_usage() -> float:
    """获取当前进程的内存占用（MB）"""
    process = psutil.Process()
    return process.memory_info().rss / 1024 / 1024


def get_gpu_memory_usage() -> float:
    """获取 GPU 显存占用（MB）"""
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024 / 1024
    return 0


def benchmark_fpn(
    model: torch.nn.Module,
    layout_image: Image.Image,
    template_image: Image.Image,
    transform,
    matching_cfg,
    num_runs: int = 5,
) -> Dict[str, float]:
    """
    测试 FPN 性能
    
    Args:
        model: RoRD 模型
        layout_image: 大版图
        template_image: 模板
        transform: 图像预处理管道
        matching_cfg: 匹配配置
        num_runs: 运行次数
    
    Returns:
        性能指标字典
    """
    from match import extract_from_pyramid, extract_features_sliding_window, mutual_nearest_neighbor
    
    device = next(model.parameters()).device
    times = []
    keypoint_counts = []
    inlier_counts = []
    
    print(f"\n{'=' * 60}")
    print(f"性能测试：FPN 路径")
    print(f"{'=' * 60}")
    
    for run in range(num_runs):
        # 版图特征提取
        layout_tensor = transform(layout_image).unsqueeze(0).to(device)
        
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        start_time = time.time()
        
        layout_kps, layout_descs = extract_from_pyramid(
            model, 
            layout_tensor, 
            float(matching_cfg.keypoint_threshold),
            getattr(matching_cfg, 'nms', {})
        )
        
        # 模板特征提取（单尺度，取 1.0）
        template_tensor = transform(template_image).unsqueeze(0).to(device)
        template_kps, template_descs = extract_from_pyramid(
            model,
            template_tensor,
            float(matching_cfg.keypoint_threshold),
            getattr(matching_cfg, 'nms', {})
        )
        
        # 匹配
        if len(layout_descs) > 0 and len(template_descs) > 0:
            matches = mutual_nearest_neighbor(template_descs, layout_descs)
            inlier_count = len(matches)
        else:
            inlier_count = 0
        
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        elapsed = (time.time() - start_time) * 1000  # 转换为 ms
        
        times.append(elapsed)
        keypoint_counts.append(len(layout_kps))
        inlier_counts.append(inlier_count)
        
        print(f"  Run {run + 1}/{num_runs}: {elapsed:.2f}ms, KPs: {len(layout_kps)}, Matches: {inlier_count}")
    
    mean_time = np.mean(times)
    std_time = np.std(times)
    mean_kps = np.mean(keypoint_counts)
    mean_inliers = np.mean(inlier_counts)
    gpu_mem = get_gpu_memory_usage()
    
    return {
        "method": "FPN",
        "mean_time_ms": float(mean_time),
        "std_time_ms": float(std_time),
        "min_time_ms": float(np.min(times)),
        "max_time_ms": float(np.max(times)),
        "all_times_ms": [float(t) for t in times],
        "mean_keypoints": float(mean_kps),
        "mean_matches": float(mean_inliers),
        "gpu_memory_mb": float(gpu_mem),
        "num_runs": num_runs,
    }


def benchmark_sliding_window(
    model: torch.nn.Module,
    layout_image: Image.Image,
    template_image: Image.Image,
    transform,
    matching_cfg,
    num_runs: int = 5,
) -> Dict[str, float]:
    """
    测试滑窗性能（图像金字塔路径）
    
    Args:
        model: RoRD 模型
        layout_image: 大版图
        template_image: 模板
        transform: 图像预处理管道
        matching_cfg: 匹配配置
        num_runs: 运行次数
    
    Returns:
        性能指标字典
    """
    from match import extract_features_sliding_window, extract_keypoints_and_descriptors, mutual_nearest_neighbor
    
    device = next(model.parameters()).device
    times = []
    keypoint_counts = []
    inlier_counts = []
    
    print(f"\n{'=' * 60}")
    print(f"性能测试：滑窗路径")
    print(f"{'=' * 60}")
    
    for run in range(num_runs):
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        start_time = time.time()
        
        # 版图滑窗特征提取
        layout_kps, layout_descs = extract_features_sliding_window(
            model, 
            layout_image, 
            transform, 
            matching_cfg
        )
        
        # 模板单尺度特征提取
        template_tensor = transform(template_image).unsqueeze(0).to(device)
        template_kps, template_descs = extract_keypoints_and_descriptors(
            model,
            template_tensor,
            float(matching_cfg.keypoint_threshold)
        )
        
        # 匹配
        if len(layout_descs) > 0 and len(template_descs) > 0:
            matches = mutual_nearest_neighbor(template_descs, layout_descs)
            inlier_count = len(matches)
        else:
            inlier_count = 0
        
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        elapsed = (time.time() - start_time) * 1000  # 转换为 ms
        
        times.append(elapsed)
        keypoint_counts.append(len(layout_kps))
        inlier_counts.append(inlier_count)
        
        print(f"  Run {run + 1}/{num_runs}: {elapsed:.2f}ms, KPs: {len(layout_kps)}, Matches: {inlier_count}")
    
    mean_time = np.mean(times)
    std_time = np.std(times)
    mean_kps = np.mean(keypoint_counts)
    mean_inliers = np.mean(inlier_counts)
    gpu_mem = get_gpu_memory_usage()
    
    return {
        "method": "Sliding Window",
        "mean_time_ms": float(mean_time),
        "std_time_ms": float(std_time),
        "min_time_ms": float(np.min(times)),
        "max_time_ms": float(np.max(times)),
        "all_times_ms": [float(t) for t in times],
        "mean_keypoints": float(mean_kps),
        "mean_matches": float(mean_inliers),
        "gpu_memory_mb": float(gpu_mem),
        "num_runs": num_runs,
    }


def compute_speedup(fpn_result: Dict, sw_result: Dict) -> Dict[str, float]:
    """计算 FPN 相对于滑窗的性能改进"""
    speedup = (sw_result["mean_time_ms"] - fpn_result["mean_time_ms"]) / sw_result["mean_time_ms"] * 100
    memory_saving = (sw_result["gpu_memory_mb"] - fpn_result["gpu_memory_mb"]) / sw_result["gpu_memory_mb"] * 100 if sw_result["gpu_memory_mb"] > 0 else 0
    
    return {
        "speedup_percent": float(speedup),
        "memory_saving_percent": float(memory_saving),
        "fpn_faster": speedup > 0,
        "meets_speedup_target": speedup >= 30,
        "meets_memory_target": memory_saving >= 20,
    }


def print_results(fpn_result: Dict, sw_result: Dict, comparison: Dict) -> None:
    """打印性能对比结果"""
    
    print(f"\n{'=' * 80}")
    print(f"{'性能基准测试结果':^80}")
    print(f"{'=' * 80}\n")
    
    print(f"{'指标':<30} {'FPN':<20} {'滑窗':<20}")
    print("-" * 70)
    
    print(f"{'平均推理时间 (ms)':<30} {fpn_result['mean_time_ms']:<20.2f} {sw_result['mean_time_ms']:<20.2f}")
    print(f"{'标准差 (ms)':<30} {fpn_result['std_time_ms']:<20.2f} {sw_result['std_time_ms']:<20.2f}")
    print(f"{'最小时间 (ms)':<30} {fpn_result['min_time_ms']:<20.2f} {sw_result['min_time_ms']:<20.2f}")
    print(f"{'最大时间 (ms)':<30} {fpn_result['max_time_ms']:<20.2f} {sw_result['max_time_ms']:<20.2f}")
    print()
    
    print(f"{'平均关键点数':<30} {fpn_result['mean_keypoints']:<20.0f} {sw_result['mean_keypoints']:<20.0f}")
    print(f"{'平均匹配数':<30} {fpn_result['mean_matches']:<20.0f} {sw_result['mean_matches']:<20.0f}")
    print()
    
    print(f"{'GPU 内存占用 (MB)':<30} {fpn_result['gpu_memory_mb']:<20.2f} {sw_result['gpu_memory_mb']:<20.2f}")
    print()
    
    print(f"{'=' * 80}")
    print(f"{'对标结果':^80}")
    print(f"{'=' * 80}\n")
    
    speedup = comparison["speedup_percent"]
    memory_saving = comparison["memory_saving_percent"]
    
    print(f"推理速度提升: {speedup:+.2f}% {'✅' if speedup >= 30 else '⚠️'}")
    print(f"  (目标: ≥30% | 达成: {'是' if comparison['meets_speedup_target'] else '否'})")
    print()
    
    print(f"内存节省: {memory_saving:+.2f}% {'✅' if memory_saving >= 20 else '⚠️'}")
    print(f"  (目标: ≥20% | 达成: {'是' if comparison['meets_memory_target'] else '否'})")
    print()
    
    if speedup > 0:
        print(f"🎉 FPN 相比滑窗快 {abs(speedup):.2f}%")
    elif speedup < 0:
        print(f"⚠️ FPN 相比滑窗慢 {abs(speedup):.2f}%")
    else:
        print(f"ℹ️ FPN 与滑窗性能相当")
    print()


def main():
    parser = argparse.ArgumentParser(description="RoRD FPN vs 滑窗性能对标测试")
    parser.add_argument('--config', type=str, default="configs/base_config.yaml", help="YAML 配置文件")
    parser.add_argument('--model_path', type=str, default=None, help="模型权重路径")
    parser.add_argument('--layout', type=str, required=True, help="版图路径")
    parser.add_argument('--template', type=str, required=True, help="模板路径")
    parser.add_argument('--num-runs', type=int, default=5, help="每个方法的运行次数")
    parser.add_argument('--output', type=str, default="benchmark_results.json", help="输出 JSON 文件路径")
    parser.add_argument('--device', type=str, default="cuda", help="使用设备: cuda 或 cpu")
    
    args = parser.parse_args()
    
    # 加载配置
    cfg = load_config(args.config)
    config_dir = Path(args.config).resolve().parent
    matching_cfg = cfg.matching
    
    model_path = args.model_path or str(to_absolute_path(cfg.paths.model_path, config_dir))
    
    # 设置设备
    device = torch.device(args.device if torch.cuda.is_available() or args.device == "cpu" else "cpu")
    print(f"使用设备: {device}")
    
    # 加载模型
    print(f"加载模型: {model_path}")
    model = RoRD().to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    
    # 加载图像
    print(f"加载版图: {args.layout}")
    layout_image = Image.open(args.layout).convert('L')
    print(f"  尺寸: {layout_image.size}")
    
    print(f"加载模板: {args.template}")
    template_image = Image.open(args.template).convert('L')
    print(f"  尺寸: {template_image.size}")
    
    # 获取预处理管道
    transform = get_transform()
    
    # 运行基准测试
    print(f"\n{'=' * 80}")
    print(f"{'开始性能基准测试':^80}")
    print(f"{'=' * 80}")
    print(f"运行次数: {args.num_runs}")
    print(f"配置: {args.config}")
    
    with torch.no_grad():
        fpn_result = benchmark_fpn(
            model, layout_image, template_image, transform, matching_cfg, args.num_runs
        )
        
        # 临时禁用 FPN，启用滑窗
        original_use_fpn = getattr(matching_cfg, 'use_fpn', True)
        matching_cfg.use_fpn = False
        
        sw_result = benchmark_sliding_window(
            model, layout_image, template_image, transform, matching_cfg, args.num_runs
        )
        
        # 恢复配置
        matching_cfg.use_fpn = original_use_fpn
    
    # 计算对比指标
    comparison = compute_speedup(fpn_result, sw_result)
    
    # 打印结果
    print_results(fpn_result, sw_result, comparison)
    
    # 保存结果
    results = {
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "config": str(args.config),
        "model_path": str(model_path),
        "layout_path": str(args.layout),
        "layout_size": list(layout_image.size),
        "template_path": str(args.template),
        "template_size": list(template_image.size),
        "device": str(device),
        "fpn": fpn_result,
        "sliding_window": sw_result,
        "comparison": comparison,
    }
    
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\n✅ 结果已保存至: {output_path}")
    print(f"{'=' * 80}\n")
    
    # 退出状态码
    if comparison["meets_speedup_target"] and comparison["meets_memory_target"]:
        print("🎉 所有性能指标均达到预期目标！")
        return 0
    elif comparison["fpn_faster"]:
        print("✅ FPN 性能优于滑窗，但未完全达到目标。")
        return 1
    else:
        print("⚠️ FPN 性能未优于滑窗，需要优化。")
        return 2


if __name__ == "__main__":
    sys.exit(main())