Files
RoRD-Layout-Recognation/tests/benchmark_backbones.py

121 lines
3.9 KiB
Python
Raw Permalink Normal View History

2025-10-20 13:35:13 +08:00
"""
Backbone A/B 基准测试脚本
目的在相同输入与重复次数下对比不同骨干vgg16/resnet34/efficientnet_b0
在单尺度与 FPN 前向推理的吞吐毫秒与显存占用MB
示例
uv run python tests/benchmark_backbones.py --device cpu --image-size 512 --runs 5
uv run python tests/benchmark_backbones.py --device cuda --runs 20 --backbones vgg16 resnet34 efficientnet_b0
"""
from __future__ import annotations
import argparse
import time
from typing import Dict, List, Tuple
import numpy as np
import psutil
import torch
from models.rord import RoRD
def get_mem_mb() -> float:
p = psutil.Process()
return p.memory_info().rss / 1024 / 1024
def get_gpu_mem_mb() -> float:
if torch.cuda.is_available():
return torch.cuda.memory_allocated() / 1024 / 1024
return 0.0
def warmup(model: torch.nn.Module, x: torch.Tensor, steps: int = 3, fpn: bool = False) -> None:
with torch.inference_mode():
for _ in range(steps):
_ = model(x, return_pyramid=fpn)
def bench_once(model: torch.nn.Module, x: torch.Tensor, fpn: bool = False) -> float:
if torch.cuda.is_available() and x.is_cuda:
torch.cuda.synchronize()
t0 = time.time()
with torch.inference_mode():
_ = model(x, return_pyramid=fpn)
if torch.cuda.is_available() and x.is_cuda:
torch.cuda.synchronize()
return (time.time() - t0) * 1000.0
def run_benchmark(backbone: str, device: torch.device, image_size: int, runs: int) -> Dict[str, float]:
cfg = type("cfg", (), {
"model": type("m", (), {
"backbone": type("b", (), {"name": backbone, "pretrained": False})(),
"attention": type("a", (), {"enabled": False, "type": "none", "places": []})(),
})()
})()
model = RoRD(cfg=cfg).to(device)
model.eval()
x = torch.randn(1, 3, image_size, image_size, device=device)
# warmup
warmup(model, x, steps=5, fpn=False)
warmup(model, x, steps=5, fpn=True)
# single-scale
t_list_single: List[float] = []
for _ in range(runs):
t_list_single.append(bench_once(model, x, fpn=False))
# FPN
t_list_fpn: List[float] = []
for _ in range(runs):
t_list_fpn.append(bench_once(model, x, fpn=True))
return {
"backbone": backbone,
"single_ms_mean": float(np.mean(t_list_single)),
"single_ms_std": float(np.std(t_list_single)),
"fpn_ms_mean": float(np.mean(t_list_fpn)),
"fpn_ms_std": float(np.std(t_list_fpn)),
"gpu_mem_mb": float(get_gpu_mem_mb()),
"cpu_mem_mb": float(get_mem_mb()),
"runs": int(runs),
}
def main():
parser = argparse.ArgumentParser(description="RoRD 骨干 A/B 基准测试")
parser.add_argument("--backbones", nargs="*", default=["vgg16", "resnet34", "efficientnet_b0"],
help="要测试的骨干列表")
parser.add_argument("--image-size", type=int, default=512, help="输入图像尺寸(正方形)")
parser.add_argument("--runs", type=int, default=10, help="每个设置的重复次数")
parser.add_argument("--device", type=str, default="cuda", help="cuda 或 cpu")
args = parser.parse_args()
device = torch.device(args.device if torch.cuda.is_available() or args.device == "cpu" else "cpu")
print(f"使用设备: {device}")
results: List[Dict[str, float]] = []
for bk in args.backbones:
print(f"\n=== Benchmark: {bk} ===")
res = run_benchmark(bk, device, args.image_size, args.runs)
print(f"single: {res['single_ms_mean']:.2f}±{res['single_ms_std']:.2f} ms | "
f"fpn: {res['fpn_ms_mean']:.2f}±{res['fpn_ms_std']:.2f} ms | "
f"gpu_mem: {res['gpu_mem_mb']:.1f} MB")
results.append(res)
# 简要对比打印
print("\n===== 汇总 =====")
for r in results:
print(f"{r['backbone']:<16} single {r['single_ms_mean']:.2f} ms | fpn {r['fpn_ms_mean']:.2f} ms")
if __name__ == "__main__":
main()