121 lines
3.9 KiB
Python
121 lines
3.9 KiB
Python
"""
|
||
Backbone A/B 基准测试脚本
|
||
|
||
目的:在相同输入与重复次数下,对比不同骨干(vgg16/resnet34/efficientnet_b0)
|
||
在单尺度与 FPN 前向推理的吞吐(毫秒)与显存占用(MB)。
|
||
|
||
示例:
|
||
uv run python tests/benchmark_backbones.py --device cpu --image-size 512 --runs 5
|
||
uv run python tests/benchmark_backbones.py --device cuda --runs 20 --backbones vgg16 resnet34 efficientnet_b0
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import time
|
||
from typing import Dict, List, Tuple
|
||
|
||
import numpy as np
|
||
import psutil
|
||
import torch
|
||
|
||
from models.rord import RoRD
|
||
|
||
|
||
def get_mem_mb() -> float:
|
||
p = psutil.Process()
|
||
return p.memory_info().rss / 1024 / 1024
|
||
|
||
|
||
def get_gpu_mem_mb() -> float:
|
||
if torch.cuda.is_available():
|
||
return torch.cuda.memory_allocated() / 1024 / 1024
|
||
return 0.0
|
||
|
||
|
||
def warmup(model: torch.nn.Module, x: torch.Tensor, steps: int = 3, fpn: bool = False) -> None:
|
||
with torch.inference_mode():
|
||
for _ in range(steps):
|
||
_ = model(x, return_pyramid=fpn)
|
||
|
||
|
||
def bench_once(model: torch.nn.Module, x: torch.Tensor, fpn: bool = False) -> float:
|
||
if torch.cuda.is_available() and x.is_cuda:
|
||
torch.cuda.synchronize()
|
||
t0 = time.time()
|
||
with torch.inference_mode():
|
||
_ = model(x, return_pyramid=fpn)
|
||
if torch.cuda.is_available() and x.is_cuda:
|
||
torch.cuda.synchronize()
|
||
return (time.time() - t0) * 1000.0
|
||
|
||
|
||
def run_benchmark(backbone: str, device: torch.device, image_size: int, runs: int) -> Dict[str, float]:
|
||
cfg = type("cfg", (), {
|
||
"model": type("m", (), {
|
||
"backbone": type("b", (), {"name": backbone, "pretrained": False})(),
|
||
"attention": type("a", (), {"enabled": False, "type": "none", "places": []})(),
|
||
})()
|
||
})()
|
||
|
||
model = RoRD(cfg=cfg).to(device)
|
||
model.eval()
|
||
|
||
x = torch.randn(1, 3, image_size, image_size, device=device)
|
||
|
||
# warmup
|
||
warmup(model, x, steps=5, fpn=False)
|
||
warmup(model, x, steps=5, fpn=True)
|
||
|
||
# single-scale
|
||
t_list_single: List[float] = []
|
||
for _ in range(runs):
|
||
t_list_single.append(bench_once(model, x, fpn=False))
|
||
|
||
# FPN
|
||
t_list_fpn: List[float] = []
|
||
for _ in range(runs):
|
||
t_list_fpn.append(bench_once(model, x, fpn=True))
|
||
|
||
return {
|
||
"backbone": backbone,
|
||
"single_ms_mean": float(np.mean(t_list_single)),
|
||
"single_ms_std": float(np.std(t_list_single)),
|
||
"fpn_ms_mean": float(np.mean(t_list_fpn)),
|
||
"fpn_ms_std": float(np.std(t_list_fpn)),
|
||
"gpu_mem_mb": float(get_gpu_mem_mb()),
|
||
"cpu_mem_mb": float(get_mem_mb()),
|
||
"runs": int(runs),
|
||
}
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="RoRD 骨干 A/B 基准测试")
|
||
parser.add_argument("--backbones", nargs="*", default=["vgg16", "resnet34", "efficientnet_b0"],
|
||
help="要测试的骨干列表")
|
||
parser.add_argument("--image-size", type=int, default=512, help="输入图像尺寸(正方形)")
|
||
parser.add_argument("--runs", type=int, default=10, help="每个设置的重复次数")
|
||
parser.add_argument("--device", type=str, default="cuda", help="cuda 或 cpu")
|
||
|
||
args = parser.parse_args()
|
||
|
||
device = torch.device(args.device if torch.cuda.is_available() or args.device == "cpu" else "cpu")
|
||
print(f"使用设备: {device}")
|
||
|
||
results: List[Dict[str, float]] = []
|
||
for bk in args.backbones:
|
||
print(f"\n=== Benchmark: {bk} ===")
|
||
res = run_benchmark(bk, device, args.image_size, args.runs)
|
||
print(f"single: {res['single_ms_mean']:.2f}±{res['single_ms_std']:.2f} ms | "
|
||
f"fpn: {res['fpn_ms_mean']:.2f}±{res['fpn_ms_std']:.2f} ms | "
|
||
f"gpu_mem: {res['gpu_mem_mb']:.1f} MB")
|
||
results.append(res)
|
||
|
||
# 简要对比打印
|
||
print("\n===== 汇总 =====")
|
||
for r in results:
|
||
print(f"{r['backbone']:<16} single {r['single_ms_mean']:.2f} ms | fpn {r['fpn_ms_mean']:.2f} ms")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|