2025-06-08 15:38:56 +08:00
|
|
|
# match.py
|
|
|
|
|
|
2025-06-07 23:45:32 +08:00
|
|
|
import torch
|
|
|
|
|
import torch.nn.functional as F
|
|
|
|
|
import numpy as np
|
|
|
|
|
import cv2
|
|
|
|
|
from PIL import Image
|
2025-06-08 15:38:56 +08:00
|
|
|
import argparse
|
|
|
|
|
import os
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-06-08 15:38:56 +08:00
|
|
|
import config
|
|
|
|
|
from models.rord import RoRD
|
|
|
|
|
from utils.data_utils import get_transform
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# --- Feature extraction functions (unchanged) ---
|
2025-06-09 01:49:13 +08:00
|
|
|
def extract_keypoints_and_descriptors(model, image_tensor, kp_thresh):
|
2025-06-07 23:45:32 +08:00
|
|
|
with torch.no_grad():
|
2025-06-09 01:49:13 +08:00
|
|
|
detection_map, desc = model(image_tensor)
|
|
|
|
|
|
|
|
|
|
device = detection_map.device
|
|
|
|
|
binary_map = (detection_map > kp_thresh).squeeze(0).squeeze(0)
|
|
|
|
|
coords = torch.nonzero(binary_map).float() # y, x
|
|
|
|
|
|
|
|
|
|
if len(coords) == 0:
|
|
|
|
|
return torch.tensor([], device=device), torch.tensor([], device=device)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Descriptor sampling
|
2025-06-09 01:49:13 +08:00
|
|
|
coords_for_grid = coords.flip(1).view(1, -1, 1, 2) # N, 2 -> 1, N, 1, 2 (x,y)
|
2025-07-22 23:43:35 +08:00
|
|
|
# Normalize to [-1, 1]
|
2025-06-09 01:49:13 +08:00
|
|
|
coords_for_grid = coords_for_grid / torch.tensor([(desc.shape[3]-1)/2, (desc.shape[2]-1)/2], device=device) - 1
|
|
|
|
|
|
|
|
|
|
descriptors = F.grid_sample(desc, coords_for_grid, align_corners=True).squeeze().T
|
|
|
|
|
descriptors = F.normalize(descriptors, p=2, dim=1)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Convert keypoint coordinates from feature map scale back to image scale
|
|
|
|
|
# VGG downsampling rate to relu4_3 is 8
|
2025-06-09 01:49:13 +08:00
|
|
|
keypoints = coords.flip(1) * 8.0 # x, y
|
|
|
|
|
|
|
|
|
|
return keypoints, descriptors
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# --- (New) Sliding window feature extraction function ---
|
2025-06-09 01:49:13 +08:00
|
|
|
def extract_features_sliding_window(model, large_image, transform):
|
|
|
|
|
"""
|
2025-07-22 23:43:35 +08:00
|
|
|
Extract all keypoints and descriptors from large image using sliding window
|
2025-06-09 01:49:13 +08:00
|
|
|
"""
|
2025-07-22 23:43:35 +08:00
|
|
|
print("Using sliding window to extract features from large layout...")
|
2025-06-09 01:49:13 +08:00
|
|
|
device = next(model.parameters()).device
|
|
|
|
|
W, H = large_image.size
|
|
|
|
|
window_size = config.INFERENCE_WINDOW_SIZE
|
|
|
|
|
stride = config.INFERENCE_STRIDE
|
|
|
|
|
|
|
|
|
|
all_kps = []
|
|
|
|
|
all_descs = []
|
|
|
|
|
|
|
|
|
|
for y in range(0, H, stride):
|
|
|
|
|
for x in range(0, W, stride):
|
2025-07-22 23:43:35 +08:00
|
|
|
# Ensure window does not exceed boundaries
|
2025-06-09 01:49:13 +08:00
|
|
|
x_end = min(x + window_size, W)
|
|
|
|
|
y_end = min(y + window_size, H)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Crop window
|
2025-06-09 01:49:13 +08:00
|
|
|
patch = large_image.crop((x, y, x_end, y_end))
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Preprocess
|
2025-06-09 01:49:13 +08:00
|
|
|
patch_tensor = transform(patch).unsqueeze(0).to(device)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Extract features
|
2025-06-09 01:49:13 +08:00
|
|
|
kps, descs = extract_keypoints_and_descriptors(model, patch_tensor, config.KEYPOINT_THRESHOLD)
|
|
|
|
|
|
|
|
|
|
if len(kps) > 0:
|
2025-07-22 23:43:35 +08:00
|
|
|
# Convert local coordinates to global coordinates
|
2025-06-09 01:49:13 +08:00
|
|
|
kps[:, 0] += x
|
|
|
|
|
kps[:, 1] += y
|
|
|
|
|
all_kps.append(kps)
|
|
|
|
|
all_descs.append(descs)
|
|
|
|
|
|
|
|
|
|
if not all_kps:
|
|
|
|
|
return torch.tensor([], device=device), torch.tensor([], device=device)
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
print(f"Large layout feature extraction completed, found {sum(len(k) for k in all_kps)} keypoints in total.")
|
2025-06-09 01:49:13 +08:00
|
|
|
return torch.cat(all_kps, dim=0), torch.cat(all_descs, dim=0)
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# --- Mutual nearest neighbor matching (unchanged) ---
|
2025-06-08 15:38:56 +08:00
|
|
|
def mutual_nearest_neighbor(descs1, descs2):
|
2025-06-09 01:49:13 +08:00
|
|
|
if len(descs1) == 0 or len(descs2) == 0:
|
|
|
|
|
return torch.empty((0, 2), dtype=torch.int64)
|
2025-06-08 15:38:56 +08:00
|
|
|
sim = descs1 @ descs2.T
|
|
|
|
|
nn12 = torch.max(sim, dim=1)
|
|
|
|
|
nn21 = torch.max(sim, dim=0)
|
|
|
|
|
ids1 = torch.arange(0, sim.shape[0], device=sim.device)
|
|
|
|
|
mask = (ids1 == nn21.indices[nn12.indices])
|
|
|
|
|
matches = torch.stack([ids1[mask], nn12.indices[mask]], dim=1)
|
2025-06-09 01:49:13 +08:00
|
|
|
return matches
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# --- (Modified) Multi-scale, multi-instance matching main function ---
|
2025-06-09 01:49:13 +08:00
|
|
|
def match_template_multiscale(model, layout_image, template_image, transform):
|
|
|
|
|
"""
|
2025-07-22 23:43:35 +08:00
|
|
|
Search for template at different scales and detect multiple instances
|
2025-06-09 01:49:13 +08:00
|
|
|
"""
|
2025-07-22 23:43:35 +08:00
|
|
|
# 1. Use sliding window to extract all features from large layout
|
2025-06-09 01:49:13 +08:00
|
|
|
layout_kps, layout_descs = extract_features_sliding_window(model, layout_image, transform)
|
|
|
|
|
|
|
|
|
|
if len(layout_kps) < config.MIN_INLIERS:
|
2025-07-22 23:43:35 +08:00
|
|
|
print("Too few keypoints extracted from large layout, cannot perform matching.")
|
2025-06-09 01:49:13 +08:00
|
|
|
return []
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-06-08 15:38:56 +08:00
|
|
|
found_instances = []
|
2025-06-09 01:49:13 +08:00
|
|
|
active_layout_mask = torch.ones(len(layout_kps), dtype=bool, device=layout_kps.device)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# 2. Multi-instance iterative detection
|
2025-06-07 23:45:32 +08:00
|
|
|
while True:
|
2025-06-09 01:49:13 +08:00
|
|
|
current_active_indices = torch.nonzero(active_layout_mask).squeeze(1)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Stop if remaining active keypoints are too few
|
2025-06-09 01:49:13 +08:00
|
|
|
if len(current_active_indices) < config.MIN_INLIERS:
|
2025-06-07 23:45:32 +08:00
|
|
|
break
|
|
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
current_layout_kps = layout_kps[current_active_indices]
|
|
|
|
|
current_layout_descs = layout_descs[current_active_indices]
|
2025-06-08 15:38:56 +08:00
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
best_match_info = {'inliers': 0, 'H': None, 'src_pts': None, 'dst_pts': None, 'mask': None}
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# 3. Image pyramid: iterate through each scale of template
|
|
|
|
|
print("Searching for template at new scale...")
|
2025-06-09 01:49:13 +08:00
|
|
|
for scale in config.PYRAMID_SCALES:
|
|
|
|
|
W, H = template_image.size
|
|
|
|
|
new_W, new_H = int(W * scale), int(H * scale)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Scale template
|
2025-06-09 01:49:13 +08:00
|
|
|
scaled_template = template_image.resize((new_W, new_H), Image.LANCZOS)
|
|
|
|
|
template_tensor = transform(scaled_template).unsqueeze(0).to(layout_kps.device)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Extract features from scaled template
|
2025-06-09 01:49:13 +08:00
|
|
|
template_kps, template_descs = extract_keypoints_and_descriptors(model, template_tensor, config.KEYPOINT_THRESHOLD)
|
|
|
|
|
|
|
|
|
|
if len(template_kps) < 4: continue
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Match current scale template with active layout features
|
2025-06-09 01:49:13 +08:00
|
|
|
matches = mutual_nearest_neighbor(template_descs, current_layout_descs)
|
|
|
|
|
|
|
|
|
|
if len(matches) < 4: continue
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
# RANSAC
|
2025-07-22 23:43:35 +08:00
|
|
|
# Note: template keypoint coordinates need to be restored to original size to calculate correct H
|
2025-06-09 01:49:13 +08:00
|
|
|
src_pts = template_kps[matches[:, 0]].cpu().numpy() / scale
|
|
|
|
|
dst_pts_indices = current_active_indices[matches[:, 1]]
|
|
|
|
|
dst_pts = layout_kps[dst_pts_indices].cpu().numpy()
|
2025-06-08 15:38:56 +08:00
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, config.RANSAC_REPROJ_THRESHOLD)
|
|
|
|
|
|
|
|
|
|
if H is not None and mask.sum() > best_match_info['inliers']:
|
|
|
|
|
best_match_info = {'inliers': mask.sum(), 'H': H, 'mask': mask, 'scale': scale, 'dst_pts': dst_pts}
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# 4. If best match found across all scales, record and mask
|
2025-06-09 01:49:13 +08:00
|
|
|
if best_match_info['inliers'] > config.MIN_INLIERS:
|
2025-07-22 23:43:35 +08:00
|
|
|
print(f"Found a matching instance! Inliers: {best_match_info['inliers']}, Template scale used: {best_match_info['scale']:.2f}x")
|
2025-06-09 01:49:13 +08:00
|
|
|
|
|
|
|
|
inlier_mask = best_match_info['mask'].ravel().astype(bool)
|
|
|
|
|
inlier_layout_kps = best_match_info['dst_pts'][inlier_mask]
|
|
|
|
|
|
|
|
|
|
x_min, y_min = inlier_layout_kps.min(axis=0)
|
|
|
|
|
x_max, y_max = inlier_layout_kps.max(axis=0)
|
|
|
|
|
|
|
|
|
|
instance = {'x': int(x_min), 'y': int(y_min), 'width': int(x_max - x_min), 'height': int(y_max - y_min), 'homography': best_match_info['H']}
|
|
|
|
|
found_instances.append(instance)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
# Mask keypoints in matched region to detect next instance
|
2025-06-09 01:49:13 +08:00
|
|
|
kp_x, kp_y = layout_kps[:, 0], layout_kps[:, 1]
|
|
|
|
|
region_mask = (kp_x >= x_min) & (kp_x <= x_max) & (kp_y >= y_min) & (kp_y <= y_max)
|
|
|
|
|
active_layout_mask[region_mask] = False
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
print(f"Remaining active keypoints: {active_layout_mask.sum()}")
|
2025-06-09 01:49:13 +08:00
|
|
|
else:
|
2025-07-22 23:43:35 +08:00
|
|
|
# If no good match found across all scales, end search
|
|
|
|
|
print("No new matching instances found across all scales, search ended.")
|
2025-06-09 01:49:13 +08:00
|
|
|
break
|
2025-06-08 15:38:56 +08:00
|
|
|
|
|
|
|
|
return found_instances
|
|
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
|
|
|
|
|
def visualize_matches(layout_path, bboxes, output_path):
|
2025-06-08 15:38:56 +08:00
|
|
|
layout_img = cv2.imread(layout_path)
|
|
|
|
|
for i, bbox in enumerate(bboxes):
|
|
|
|
|
x, y, w, h = bbox['x'], bbox['y'], bbox['width'], bbox['height']
|
|
|
|
|
cv2.rectangle(layout_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
|
|
|
|
cv2.putText(layout_img, f"Match {i+1}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
|
|
|
|
|
cv2.imwrite(output_path, layout_img)
|
2025-07-22 23:43:35 +08:00
|
|
|
print(f"Visualization result saved to: {output_path}")
|
2025-06-07 23:45:32 +08:00
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
|
2025-06-07 23:45:32 +08:00
|
|
|
if __name__ == "__main__":
|
2025-07-22 23:43:35 +08:00
|
|
|
parser = argparse.ArgumentParser(description="Multi-scale template matching using RoRD")
|
2025-06-08 15:38:56 +08:00
|
|
|
parser.add_argument('--model_path', type=str, default=config.MODEL_PATH)
|
|
|
|
|
parser.add_argument('--layout', type=str, required=True)
|
|
|
|
|
parser.add_argument('--template', type=str, required=True)
|
|
|
|
|
parser.add_argument('--output', type=str)
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
transform = get_transform()
|
2025-06-07 23:45:32 +08:00
|
|
|
model = RoRD().cuda()
|
2025-06-08 15:38:56 +08:00
|
|
|
model.load_state_dict(torch.load(args.model_path))
|
2025-06-07 23:45:32 +08:00
|
|
|
model.eval()
|
|
|
|
|
|
2025-06-09 01:49:13 +08:00
|
|
|
layout_image = Image.open(args.layout).convert('L')
|
|
|
|
|
template_image = Image.open(args.template).convert('L')
|
|
|
|
|
|
|
|
|
|
detected_bboxes = match_template_multiscale(model, layout_image, template_image, transform)
|
|
|
|
|
|
2025-07-22 23:43:35 +08:00
|
|
|
print("\nDetected bounding boxes:")
|
2025-06-07 23:45:32 +08:00
|
|
|
for bbox in detected_bboxes:
|
2025-06-08 15:38:56 +08:00
|
|
|
print(bbox)
|
|
|
|
|
|
|
|
|
|
if args.output:
|
2025-06-09 01:49:13 +08:00
|
|
|
visualize_matches(args.layout, detected_bboxes, args.output)
|