""" ETH3D multi-view benchmark, used for line matching evaluation. """ import logging import os import shutil import numpy as np import cv2 import torch from pathlib import Path import zipfile from .base_dataset import BaseDataset from .utils import scale_intrinsics from ..geometry.wrappers import Camera, Pose from ..settings import DATA_PATH from ..utils.image import ImagePreprocessor, load_image logger = logging.getLogger(__name__) def read_cameras(camera_file, scale_factor=None): """Read the camera intrinsics from a file in COLMAP format.""" with open(camera_file, "r") as f: raw_cameras = f.read().rstrip().split("\n") raw_cameras = raw_cameras[3:] cameras = [] for c in raw_cameras: data = c.split(" ") fx, fy, cx, cy = np.array(list(map(float, data[4:]))) K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32) if scale_factor is not None: K = scale_intrinsics(K, np.array([scale_factor, scale_factor])) cameras.append(Camera.from_calibration_matrix(K).float()) return cameras def qvec2rotmat(qvec): """Convert from quaternions to rotation matrix.""" return np.array( [ [ 1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2, 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2], ], [ 2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], 1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2, 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1], ], [ 2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], 1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2, ], ] ) class ETH3DDataset(BaseDataset): default_conf = { "data_dir": "ETH3D_undistorted", "grayscale": True, "downsize_factor": 8, "min_covisibility": 500, "batch_size": 1, "two_view": True, "min_overlap": 0.5, "max_overlap": 1.0, "sort_by_overlap": False, "seed": 0, } def _init(self, conf): self.grayscale = conf.grayscale self.downsize_factor = conf.downsize_factor # Set random seeds np.random.seed(conf.seed) torch.manual_seed(conf.seed) # Auto-download the dataset if not (DATA_PATH / conf.data_dir).exists(): logger.info("Downloading the ETH3D dataset...") self.download_eth3d() # Form pairs of images from the multiview dataset self.img_dir = DATA_PATH / conf.data_dir self.data = [] for folder in self.img_dir.iterdir(): img_folder = Path(folder, "images", "dslr_images_undistorted") depth_folder = Path(folder, "ground_truth_depth/undistorted_depth") depth_ext = ".png" names = [img.name for img in img_folder.iterdir()] names.sort() # Read intrinsics and extrinsics data cameras = read_cameras( str(Path(folder, "dslr_calibration_undistorted", "cameras.txt")), 1 / self.downsize_factor, ) name_to_cam_idx = {name: {} for name in names} with open( str(Path(folder, "dslr_calibration_jpg", "images.txt")), "r" ) as f: raw_data = f.read().rstrip().split("\n")[4::2] for raw_line in raw_data: line = raw_line.split(" ") img_name = os.path.basename(line[-1]) name_to_cam_idx[img_name]["dist_camera_idx"] = int(line[-2]) T_world_to_camera = {} image_visible_points3D = {} with open( str(Path(folder, "dslr_calibration_undistorted", "images.txt")), "r" ) as f: lines = f.readlines()[4:] # Skip the header raw_poses = [line.strip("\n").split(" ") for line in lines[::2]] raw_points = [line.strip("\n").split(" ") for line in lines[1::2]] for raw_pose, raw_pts in zip(raw_poses, raw_points): img_name = os.path.basename(raw_pose[-1]) # Extract the transform from world to camera target_extrinsics = list(map(float, raw_pose[1:8])) pose = np.eye(4, dtype=np.float32) pose[:3, :3] = qvec2rotmat(target_extrinsics[:4]) pose[:3, 3] = target_extrinsics[4:] T_world_to_camera[img_name] = pose name_to_cam_idx[img_name]["undist_camera_idx"] = int(raw_pose[-2]) # Extract the visible 3D points point3D_ids = [id for id in map(int, raw_pts[2::3]) if id != -1] image_visible_points3D[img_name] = set(point3D_ids) # Extract the covisibility of each image num_imgs = len(names) n_covisible_points = np.zeros((num_imgs, num_imgs)) for i in range(num_imgs - 1): for j in range(i + 1, num_imgs): visible_points3D1 = image_visible_points3D[names[i]] visible_points3D2 = image_visible_points3D[names[j]] n_covisible_points[i, j] = len( visible_points3D1 & visible_points3D2 ) # Keep only the pairs with enough covisibility valid_pairs = np.where(n_covisible_points >= conf.min_covisibility) valid_pairs = np.stack(valid_pairs, axis=1) self.data += [ { "view0": { "name": names[i][:-4], "img_path": str(Path(img_folder, names[i])), "depth_path": str(Path(depth_folder, names[i][:-4])) + depth_ext, "camera": cameras[name_to_cam_idx[names[i]]["dist_camera_idx"]], "T_w2cam": Pose.from_4x4mat(T_world_to_camera[names[i]]), }, "view1": { "name": names[j][:-4], "img_path": str(Path(img_folder, names[j])), "depth_path": str(Path(depth_folder, names[j][:-4])) + depth_ext, "camera": cameras[name_to_cam_idx[names[j]]["dist_camera_idx"]], "T_w2cam": Pose.from_4x4mat(T_world_to_camera[names[j]]), }, "T_world_to_ref": Pose.from_4x4mat(T_world_to_camera[names[i]]), "T_world_to_target": Pose.from_4x4mat(T_world_to_camera[names[j]]), "T_0to1": Pose.from_4x4mat( np.float32( T_world_to_camera[names[j]] @ np.linalg.inv(T_world_to_camera[names[i]]) ) ), "T_1to0": Pose.from_4x4mat( np.float32( T_world_to_camera[names[i]] @ np.linalg.inv(T_world_to_camera[names[j]]) ) ), "n_covisible_points": n_covisible_points[i, j], } for (i, j) in valid_pairs ] # Print some info print("[Info] Successfully initialized dataset") print("\t Name: ETH3D") print("----------------------------------------") def download_eth3d(self): data_dir = DATA_PATH / self.conf.data_dir tmp_dir = data_dir.parent / "ETH3D_tmp" if tmp_dir.exists(): shutil.rmtree(tmp_dir) tmp_dir.mkdir(exist_ok=True, parents=True) url_base = "https://cvg-data.inf.ethz.ch/ETH3D_undistorted/" zip_name = "ETH3D_undistorted.zip" zip_path = tmp_dir / zip_name torch.hub.download_url_to_file(url_base + zip_name, zip_path) with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(tmp_dir) shutil.move(tmp_dir / zip_name.split(".")[0], data_dir) def get_dataset(self, split): return ETH3DDataset(self.conf) def _read_image(self, img_path): img = load_image(img_path, grayscale=self.grayscale) shape = img.shape[-2:] # instead of INTER_AREA this does bilinear interpolation with antialiasing img_data = ImagePreprocessor({"resize": max(shape) // self.downsize_factor})( img ) return img_data def read_depth(self, depth_path): if self.downsize_factor != 8: raise ValueError( "Undistorted depth only available for low res" + " images(downsize_factor = 8)." ) depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) depth_img = depth_img.astype(np.float32) / 256 return depth_img def __getitem__(self, idx): """Returns the data associated to a pair of images (reference, target) that are co-visible.""" data = self.data[idx] # Load the images view0 = data.pop("view0") view1 = data.pop("view1") view0 = {**view0, **self._read_image(view0["img_path"])} view1 = {**view1, **self._read_image(view1["img_path"])} view0["scales"] = np.array([1.0, 1]).astype(np.float32) view1["scales"] = np.array([1.0, 1]).astype(np.float32) # Load the depths view0["depth"] = self.read_depth(view0["depth_path"]) view1["depth"] = self.read_depth(view1["depth_path"]) outputs = { **data, "view0": view0, "view1": view1, "name": f"{view0['name']}_{view1['name']}", } return outputs def __len__(self): return len(self.data)