glue-factory-custom/gluefactory/datasets/eth3d.py

"""
ETH3D multi-view benchmark, used for line matching evaluation.
"""
import logging
import os
import shutil

import numpy as np
import cv2
import torch
from pathlib import Path
import zipfile

from .base_dataset import BaseDataset
from .utils import scale_intrinsics
from ..geometry.wrappers import Camera, Pose
from ..settings import DATA_PATH
from ..utils.image import ImagePreprocessor, load_image

logger = logging.getLogger(__name__)


def read_cameras(camera_file, scale_factor=None):
    """Read the camera intrinsics from a file in COLMAP format."""
    with open(camera_file, "r") as f:
        raw_cameras = f.read().rstrip().split("\n")
    raw_cameras = raw_cameras[3:]
    cameras = []
    for c in raw_cameras:
        data = c.split(" ")
        fx, fy, cx, cy = np.array(list(map(float, data[4:])))
        K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
        if scale_factor is not None:
            K = scale_intrinsics(K, np.array([scale_factor, scale_factor]))
        cameras.append(Camera.from_calibration_matrix(K).float())
    return cameras


def qvec2rotmat(qvec):
    """Convert from quaternions to rotation matrix."""
    return np.array(
        [
            [
                1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
                2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
                2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2],
            ],
            [
                2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
                1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
                2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1],
            ],
            [
                2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
                2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
                1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2,
            ],
        ]
    )


class ETH3DDataset(BaseDataset):
    default_conf = {
        "data_dir": "ETH3D_undistorted",
        "grayscale": True,
        "downsize_factor": 8,
        "min_covisibility": 500,
        "batch_size": 1,
        "two_view": True,
        "min_overlap": 0.5,
        "max_overlap": 1.0,
        "sort_by_overlap": False,
        "seed": 0,
    }

    def _init(self, conf):
        self.grayscale = conf.grayscale
        self.downsize_factor = conf.downsize_factor

        # Set random seeds
        np.random.seed(conf.seed)
        torch.manual_seed(conf.seed)

        # Auto-download the dataset
        if not (DATA_PATH / conf.data_dir).exists():
            logger.info("Downloading the ETH3D dataset...")
            self.download_eth3d()

        # Form pairs of images from the multiview dataset
        self.img_dir = DATA_PATH / conf.data_dir
        self.data = []
        for folder in self.img_dir.iterdir():
            img_folder = Path(folder, "images", "dslr_images_undistorted")
            depth_folder = Path(folder, "ground_truth_depth/undistorted_depth")
            depth_ext = ".png"
            names = [img.name for img in img_folder.iterdir()]
            names.sort()

            # Read intrinsics and extrinsics data
            cameras = read_cameras(
                str(Path(folder, "dslr_calibration_undistorted", "cameras.txt")),
                1 / self.downsize_factor,
            )
            name_to_cam_idx = {name: {} for name in names}
            with open(
                str(Path(folder, "dslr_calibration_jpg", "images.txt")), "r"
            ) as f:
                raw_data = f.read().rstrip().split("\n")[4::2]
            for raw_line in raw_data:
                line = raw_line.split(" ")
                img_name = os.path.basename(line[-1])
                name_to_cam_idx[img_name]["dist_camera_idx"] = int(line[-2])
            T_world_to_camera = {}
            image_visible_points3D = {}
            with open(
                str(Path(folder, "dslr_calibration_undistorted", "images.txt")), "r"
            ) as f:
                lines = f.readlines()[4:]  # Skip the header
                raw_poses = [line.strip("\n").split(" ") for line in lines[::2]]
                raw_points = [line.strip("\n").split(" ") for line in lines[1::2]]
            for raw_pose, raw_pts in zip(raw_poses, raw_points):
                img_name = os.path.basename(raw_pose[-1])
                # Extract the transform from world to camera
                target_extrinsics = list(map(float, raw_pose[1:8]))
                pose = np.eye(4, dtype=np.float32)
                pose[:3, :3] = qvec2rotmat(target_extrinsics[:4])
                pose[:3, 3] = target_extrinsics[4:]
                T_world_to_camera[img_name] = pose
                name_to_cam_idx[img_name]["undist_camera_idx"] = int(raw_pose[-2])
                # Extract the visible 3D points
                point3D_ids = [id for id in map(int, raw_pts[2::3]) if id != -1]
                image_visible_points3D[img_name] = set(point3D_ids)

            # Extract the covisibility of each image
            num_imgs = len(names)
            n_covisible_points = np.zeros((num_imgs, num_imgs))
            for i in range(num_imgs - 1):
                for j in range(i + 1, num_imgs):
                    visible_points3D1 = image_visible_points3D[names[i]]
                    visible_points3D2 = image_visible_points3D[names[j]]
                    n_covisible_points[i, j] = len(
                        visible_points3D1 & visible_points3D2
                    )

            # Keep only the pairs with enough covisibility
            valid_pairs = np.where(n_covisible_points >= conf.min_covisibility)
            valid_pairs = np.stack(valid_pairs, axis=1)

            self.data += [
                {
                    "view0": {
                        "name": names[i][:-4],
                        "img_path": str(Path(img_folder, names[i])),
                        "depth_path": str(Path(depth_folder, names[i][:-4]))
                        + depth_ext,
                        "camera": cameras[name_to_cam_idx[names[i]]["dist_camera_idx"]],
                        "T_w2cam": Pose.from_4x4mat(T_world_to_camera[names[i]]),
                    },
                    "view1": {
                        "name": names[j][:-4],
                        "img_path": str(Path(img_folder, names[j])),
                        "depth_path": str(Path(depth_folder, names[j][:-4]))
                        + depth_ext,
                        "camera": cameras[name_to_cam_idx[names[j]]["dist_camera_idx"]],
                        "T_w2cam": Pose.from_4x4mat(T_world_to_camera[names[j]]),
                    },
                    "T_world_to_ref": Pose.from_4x4mat(T_world_to_camera[names[i]]),
                    "T_world_to_target": Pose.from_4x4mat(T_world_to_camera[names[j]]),
                    "T_0to1": Pose.from_4x4mat(
                        np.float32(
                            T_world_to_camera[names[j]]
                            @ np.linalg.inv(T_world_to_camera[names[i]])
                        )
                    ),
                    "T_1to0": Pose.from_4x4mat(
                        np.float32(
                            T_world_to_camera[names[i]]
                            @ np.linalg.inv(T_world_to_camera[names[j]])
                        )
                    ),
                    "n_covisible_points": n_covisible_points[i, j],
                }
                for (i, j) in valid_pairs
            ]

        # Print some info
        print("[Info] Successfully initialized dataset")
        print("\t Name: ETH3D")
        print("----------------------------------------")

    def download_eth3d(self):
        data_dir = DATA_PATH / self.conf.data_dir
        tmp_dir = data_dir.parent / "ETH3D_tmp"
        if tmp_dir.exists():
            shutil.rmtree(tmp_dir)
        tmp_dir.mkdir(exist_ok=True, parents=True)
        url_base = "https://cvg-data.inf.ethz.ch/ETH3D_undistorted/"
        zip_name = "ETH3D_undistorted.zip"
        zip_path = tmp_dir / zip_name
        torch.hub.download_url_to_file(url_base + zip_name, zip_path)
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(tmp_dir)
        shutil.move(tmp_dir / zip_name.split(".")[0], data_dir)

    def get_dataset(self, split):
        return ETH3DDataset(self.conf)

    def _read_image(self, img_path):
        img = load_image(img_path, grayscale=self.grayscale)
        shape = img.shape[-2:]
        # instead of INTER_AREA this does bilinear interpolation with antialiasing
        img_data = ImagePreprocessor({"resize": max(shape) // self.downsize_factor})(
            img
        )
        return img_data

    def read_depth(self, depth_path):
        if self.downsize_factor != 8:
            raise ValueError(
                "Undistorted depth only available for low res"
                + " images(downsize_factor = 8)."
            )
        depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
        depth_img = depth_img.astype(np.float32) / 256

        return depth_img

    def __getitem__(self, idx):
        """Returns the data associated to a pair of images (reference, target)
        that are co-visible."""
        data = self.data[idx]
        # Load the images
        view0 = data.pop("view0")
        view1 = data.pop("view1")
        view0 = {**view0, **self._read_image(view0["img_path"])}
        view1 = {**view1, **self._read_image(view1["img_path"])}
        view0["scales"] = np.array([1.0, 1]).astype(np.float32)
        view1["scales"] = np.array([1.0, 1]).astype(np.float32)

        # Load the depths
        view0["depth"] = self.read_depth(view0["depth_path"])
        view1["depth"] = self.read_depth(view1["depth_path"])

        outputs = {
            **data,
            "view0": view0,
            "view1": view1,
            "name": f"{view0['name']}_{view1['name']}",
        }

        return outputs

    def __len__(self):
        return len(self.data)