Source code for fvdb_reality_capture.sfm_scene.sfm_metadata
# Copyright Contributors to the OpenVDB Project
# SPDX-License-Identifier: Apache-2.0
#
from enum import Enum
from typing import Any
import cv2
import numpy as np
class SfmCameraType(Enum):
"""
Enum representing different camera types used in structure-from-motion (SFM) pipelines.
"""
PINHOLE = "PINHOLE"
"""
A standard pinhole camera model with no lens distortion.
Uses separate focal lengths for x and y directions (fx, fy) and a principal point (cx, cy).
"""
SIMPLE_PINHOLE = "SIMPLE_PINHOLE"
"""
A simplified pinhole camera model with a single focal length and no lens distortion. The principal point is at the image center.
"""
SIMPLE_RADIAL = "SIMPLE_RADIAL"
"""
A simplified radial distortion camera model with a single focal length and one radial distortion coefficient.
"""
RADIAL = "RADIAL"
"""
A radial distortion camera model with separate focal lengths and two radial distortion coefficients.
"""
OPENCV = "OPENCV"
"""
The `OpenCV <http://opencv.org/>`_ camera model with separate focal lengths, a principal point, and five distortion coefficients ``(k1, k2, p1, p2, k3)``.
See `OpenCV camera documentation <https://docs.opencv.org/4.x/d9/d0c/group__calib3d.html>`_ for more details.
"""
OPENCV_FISHEYE = "OPENCV_FISHEYE"
"""
The `OpenCV <http://opencv.org/>`_ fisheye camera model with separate focal lengths, a principal point, and four distortion coefficients ``(k1, k2, k3, k4)``.
See `OpenCV fisheye documentation <https://docs.opencv.org/4.x/db/d58/group__calib3d__fisheye.html>`_ for more details.
"""
[docs]
class SfmCameraMetadata:
"""
This class encodes metadata about a camera used to capture images in an :class:`SfmScene`.
It contains information about the camera's intrinsic parameters (focal length, principal point, etc.),
the camera type (see :class:`SfmCameraType`) (e.g., pinhole, radial distortion), and distortion parameters if applicable.
The camera metadata is used to project 3D points into 2D pixel coordinates and to undistort images captured by the camera.
"""
[docs]
def __init__(
self,
img_width: int,
img_height: int,
fx: float,
fy: float,
cx: float,
cy: float,
camera_type: SfmCameraType,
distortion_parameters: np.ndarray,
):
"""
Create a new :class:`SfmCameraMetadata` object.
Args:
img_width (int): The width of the camera image in pixel units (must be a positive integer).
img_height (int): The height of the camera image in pixel units (must be a positive integer).
fx (float): The focal length in the x direction in pixel units.
fy (float): The focal length in the y direction in pixel units.
cx (float): The x-coordinate of the principal point (optical center) in pixel units.
cy (float): The y-coordinate of the principal point (optical center) in pixel units.
camera_type (SfmCameraType): The type of camera used to capture the image (e.g., "PINHOLE", "SIMPLE_PINHOLE", etc.). See :class:`SfmCameraType` for details.
distortion_parameters (np.ndarray): An array of distortion coefficients corresponding to the camera type, or an empty array if no distortion is present.
"""
# camera intrinsics assuming a perspective projection model
projection_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
if len(distortion_parameters) != 0:
undistorted_proj_mat, undistort_roi = cv2.getOptimalNewCameraMatrix(
projection_matrix, distortion_parameters, (img_width, img_height), 0
)
undistort_map_x, undistort_map_y = cv2.initUndistortRectifyMap(
projection_matrix, distortion_parameters, None, undistorted_proj_mat, (img_width, img_height), cv2.CV_32FC1 # type: ignore
)
self._projection_matrix = undistorted_proj_mat
self._undistort_roi = tuple([v for v in undistort_roi])
assert len(self._undistort_roi) == 4, "Undistort ROI must be a tuple of (x, y, width, height)"
self._undistort_map_x = undistort_map_x
self._undistort_map_y = undistort_map_y
else:
self._projection_matrix = projection_matrix
self._undistort_roi = None
self._undistort_map_x = None
self._undistort_map_y = None
self._fx = self._projection_matrix[0, 0]
self._fy = self._projection_matrix[1, 1]
self._cx = self._projection_matrix[0, 2]
self._cy = self._projection_matrix[1, 2]
self._width = img_width
self._height = img_height
self._camera_type = camera_type
self._distortion_parameters = distortion_parameters
[docs]
def state_dict(self) -> dict[str, Any]:
"""
Return a state dictionary representing the camera metadata.
This dictionary can be used to serialize and deserialize the camera metadata.
Returns:
state_dict (dict[str, Any]): A dictionary containing the camera metadata.
"""
return {
"img_width": self.width,
"img_height": self.height,
"fx": self.fx,
"fy": self.fy,
"cx": self.cx,
"cy": self.cy,
"camera_type": self.camera_type.value,
"distortion_parameters": self.distortion_parameters.tolist(),
}
[docs]
@classmethod
def from_state_dict(cls, state_dict: dict[str, Any]) -> "SfmCameraMetadata":
"""
Create a new :class:`SfmCameraMetadata` object from a state dictionary originally created by :meth:`state_dict`.
Args:
state_dict (dict[str, Any]): A dictionary containing the camera metadata.
Returns:
SfmCameraMetadata: A new :class:`SfmCameraMetadata` object.
"""
if "img_width" not in state_dict:
raise KeyError("img_width is missing from state_dict")
if "img_height" not in state_dict:
raise KeyError("img_height is missing from state_dict")
if "fx" not in state_dict:
raise KeyError("fx is missing from state_dict")
if "fy" not in state_dict:
raise KeyError("fy is missing from state_dict")
if "cx" not in state_dict:
raise KeyError("cx is missing from state_dict")
if "cy" not in state_dict:
raise KeyError("cy is missing from state_dict")
if "camera_type" not in state_dict:
raise KeyError("camera_type is missing from state_dict")
if "distortion_parameters" not in state_dict:
raise KeyError("distortion_parameters is missing from state_dict")
img_width = int(state_dict["img_width"])
img_height = int(state_dict["img_height"])
fx = float(state_dict["fx"])
fy = float(state_dict["fy"])
cx = float(state_dict["cx"])
cy = float(state_dict["cy"])
camera_type = SfmCameraType(state_dict["camera_type"])
distortion_parameters = np.array(state_dict["distortion_parameters"])
return cls(
img_width=img_width,
img_height=img_height,
fx=fx,
fy=fy,
cx=cx,
cy=cy,
camera_type=camera_type,
distortion_parameters=distortion_parameters,
)
@property
def projection_matrix(self) -> np.ndarray:
"""
Return the camera projection matrix.
The projection matrix is a 3x3 matrix that maps 3D points in camera coordinates to 2D points in pixel coordinates.
Returns:
projection_matrix (np.ndarray): The camera projection matrix as a 3x3 numpy array.
"""
return self._projection_matrix
@property
def fx(self) -> float:
"""
Return the focal length in the x direction in pixel units.
Returns:
fx (float): The focal length in the x direction in pixel units.
"""
return self._fx
@property
def fy(self) -> float:
"""
Return the focal length in the y direction in pixel units.
Returns:
fy (float): The focal length in the y direction in pixel units.
"""
return self._fy
@property
def cx(self) -> float:
"""
Return the x-coordinate of the principal point (optical center) in pixel units.
Returns:
cx (float): The x-coordinate of the principal point in pixel units.
"""
return self._cx
@property
def cy(self) -> float:
"""
Return the y-coordinate of the principal point (optical center) in pixel units.
Returns:
cy (float): The y-coordinate of the principal point in pixel units.
"""
return self._cy
@property
def fovx(self) -> float:
"""
Return the horizontal field of view in radians.
Returns:
fovx (float): The horizontal field of view in radians.
"""
return self._focal2fov(self.fx, self.width)
@property
def fovy(self) -> float:
"""
Return the vertical field of view in radians.
Returns:
fovy (float): The vertical field of view in radians.
"""
return self._focal2fov(self.fy, self.height)
@property
def width(self) -> int:
"""
Return the width of the camera image in pixel units.
Returns:
width (int): The width of the camera image in pixels.
"""
return self._width
@property
def height(self) -> int:
"""
Return the height of the camera image in pixel units.
Returns:
height (int): The height of the camera image in pixels.
"""
return self._height
@property
def camera_type(self) -> SfmCameraType:
"""
Return the type of camera used to capture the image.
Returns:
camera_type (SfmCameraType): The camera type (e.g., "PINHOLE", "SIMPLE_PINHOLE", etc.). See :class:`SfmCameraType` for details.
"""
return self._camera_type
@property
def aspect(self) -> float:
"""
Return the aspect ratio of the camera image.
The aspect ratio is defined as the width divided by the height.
Returns:
aspect (float): The aspect ratio of the camera image.
"""
return self.width / self.height
@property
def distortion_parameters(self) -> np.ndarray:
"""
Return the distortion parameters of the camera.
The distortion parameters are used to correct lens distortion in the captured images.
Returns:
distortion_parameters (np.ndarray): An array of distortion coefficients.
"""
return self._distortion_parameters
[docs]
def resize(self, new_width, new_height) -> "SfmCameraMetadata":
"""
Return a new :class:`SfmCameraMetadata` object with the camera parameters resized to the new image dimensions.
Args:
new_width (int): The new width of the camera image (must be a positive integer)
new_height (int): The new height of the camera image (must be a positive integer)
Returns:
SfmCameraMetadata: A new :class:`SfmCameraMetadata` object with the resized camera parameters.
"""
if new_width <= 0 or new_height <= 0:
raise ValueError("New size must be positive integers.")
rescale_w = self.width / new_width
rescale_h = self.height / new_height
new_fx = self.fx / rescale_w
new_fy = self.fy / rescale_h
new_cx = self.cx / rescale_w
new_cy = self.cy / rescale_h
return SfmCameraMetadata(
new_width, new_height, new_fx, new_fy, new_cx, new_cy, self.camera_type, self.distortion_parameters
)
@property
def undistort_roi(self) -> tuple[int, int, int, int] | None:
"""
Return the region of interest (ROI) for undistorted images.
The ROI is defined as a tuple of ``(x, y, width, height)`` that specifies the valid pixel range in an undistorted image.
If the camera does not have distortion parameters, this will be None.
Returns:
undistort_roi (tuple[int, int, int, int] | None): The ROI for undistorted images or None if no distortion parameters are present.
"""
if self._undistort_roi is not None:
assert len(self._undistort_roi) == 4, "Undistort ROI must be a tuple of (x, y, width, height)"
return self._undistort_roi
@property
def undistort_map_x(self) -> np.ndarray | None:
"""
Return the undistortion map for the x-coordinates of the image.
The undistortion map is used to remap the pixel coordinates in a distorted image to correct for lens distortion.
If the camera does not have distortion parameters, this will be None.
Returns:
undistort_map_x (np.ndarray | None): The undistortion map for the x-coordinates or None if no distortion parameters are present.
"""
return self._undistort_map_x
@property
def undistort_map_y(self) -> np.ndarray | None:
"""
Return the undistortion map for the y-coordinates of the image.
The undistortion map is used to remap the pixel coordinates in a distorted image to correct for lens distortion.
If the camera does not have distortion parameters, this will be None.
Returns:
undistort_map_y (np.ndarray | None): The undistortion map for the y-coordinates or None if no distortion parameters are present.
"""
return self._undistort_map_y
@staticmethod
def _focal2fov(focal: float, pixels: float) -> float:
"""
Convert a focal length in pixel units to a field of view in radians.
Args:
focal (float): The focal length in pixel units.
pixels (float): The number of pixels corresponding to the field of view.
Returns:
float: The field of view in radians.
"""
return 2 * np.arctan(pixels / (2 * focal))
[docs]
def undistort_image(self, image: np.ndarray) -> np.ndarray:
"""
Undistort an image using the camera's distortion parameters.
Args:
image (np.ndarray): The distorted image to undistort.
Returns:
undistorted_image (np.ndarray): The undistorted image.
"""
if self.undistort_map_x is not None and self.undistort_map_y is not None:
image_remap = cv2.remap(image, self.undistort_map_x, self.undistort_map_y, interpolation=cv2.INTER_LINEAR)
assert self.undistort_roi is not None
x, y, w, h = self.undistort_roi
return image_remap[y : y + h, x : x + w]
else:
return image
[docs]
class SfmPosedImageMetadata:
"""
This class encodes metadata about a single posed image in an :class:`SfmScene`.
It contains information about the camera pose (world-to-camera and camera-to-world matrices),
a reference to the metadata for the camera that captured the image (see :class:`SfmCameraMetadata`),
and the image and (optionally) mask file paths.
"""
[docs]
def __init__(
self,
world_to_camera_matrix: np.ndarray,
camera_to_world_matrix: np.ndarray,
camera_metadata: SfmCameraMetadata,
camera_id: int,
image_path: str,
mask_path: str,
point_indices: np.ndarray | None,
image_id: int,
):
"""
Create a new :class:`SfmImageMetadata` object.
Args:
world_to_camera_matrix (np.ndarray): A 4x4 matrix representing the transformation from world coordinates to camera coordinates.
camera_to_world_matrix (np.ndarray): A 4x4 matrix representing the transformation from camera coordinates to world coordinates.
camera_metadata (SfmCameraMetadata): The metadata for the camera that captured this image.
camera_id (int): The unique identifier for the camera that captured this image.
image_path (str): The file path to the image on the filesystem.
mask_path (str): The file path to the mask image on the filesystem (can be an empty string if no mask is available).
point_indices (np.ndarray | None): An optional array of point indices that are visible in this image (can be None if not available).
image_id (int): The unique identifier for the image.
"""
self._world_to_camera_matrix = world_to_camera_matrix
self._camera_to_world_matrix = camera_to_world_matrix
self._camera_id = camera_id
self._image_path = image_path
self._mask_path = mask_path
self._point_indices = point_indices
self._camera_metadata = camera_metadata
self._image_id = image_id
[docs]
def state_dict(self) -> dict[str, Any]:
"""
Return a state dictionary representing the image metadata.
This dictionary can be used to serialize and deserialize the image metadata.
Returns:
state_dict (dict[str, Any]): A dictionary containing the image metadata.
"""
return {
"world_to_camera_matrix": self.world_to_camera_matrix.tolist(),
"camera_to_world_matrix": self.camera_to_world_matrix.tolist(),
"camera_id": self.camera_id,
"image_path": self.image_path,
"mask_path": self.mask_path,
"point_indices": self.point_indices.tolist() if self.point_indices is not None else None,
"image_id": self.image_id,
}
[docs]
@classmethod
def from_state_dict(
cls,
state_dict: dict[str, Any],
camera_metadata: dict[int, SfmCameraMetadata],
) -> "SfmPosedImageMetadata":
"""
Create a new :class:`SfmImageMetadata` object from a state dictionary and camera metadata (see :meth:`state_dict`).
Args:
state_dict (dict[str, Any]): A dictionary containing the image metadata.
camera_metadata (dict[int, SfmCameraMetadata]): A dictionary mapping camera IDs to :class:`SfmCameraMetadata` objects.
Returns:
SfmImageMetadata: A new :class:`SfmImageMetadata` object.
"""
if "world_to_camera_matrix" not in state_dict:
raise KeyError("world_to_camera_matrix is missing from state_dict")
if "camera_to_world_matrix" not in state_dict:
raise KeyError("camera_to_world_matrix is missing from state_dict")
if "camera_id" not in state_dict:
raise KeyError("camera_id is missing from state_dict")
if "image_path" not in state_dict:
raise KeyError("image_path is missing from state_dict")
if "mask_path" not in state_dict:
raise KeyError("mask_path is missing from state_dict")
if "image_id" not in state_dict:
raise KeyError("image_id is missing from state_dict")
world_to_camera_matrix = np.array(state_dict["world_to_camera_matrix"])
camera_to_world_matrix = np.array(state_dict["camera_to_world_matrix"])
camera_id = int(state_dict["camera_id"])
image_path = str(state_dict["image_path"])
mask_path = str(state_dict["mask_path"])
point_indices = (
np.array(state_dict["point_indices"])
if "point_indices" in state_dict and state_dict["point_indices"] is not None
else None
)
image_id = int(state_dict["image_id"])
if camera_id not in camera_metadata:
raise KeyError(f"Camera ID {camera_id} not found in camera_metadata")
return cls(
world_to_camera_matrix=world_to_camera_matrix,
camera_to_world_matrix=camera_to_world_matrix,
camera_metadata=camera_metadata[camera_id],
camera_id=camera_id,
image_path=image_path,
mask_path=mask_path,
point_indices=point_indices,
image_id=image_id,
)
[docs]
def transform(self, transformation_matrix: np.ndarray) -> "SfmPosedImageMetadata":
"""
Return a new :class:`SfmImageMetadata` object with the camera pose transformed by the given transformation matrix.
This transformation applies to the left of the camera to world transformation matrix,
meaning it transforms the camera in world space.
*i.e.* ``new_camera_to_world_matrix = transformation_matrix @ self.camera_to_world_matrix``
Args:
transformation_matrix (np.ndarray): A 4x4 transformation matrix to apply.
Returns:
SfmImageMetadata: A new :class:`SfmImageMetadata` object with the transformed matrices.
"""
new_camera_to_world_matrix = transformation_matrix @ self.camera_to_world_matrix
new_world_to_camera_matrix = np.linalg.inv(new_camera_to_world_matrix)
return SfmPosedImageMetadata(
world_to_camera_matrix=new_world_to_camera_matrix,
camera_to_world_matrix=new_camera_to_world_matrix,
camera_metadata=self.camera_metadata,
camera_id=self.camera_id,
image_path=self.image_path,
mask_path=self.mask_path,
point_indices=self.point_indices,
image_id=self.image_id,
)
@property
def world_to_camera_matrix(self) -> np.ndarray:
"""
Return the world-to-camera transformation matrix for this posed image.
This matrix transforms points from world coordinates to camera coordinates.
Returns:
world_to_camera_matrix (np.ndarray): The world-to-camera transformation matrix as a 4x4 numpy array.
"""
return self._world_to_camera_matrix
@property
def camera_to_world_matrix(self) -> np.ndarray:
"""
Return the camera-to-world transformation matrix for this posed image.
This matrix transforms points from camera coordinates to world coordinates.
Returns:
camera_to_world_matrix (np.ndarray): The camera-to-world transformation matrix as a 4x4 numpy array.
"""
return self._camera_to_world_matrix
@property
def camera_id(self) -> int:
"""
Return the unique identifier for the camera that captured this image.
Returns:
camera_id (int): The camera ID.
"""
return self._camera_id
@property
def image_size(self) -> tuple[int, int]:
"""
Return the resolution of the posed image in pixels as a tuple of the form ``(height, width)``
Returns:
image_size (tuple[int, int]): The image resolution as ``(height, width)``.
"""
return self._camera_metadata.height, self._camera_metadata.width
@property
def image_path(self) -> str:
"""
Return the file path to color image for this posed image.
Returns:
image_path (str): The path to the color image file for this posed image.
"""
return self._image_path
@property
def mask_path(self) -> str:
"""
Return the file path to the mask for this posed image.
The mask image is used to indicate which pixels in the image are valid (e.g., not occluded).
An empty string indicates that no mask is available.
Returns:
mask_path (str): The path to the posed mask image file.
"""
return self._mask_path
@property
def point_indices(self) -> np.ndarray | None:
"""
Return the indices of the 3D points that are visible in this posed image or ``None`` if the indices are not available.
These indices correspond to the points in the :class:`SfmScene`'s point cloud that are visible in this posed image.
Returns:
point_indices (np.ndarray | None): An array of indices of the visible 3D points or ``None`` if not available.
"""
return self._point_indices
@property
def camera_metadata(self) -> SfmCameraMetadata:
"""
Return metadata about the camera that captured this posed image (see :class:`SfmCameraMetadata`).
The camera metadata contains information about the camera's intrinsic parameters, such as focal length and distortion coefficients.
Returns:
SfmCameraMetadata: The camera metadata object.
"""
return self._camera_metadata
@property
def image_id(self) -> int:
"""
Return the unique identifier for this image.
This ID is used to uniquely identify the image within the dataset.
Returns:
int: The image ID.
"""
return self._image_id
@property
def lookat(self):
"""
Return the camera lookat vector.
The lookat vector is the direction the camera is pointing, which is the negative z-axis in the camera coordinate system.
Returns:
lookat (np.ndarray): The camera lookat vector as a 3D numpy array.
"""
return self.camera_to_world_matrix[:3, 2]
@property
def origin(self):
"""
Return the origin of the posed image. *i.e.* the position of the camera in world coordinates when it captured the image.
The origin is the position of the camera in world coordinates, which is the translation part of the camera-to-world matrix.
Returns:
origin (np.ndarray): The camera origin as a 3D numpy array.
"""
return self.camera_to_world_matrix[:3, 3]
@property
def up(self):
"""
Return the camera up vector.
The up vector is the direction that is considered "up" in the camera coordinate system, which is the negative y-axis in the camera coordinate system.
Returns:
up (np.ndarray): The camera up vector as a 3D numpy array.
"""
return -self.camera_to_world_matrix[:3, 1]
@property
def right(self):
"""
Return the camera right vector.
The right vector is the direction that is considered "right" in the camera coordinate system, which is the x-axis in the camera coordinate system.
Returns:
right (np.ndarray): The camera right vector as a 3D numpy array.
"""
return self.camera_to_world_matrix[:3, 0]