Source code for trifinger_simulation.camera

"""Classes and functions for simulating cameras.


Simple vs Calibrated Camera
---------------------------

There are two different camera implementations: :class:`Camera` and
:class:`CalibratedCamera`.

:class:`Camera` first one only takes the camera's position, orientation and
field of view as input and uses PyBullet functions to compute the corresponding
transformation/projection from it.

:class:`CalibratedCamera` expects a full camera matrix and distortion
coefficients (like they are acquired through camera calibration.  By using the
calibration parameters of a real camera here, the rendered images are much
closer to the real ones.  On the downside the rendering process is a bit slower
as applying the distortion takes some time.

Loading calibration parameters from a single YAML file can be done using
:meth:`CameraParameters.load`.  To load the calibration parameters of all three
cameras of a TriFinger platform, :func:`load_camera_parameters` can be used.


Camera Arrays
-------------

:class:`CameraArray` provides a simple interface to get images from an
arbitrary number of cameras.

When specifically simulating the three-camera-setup of the TriFinger platform,
you can use :class:`TriFingerCameras` for the simple camera model or
:func:`create_trifinger_camera_array_from_config` for the model using the full
calibration parameters.
"""

import itertools
import typing
import pathlib

import yaml
import numpy as np
import pybullet
from scipy.spatial.transform import Rotation


[docs] def calib_data_to_matrix(data: dict) -> np.ndarray: """Extract a matrix from a camera parameter dict (as loaded from YAML).""" return np.array(data["data"]).reshape(data["rows"], data["cols"])
# TODO: this is more or less a duplicate of trifinger_cameras/CameraParameters
[docs] class CameraParameters(typing.NamedTuple): """Represents intrinsic and extrinsic parameters of a camera. See description of properties for the meaning of the constructor arguments. """ #: Name of the camera. name: str #: Width of the images. width: int #: Height of the images. height: int #: Camera projection matrix. Shape = (3, 3) camera_matrix: np.ndarray #: Distortion coefficients. Shape = (5,) distortion_coefficients: np.ndarray #: Transformation matrix from world to camera frame. Shape = (4, 4) tf_world_to_camera: np.ndarray
[docs] @classmethod def load(cls, stream: typing.TextIO) -> "CameraParameters": """Load camera parameters from a YAML stream. Args: stream: Input stream of configuration in YAML format. Returns: Instance of CameraParameters with values set based on the input YAML. """ data = yaml.safe_load(stream) name = data["camera_name"] width = data["image_width"] height = data["image_height"] camera_matrix = calib_data_to_matrix(data["camera_matrix"]) dist_coeffs = calib_data_to_matrix(data["distortion_coefficients"])[0] tf_world_to_camera = calib_data_to_matrix(data["tf_world_to_camera"]) return cls( name, width, height, camera_matrix, dist_coeffs, tf_world_to_camera )
[docs] def dump(self, stream: typing.TextIO): """Dump camera parameters in YAML format to the given output stream. Args: stream: Output stream. """ # save all the data calibration_data = { "camera_name": self.name, # make sure width and height are plain strings (not numpy.int64 or # the like) "image_width": int(self.width), "image_height": int(self.height), } calibration_data["camera_matrix"] = { "rows": 3, "cols": 3, "data": self.camera_matrix.flatten().tolist(), } calibration_data["distortion_coefficients"] = { "rows": 1, "cols": 5, "data": self.distortion_coefficients.flatten().tolist(), } calibration_data["tf_world_to_camera"] = { "rows": 4, "cols": 4, "data": self.tf_world_to_camera.flatten().tolist(), } yaml.dump( calibration_data, stream, default_flow_style=False, sort_keys=False, )
class BaseCamera: def get_image( self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL ) -> np.ndarray: """Render an image.""" raise NotImplementedError() def get_width(self) -> int: """Get width of the images rendered by this camera instance.""" raise NotImplementedError() def get_height(self) -> int: """Get height of the images rendered by this camera instance.""" raise NotImplementedError()
[docs] class Camera(BaseCamera): """Represents a camera in the simulation environment. Note: This class uses a simplified camera model. For images that better match with the real cameras, use ``CalibratedCamera``. """ def __init__( self, camera_position, camera_orientation, image_size=(270, 270), field_of_view=52, near_plane_distance=0.001, far_plane_distance=100.0, pybullet_client_id=0, ): """ Args: camera_position: Position (x, y, z) of the camera w.r.t. the world frame. camera_orientation: Quaternion (x, y, z, w) representing the orientation of the camera. image_size: Tuple (width, height) specifying the size of the image. field_of_view: Field of view of the camera near_plane_distance: see OpenGL's documentation for details far_plane_distance: see OpenGL's documentation for details target_position: where should the camera be pointed at camera_up_vector: the up axis of the camera pybullet_client_id: Id of the pybullet client (needed when multiple clients are running in parallel). """ self._pybullet_client_id = pybullet_client_id self._width = image_size[0] self._height = image_size[1] camera_rot = Rotation.from_quat(camera_orientation) target_position = camera_rot.apply([0, 0, 1]) camera_up_vector = camera_rot.apply([0, -1, 0]) self._view_matrix = pybullet.computeViewMatrix( cameraEyePosition=camera_position, cameraTargetPosition=target_position, cameraUpVector=camera_up_vector, physicsClientId=self._pybullet_client_id, ) self._proj_matrix = pybullet.computeProjectionMatrixFOV( fov=field_of_view, aspect=float(self._width) / self._height, nearVal=near_plane_distance, farVal=far_plane_distance, physicsClientId=self._pybullet_client_id, )
[docs] def get_width(self) -> int: return self._width
[docs] def get_height(self) -> int: return self._height
[docs] def get_image( self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL ) -> np.ndarray: """Get a rendered image from the camera. Args: renderer: Specify which renderer is to be used. The renderer used by default relies on X server. Note: this would need visualization to have access to OpenGL. In order to use the renderer without visualization, as in, in the "DIRECT" mode of connection, use the ER_TINY_RENDERER. Returns: array, shape=(height, width, 3): Rendered RGB image from the simulated camera. """ (_, _, img, _, _) = pybullet.getCameraImage( width=self._width, height=self._height, viewMatrix=self._view_matrix, projectionMatrix=self._proj_matrix, renderer=renderer, physicsClientId=self._pybullet_client_id, ) # remove the alpha channel return img[:, :, :3]
[docs] class CalibratedCamera(BaseCamera): """Simulate a camera based on calibration parameters. This class renders images from the simulation, using calibration parameters from a real camera. It uses a more accurate projection matrix as ``Camera`` and also takes distortion into account. """ # How this class works internally # =============================== # # The "projection" and "view" matrices required by pyBullet are computed in # __init__ based on the given camera_matrix and tf_world_to_camera. # # The distortion cannot be directly integrated there, so it is applied # after rendering, using the given distortion_coefficients. # # When distorting the rendered image, there is the problem that the images # shrinks a bit, that is, when the size of the image should be preserved, # some pixels close to the edge will be empty. # To avoid this, some padding is added for the rendering, i.e. the # rendered image is a bit larger then the desired output. After # distortion, the padding is removed again, so that the resulting image has # the desired size. def __init__( self, camera_matrix, distortion_coefficients, tf_world_to_camera, image_size, near_plane_distance, far_plane_distance, pybullet_client_id=0, ): r""" Args: camera_matrix: Camera matrix containing focal length and centre point: .. math:: \begin{bmatrix} f_x & 0 & c_x \\ 0 & f_y & c_y \\ 0 & 0 & 0 \end{bmatrix} distortion_coefficients: Distortion coefficients ``(k_1, k_2, p_1, p_2, k_3)`` tf_world_to_camera: Homogeneous transformation matrix from world to camera frame. image_size: Size of the image given as ``(width, height)``. near_plane_distance: Minimum distance to camera for objects to be rendered. Objects that are closer to the camera are clipped. far_plane_distance: Maximum distance to the camera for objects to be rendered. Objects that are further away are clipped. pybullet_client_id: Id of the pybullet client (needed when multiple clients are running in parallel). """ # store the original camera matrix for use in C++ bindings (trifinger_cameras) self._original_camera_matrix = camera_matrix self._pybullet_client_id = pybullet_client_id #: Width of the output images. self._output_width = image_size[0] #: Height of the output images self._output_height = image_size[1] # Padding that is added to the rendered image self._padding = ( round(self._output_width * 0.1), round(self._output_height * 0.1), ) # size of the rendered (undistorted) image self._render_width = self._output_width + 2 * self._padding[0] self._render_height = self._output_height + 2 * self._padding[1] # adjust the centre point in the camera matrix to the padding center_offset = np.array( [[0, 0, self._padding[0]], [0, 0, self._padding[1]], [0, 0, 0]] ) self._camera_matrix = camera_matrix + center_offset self._distortion_coefficients = distortion_coefficients # In OpenGL the camera is looking into negative z-direction, so we need # to rotate the camera pose by 180 degree around x. rot_x_180 = np.array( [ [1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1], ] ) tf_mat = rot_x_180 @ tf_world_to_camera self._view_matrix = tf_mat.flatten(order="F") # pyBullet's computeProjectionMatrix() makes some simplifying # assumptions (f_x == f_y, c_x, c_y = image_size / 2) which are not # generally true. Therefore compute the projection matrix manually. # # https://stackoverflow.com/a/60450420/2095383 # https://www.songho.ca/opengl/gl_projectionmatrix.html # https://amytabb.com/ts/2019_06_28/ # get focal length and centre point from camera matrix f_x = self._camera_matrix[0, 0] f_y = self._camera_matrix[1, 1] c_x = self._camera_matrix[0, 2] c_y = self._camera_matrix[1, 2] x_scale = 2 / self._render_width * f_x y_scale = 2 / self._render_height * f_y near = near_plane_distance far = far_plane_distance x_shift = 1 - 2 * c_x / self._render_width y_shift = (2 * c_y - self._render_height) / self._render_height proj_mat = np.array( [ [x_scale, 0, x_shift, 0], [0, y_scale, y_shift, 0], [ 0, 0, (near + far) / (near - far), 2 * near * far / (near - far), ], [0, 0, -1, 0], ] ) self._proj_matrix = proj_mat.flatten(order="F")
[docs] def get_width(self) -> int: return self._output_width
[docs] def get_height(self) -> int: return self._output_height
[docs] def distort_image(self, image): """Distort an image based on the cameras distortion coefficients. Args: image: The undistorted image. Returns: The distorted image. """ # this function is based on the formulas from here: # https://stackoverflow.com/a/58131157/2095383 # Computations are done on numpy arrays as much as possible for # performance reasons. distorted_image = np.zeros_like(image) f_x = self._camera_matrix[0, 0] f_y = self._camera_matrix[1, 1] c_x = self._camera_matrix[0, 2] c_y = self._camera_matrix[1, 2] k_1, k_2, p_1, p_2, k_3 = self._distortion_coefficients f = np.array([f_y, f_x]) c = np.array([c_y, c_x]) image_points = np.array( tuple( itertools.product( range(self._render_height), range(self._render_width) ) ) ) # normalize the image coordinates norm_points = (image_points - c) / f norm_points_square = norm_points**2 norm_points_xy = norm_points.prod(axis=1) # determining the radial distortion r2 = np.sum(norm_points_square, axis=1) icdist = 1 / (1 - ((k_3 * r2 + k_2) * r2 + k_1) * r2) # determining the tangential distortion p = np.array([[p_2, p_1]]) r2_plus_2_point_sq = r2[:, None] + 2 * norm_points_square delta = 2 * p * norm_points_xy[:, None] + p[::-1] * r2_plus_2_point_sq distorted_points = (norm_points + delta) * icdist[:, None] # un-normalise distorted_points = distorted_points * f + c # float to int distorted_points = distorted_points.round().astype(int) # filter out points that are outside the image in_image_idx = np.all( np.logical_and( (0, 0) <= distorted_points, distorted_points < (self._render_height, self._render_width), ), axis=1, ) distorted_points = distorted_points[in_image_idx] image_points = image_points[in_image_idx] # finally construct the distorted image distorted_image[tuple(distorted_points.T)] = image[ tuple(image_points.T) ] return distorted_image
[docs] def get_image( self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL ) -> np.ndarray: """Get a rendered and distorted image from the camera. Args: renderer: Specify which renderer is to be used. The renderer used by default relies on X server. Note: this would need visualization to have access to OpenGL. In order to use the renderer without visualization, as in, in the "DIRECT" mode of connection, use the ER_TINY_RENDERER. Returns: array, shape=(height, width, 3): Rendered RGB image from the simulated camera. """ (_, _, img, _, _) = pybullet.getCameraImage( width=self._render_width, height=self._render_height, viewMatrix=self._view_matrix, projectionMatrix=self._proj_matrix, renderer=renderer, physicsClientId=self._pybullet_client_id, ) # remove the alpha channel img = img[:, :, :3] # distort image img = self.distort_image(img) # remove the padding img = img[ self._padding[1] : -self._padding[1], self._padding[0] : -self._padding[0], ] return img
[docs] class CameraArray: """Array of an arbitrary number of cameras.""" def __init__(self, cameras: typing.Sequence[BaseCamera]): """ Args: cameras: List of cameras. """ self.cameras = cameras
[docs] def get_images( self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL ) -> typing.List[np.ndarray]: """Get images. See Camera.get_image() for details. Returns: List of RGB images, one per camera. """ return [c.get_image(renderer=renderer) for c in self.cameras]
[docs] def get_bayer_images( self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL ) -> typing.List[np.ndarray]: """Get Bayer images. Same as get_images() but returning the images as BG-Bayer patterns instead of RGB. """ return [rbg_to_bayer_bg(img) for img in self.get_images(renderer)]
[docs] def create_trifinger_camera_array( camera_parameters: typing.Iterable[CameraParameters], pybullet_client_id=0, ) -> CameraArray: """Create a TriFinger camera array using camera calibration parameters. Args: camera_parameters: List of camera calibration parameters for the three cameras. pybullet_client_id: Id of the pybullet client (needed when multiple clients are running in parallel). Returns: CameraArray with three cameras. """ camera_ids = (60, 180, 300) cameras = [] for camera_id, params in zip(camera_ids, camera_parameters): # Sanity check to verify the camera parameters are given in the correct # order. camera_name = "camera{}".format(camera_id) if params.name != camera_name: raise ValueError( "Expected parameters for camera {} but got {}".format( camera_name, params.name ) ) image_size = (params.width, params.height) camera = CalibratedCamera( params.camera_matrix, params.distortion_coefficients, params.tf_world_to_camera, image_size=image_size, near_plane_distance=0.02, far_plane_distance=2.0, pybullet_client_id=pybullet_client_id, ) cameras.append(camera) return CameraArray(cameras)
def load_camera_parameters( config_dir: pathlib.Path, filename_pattern: str = "camera{id}.yml", ) -> typing.Tuple[CameraParameters, ...]: camera_ids = (60, 180, 300) def load_params(id): with open(config_dir / filename_pattern.format(id=id)) as f: return CameraParameters.load(f) camera_parameters = tuple(load_params(id) for id in camera_ids) return camera_parameters
[docs] def create_trifinger_camera_array_from_config( config_dir: pathlib.Path, calib_filename_pattern="camera{id}.yml", pybullet_client_id=0, ) -> CameraArray: """Create a TriFinger camera array using camera calibration files. Loads camera calibration files from the given directory and uses them to create a :class:`CameraArray` of :class:`CalibratedCamera`. Args: config_dir: Directory containing the camera calibration files. calib_filename_pattern: Template for the camera calibration file names. '{id}' will be replaced with the camera id (60, 180, 300). pybullet_client_id: Id of the pybullet client (needed when multiple clients are running in parallel). Returns: CameraArray with three cameras. """ camera_parameters = load_camera_parameters( config_dir, calib_filename_pattern ) return create_trifinger_camera_array(camera_parameters, pybullet_client_id)
[docs] class TriFingerCameras(CameraArray): """Simulate the three cameras of the TriFinger platform. .. note:: This uses the simple camera model (see :class:`Camera`). To get images that are more closely matching those of the real cameras use :func:`create_trifinger_camera_array_from_config` instead. """ def __init__(self, **kwargs): cameras = [ # camera60 Camera( camera_position=[0.2496, 0.2458, 0.4190], camera_orientation=[0.3760, 0.8690, -0.2918, -0.1354], **kwargs, ), # camera180 Camera( camera_position=[0.0047, -0.2834, 0.4558], camera_orientation=[0.9655, -0.0098, -0.0065, -0.2603], **kwargs, ), # camera300 Camera( camera_position=[-0.2470, 0.2513, 0.3943], camera_orientation=[-0.3633, 0.8686, -0.3141, 0.1220], **kwargs, ), ] super().__init__(cameras)
[docs] def rbg_to_bayer_bg(image: np.ndarray) -> np.ndarray: """Convert an rgb image to a BG Bayer pattern. This can be used to generate simulated raw camera data in Bayer format. Note that there will be some loss in image quality. It is mostly meant for testing the full software pipeline with the same conditions as on the real robot. It is not optimized of realistic images. Args: image: RGB image. Returns: Bayer pattern based on the input image. Height and width are the same as of the input image. The image can be converted using OpenCV's `COLOR_BAYER_BG2*`. """ # there is only one channel but it still needs the third dimension, so that # the conversion to a cv::Mat in C++ is easier bayer_img = np.zeros((image.shape[0], image.shape[1], 1), dtype=np.uint8) # channel names, assuming input is RGB CHANNEL_RED = 0 CHANNEL_GREEN = 1 CHANNEL_BLUE = 2 # channel map to get the following pattern (called "BG" in OpenCV): # # RG # GB # channel_map = { (0, 0): CHANNEL_RED, (1, 0): CHANNEL_GREEN, (0, 1): CHANNEL_GREEN, (1, 1): CHANNEL_BLUE, } for r in range(image.shape[0]): for c in range(image.shape[1]): channel = channel_map[(r % 2, c % 2)] bayer_img[r, c] = image[r, c, channel] return bayer_img