Source code for trifinger_simulation.camera

"""Classes and functions for simulating cameras.


Simple vs Calibrated Camera
---------------------------

There are two different camera implementations: :class:`Camera` and
:class:`CalibratedCamera`.

:class:`Camera` first one only takes the camera's position, orientation and
field of view as input and uses PyBullet functions to compute the corresponding
transformation/projection from it.

:class:`CalibratedCamera` expects a full camera matrix and distortion
coefficients (like they are acquired through camera calibration.  By using the
calibration parameters of a real camera here, the rendered images are much
closer to the real ones.  On the downside the rendering process is a bit slower
as applying the distortion takes some time.

Loading calibration parameters from a single YAML file can be done using
:meth:`CameraParameters.load`.  To load the calibration parameters of all three
cameras of a TriFinger platform, :func:`load_camera_parameters` can be used.


Camera Arrays
-------------

:class:`CameraArray` provides a simple interface to get images from an
arbitrary number of cameras.

When specifically simulating the three-camera-setup of the TriFinger platform,
you can use :class:`TriFingerCameras` for the simple camera model or
:func:`create_trifinger_camera_array_from_config` for the model using the full
calibration parameters.
"""

import itertools
import typing
import pathlib

import yaml
import numpy as np
import pybullet
from scipy.spatial.transform import Rotation



[docs]
def calib_data_to_matrix(data: dict) -> np.ndarray:
    """Extract a matrix from a camera parameter dict (as loaded from YAML)."""
    return np.array(data["data"]).reshape(data["rows"], data["cols"])



# TODO: this is more or less a duplicate of trifinger_cameras/CameraParameters

[docs]
class CameraParameters(typing.NamedTuple):
    """Represents intrinsic and extrinsic parameters of a camera.

    See description of properties for the meaning of the constructor arguments.
    """

    #: Name of the camera.
    name: str
    #: Width of the images.
    width: int
    #: Height of the images.
    height: int
    #: Camera projection matrix.  Shape = (3, 3)
    camera_matrix: np.ndarray
    #: Distortion coefficients.  Shape = (5,)
    distortion_coefficients: np.ndarray
    #: Transformation matrix from world to camera frame.  Shape = (4, 4)
    tf_world_to_camera: np.ndarray


[docs]
    @classmethod
    def load(cls, stream: typing.TextIO) -> "CameraParameters":
        """Load camera parameters from a YAML stream.

        Args:
            stream: Input stream of configuration in YAML format.

        Returns:
            Instance of CameraParameters with values set based on the input
            YAML.
        """
        data = yaml.safe_load(stream)

        name = data["camera_name"]
        width = data["image_width"]
        height = data["image_height"]
        camera_matrix = calib_data_to_matrix(data["camera_matrix"])
        dist_coeffs = calib_data_to_matrix(data["distortion_coefficients"])[0]
        tf_world_to_camera = calib_data_to_matrix(data["tf_world_to_camera"])

        return cls(
            name, width, height, camera_matrix, dist_coeffs, tf_world_to_camera
        )



[docs]
    def dump(self, stream: typing.TextIO):
        """Dump camera parameters in YAML format to the given output stream.

        Args:
            stream: Output stream.
        """
        # save all the data
        calibration_data = {
            "camera_name": self.name,
            # make sure width and height are plain strings (not numpy.int64 or
            # the like)
            "image_width": int(self.width),
            "image_height": int(self.height),
        }

        calibration_data["camera_matrix"] = {
            "rows": 3,
            "cols": 3,
            "data": self.camera_matrix.flatten().tolist(),
        }

        calibration_data["distortion_coefficients"] = {
            "rows": 1,
            "cols": 5,
            "data": self.distortion_coefficients.flatten().tolist(),
        }

        calibration_data["tf_world_to_camera"] = {
            "rows": 4,
            "cols": 4,
            "data": self.tf_world_to_camera.flatten().tolist(),
        }

        yaml.dump(
            calibration_data,
            stream,
            default_flow_style=False,
            sort_keys=False,
        )




class BaseCamera:
    def get_image(
        self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL
    ) -> np.ndarray:
        """Render an image."""
        raise NotImplementedError()

    def get_width(self) -> int:
        """Get width of the images rendered by this camera instance."""
        raise NotImplementedError()

    def get_height(self) -> int:
        """Get height of the images rendered by this camera instance."""
        raise NotImplementedError()



[docs]
class Camera(BaseCamera):
    """Represents a camera in the simulation environment.

    Note:  This class uses a simplified camera model.  For images that better
    match with the real cameras, use ``CalibratedCamera``.
    """

    def __init__(
        self,
        camera_position,
        camera_orientation,
        image_size=(270, 270),
        field_of_view=52,
        near_plane_distance=0.001,
        far_plane_distance=100.0,
        pybullet_client_id=0,
    ):
        """
        Args:
            camera_position:  Position (x, y, z) of the camera w.r.t. the world
                frame.
            camera_orientation:  Quaternion (x, y, z, w) representing the
                orientation of the camera.
            image_size:  Tuple (width, height) specifying the size of the
                image.
            field_of_view: Field of view of the camera
            near_plane_distance: see OpenGL's documentation for details
            far_plane_distance: see OpenGL's documentation for details
            target_position: where should the camera be pointed at
            camera_up_vector: the up axis of the camera
            pybullet_client_id:  Id of the pybullet client (needed when
                multiple clients are running in parallel).
        """
        self._pybullet_client_id = pybullet_client_id
        self._width = image_size[0]
        self._height = image_size[1]

        camera_rot = Rotation.from_quat(camera_orientation)
        target_position = camera_rot.apply([0, 0, 1])
        camera_up_vector = camera_rot.apply([0, -1, 0])

        self._view_matrix = pybullet.computeViewMatrix(
            cameraEyePosition=camera_position,
            cameraTargetPosition=target_position,
            cameraUpVector=camera_up_vector,
            physicsClientId=self._pybullet_client_id,
        )

        self._proj_matrix = pybullet.computeProjectionMatrixFOV(
            fov=field_of_view,
            aspect=float(self._width) / self._height,
            nearVal=near_plane_distance,
            farVal=far_plane_distance,
            physicsClientId=self._pybullet_client_id,
        )


[docs]
    def get_width(self) -> int:
        return self._width



[docs]
    def get_height(self) -> int:
        return self._height



[docs]
    def get_image(
        self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL
    ) -> np.ndarray:
        """Get a rendered image from the camera.

        Args:
            renderer: Specify which renderer is to be used. The renderer used
                by default relies on X server. Note: this would need
                visualization to have access to OpenGL. In order to use the
                renderer without visualization, as in, in the "DIRECT" mode of
                connection, use the ER_TINY_RENDERER.

        Returns:
            array, shape=(height, width, 3):  Rendered RGB image from the
            simulated camera.
        """
        (_, _, img, _, _) = pybullet.getCameraImage(
            width=self._width,
            height=self._height,
            viewMatrix=self._view_matrix,
            projectionMatrix=self._proj_matrix,
            renderer=renderer,
            physicsClientId=self._pybullet_client_id,
        )
        # Depending on whatever, pybullet may not use NumPy in some installations, see
        # https://github.com/bulletphysics/bullet3/issues/4523.
        # To make sure our code still works, catch that and convert the image
        # accordingly
        if isinstance(img, tuple):
            img = np.array(img, dtype=np.uint8).reshape(
                self._height, self._width, 4
            )

        # remove the alpha channel
        return img[:, :, :3]





[docs]
class CalibratedCamera(BaseCamera):
    """Simulate a camera based on calibration parameters.

    This class renders images from the simulation, using calibration parameters
    from a real camera.  It uses a more accurate projection matrix as
    ``Camera`` and also takes distortion into account.
    """

    # How this class works internally
    # ===============================
    #
    # The "projection" and "view" matrices required by pyBullet are computed in
    # __init__ based on the given camera_matrix and tf_world_to_camera.
    #
    # The distortion cannot be directly integrated there, so it is applied
    # after rendering, using the given distortion_coefficients.
    #
    # When distorting the rendered image, there is the problem that the images
    # shrinks a bit, that is, when the size of the image should be preserved,
    # some pixels close to the edge will be empty.
    # To avoid this, some padding is added for the rendering, i.e.  the
    # rendered image is a bit larger then the desired output.  After
    # distortion, the padding is removed again, so that the resulting image has
    # the desired size.

    def __init__(
        self,
        camera_matrix,
        distortion_coefficients,
        tf_world_to_camera,
        image_size,
        near_plane_distance,
        far_plane_distance,
        pybullet_client_id=0,
    ):
        r"""
        Args:
            camera_matrix:  Camera matrix containing focal length and centre
                point:

                .. math::

                    \begin{bmatrix}
                      f_x &  0  & c_x \\
                       0  & f_y & c_y \\
                       0  &  0  &  0
                    \end{bmatrix}

            distortion_coefficients:  Distortion coefficients
                ``(k_1, k_2, p_1, p_2, k_3)``
            tf_world_to_camera:  Homogeneous transformation matrix from world
                to camera frame.
            image_size:  Size of the image given as ``(width, height)``.
            near_plane_distance:  Minimum distance to camera for objects to be
                rendered.  Objects that are closer to the camera are clipped.
            far_plane_distance:  Maximum distance to the camera for objects to
                be rendered.  Objects that are further away are clipped.
            pybullet_client_id:  Id of the pybullet client (needed when
                multiple clients are running in parallel).
        """
        # store the original camera matrix for use in C++ bindings (trifinger_cameras)
        self._original_camera_matrix = camera_matrix

        self._pybullet_client_id = pybullet_client_id

        #: Width of the output images.
        self._output_width = image_size[0]
        #: Height of the output images
        self._output_height = image_size[1]

        # Padding that is added to the rendered image
        self._padding = (
            round(self._output_width * 0.1),
            round(self._output_height * 0.1),
        )

        # size of the rendered (undistorted) image
        self._render_width = self._output_width + 2 * self._padding[0]
        self._render_height = self._output_height + 2 * self._padding[1]

        # adjust the centre point in the camera matrix to the padding
        center_offset = np.array(
            [[0, 0, self._padding[0]], [0, 0, self._padding[1]], [0, 0, 0]]
        )

        self._camera_matrix = camera_matrix + center_offset
        self._distortion_coefficients = distortion_coefficients

        # In OpenGL the camera is looking into negative z-direction, so we need
        # to rotate the camera pose by 180 degree around x.
        rot_x_180 = np.array(
            [
                [1, 0, 0, 0],
                [0, -1, 0, 0],
                [0, 0, -1, 0],
                [0, 0, 0, 1],
            ]
        )
        tf_mat = rot_x_180 @ tf_world_to_camera
        self._view_matrix = tf_mat.flatten(order="F")

        # pyBullet's computeProjectionMatrix() makes some simplifying
        # assumptions (f_x == f_y, c_x, c_y = image_size / 2) which are not
        # generally true.  Therefore compute the projection matrix manually.
        #
        # https://stackoverflow.com/a/60450420/2095383
        # https://www.songho.ca/opengl/gl_projectionmatrix.html
        # https://amytabb.com/ts/2019_06_28/

        # get focal length and centre point from camera matrix
        f_x = self._camera_matrix[0, 0]
        f_y = self._camera_matrix[1, 1]
        c_x = self._camera_matrix[0, 2]
        c_y = self._camera_matrix[1, 2]

        x_scale = 2 / self._render_width * f_x
        y_scale = 2 / self._render_height * f_y
        near = near_plane_distance
        far = far_plane_distance

        x_shift = 1 - 2 * c_x / self._render_width
        y_shift = (2 * c_y - self._render_height) / self._render_height
        proj_mat = np.array(
            [
                [x_scale, 0, x_shift, 0],
                [0, y_scale, y_shift, 0],
                [
                    0,
                    0,
                    (near + far) / (near - far),
                    2 * near * far / (near - far),
                ],
                [0, 0, -1, 0],
            ]
        )
        self._proj_matrix = proj_mat.flatten(order="F")


[docs]
    def get_width(self) -> int:
        return self._output_width



[docs]
    def get_height(self) -> int:
        return self._output_height



[docs]
    def distort_image(self, image):
        """Distort an image based on the cameras distortion coefficients.

        Args:
            image:  The undistorted image.

        Returns:
            The distorted image.
        """

        # this function is based on the formulas from here:
        # https://stackoverflow.com/a/58131157/2095383
        # Computations are done on numpy arrays as much as possible for
        # performance reasons.

        distorted_image = np.zeros_like(image)

        f_x = self._camera_matrix[0, 0]
        f_y = self._camera_matrix[1, 1]
        c_x = self._camera_matrix[0, 2]
        c_y = self._camera_matrix[1, 2]
        k_1, k_2, p_1, p_2, k_3 = self._distortion_coefficients

        f = np.array([f_y, f_x])
        c = np.array([c_y, c_x])

        image_points = np.array(
            tuple(
                itertools.product(
                    range(self._render_height), range(self._render_width)
                )
            )
        )

        # normalize the image coordinates
        norm_points = (image_points - c) / f
        norm_points_square = norm_points**2
        norm_points_xy = norm_points.prod(axis=1)

        # determining the radial distortion
        r2 = np.sum(norm_points_square, axis=1)
        icdist = 1 / (1 - ((k_3 * r2 + k_2) * r2 + k_1) * r2)

        # determining the tangential distortion
        p = np.array([[p_2, p_1]])

        r2_plus_2_point_sq = r2[:, None] + 2 * norm_points_square
        delta = 2 * p * norm_points_xy[:, None] + p[::-1] * r2_plus_2_point_sq

        distorted_points = (norm_points + delta) * icdist[:, None]

        # un-normalise
        distorted_points = distorted_points * f + c

        # float to int
        distorted_points = distorted_points.round().astype(int)

        # filter out points that are outside the image
        in_image_idx = np.all(
            np.logical_and(
                (0, 0) <= distorted_points,
                distorted_points < (self._render_height, self._render_width),
            ),
            axis=1,
        )
        distorted_points = distorted_points[in_image_idx]
        image_points = image_points[in_image_idx]

        # finally construct the distorted image
        distorted_image[tuple(distorted_points.T)] = image[
            tuple(image_points.T)
        ]

        return distorted_image



[docs]
    def get_image(
        self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL
    ) -> np.ndarray:
        """Get a rendered and distorted image from the camera.

        Args:
            renderer: Specify which renderer is to be used. The renderer used
                by default relies on X server. Note: this would need
                visualization to have access to OpenGL. In order to use the
                renderer without visualization, as in, in the "DIRECT" mode of
                connection, use the ER_TINY_RENDERER.

        Returns:
            array, shape=(height, width, 3):  Rendered RGB image from the
            simulated camera.
        """
        (_, _, img, _, _) = pybullet.getCameraImage(
            width=self._render_width,
            height=self._render_height,
            viewMatrix=self._view_matrix,
            projectionMatrix=self._proj_matrix,
            renderer=renderer,
            physicsClientId=self._pybullet_client_id,
        )
        # Depending on whatever, pybullet may not use NumPy in some installations, see
        # https://github.com/bulletphysics/bullet3/issues/4523.
        # To make sure our code still works, catch that and convert the image
        # accordingly
        if isinstance(img, tuple):
            img = np.array(img, dtype=np.uint8).reshape(
                self._render_height, self._render_width, 4
            )

        # remove the alpha channel
        img = img[:, :, :3]

        # distort image
        img = self.distort_image(img)

        # remove the padding
        img = img[
            self._padding[1] : -self._padding[1],
            self._padding[0] : -self._padding[0],
        ]

        return img





[docs]
class CameraArray:
    """Array of an arbitrary number of cameras."""

    def __init__(self, cameras: typing.Sequence[BaseCamera]):
        """
        Args:
            cameras: List of cameras.
        """
        self.cameras = cameras


[docs]
    def get_images(
        self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL
    ) -> typing.List[np.ndarray]:
        """Get images.

        See Camera.get_image() for details.

        Returns:
            List of RGB images, one per camera.
        """
        return [c.get_image(renderer=renderer) for c in self.cameras]



[docs]
    def get_bayer_images(
        self, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL
    ) -> typing.List[np.ndarray]:
        """Get Bayer images.

        Same as get_images() but returning the images as BG-Bayer patterns
        instead of RGB.
        """
        return [rbg_to_bayer_bg(img) for img in self.get_images(renderer)]





[docs]
def create_trifinger_camera_array(
    camera_parameters: typing.Iterable[CameraParameters],
    pybullet_client_id=0,
) -> CameraArray:
    """Create a TriFinger camera array using camera calibration parameters.

    Args:
        camera_parameters:  List of camera calibration parameters for the three
            cameras.
        pybullet_client_id:  Id of the pybullet client (needed when multiple
            clients are running in parallel).

    Returns:
        CameraArray with three cameras.
    """
    camera_ids = (60, 180, 300)

    cameras = []
    for camera_id, params in zip(camera_ids, camera_parameters):
        # Sanity check to verify the camera parameters are given in the correct
        # order.
        camera_name = "camera{}".format(camera_id)
        if params.name != camera_name:
            raise ValueError(
                "Expected parameters for camera {} but got {}".format(
                    camera_name, params.name
                )
            )

        image_size = (params.width, params.height)

        camera = CalibratedCamera(
            params.camera_matrix,
            params.distortion_coefficients,
            params.tf_world_to_camera,
            image_size=image_size,
            near_plane_distance=0.02,
            far_plane_distance=2.0,
            pybullet_client_id=pybullet_client_id,
        )
        cameras.append(camera)

    return CameraArray(cameras)



def load_camera_parameters(
    config_dir: pathlib.Path,
    filename_pattern: str = "camera{id}.yml",
) -> typing.Tuple[CameraParameters, ...]:
    camera_ids = (60, 180, 300)

    def load_params(id):
        with open(config_dir / filename_pattern.format(id=id)) as f:
            return CameraParameters.load(f)

    camera_parameters = tuple(load_params(id) for id in camera_ids)

    return camera_parameters



[docs]
def create_trifinger_camera_array_from_config(
    config_dir: pathlib.Path,
    calib_filename_pattern="camera{id}.yml",
    pybullet_client_id=0,
) -> CameraArray:
    """Create a TriFinger camera array using camera calibration files.

    Loads camera calibration files from the given directory and uses them to
    create a :class:`CameraArray` of :class:`CalibratedCamera`.

    Args:
        config_dir:  Directory containing the camera calibration files.
        calib_filename_pattern:  Template for the camera calibration file
            names.  '{id}' will be replaced with the camera id (60, 180, 300).
        pybullet_client_id:  Id of the pybullet client (needed when multiple
            clients are running in parallel).

    Returns:
        CameraArray with three cameras.
    """
    camera_parameters = load_camera_parameters(
        config_dir, calib_filename_pattern
    )

    return create_trifinger_camera_array(camera_parameters, pybullet_client_id)




[docs]
class TriFingerCameras(CameraArray):
    """Simulate the three cameras of the TriFinger platform.

    .. note::
        This uses the simple camera model (see :class:`Camera`). To get images
        that are more closely matching those of the real cameras use
        :func:`create_trifinger_camera_array_from_config` instead.
    """

    def __init__(self, **kwargs):
        cameras = [
            # camera60
            Camera(
                camera_position=[0.2496, 0.2458, 0.4190],
                camera_orientation=[0.3760, 0.8690, -0.2918, -0.1354],
                **kwargs,
            ),
            # camera180
            Camera(
                camera_position=[0.0047, -0.2834, 0.4558],
                camera_orientation=[0.9655, -0.0098, -0.0065, -0.2603],
                **kwargs,
            ),
            # camera300
            Camera(
                camera_position=[-0.2470, 0.2513, 0.3943],
                camera_orientation=[-0.3633, 0.8686, -0.3141, 0.1220],
                **kwargs,
            ),
        ]

        super().__init__(cameras)




[docs]
def rbg_to_bayer_bg(image: np.ndarray) -> np.ndarray:
    """Convert an rgb image to a BG Bayer pattern.

    This can be used to generate simulated raw camera data in Bayer format.
    Note that there will be some loss in image quality.  It is mostly meant for
    testing the full software pipeline with the same conditions as on the real
    robot.  It is not optimized of realistic images.

    Args:
        image: RGB image.

    Returns:
        Bayer pattern based on the input image.  Height and width are the same
        as of the input image.  The image can be converted using OpenCV's
        `COLOR_BAYER_BG2*`.
    """
    # there is only one channel but it still needs the third dimension, so that
    # the conversion to a cv::Mat in C++ is easier
    bayer_img = np.zeros((image.shape[0], image.shape[1], 1), dtype=np.uint8)

    # channel names, assuming input is RGB
    CHANNEL_RED = 0
    CHANNEL_GREEN = 1
    CHANNEL_BLUE = 2

    # channel map to get the following pattern (called "BG" in OpenCV):
    #
    #   RG
    #   GB
    #
    channel_map = {
        (0, 0): CHANNEL_RED,
        (1, 0): CHANNEL_GREEN,
        (0, 1): CHANNEL_GREEN,
        (1, 1): CHANNEL_BLUE,
    }

    for r in range(image.shape[0]):
        for c in range(image.shape[1]):
            channel = channel_map[(r % 2, c % 2)]
            bayer_img[r, c] = image[r, c, channel]

    return bayer_img