Source code for lisbet.transforms_extra

"""Augmentation module for transforming samples in a dataset.

This module provides data augmentation and preprocessing transforms for pose tracking
datasets stored as xarray.Dataset objects or canonical NumPy arrays. NumPy pose arrays
must have shape ``(time, individuals, keypoints, space)``. The transforms can be used
in training pipelines to improve model robustness and generalization.

Available Transforms
--------------------
GaussianJitter
    Adds independent Gaussian noise to every pose coordinate across the full time
    window and clamps the result to the normalized [0, 1] range.

RandomPermutation
    Randomly permutes both coordinate labels and their associated data together across
    the entire time window. Useful for making models invariant to coordinate ordering
    (e.g., individual identities, spatial axes).

RandomBlockPermutation
    Randomly permutes data within a contiguous block of frames while keeping coordinate
    labels unchanged. Creates temporal identity confusion within part of the window.
    Useful for more challenging augmentation scenarios.

RandomRotation
    Applies a random rotation to keypoint coordinates in normalized [0, 1] space.
    Supports 2D and 3D keypoints with configurable maximum angle and post-rotation
    normalization modes (truncate, rescale, or none).

KeypointAblation
    Independently selects (keypoint, individual) pairs and sets their coordinates to
    0.0 across the full time window. Simulates sustained missing or occluded keypoints
    for robustness testing.

PoseToTensor
    Converts xarray or canonical NumPy pose tracking data to PyTorch tensors by
    stacking spatial dimensions into a single feature dimension.

PoseToVideo
    Renders pose tracking data as video frames (RGB images) using OpenCV, with
    customizable body specifications for visualization. Coordinate labels are
    required, so this transform is xarray-only.

VideoToTensor
    Converts video frames from NumPy arrays to PyTorch tensors with optional
    normalization for video model inputs.

Usage Examples
--------------
>>> from lisbet.transforms_extra import RandomPermutation, PoseToTensor
>>> from torchvision import transforms
>>>
>>> # Simple augmentation pipeline
>>> transform = transforms.Compose([
...     RandomPermutation(seed=42, coordinate='individuals'),
...     PoseToTensor(),
... ])
>>>
>>> # Apply with probability using torchvision.transforms.RandomApply
>>> transform = transforms.Compose([
...     transforms.RandomApply([
...         RandomPermutation(seed=42, coordinate='individuals')
...     ], p=0.5),
...     PoseToTensor(),
... ])
>>>
>>> # Block permutation for temporal identity confusion
>>> from lisbet.transforms_extra import RandomBlockPermutation
>>> transform = transforms.Compose([
...     RandomBlockPermutation(
...         seed=42, coordinate='individuals', permute_fraction=0.3
...     ),
...     PoseToTensor(),
... ])
>>>
>>> # Keypoint ablation for robustness to missing data
>>> from lisbet.transforms_extra import KeypointAblation
>>> transform = transforms.Compose([
...     transforms.RandomApply([
...         KeypointAblation(seed=42, pB=0.05)
...     ], p=1.0),
...     PoseToTensor(),
... ])
>>>
>>> # Random rotation augmentation for spatial invariance
>>> from lisbet.transforms_extra import RandomRotation
>>> transform = transforms.Compose([
...     RandomRotation(seed=42, max_angle=30.0, mode='truncate'),
...     PoseToTensor(),
... ])

Notes
-----
- Augmentations should be applied thoughtfully based on dataset characteristics
- Spatial axis permutation (coordinate='space') is only suitable for top-down view
  datasets where axes are symmetric
- Identity permutations work best for datasets where individual labels are
  interchangeable
"""

import cv2
import numpy as np
import torch
import xarray as xr

from lisbet.drawing import BodySpecs, body_specs_registry, color_to_bgr

_CANONICAL_DIMS = ("time", "individuals", "keypoints", "space")
_NUMPY_AXES = {dim: axis for axis, dim in enumerate(_CANONICAL_DIMS)}


def _canonical_position(posetracks):
    """Return position values in canonical NumPy axis order."""
    if isinstance(posetracks, np.ndarray):
        if posetracks.ndim != 4:
            raise ValueError(
                "NumPy pose arrays must have shape "
                "(time, individuals, keypoints, space)."
            )
        return posetracks

    if not isinstance(posetracks, xr.Dataset):
        raise TypeError("Pose transforms require an xarray Dataset or NumPy array.")

    pos_var = posetracks["position"]
    missing_dims = set(_CANONICAL_DIMS) - set(pos_var.dims)
    if missing_dims:
        raise ValueError(
            f"Position variable must contain {set(_CANONICAL_DIMS)} dimensions. "
            f"Missing: {missing_dims}"
        )
    return pos_var.transpose(*_CANONICAL_DIMS).values


def _restore_position(posetracks, position):
    """Return transformed values in the same container type as the input."""
    if isinstance(posetracks, np.ndarray):
        return np.ascontiguousarray(position, dtype=posetracks.dtype)

    original_dims = posetracks["position"].dims
    axes = tuple(_NUMPY_AXES[dim] for dim in original_dims)
    posetracks["position"].values[...] = np.transpose(position, axes)
    return posetracks


def _random_permutation(n, generator, exclude_identity=False):
    """Generate a random permutation of n elements.

    Parameters
    ----------
    n : int
        Number of elements to permute.
    generator : torch.Generator
        Random number generator.
    exclude_identity : bool
        If True, the identity permutation is excluded.

    Returns
    -------
    list
        A random permutation as a list of indices.

    Raises
    ------
    ValueError
        If exclude_identity=True and n < 2.
    """
    if exclude_identity and n < 2:
        raise ValueError("Cannot exclude identity permutation for n < 2")

    perm = torch.randperm(n, generator=generator).tolist()

    if exclude_identity:
        identity = list(range(n))
        while perm == identity:
            perm = torch.randperm(n, generator=generator).tolist()

    return perm



[docs]
class GaussianJitter:
    """Apply Gaussian jitter across the full window.

    Gaussian noise drawn from N(0, sigma^2) is added independently to every
    coordinate. Coordinates are assumed normalized in [0, 1] and are clamped to that
    range post-perturbation.

    Parameters
    ----------
    seed : int
        RNG seed for reproducibility.
    sigma : float
        Standard deviation of the Gaussian noise.
    """


[docs]
    def __init__(self, seed: int, sigma: float):
        self.seed = seed
        self.sigma = float(sigma)
        self.g = torch.Generator().manual_seed(seed)


    def __call__(self, posetracks):
        position = _canonical_position(posetracks)
        # Draw in movement's xarray storage order (time, space, keypoints,
        # individuals), then transpose to canonical order. This preserves the exact
        # seeded output of the pre-NumPy implementation for this minor release.
        # TODO: In the next major release, replace the legacy_shape/noise block with
        # ``noise = torch.randn(position.shape, generator=self.g) * self.sigma``
        # and document the resulting reproducibility change.
        legacy_shape = (
            position.shape[0],
            position.shape[3],
            position.shape[2],
            position.shape[1],
        )
        noise = (torch.randn(legacy_shape, generator=self.g) * self.sigma).permute(
            0, 3, 2, 1
        )
        transformed = torch.from_numpy(np.ascontiguousarray(position)) + noise
        transformed.clamp_(0.0, 1.0)
        return _restore_position(posetracks, transformed.numpy())




[docs]
class KeypointAblation:
    """Apply keypoint ablation with per-(keypoint, individual) Bernoulli sampling.

    Probability ``pB`` is applied independently to each (keypoint, individual) pair.
    For every selected pair, all spatial coordinates (x, y, z, etc.) are set to zero
    across the entire time window, simulating sustained missing or occluded keypoints.

    This augmentation helps models become robust to missing data, which commonly occurs
    due to occlusions, tracking failures, or low-confidence detections.

    Parameters
    ----------
    seed : int
        RNG seed for reproducibility.
    pB : float
        Bernoulli probability for each (keypoint, individual) pair across the full
        window.

    Examples
    --------
    >>> from lisbet.transforms_extra import KeypointAblation
    >>> ablation = KeypointAblation(seed=42, pB=0.05)
    >>> ablated_ds = ablation(posetracks)
    """


[docs]
    def __init__(self, seed: int, pB: float):
        self.seed = seed
        self.pB = float(pB)
        self.g = torch.Generator().manual_seed(seed)


    def __call__(self, posetracks):
        position = _canonical_position(posetracks)
        # Draw in movement's xarray mask layout (time, space, keypoints,
        # individuals), then transpose to canonical order. This preserves the exact
        # seeded output of the pre-NumPy implementation for this minor release.
        # TODO: In the next major release, replace the legacy_mask_shape/bern block
        # with:
        # bern = torch.rand(
        #     (1, position.shape[1], position.shape[2], 1),
        #     generator=self.g,
        # ) < self.pB
        legacy_mask_shape = (1, 1, position.shape[2], position.shape[1])
        bern = (torch.rand(legacy_mask_shape, generator=self.g) < self.pB).permute(
            0, 3, 2, 1
        )

        pos = torch.from_numpy(np.ascontiguousarray(position))
        transformed = torch.where(bern, torch.zeros((), dtype=pos.dtype), pos)
        return _restore_position(posetracks, transformed.numpy())




[docs]
class RandomPermutation:
    """
    Randomly permute the order of a specified pose coordinate.

    For an xarray dataset, both the coordinate labels and their associated data are
    reordered. For a NumPy array, ``coordinate`` identifies the corresponding axis in
    the canonical ``(time, individuals, keypoints, space)`` layout; raw arrays have no
    coordinate labels to reorder.

    This augmentation can be used to increase invariance to coordinate order (e.g.,
    fixed identity, axis orientation). The permutation is applied to the entire dataset.

    Parameters
    ----------
    seed : int
        Random seed for reproducibility.
    coordinate : str
        Name of the coordinate to permute (e.g., 'individuals', 'keypoints', 'space').
        For NumPy input, this must name an axis in the canonical pose layout.
    exclude_identity : bool
        If True, the identity permutation (no change) is excluded. This guarantees
        that at least one element will be moved. Default is False.

    Methods
    -------
    __call__(posetracks)
        Apply the random permutation and return the same container type as the input.

    Examples
    --------
    >>> permute = RandomPermutation(seed=42, coordinate='individuals')
    >>> permuted_ds = permute(posetracks)
    >>> # Guarantee a permutation occurs
    >>> permute = RandomPermutation(seed=42, coordinate='space', exclude_identity=True)
    >>> permuted_ds = permute(posetracks)
    """


[docs]
    def __init__(self, seed, coordinate="individuals", exclude_identity=False):
        self.seed = seed
        self.coordinate = coordinate
        self.exclude_identity = exclude_identity
        self.g = torch.Generator().manual_seed(seed)



[docs]
    def __call__(self, posetracks):
        """
        Apply random permutation to the specified coordinate.

        Parameters
        ----------
        posetracks : xarray.Dataset or numpy.ndarray
            Pose tracks represented by an xarray dataset with a ``position`` variable,
            or by a NumPy array in canonical
            ``(time, individuals, keypoints, space)`` order.

        Returns
        -------
        xarray.Dataset or numpy.ndarray
            Pose tracks with the selected coordinate or canonical axis permuted. The
            returned container type matches ``posetracks``.
        """
        if isinstance(posetracks, np.ndarray):
            _canonical_position(posetracks)
            if self.coordinate not in _NUMPY_AXES:
                raise ValueError(f"Unknown NumPy pose coordinate '{self.coordinate}'.")
            coordinate_axis = _NUMPY_AXES[self.coordinate]
            coordinate_size = posetracks.shape[coordinate_axis]
        elif isinstance(posetracks, xr.Dataset):
            coordinate_size = posetracks.coords[self.coordinate].size
        else:
            raise TypeError("Pose transforms require an xarray Dataset or NumPy array.")

        # Generate a random permutation
        perm = _random_permutation(coordinate_size, self.g, self.exclude_identity)

        if isinstance(posetracks, np.ndarray):
            return np.take(posetracks, perm, axis=coordinate_axis)

        # For xarray, retain the public behavior of reordering coordinate labels and
        # every variable that uses the selected coordinate.
        return posetracks.isel({self.coordinate: perm})





[docs]
class RandomBlockPermutation:
    """
    Randomly permutes the data (but not coordinate labels) of a specified coordinate
    within a random contiguous block of frames.

    For an xarray dataset, coordinate labels remain fixed while the associated pose
    data are permuted inside the block. For a NumPy array, ``coordinate`` identifies
    the corresponding axis in the canonical
    ``(time, individuals, keypoints, space)`` layout; raw arrays have no coordinate
    labels.

    This augmentation is useful to create identity swaps within a portion of the time
    series, mimicking the effects of a tracking error, while maintaining consistent
    coordinate labels throughout.

    Parameters
    ----------
    seed : int
        Random seed for reproducibility.
    coordinate : str
        Name of the coordinate to permute (e.g., 'individuals', 'keypoints').
        For NumPy input, this must name an axis in the canonical pose layout.
    permute_fraction : float
        Fraction of the time window to which the permutation is applied.
        Must be in (0, 1]. A continuous block of frames of this relative size will be
        selected at random, and the permutation will be applied only to the data
        within this block, keeping coordinate labels unchanged.
    exclude_identity : bool
        If True, the identity permutation (no change) is excluded. This guarantees
        that at least one element will be moved. Default is False.

    Methods
    -------
    __call__(posetracks)
        Apply the random block permutation and return the same container type as the
        input.

    Notes
    -----
    This implementation uses uniform frame probability sampling to ensure that every
    frame in the window has an equal probability of being affected by the permutation,
    regardless of its position. This is achieved by allowing the block's starting
    position to extend beyond window boundaries, then clipping to the valid range.

    As a consequence, the actual number of affected frames may be smaller than
    ``permute_fraction * window_size`` when the block overlaps with window boundaries.
    On average, the expected probability for any given frame to be affected is::

        block_size / (window_size + block_size - 1)

    which simplifies to approximately ``permute_fraction / (1 + permute_fraction)``
    for large windows. For example, with ``permute_fraction=0.3``, the expected
    probability per frame is approximately 0.23 (about 77% of the nominal fraction).

    Note that ``permute_fraction`` specifies the *nominal* block size, not the
    expected fraction of affected frames. Even with ``permute_fraction=1.0``, the
    expected probability per frame would be ~0.5, not 1.0, because the block can
    "hang off" either edge of the window. This is the expected tradeoff for
    achieving uniform frame probability.

    Examples
    --------
    >>> permute = RandomBlockPermutation(seed=42, coordinate='individuals',
    ...                                   permute_fraction=0.3)
    >>> permuted_ds = permute(posetracks)
    >>> # Guarantee a permutation occurs within the block
    >>> permute = RandomBlockPermutation(seed=42, coordinate='individuals',
    ...                                   permute_fraction=0.3, exclude_identity=True)
    >>> permuted_ds = permute(posetracks)
    """


[docs]
    def __init__(
        self,
        seed,
        coordinate="individuals",
        permute_fraction=0.5,
        exclude_identity=False,
    ):
        self.seed = seed
        self.coordinate = coordinate
        if not 0 < permute_fraction <= 1:
            raise ValueError("permute_fraction must be a float in (0, 1].")
        self.permute_fraction = permute_fraction
        self.exclude_identity = exclude_identity
        self.g = torch.Generator().manual_seed(seed)



[docs]
    def __call__(self, posetracks):
        """
        Apply random block permutation to the specified coordinate.

        Parameters
        ----------
        posetracks : xarray.Dataset or numpy.ndarray
            Pose tracks represented by an xarray dataset with a ``position`` variable,
            or by a NumPy array in canonical
            ``(time, individuals, keypoints, space)`` order.

        Returns
        -------
        xarray.Dataset or numpy.ndarray
            Pose tracks with data permuted inside a random block. The returned
            container type matches ``posetracks``; xarray coordinates remain
            unchanged.
        """
        if isinstance(posetracks, xr.Dataset):
            return self._apply_xarray(posetracks)
        if isinstance(posetracks, np.ndarray):
            return self._apply_numpy(posetracks)
        raise TypeError("Pose transforms require an xarray Dataset or NumPy array.")


    def _apply_xarray(self, posetracks):
        """Apply the legacy xarray implementation without changing its semantics."""
        # Get current coordinate values
        coord_vals = list(posetracks.coords[self.coordinate].values)

        # Generate a random permutation
        perm = _random_permutation(len(coord_vals), self.g, self.exclude_identity)

        window_size = posetracks.sizes["time"]
        block_size = int(self.permute_fraction * window_size)

        if block_size == 0:
            # No permutation needed
            return posetracks

        # Sample start_idx from extended range to ensure uniform frame probability.
        # Range: [1 - block_size, window_size - 1] gives each frame exactly
        # block_size chances to be included in the block.
        start_idx = torch.randint(
            1 - block_size, window_size, (1,), generator=self.g
        ).item()

        # Clip to valid range
        actual_start = max(0, start_idx)
        actual_end = min(window_size, start_idx + block_size)

        # For block permutation, we permute only the data
        # while keeping coordinates unchanged across the full time series
        block_to_permute = posetracks.isel(time=slice(actual_start, actual_end))

        # Get the dimension index for the coordinate
        coord_dim = list(posetracks["position"].dims).index(self.coordinate)

        # Permute the data along the coordinate dimension
        permuted_data = np.take(
            block_to_permute["position"].values, perm, axis=coord_dim
        )

        # Create a new block with permuted data but original coordinates
        permuted_block = block_to_permute.copy(deep=True)
        permuted_block["position"].values[:] = permuted_data

        # Split and concatenate
        before_block = posetracks.isel(time=slice(None, actual_start))
        after_block = posetracks.isel(time=slice(actual_end, None))

        posetracks = xr.concat(
            [before_block, permuted_block, after_block], dim="time", join="outer"
        )

        return posetracks

    def _apply_numpy(self, posetracks):
        """Apply a block permutation to a canonical NumPy pose array."""
        position = _canonical_position(posetracks)
        if self.coordinate not in _NUMPY_AXES:
            raise ValueError(f"Unknown pose coordinate '{self.coordinate}'.")
        coordinate_axis = _NUMPY_AXES[self.coordinate]

        # Generate a random permutation
        perm = _random_permutation(
            position.shape[coordinate_axis], self.g, self.exclude_identity
        )

        window_size = position.shape[0]
        block_size = int(self.permute_fraction * window_size)

        if block_size == 0:
            return _restore_position(posetracks, position)

        # Sample start_idx from extended range to ensure uniform frame probability.
        # Range: [1 - block_size, window_size - 1] gives each frame exactly
        # block_size chances to be included in the block.
        start_idx = torch.randint(
            1 - block_size, window_size, (1,), generator=self.g
        ).item()

        # Clip to valid range
        actual_start = max(0, start_idx)
        actual_end = min(window_size, start_idx + block_size)

        transformed = np.array(position, copy=True)
        transformed[actual_start:actual_end] = np.take(
            position[actual_start:actual_end], perm, axis=coordinate_axis
        )
        return _restore_position(posetracks, transformed)




[docs]
class RandomRotation:
    """Apply a random rotation to keypoint coordinates in normalized [0, 1] space.

    The input may be an xarray dataset or a NumPy array in canonical
    ``(time, individuals, keypoints, space)`` order. The returned container type
    matches the input.

    Samples a rotation angle uniformly from [-max_angle, +max_angle] and applies it
    consistently across all frames in the window. For 2D data, rotates around the
    center (0.5, 0.5). For 3D data, rotates around (0.5, 0.5, 0.5) about a randomly
    sampled unit axis using Rodrigues' formula.

    After rotation, coordinates can be normalized back to [0, 1] using one of three
    modes: ``"truncate"`` (clamp), ``"rescale"`` (min-max rescaling per spatial
    dimension), or ``"none"`` (no normalization).

    Note: input data is assumed to be free of NaN values. NaN values are replaced
    with 0.0 at load time (see ``lisbet.io.core._load_posetracks``).

    Parameters
    ----------
    seed : int
        RNG seed for reproducibility.
    max_angle : float
        Maximum rotation angle in degrees. The angle is sampled uniformly from
        [-max_angle, +max_angle]. Default is 180.0.
    mode : str
        Normalization mode after rotation. One of:

        - ``"truncate"``: Clamp coordinates to [0, 1].
        - ``"rescale"``: If any coordinate falls outside [0, 1] after rotation,
          rescale each spatial dimension independently so that the min maps to 0
          and the max maps to 1 (across all keypoints, individuals, and time).
          If all coordinates are already within [0, 1], no rescaling is applied.
        - ``"none"``: No normalization is applied.

        Default is ``"truncate"``.

    Examples
    --------
    >>> from lisbet.transforms_extra import RandomRotation
    >>> rotation = RandomRotation(seed=42, max_angle=30.0)
    >>> rotated_ds = rotation(posetracks)
    >>> # Rescale mode for 3D data
    >>> rotation = RandomRotation(seed=42, max_angle=45.0, mode='rescale')
    >>> rotated_ds = rotation(posetracks)
    """


[docs]
    def __init__(self, seed: int, max_angle: float = 180.0, mode: str = "truncate"):
        valid_modes = ("truncate", "rescale", "none")
        if mode not in valid_modes:
            raise ValueError(f"mode must be one of {valid_modes}, got '{mode}'")
        self.seed = seed
        self.max_angle = float(max_angle)
        self.mode = mode
        self.g = torch.Generator().manual_seed(seed)


    def __call__(self, posetracks):
        """
        Apply random rotation to keypoint coordinates.

        Parameters
        ----------
        posetracks : xarray.Dataset or numpy.ndarray
            Pose tracks represented by an xarray dataset with a ``position`` variable,
            or by a NumPy array in canonical
            ``(time, individuals, keypoints, space)`` order.

        Returns
        -------
        xarray.Dataset or numpy.ndarray
            Pose tracks with rotated position coordinates. The returned container type
            matches ``posetracks``.

        Raises
        ------
        ValueError
            If the 'space' dimension has a size other than 2 or 3.
        """
        position = _canonical_position(posetracks)
        n_space = position.shape[3]

        if n_space not in (2, 3):
            raise ValueError(f"'space' dimension must have size 2 or 3, got {n_space}")

        # Sample rotation angle uniformly from [-max_angle, +max_angle]
        angle_deg = (
            torch.rand(1, generator=self.g).item() * 2.0 - 1.0
        ) * self.max_angle
        angle_rad = angle_deg * (np.pi / 180.0)

        # Build rotation matrix
        c, s = np.cos(angle_rad), np.sin(angle_rad)
        if n_space == 2:
            R = np.array([[c, -s], [s, c]])
        else:
            # 3D: sample a random unit axis uniformly on the unit sphere
            axis = torch.randn(3, generator=self.g).numpy()
            axis = axis / np.linalg.norm(axis)
            # Rodrigues' rotation formula: R = I + sin(θ)K + (1 - cos(θ))K²
            kx, ky, kz = axis
            K = np.array([[0.0, -kz, ky], [kz, 0.0, -kx], [-ky, kx, 0.0]])
            R = np.eye(3) + s * K + (1.0 - c) * (K @ K)

        # Rotate around center of the [0, 1] space
        pos = (position - 0.5) @ R.T + 0.5

        # Apply normalization mode
        if self.mode == "truncate":
            np.clip(pos, 0.0, 1.0, out=pos)
        elif self.mode == "rescale" and (np.any(pos < 0.0) or np.any(pos > 1.0)):
            for s_i in range(n_space):
                spatial_slice = pos[..., s_i]
                vmin, vmax = spatial_slice.min(), spatial_slice.max()
                if vmin != vmax:
                    pos[..., s_i] = (spatial_slice - vmin) / (vmax - vmin)

        return _restore_position(posetracks, pos)




[docs]
class PoseToTensor:
    """
    Convert xarray or canonical NumPy pose data into a PyTorch tensor.

    This transformation stacks the 'individuals', 'keypoints', and 'space' dimensions
    into a single 'features' dimension, resulting in a tensor of shape
    (time, features), where features = individuals * keypoints * space.

    Parameters
    ----------
    None

    Methods
    -------
    __call__(posetracks)
        Stack the 'individuals', 'keypoints', and 'space' dimensions of the 'position'
        variable and return as a PyTorch tensor.

    Examples
    --------
    >>> tensor = PoseToTensor()(posetracks)
    >>> tensor.shape
    torch.Size([time, features])
    """


[docs]
    def __call__(self, posetracks):
        """
        Stack the 'individuals', 'keypoints', and 'space' dimensions of the 'position'
        data and return them as a PyTorch tensor.

        Parameters
        ----------
        posetracks : xarray.Dataset or numpy.ndarray
            Pose tracks dataset, or a NumPy array with canonical shape
            ``(time, individuals, keypoints, space)``.

        Returns
        -------
        torch.Tensor
            Tensor of shape (time, features), where features =
            individuals * keypoints * space, containing the stacked position data.
        """
        if isinstance(posetracks, np.ndarray):
            _canonical_position(posetracks)
            values = np.ascontiguousarray(
                posetracks.reshape(posetracks.shape[0], -1), dtype=np.float32
            )
            return torch.from_numpy(values)

        if not isinstance(posetracks, xr.Dataset):
            raise TypeError("PoseToTensor requires an xarray Dataset or NumPy array.")
        return torch.from_numpy(
            posetracks.stack(
                features=("individuals", "keypoints", "space")
            ).position.values.astype("float32")
        )





[docs]
class PoseToVideo:
    """
    Fast OpenCV-based transformation: posetracks (xarray.Dataset) to a sequence of RGB
    images.
    """


[docs]
    def __init__(
        self,
        body_specs: dict[str, BodySpecs],
        image_size=(256, 256),
        bg_color="black",
    ):
        """
        Fast OpenCV-based transformation using BodySpecs for each individual.

        Parameters
        ----------
        body_specs : dict of str to BodySpecs
            Dictionary mapping individual_name (or species) to BodySpecs.
        image_size : tuple of int, optional
            (width, height) of output frames. Default is (256, 256).
        bg_color : tuple or str, optional
            BGR tuple or color name/hex for background color (default is black).
        """
        self.body_specs = body_specs
        self.width, self.height = image_size
        self.bg_color = color_to_bgr(bg_color)


    def __call__(self, posetracks, show_progress=False):
        if not isinstance(posetracks, xr.Dataset):
            raise TypeError(
                "PoseToVideo requires an xarray Dataset with coordinate labels."
            )
        frames = [
            self.render_frame(posetracks, t) for t in range(posetracks.sizes["time"])
        ]

        frames = np.stack(frames, axis=0)

        # # Convert to PyTorch tensor
        # frames = torch.Tensor(frames)

        return frames


[docs]
    def render_frame(self, posetracks, t_idx):
        """
        Render a single frame of pose tracks as an RGB image.

        Parameters
        ----------
        posetracks : xarray.Dataset
            The pose tracks dataset containing keypoints and individuals.
            Must have a "position" variable with dimensions ("time", "individuals",
            "keypoints", "space").
        t_idx : int
            The time index of the frame to render.

        Returns
        -------
        frame : numpy.ndarray
            The rendered frame as a (height, width, 3) uint8 RGB image.
        """
        frame = np.full((self.height, self.width, 3), self.bg_color, dtype=np.uint8)
        pos = (
            posetracks["position"]
            .isel(time=t_idx)
            .transpose("individuals", "keypoints", "space")
            .values
        )
        keypoints = list(posetracks.keypoints.values)
        individuals = list(posetracks.individuals.values)
        for ind_idx, ind_name in enumerate(individuals):
            spec = self.body_specs.get(ind_name, body_specs_registry.get(ind_name))
            if spec is None:
                continue
            # Draw polygons (with alpha blending)
            for poly in spec.polygons:
                pts = []
                for kp in poly:
                    if kp in keypoints:
                        idx = keypoints.index(kp)
                        x, y = pos[ind_idx, idx, :]
                        # Skip if coordinates are NaN (ablated keypoints)
                        if not (np.isnan(x) or np.isnan(y)):
                            pts.append([int(x * self.width), int(y * self.height)])
                if len(pts) >= 3:
                    pts_np = np.array([pts], dtype=np.int32)
                    overlay = frame.copy()
                    color = color_to_bgr(spec.polygon_color)
                    cv2.fillPoly(overlay, pts_np, color)
                    frame = cv2.addWeighted(
                        overlay, spec.polygon_alpha, frame, 1 - spec.polygon_alpha, 0
                    )
            # Draw skeleton
            for edge in spec.skeleton_edges:
                if edge[0] in keypoints and edge[1] in keypoints:
                    idx1 = keypoints.index(edge[0])
                    idx2 = keypoints.index(edge[1])
                    x1, y1 = pos[ind_idx, idx1, :]
                    x2, y2 = pos[ind_idx, idx2, :]
                    # Skip if any coordinates are NaN (ablated keypoints)
                    if not (
                        np.isnan(x1) or np.isnan(y1) or np.isnan(x2) or np.isnan(y2)
                    ):
                        color = color_to_bgr(spec.skeleton_color)
                        cv2.line(
                            frame,
                            (int(x1 * self.width), int(y1 * self.height)),
                            (int(x2 * self.width), int(y2 * self.height)),
                            color=color,
                            thickness=spec.skeleton_thickness,
                            lineType=cv2.LINE_AA,
                        )
            # Draw keypoints
            for k, kp in enumerate(keypoints):
                x, y = pos[ind_idx, k, :]
                # Skip if coordinates are NaN (ablated keypoints)
                if not (np.isnan(x) or np.isnan(y)):
                    color = color_to_bgr(spec.get_keypoint_color(kp))
                    cv2.circle(
                        frame,
                        (int(x * self.width), int(y * self.height)),
                        spec.keypoint_size,
                        color=color,
                        thickness=-1,
                        lineType=cv2.LINE_AA,
                    )

        # Convert a BGR frame (OpenCV) to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        return frame





[docs]
class VideoToTensor:
    """
    Transform a video (NumPy RGB array) into a PyTorch tensor suitable for video models.

    Converts (frames, H, W, 3) RGB uint8/float arrays to (frames, 3, H, W) float
    tensors, with optional normalization and mean/std normalization.

    Parameters
    ----------
    normalize : bool, optional
        If True, scale pixel values to [0, 1] (default: True).
    mean : tuple or list or np.ndarray or torch.Tensor, optional
        Per-channel mean for normalization (applied after scaling to [0, 1]).
        If None, no mean subtraction is performed.
    std : tuple or list or np.ndarray or torch.Tensor, optional
        Per-channel std for normalization (applied after mean subtraction).
        If None, no std division is performed.
    dtype : torch.dtype, optional
        Output tensor dtype (default: torch.float32).
    """


[docs]
    def __init__(self, normalize=True, mean=None, std=None, dtype=torch.float32):
        self.normalize = normalize
        self.mean = mean
        self.std = std
        self.dtype = dtype


    def __call__(self, video):
        """
        Parameters
        ----------
        video : np.ndarray
            Video as (frames, H, W, 3) RGB, dtype uint8 or float.

        Returns
        -------
        torch.Tensor
            Video as (frames, 3, H, W), dtype as specified.
        """
        if not isinstance(video, np.ndarray):
            raise TypeError("Input video must be a numpy ndarray.")
        if video.ndim != 4 or video.shape[-1] != 3:
            raise ValueError("Input video must have shape (frames, H, W, 3) [RGB].")

        # If uint8, convert to float32 for normalization
        if video.dtype == np.uint8:
            video = video.astype(np.float32)
            if self.normalize:
                video = video / 255.0
        elif self.normalize:
            # Assume already float, but ensure in [0, 1]
            video = np.clip(video, 0.0, 1.0)

        # Rearrange to (frames, 3, H, W)
        video = np.transpose(video, (0, 3, 1, 2))
        tensor = torch.from_numpy(video).type(self.dtype)

        # Optional mean/std normalization (per channel)
        if self.mean is not None:
            mean = torch.as_tensor(self.mean, dtype=self.dtype, device=tensor.device)
            if mean.ndim == 1:
                mean = mean.view(1, 3, 1, 1)
            tensor = tensor - mean
        if self.std is not None:
            std = torch.as_tensor(self.std, dtype=self.dtype, device=tensor.device)
            if std.ndim == 1:
                std = std.view(1, 3, 1, 1)
            tensor = tensor / std

        return tensor