Source code for lisbet.analysis

"""LISBET analysis.
Module for analyzing sequences of states (motifs) in a dataset.

This module provides functions to compute various statistics for sequences of states,
including bout statistics, transition probabilities, and F1 score matrices. The
functions are designed to work with sequences of states, where each state represents a
motif in the sequence.

Functions
---------
bout_stats(sequences, lengths, fps, groups=None)
    Compute statistics for sequences of states (motifs) in a dataset.

transition_probability(sequences, lengths, dummy_state_id=None, groups=None)
    Compute transition probabilities between states in a sequence.

f1_score_matrix(labels, predictions)
    Compute an F1 score matrix comparing predicted states to true labels.

"""

from itertools import groupby

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from tqdm.auto import trange


[docs] def bout_stats(sequences, lengths, fps, groups=None): """ Compute statistics for sequences of states (motifs) in a dataset. This function calculates the mean bout duration and event rate for each motif in the sequence. Parameters ---------- sequences : list or array-like The sequence of states, where each state represents a motif. lengths : list or array-like The length (duration) of each sequence. Must be the same length as `sequences`. fps : int or float The frame rate at which the sequences were recorded. Used to convert bout durations and event rates to seconds and minutes respectively. groups : list or array-like, optional A list of group labels corresponding to each sequence. If None, all sequences are assumed to belong to a default group. Default is None. Returns ------- pandas.DataFrame A DataFrame with the following columns: "Motif ID", "Group label", "Mean bout duration (s)", "Rate (events / min)". Each row corresponds to a unique motif in the sequences, and the DataFrame is grouped by the group label if provided. """ analysis_results = [] for i, seq_duration in enumerate(lengths): # Select sequence data start = sum(lengths[:i]) stop = start + seq_duration seq_pred = sequences[start:stop] # Set group label, if available group_label = groups[i] if groups is not None else "default" # Compute sequence of states events = pd.DataFrame( [(k, sum(1 for i in g)) for k, g in groupby(seq_pred)], columns=["motif_id", "bout_duration"], ) # Compute statistics events_stats = events.groupby("motif_id").agg(["mean", "std", "count", "sum"]) for motif_id in events_stats.index: analysis_results.append( { "Motif ID": motif_id, "Group label": group_label, "Mean bout duration (s)": ( events_stats.loc[motif_id, "bout_duration"]["mean"] / fps ), "Rate (events / min)": events_stats.loc[motif_id, "bout_duration"][ "count" ] / (events_stats["bout_duration"]["sum"].sum() / fps / 60), } ) analysis_results = pd.DataFrame( analysis_results, columns=[ "Motif ID", "Group label", "Mean bout duration (s)", "Rate (events / min)", ], ) return analysis_results
[docs] def transition_probability(sequences, lengths, dummy_state_id=None, groups=None): """ Compute transition probabilities between states in a sequence. This function calculates the probability of transitioning from one state to another in a sequence of states, optionally ignoring a specified dummy state. Parameters ---------- sequences : list or array-like The sequence of states, where each state represents a motif. lengths : list or array-like The length (duration) of each sequence. Must be the same length as `sequences`. dummy_state_id : int, optional The state ID to be ignored in the analysis (e.g., a dummy state). If None, no state is ignored. Default is None. groups : list or array-like, optional A list of group labels corresponding to each sequence. If None, all sequences are assumed to belong to a default group. Default is None. Returns ------- dict A dictionary where each key is a group label and each value is a 2D numpy array representing the transition probability matrix for that group. The matrix has dimensions (number of states) x (number of states). """ # Find number of states assert np.min(sequences) == 0 num_states = int(np.max(sequences)) + 1 if dummy_state_id is not None: num_states = num_states - 1 # Count transitions if groups is None: occurrences = {"default": np.zeros((num_states, num_states))} groups = ["default"] * len(lengths) else: occurrences = {k: np.zeros((num_states, num_states)) for k in np.unique(groups)} for j, seq_duration in enumerate(lengths): # Select sequence data start = sum(lengths[:j]) stop = start + seq_duration seq_pred = sequences[start:stop] group = groups[j] # Compute sequence of states, ignoring the dummy state events = [ (k, sum(1 for i in g)) for k, g in groupby(seq_pred) if k != dummy_state_id ] for i in range(len(events) - 1): src = events[i][0] dst = events[i + 1][0] # Skip self-events, introduced by ignoring the dummy state if src != dst: occurrences[group][src][dst] += 1 # Make probability trans_prob = { group: occurrences[group] / np.sum(occurrences[group], axis=1)[:, np.newaxis] for group in occurrences } return trans_prob
[docs] def f1_score_matrix(labels, predictions): """ Compute an F1 score matrix comparing predicted states to true labels. This function calculates the F1 score for each pair of predicted and true states, resulting in a matrix of F1 scores where each row corresponds to a predicted state and each column corresponds to a true state. Parameters ---------- labels : list or array-like The true state labels. predictions : list or array-like The predicted state labels. Returns ------- numpy.ndarray A 2D numpy array where each element [i, j] represents the F1 score for predicting state i as state j. """ n_states = np.max(predictions) + 1 n_classes = np.max(labels) + 1 f1_matrix = [] for s in trange(n_states, desc="Computing F1 score"): bin_pred = np.array(predictions == s, dtype=int) score = [] for lbl in range(n_classes): bin_lab = np.array(labels == lbl, dtype=int) score.append(f1_score(bin_lab, bin_pred, zero_division=0.0)) f1_matrix.append(score) f1_matrix = np.array(f1_matrix) return f1_matrix