Source code for neuroconv.tools.spikeinterface.spikeinterfacerecordingdatachunkiterator

from typing import Iterable, Optional

import numpy as np
from spikeinterface import BaseRecording
from tqdm import tqdm

from neuroconv.tools.hdmf import GenericDataChunkIterator


[docs]class SpikeInterfaceRecordingDataChunkIterator(GenericDataChunkIterator):
    """DataChunkIterator specifically for use on RecordingExtractor objects."""

    def __init__(
        self,
        recording: BaseRecording,
        segment_index: int = 0,
        return_scaled: bool = False,
        buffer_gb: Optional[float] = None,
        buffer_shape: Optional[tuple] = None,
        chunk_mb: Optional[float] = None,
        chunk_shape: Optional[tuple] = None,
        display_progress: bool = False,
        progress_bar_class: Optional[tqdm] = None,
        progress_bar_options: Optional[dict] = None,
    ):
        """
        Initialize an Iterable object which returns DataChunks with data and their selections on each iteration.

        Parameters
        ----------
        recording : SpikeInterfaceRecording
            The SpikeInterfaceRecording object (RecordingExtractor or BaseRecording) which handles the data access.
        segment_index : int, optional
            The recording segment to iterate on.
            Defaults to 0.
        return_scaled : bool, optional
            Whether to return the trace data in scaled units (uV, if True) or in the raw data type (if False).
            Defaults to False.
        buffer_gb : float, optional
            The upper bound on size in gigabytes (GB) of each selection from the iteration.
            The buffer_shape will be set implicitly by this argument.
            Cannot be set if `buffer_shape` is also specified.
            The default is 1GB.
        buffer_shape : tuple, optional
            Manual specification of buffer shape to return on each iteration.
            Must be a multiple of chunk_shape along each axis.
            Cannot be set if `buffer_gb` is also specified.
            The default is None.
        chunk_mb : float, optional
            The upper bound on size in megabytes (MB) of the internal chunk for the HDF5 dataset.
            The chunk_shape will be set implicitly by this argument.
            Cannot be set if `chunk_shape` is also specified.
            The default is 10MB, as recommended by the HDF5 group.
            For more details, search the hdf5 documentation for "Improving IO Performance Compressed Datasets".
        chunk_shape : tuple, optional
            Manual specification of the internal chunk shape for the HDF5 dataset.
            Cannot be set if `chunk_mb` is also specified.
            The default is None.
        display_progress : bool, optional
            Display a progress bar with iteration rate and estimated completion time.
        progress_bar_class : dict, optional
            The progress bar class to use.
            Defaults to tqdm.tqdm if the TQDM package is installed.
        progress_bar_options : dict, optional
            Dictionary of keyword arguments to be passed directly to tqdm.
            See https://github.com/tqdm/tqdm#parameters for options.
        """
        self.recording = recording
        self.segment_index = segment_index
        self.return_scaled = return_scaled
        self.channel_ids = recording.get_channel_ids()
        super().__init__(
            buffer_gb=buffer_gb,
            buffer_shape=buffer_shape,
            chunk_mb=chunk_mb,
            chunk_shape=chunk_shape,
            display_progress=display_progress,
            progress_bar_class=progress_bar_class,
            progress_bar_options=progress_bar_options,
        )

    def _get_default_chunk_shape(self, chunk_mb: float = 10.0) -> tuple[int, int]:
        assert chunk_mb > 0, f"chunk_mb ({chunk_mb}) must be greater than zero!"

        number_of_channels = self.recording.get_num_channels()
        number_of_frames = self.recording.get_num_frames(segment_index=self.segment_index)
        dtype = self.recording.get_dtype()

        chunk_shape = get_electrical_series_chunk_shape(
            number_of_channels=number_of_channels, number_of_frames=number_of_frames, dtype=dtype, chunk_mb=chunk_mb
        )

        return chunk_shape

    def _get_data(self, selection: tuple[slice]) -> Iterable:
        return self.recording.get_traces(
            segment_index=self.segment_index,
            channel_ids=self.channel_ids[selection[1]],
            start_frame=selection[0].start,
            end_frame=selection[0].stop,
            return_scaled=self.return_scaled,
        )

    def _get_dtype(self):
        return self.recording.get_dtype()

    def _get_maxshape(self):
        return (self.recording.get_num_samples(segment_index=self.segment_index), self.recording.get_num_channels())


[docs]def get_electrical_series_chunk_shape(
    number_of_channels: int, number_of_frames: int, dtype: np.dtype, chunk_mb: float = 10.0
) -> tuple[int, int]:
    """
    Estimate good chunk shape for an ElectricalSeries dataset.

    This function gives good estimates for cloud access patterns.

    Parameters
    ----------
    number_of_channels : int
        The number of channels in the ElectricalSeries dataset.
    number_of_frames : int
        The number of frames in the ElectricalSeries dataset.
    dtype : np.dtype
        The data type of the ElectricalSeries dataset.
    chunk_mb : float, optional
        The upper bound on size in megabytes (MB) of the internal chunk for the HDF5 dataset.
        The chunk_shape will be set implicitly by this argument.

    Returns
    -------
    tuple[int, int]
        The chunk shape for the ElectricalSeries dataset.
    """
    assert chunk_mb > 0, f"chunk_mb ({chunk_mb}) must be greater than zero!"

    # We use 64 channels as that gives enough time for common sampling rates when chunk_mb == 10.0
    # See # from https://github.com/flatironinstitute/neurosift/issues/52#issuecomment-1671405249
    chunk_channels = min(64, number_of_channels)

    size_of_chunk_channels_bytes = chunk_channels * dtype.itemsize
    total_chunk_space_bytes = chunk_mb * 1e6

    # We allocate as many frames as possible with the remaining space of the chunk
    chunk_frames = total_chunk_space_bytes // size_of_chunk_channels_bytes

    # We clip by the number of frames if the samples are too small
    chunk_frames = min(chunk_frames, number_of_frames)

    return (chunk_frames, chunk_channels)