csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

  1"""
  2Contains the Event class, which represents a single event in a scan.
  3The Event class optionally holds metadata and features. Lists of events with
  4similar metadata or features can be combined into DataFrames for analysis.
  5
  6The Event class holds the position of the event in the frame, which can be converted
  7to the position in the scanner or slide coordinate positions. See the
  8csi_utils.csi_scans documentation page for more information on the coordinate systems.
  9"""
 10
 11import os
 12import math
 13import typing
 14
 15import numpy as np
 16import pandas as pd
 17
 18import pyreadr
 19
 20from .csi_scans import Scan
 21from .csi_tiles import Tile
 22from .csi_frames import Frame
 23
 24
 25class Event:
 26    """
 27    A class that represents a single event in a scan, making it easy to evaluate
 28    singular events. Required metadata is exposed as attributes, and optional
 29    metadata and features are stored as DataFrames.
 30    """
 31
 32    SCAN_TO_SLIDE_TRANSFORM = {
 33        # Axioscan zero is in the top-right corner instead of top-left
 34        Scan.Type.AXIOSCAN7: np.array(
 35            [
 36                [1, 0, 75000],
 37                [0, 1, 0],
 38                [0, 0, 1],
 39            ]
 40        ),
 41        # BZScanner coordinates are a special kind of messed up:
 42        # - The slide is upside-down.
 43        # - The slide is oriented vertically, with the barcode at the bottom.
 44        # - Tiles are numbered from the top-right
 45        Scan.Type.BZSCANNER: np.array(
 46            [
 47                [0, -1, 75000],
 48                [-1, 0, 25000],
 49                [0, 0, 1],
 50            ]
 51        ),
 52    }
 53    """
 54    Homogeneous transformation matrices for converting between scanner and slide
 55    coordinates. The matrices are 3x3, with the final column representing the
 56    translation in micrometers (um). For more information, see 
 57    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 58    
 59    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 60    imperfections in slides and alignment in the scanners. Units are in micrometers.
 61    """
 62
 63    def __init__(
 64        self,
 65        scan: Scan,
 66        tile: Tile,
 67        x: int,
 68        y: int,
 69        size: int = 12,  # End-to-end size in pixels
 70        metadata: pd.Series = None,
 71        features: pd.Series = None,
 72    ):
 73        self.scan = scan
 74        self.tile = tile
 75        self.x = x
 76        self.y = y
 77        self.size = size
 78        self.metadata = metadata
 79        self.features = features
 80
 81    def __repr__(self) -> str:
 82        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 83
 84    def __eq__(self, other) -> bool:
 85        return self.__repr__() == other.__repr__()
 86
 87    def __lt__(self, other):
 88        return self.__repr__() < other.__repr__()
 89
 90    def get_scan_position(self) -> tuple[float, float]:
 91        """
 92        Get the position of the event in the scanner's coordinate frame.
 93        :return: the scan position of the event in micrometers (um).
 94        """
 95        # Get overall pixel position
 96        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 97        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 98        # Convert to micrometers
 99        x_um = pixel_x * self.scan.pixel_size_um
100        y_um = pixel_y * self.scan.pixel_size_um
101        # Add the scan's origin in the scanner frame
102        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
103        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
104        return x_um, y_um
105
106    def get_slide_position(self) -> tuple[float, float]:
107        """
108        Get the slide position of the event in micrometers (um).
109        :return: the slide position of the event.
110        """
111        # Turn scan_position into a 3x1 vector
112        scan_position = self.get_scan_position()
113        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
114
115        # Multiply by the appropriate homogeneous matrix
116        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
117            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
118        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
119            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
120        else:
121            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
122        slide_position = np.matmul(transform, scan_position)
123        return float(slide_position[0][0]), float(slide_position[1][0])
124
125    def crop_images(
126        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
127    ) -> list[np.ndarray]:
128        """
129        Get the event crops from the frame images. Called "get" because it does not
130        need to extract anything; it is very quick for extracting multiple events from
131        the same tile.
132        Use this if you're interested in many events.
133        :param images: the frame images.
134        :param crop_size: the square size of the image crop to get for this event.
135        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
136        :return: image_size x image_size crops of the event in the provided frames. If
137        the event is too close to the edge, the crop will be smaller and not centered.
138        """
139        # Convert a crop size in micrometers to pixels
140        if not in_pixels:
141            crop_size = round(crop_size / self.scan.pixel_size_um)
142        # Find the crop bounds
143        bounds = [
144            self.x - crop_size // 2,
145            self.y - crop_size // 2,
146            self.x + math.ceil(crop_size / 2),
147            self.y + math.ceil(crop_size / 2),
148        ]
149        # Determine how much the bounds violate the image size
150        displacements = [
151            max(0, -bounds[0]),
152            max(0, -bounds[1]),
153            max(0, bounds[2] - images[0].shape[1]),
154            max(0, bounds[3] - images[0].shape[0]),
155        ]
156        # Cap off the bounds
157        bounds = [
158            max(0, bounds[0]),
159            max(0, bounds[1]),
160            min(images[0].shape[1], bounds[2]),
161            min(images[0].shape[0], bounds[3]),
162        ]
163
164        # Crop the images
165        cropped_images = []
166        for image in images:
167            # Create a blank image of the right size
168            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
169
170            # Insert the cropped image into the blank image, leaving a black buffer
171            # around the edges if the crop would go beyond the original image bounds
172            cropped_image[
173                displacements[1] : crop_size - displacements[3],
174                displacements[0] : crop_size - displacements[2],
175            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
176            cropped_images.append(cropped_image)
177        return cropped_images
178
179    def extract_images(
180        self, crop_size: int = 100, in_pixels: bool = True
181    ) -> list[np.ndarray]:
182        """
183        Extract the images from the scan and tile, reading from the file. Called
184        "extract" because it must read and extract the images from file, which is slow.
185        Use this if you're interested in only a few events, as it is inefficient when
186        reading multiple events from the same tile.
187        :param crop_size: the square size of the image crop to get for this event.
188        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
189        :return: a list of cropped images from the scan in the order of the channels.
190        """
191        frames = Frame.get_frames(self.tile)
192        images = [frame.get_image() for frame in frames]
193        return self.crop_images(images, crop_size, in_pixels)
194
195    @classmethod
196    def extract_images_for_list(
197        cls,
198        events: list[typing.Self],
199        crop_size: int | list[int] = None,
200        in_pixels: bool = True,
201    ) -> list[list[np.ndarray]]:
202        """
203        Get the images for a list of events, ensuring that there is no wasteful reading
204        of the same tile multiple times. This function is more efficient than calling
205        extract_event_images for each event.
206        TODO: test this function
207        :param events: the events to extract images for.
208        :param crop_size: the square size of the image crop to get for this event.
209                          Defaults to four times the size of the event.
210        :param in_pixels: whether the crop size is in pixels or micrometers.
211                          Defaults to pixels, and is ignored if crop_size is None.
212        :return: a list of lists of cropped images for each event.
213        """
214        if len(events) == 0:
215            return []
216
217        # Populate a crop size if none provided
218        if crop_size is None:
219            crop_size = [4 * event.size for event in events]
220            in_pixels = True
221        # Propagate a constant crop size
222        elif isinstance(crop_size, int):
223            crop_size = [crop_size] * len(events)
224
225        # Sort the events by tile; use a shallow copy to avoid modifying the original
226        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
227
228        # Allocate the list to size
229        images = [None] * len(events)
230        last_tile = None
231        frame_images = None  # Holds large numpy arrays, so expensive to compare
232        # Iterate through in sorted order
233        for i in order:
234            if last_tile != events[i].tile:
235                # Gather the frame images, preserving them for the next event
236                frames = Frame.get_frames(events[i].tile)
237                frame_images = [frame.get_image() for frame in frames]
238
239                last_tile = events[i].tile
240            # Use the frame images to crop the event images
241            # Preserve the original order using order[i]
242            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
243        return images
244
245
246class EventArray:
247    """
248    A class that holds a large number of events' data, making it easy to analyze and
249    manipulate many events at once. A more separated version of the Event class.
250    """
251
252    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
253
254    def __init__(
255        self,
256        info: pd.DataFrame = None,
257        metadata: pd.DataFrame = None,
258        features: pd.DataFrame = None,
259    ):
260        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
261        if info is not None and (
262            not all(col in info.columns for col in self.INFO_COLUMNS)
263            or len(info.columns) != 6
264        ):
265            raise ValueError(
266                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
267            )
268        # All DataFrames must all have the same number of rows
269        if metadata is not None and (info is None or len(info) != len(metadata)):
270            raise ValueError(
271                "If EventArray.metadata is not None, it should match rows with .info"
272            )
273        if features is not None and (info is None or len(info) != len(features)):
274            raise ValueError(
275                "If EventArray.features is not None, it should match rows with .info"
276            )
277        self.info = info
278        self.metadata = metadata
279        self.features = features
280
281    def __len__(self) -> int:
282        # Convenience method to get the number of events
283        if self.info is None:
284            return 0
285        else:
286            return len(self.info)
287
288    def __eq__(self, other):
289        is_equal = True
290        # Parse all possibilities for info
291        if isinstance(self.info, pd.DataFrame):
292            if isinstance(other.info, pd.DataFrame):
293                is_equal = self.info.equals(other.info)
294                if not is_equal:
295                    return False
296            else:
297                return False
298        elif self.info is None:
299            if other.info is not None:
300                return False
301
302        # Parse all possibilities for metadata
303        if isinstance(self.metadata, pd.DataFrame):
304            if isinstance(other.metadata, pd.DataFrame):
305                is_equal = self.metadata.equals(other.metadata)
306                if not is_equal:
307                    return False
308            else:
309                return False
310        elif self.metadata is None:
311            if other.metadata is not None:
312                return False
313
314        # Parse all possibilities for features
315        if isinstance(self.features, pd.DataFrame):
316            if isinstance(other.features, pd.DataFrame):
317                is_equal = self.features.equals(other.features)
318                if not is_equal:
319                    return False
320            else:
321                return False
322        elif self.features is None:
323            if other.features is not None:
324                return False
325
326        return is_equal
327
328    def sort(self, by: str | list[str], ascending: bool = True) -> typing.Self:
329        """
330        Sort the EventArray by a column in the info, metadata, or features DataFrames.
331        :param by: name of the column to sort by.
332        :param ascending: whether to sort in ascending order.
333        :return:
334        """
335        everything = pd.concat([self.info, self.metadata, self.features], axis=1)
336        order = everything.sort_values(by=by, ascending=ascending).index
337        self.info = self.info.loc[order].reset_index(drop=True)
338        if self.metadata is not None:
339            self.metadata = self.metadata.loc[order].reset_index(drop=True)
340        if self.features is not None:
341            self.features = self.features.loc[order].reset_index(drop=True)
342        return self
343
344    def add_metadata(self, new_metadata: pd.DataFrame) -> None:
345        """
346        Add metadata to the EventArray.
347        :param new_metadata: the metadata to add.
348        """
349        if self.metadata is None:
350            if len(self) != len(new_metadata):
351                raise ValueError("New metadata does not match length of existing info")
352            self.metadata = new_metadata
353        else:
354            # Add the new metadata columns to the existing metadata
355            self.metadata = pd.concat([self.metadata, new_metadata], axis=1)
356
357    def add_features(self, new_features: pd.DataFrame) -> None:
358        """
359        Add features to the EventArray.
360        :param new_features: the metadata to add.
361        """
362        if self.features is None:
363            if len(self) != len(new_features):
364                raise ValueError("New metadata does not match length of existing info")
365            self.features = new_features
366        else:
367            # Add the new metadata columns to the existing metadata
368            self.features = pd.concat([self.features, new_features], axis=1)
369
370    @classmethod
371    def from_list(cls, events: list[typing.Self]) -> typing.Self:
372        """
373        Combine EventArrays in a list into a single EventArray.
374        :param events: the new list of events.
375        """
376        all_info = []
377        all_metadata = []
378        all_features = []
379        for event_array in events:
380            # Skip empty EventArrays
381            if event_array.info is not None:
382                all_info.append(event_array.info)
383            if event_array.metadata is not None:
384                all_metadata.append(event_array.metadata)
385            if event_array.features is not None:
386                all_features.append(event_array.features)
387        if len(all_info) == 0:
388            return EventArray()
389        else:
390            all_info = pd.concat(all_info, ignore_index=True)
391        if len(all_metadata) == 0:
392            all_metadata = None
393        else:
394            all_metadata = pd.concat(all_metadata, ignore_index=True)
395        if len(all_features) == 0:
396            all_features = None
397        else:
398            all_features = pd.concat(all_features, ignore_index=True)
399
400        return EventArray(all_info, all_metadata, all_features)
401
402    @classmethod
403    def from_events(cls, events: list[Event]) -> typing.Self:
404        """
405        Set the events in the EventArray to a new list of events.
406        :param events: the new list of events.
407        """
408        # Return an empty array if we were passed nothing
409        if events is None or len(events) == 0:
410            return EventArray()
411        # Otherwise, grab the info
412        info = pd.DataFrame(
413            {
414                "slide_id": [event.scan.slide_id for event in events],
415                "tile": [event.tile.n for event in events],
416                "roi": [event.tile.n_roi for event in events],
417                "x": [event.x for event in events],
418                "y": [event.y for event in events],
419                "size": [event.size for event in events],
420            }
421        )
422        metadata_list = [event.metadata for event in events]
423        # Iterate through and ensure that all metadata is the same shape
424        for metadata in metadata_list:
425            if type(metadata) != type(metadata_list[0]):
426                raise ValueError("All metadata must be the same type.")
427            if metadata is not None and metadata.shape != metadata_list[0].shape:
428                raise ValueError("All metadata must be the same shape.")
429        if metadata_list[0] is None:
430            metadata = None
431        else:
432            metadata = pd.DataFrame(metadata_list)
433        features_list = [event.features for event in events]
434        # Iterate through and ensure that all features are the same shape
435        for features in features_list:
436            if type(features) != type(features_list[0]):
437                raise ValueError("All features must be the same type.")
438            if features is not None and features.shape != features_list[0].shape:
439                raise ValueError("All features must be the same shape.")
440        if features_list[0] is None:
441            features = None
442        else:
443            features = pd.DataFrame(features_list)
444        return EventArray(info=info, metadata=metadata, features=features)
445
446    def to_events(
447        self,
448        scans: list[Scan],
449        ignore_missing_scans=True,
450        ignore_metadata=False,
451        ignore_features=False,
452    ) -> list[Event]:
453        """
454        Get the events in the EventArray as a list of events.
455        :param scans: the scans that the events belong to. Pass an empty list if you
456                      don't care about scan metadata.
457        :param ignore_missing_scans: whether to create blank scans for events without scans.
458        :param ignore_metadata: whether to ignore metadata or not
459        :param ignore_features: whether to ignore features or not
460        :return:
461        """
462        events = []
463        for i in range(len(self.info)):
464            # Determine the associated scan
465            scan = None
466            for s in scans:
467                if s.slide_id == self.info["slide_id"][i]:
468                    scan = s
469                    break
470            if scan is None:
471                if ignore_missing_scans:
472                    # Create a placeholder scan if the scan is missing
473                    scan = Scan.make_placeholder(
474                        self.info["slide_id"][i],
475                        self.info["tile"][i],
476                        self.info["roi"][i],
477                    )
478                else:
479                    raise ValueError(
480                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
481                    )
482            # Add to the list
483            events.append(
484                Event(
485                    scan,
486                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
487                    self.info["x"][i],
488                    self.info["y"][i],
489                    size=self.info["size"][i],
490                    metadata=None if ignore_metadata else self.metadata.loc[i],
491                    features=None if ignore_features else self.features.loc[i],
492                )
493            )
494        return events
495
496    def to_dataframe(self) -> pd.DataFrame:
497        """
498        Convert all the data in the EventArray to a single DataFrame.
499        :return: a DataFrame with all the data in the EventArray.
500        """
501        # Make a copy of the info DataFrame and prepend "info_" to the column names
502        output = self.info.copy()
503        output.columns = [f"info_{col}" for col in output.columns]
504        # Combine with the metadata and prepend "metadata_" to the column names
505        if self.metadata is not None:
506            metadata = self.metadata.copy()
507            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
508            output = pd.concat([output, metadata], axis=1)
509        # Combine with the features and prepend "features_" to the column names
510        if self.features is not None:
511            features = self.features.copy()
512            features.columns = [f"features_{col}" for col in features.columns]
513            output = pd.concat([output, features], axis=1)
514        return output
515
516    @classmethod
517    def from_dataframe(cls, df) -> typing.Self:
518        """
519        From a single, special DataFrame, create an EventArray.
520        :return: a DataFrame with all the data in the EventArray.
521        """
522        # Split the columns into info, metadata, and features and strip prefix
523        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
524        info.columns = [col.replace("info_", "") for col in info.columns]
525        if info.size == 0:
526            info = None
527        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
528        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
529        if metadata.size == 0:
530            metadata = None
531        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
532        features.columns = [col.replace("features_", "") for col in features.columns]
533        if features.size == 0:
534            features = None
535        return cls(info=info, metadata=metadata, features=features)
536
537    def save_csv(self, output_path: str) -> bool:
538        """
539        Save the events to an CSV file, including metadata and features.
540        :param output_path:
541        :return:
542        """
543        self.to_dataframe().to_csv(output_path, index=False)
544        return os.path.exists(output_path)
545
546    @classmethod
547    def load_csv(cls, input_path: str) -> typing.Self:
548        """
549        Load the events from an CSV file, including metadata and features.
550        :param input_path:
551        :return:
552        """
553        # Load the CSV file
554        df = pd.read_csv(input_path)
555        return cls.from_dataframe(df)
556
557    def save_hdf5(self, output_path: str) -> bool:
558        """
559        Save the events to an HDF5 file, including metadata and features.
560        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
561        though these files are slightly harder to view in HDFView or similar.
562        :param output_path:
563        :return:
564        """
565        # Open the output_path as an HDF5 file
566        with pd.HDFStore(output_path) as store:
567            # Store the dataframes in the HDF5 file
568            if self.info is not None:
569                store.put("info", self.info, index=False)
570            if self.metadata is not None:
571                store.put("metadata", self.metadata, index=False)
572            if self.features is not None:
573                store.put("features", self.features, index=False)
574        return os.path.exists(output_path)
575
576    @classmethod
577    def load_hdf5(cls, input_path: str) -> typing.Self:
578        """
579        Load the events from an HDF5 file, including metadata and features.
580        :param input_path:
581        :return:
582        """
583        # Open the input_path as an HDF5 file
584        with pd.HDFStore(input_path) as store:
585            # Load the dataframes from the HDF5 file
586            info = store.get("info") if "info" in store else None
587            metadata = store.get("metadata") if "metadata" in store else None
588            features = store.get("features") if "features" in store else None
589        return cls(info=info, metadata=metadata, features=features)
590
591    @classmethod
592    def load_ocular(
593        cls,
594        input_path: str,
595        event_type="cells",
596        cell_data_files=(
597            "rc-final1.rds",
598            "rc-final2.rds",
599            "rc-final3.rds",
600            "rc-final4.rds",
601            "ocular_interesting.rds",
602        ),
603        others_data_files=(
604            "others-final1.rds",
605            "others-final2.rds",
606            "others-final3.rds",
607            "others-final4.rds",
608        ),
609        atlas_data_files=(
610            "ocular_interesting.rds",
611            "ocular_not_interesting.rds",
612        ),
613        merge_event_data_with_stats=True,
614        filter_and_generate_morphs=True,
615        drop_common_events=True,
616        log=None,
617    ) -> typing.Self:
618        """
619
620        :param input_path:
621        :param event_type:
622        :param cell_data_files:
623        :param others_data_files:
624        :param atlas_data_files:
625        :param merge_event_data_with_stats:
626        :param filter_and_generate_morphs:
627        :param drop_common_events:
628        :param log:
629        :return:
630        """
631        # Check if the input path is a directory or a file
632        if os.path.isfile(input_path):
633            data_files = [os.path.basename(input_path)]
634            input_path = os.path.dirname(input_path)
635        if event_type == "cells":
636            data_files = cell_data_files
637        elif event_type == "others":
638            data_files = others_data_files
639        else:
640            raise ValueError("Invalid event type.")
641
642        # Load the data from the OCULAR files
643        file_data = {}
644        for file in data_files:
645            file_path = os.path.join(input_path, file)
646            if not os.path.isfile(file_path):
647                if log is not None:
648                    log.warning(f"{file} not found for in {input_path}")
649                continue
650            file_data[file] = pyreadr.read_r(file_path)
651            # Get the DataFrame associated with None (pyreadr dict quirk)
652            file_data[file] = file_data[file][None]
653            if len(file_data[file]) == 0:
654                # File gets dropped from the dict
655                file_data.pop(file)
656                if log is not None:
657                    log.warning(f"{file} has no cells")
658                continue
659
660            if log is not None:
661                log.debug(f"{file} has {len(file_data[file])} cells")
662
663            # Drop common cells if requested and in this file
664            if file in atlas_data_files and drop_common_events:
665                common_cell_indices = (
666                    file_data[file]["catalogue_classification"] == "common_cell"
667                )
668                if log is not None:
669                    log.debug(
670                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
671                        f"common cells from {file}"
672                    )
673                file_data[file] = file_data[file][common_cell_indices == False]
674
675            if len(file_data[file]) == 0:
676                # File gets dropped from the dict
677                file_data.pop(file)
678                if log is not None:
679                    log.warning(f"{file} has no cells after dropping common cells")
680                continue
681
682            # Extract frame_id and cell_id
683            # DAPI- events already have frame_id cell_id outside rowname
684            if event_type == "cells":
685                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
686                # get frame_id cell_id from rownames column and split into two columns
687                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
688                if len(split_res.columns) != 2:
689                    log.warning(
690                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
691                    )
692                # then assign it back to the dataframe
693                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
694            # reset indexes since they can cause NaN values in concat
695            file_data[file].reset_index(drop=True, inplace=True)
696
697        # Merge the data from all files
698        if len(file_data) == 0:
699            return EventArray()
700        elif len(file_data) == 1:
701            data = [file_data[file] for file in file_data.keys()][0]
702        else:
703            data = pd.concat(file_data.values())
704
705        if log is not None:
706            log.debug(f"Gathered a total of {len(data)} events")
707
708        # Others is missing the "slide_id". Insert it right before "frame_id" column
709        if event_type == "others" and "slide_id" not in data.columns:
710            if os.path.basename(input_path) == "ocular":
711                slide_id = os.path.basename(os.path.dirname(input_path))
712            else:
713                slide_id = "UNKNOWN"
714            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
715
716        # Sort according to ascending cell_id to keep the original, which is in manual_df
717        data = data.sort_values(by=["cell_id"], ascending=True)
718        # Filter out duplicates by x & y
719        data = data.assign(
720            unique_id=data["slide_id"]
721            + "_"
722            + data["frame_id"].astype(str)
723            + "_"
724            + data["cellx"].astype(int).astype(str)
725            + "_"
726            + data["celly"].astype(int).astype(str)
727        )
728        data = data.drop_duplicates(subset=["unique_id"], keep="first", inplace=False)
729        # Filter out duplicates by cell_id
730        data = data.assign(
731            unique_id=data["slide_id"]
732            + "_"
733            + data["frame_id"].astype(str)
734            + "_"
735            + data["cell_id"].astype(str)
736        )
737        data.reset_index(drop=True, inplace=True)
738        # All columns up to "slide_id" are features; drop the "slide_id"
739        features = data.loc[:, :"slide_id"].iloc[:, :-1]
740        data = data.loc[:, "slide_id":]
741        # Grab the info columns
742        info = data[["slide_id", "frame_id", "cellx", "celly"]]
743        info.columns = ["slide_id", "tile", "x", "y"]
744        info = info.assign(
745            roi=0,  # OCULAR only works on 1 ROI, as far as known
746            size=25,  # Static, for later montaging
747        )
748        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
749        # Metadata has duplicate columns for later convenience
750        metadata = data
751        return EventArray(info, metadata, features)
752
753    def save_ocular(self, output_path: str, event_type: str = "cells") -> bool:
754        """
755        Save the events to an OCULAR file. Relies on the dataframe originating
756        from an OCULAR file (same columns; duplicate metadata/info).
757        :param output_path:
758        :return:
759        """
760        if event_type == "cells":
761            file_stub = "rc-final"
762        elif event_type == "others":
763            file_stub = "others-final"
764        else:
765            raise ValueError("Invalid event type. Must be cells or others.")
766
767        # Check for the "ocular_interesting" column
768        if event_type == "cells" and "ocular_interesting" in self.metadata.columns:
769            interesting = self.metadata["ocular_interesting"]
770            # Split the metadata into interesting and regular
771            # Interesting will only have dropped columns, with no internal changes
772            interesting = pd.concat(
773                [self.features[interesting], self.metadata[interesting]], axis=1
774            ).reset_index(drop=True)
775            # Data will get some columns changed, so copy it
776            data = (
777                pd.concat(
778                    [self.features[~interesting], self.metadata[~interesting]], axis=1
779                )
780                .copy(deep=True)
781                .reset_index(drop=True)
782                .drop(columns=["ocular_interesting"])
783            )
784
785            # Drop particular columns for "interesting"
786            interesting = interesting.drop(
787                [
788                    "clust",
789                    "hcpc",
790                    "frame_id",
791                    "cell_id",
792                    "unique_id",
793                    "ocular_interesting",
794                ],
795                axis=1,
796            )
797            # Save both .csv and .rds
798            interesting.to_csv(
799                os.path.join(output_path, "ocular_interesting.csv"), index=False
800            )
801            pyreadr.write_rds(
802                os.path.join(output_path, "ocular_interesting.rds"), interesting
803            )
804        else:
805            # Get all data, copying it
806            data = (
807                pd.concat([self.features, self.metadata], axis=1)
808                .copy(deep=True)
809                .reset_index(drop=True)
810            )
811
812        # Split based on cluster number to conform to *-final[1-4].rds
813        n_clusters = max(data["clust"]) + 1
814        split_idx = [round(i * n_clusters / 4) for i in range(5)]
815        for i in range(4):
816            subset = (split_idx[i] <= data["clust"]) & (
817                data["clust"] < split_idx[i + 1]
818            )
819            subset = data[subset].reset_index(drop=True)
820            subset["hcpc"] = i + 1
821            pyreadr.write_rds(
822                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
823            )
824
825        # Create new example cell strings
826        data["example_cell_id"] = (
827            data["slide_id"]
828            + " "
829            + data["frame_id"].astype(str)
830            + " "
831            + data["cell_id"].astype(str)
832            + " "
833            + data["cellx"].astype(int).astype(str)
834            + " "
835            + data["celly"].astype(int).astype(str)
836        )
837        # Find averagable data columns
838        if "cellcluster_id" in data.columns:
839            avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist()
840        else:
841            avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist()
842        # Group by cluster and average
843        data = data.groupby("clust").agg(
844            **{col: (col, "mean") for col in avg_cols},
845            count=("clust", "size"),  # count rows in each cluster
846            example_cells=("example_cell_id", lambda x: ",".join(x)),
847            hcpc=("hcpc", lambda x: x.iloc[0]),
848        )
849        data = data.reset_index()  # Do NOT drop, index is "clust"
850        # Create new columns
851        metadata = pd.DataFrame(
852            {
853                "count": data["count"],
854                "example_cells": data["example_cells"],
855                "clust": data["clust"].astype(int),
856                "hcpc": data["hcpc"].astype(int),
857                "id": data["clust"].astype(int).astype(str),
858                "cccluster": "0",  # Dummy value
859                "ccdistance": 0.0,  # Dummy value
860                "rownum": list(range(len(data))),
861                "framegroup": 0,  # Dummy value
862            }
863        )
864        data = pd.concat([data.loc[:, avg_cols], metadata], axis=1)
865        # Save the data
866        data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
867        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
class Event:
 26class Event:
 27    """
 28    A class that represents a single event in a scan, making it easy to evaluate
 29    singular events. Required metadata is exposed as attributes, and optional
 30    metadata and features are stored as DataFrames.
 31    """
 32
 33    SCAN_TO_SLIDE_TRANSFORM = {
 34        # Axioscan zero is in the top-right corner instead of top-left
 35        Scan.Type.AXIOSCAN7: np.array(
 36            [
 37                [1, 0, 75000],
 38                [0, 1, 0],
 39                [0, 0, 1],
 40            ]
 41        ),
 42        # BZScanner coordinates are a special kind of messed up:
 43        # - The slide is upside-down.
 44        # - The slide is oriented vertically, with the barcode at the bottom.
 45        # - Tiles are numbered from the top-right
 46        Scan.Type.BZSCANNER: np.array(
 47            [
 48                [0, -1, 75000],
 49                [-1, 0, 25000],
 50                [0, 0, 1],
 51            ]
 52        ),
 53    }
 54    """
 55    Homogeneous transformation matrices for converting between scanner and slide
 56    coordinates. The matrices are 3x3, with the final column representing the
 57    translation in micrometers (um). For more information, see 
 58    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 59    
 60    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 61    imperfections in slides and alignment in the scanners. Units are in micrometers.
 62    """
 63
 64    def __init__(
 65        self,
 66        scan: Scan,
 67        tile: Tile,
 68        x: int,
 69        y: int,
 70        size: int = 12,  # End-to-end size in pixels
 71        metadata: pd.Series = None,
 72        features: pd.Series = None,
 73    ):
 74        self.scan = scan
 75        self.tile = tile
 76        self.x = x
 77        self.y = y
 78        self.size = size
 79        self.metadata = metadata
 80        self.features = features
 81
 82    def __repr__(self) -> str:
 83        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 84
 85    def __eq__(self, other) -> bool:
 86        return self.__repr__() == other.__repr__()
 87
 88    def __lt__(self, other):
 89        return self.__repr__() < other.__repr__()
 90
 91    def get_scan_position(self) -> tuple[float, float]:
 92        """
 93        Get the position of the event in the scanner's coordinate frame.
 94        :return: the scan position of the event in micrometers (um).
 95        """
 96        # Get overall pixel position
 97        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 98        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 99        # Convert to micrometers
100        x_um = pixel_x * self.scan.pixel_size_um
101        y_um = pixel_y * self.scan.pixel_size_um
102        # Add the scan's origin in the scanner frame
103        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
104        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
105        return x_um, y_um
106
107    def get_slide_position(self) -> tuple[float, float]:
108        """
109        Get the slide position of the event in micrometers (um).
110        :return: the slide position of the event.
111        """
112        # Turn scan_position into a 3x1 vector
113        scan_position = self.get_scan_position()
114        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
115
116        # Multiply by the appropriate homogeneous matrix
117        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
118            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
119        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
120            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
121        else:
122            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
123        slide_position = np.matmul(transform, scan_position)
124        return float(slide_position[0][0]), float(slide_position[1][0])
125
126    def crop_images(
127        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
128    ) -> list[np.ndarray]:
129        """
130        Get the event crops from the frame images. Called "get" because it does not
131        need to extract anything; it is very quick for extracting multiple events from
132        the same tile.
133        Use this if you're interested in many events.
134        :param images: the frame images.
135        :param crop_size: the square size of the image crop to get for this event.
136        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
137        :return: image_size x image_size crops of the event in the provided frames. If
138        the event is too close to the edge, the crop will be smaller and not centered.
139        """
140        # Convert a crop size in micrometers to pixels
141        if not in_pixels:
142            crop_size = round(crop_size / self.scan.pixel_size_um)
143        # Find the crop bounds
144        bounds = [
145            self.x - crop_size // 2,
146            self.y - crop_size // 2,
147            self.x + math.ceil(crop_size / 2),
148            self.y + math.ceil(crop_size / 2),
149        ]
150        # Determine how much the bounds violate the image size
151        displacements = [
152            max(0, -bounds[0]),
153            max(0, -bounds[1]),
154            max(0, bounds[2] - images[0].shape[1]),
155            max(0, bounds[3] - images[0].shape[0]),
156        ]
157        # Cap off the bounds
158        bounds = [
159            max(0, bounds[0]),
160            max(0, bounds[1]),
161            min(images[0].shape[1], bounds[2]),
162            min(images[0].shape[0], bounds[3]),
163        ]
164
165        # Crop the images
166        cropped_images = []
167        for image in images:
168            # Create a blank image of the right size
169            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
170
171            # Insert the cropped image into the blank image, leaving a black buffer
172            # around the edges if the crop would go beyond the original image bounds
173            cropped_image[
174                displacements[1] : crop_size - displacements[3],
175                displacements[0] : crop_size - displacements[2],
176            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
177            cropped_images.append(cropped_image)
178        return cropped_images
179
180    def extract_images(
181        self, crop_size: int = 100, in_pixels: bool = True
182    ) -> list[np.ndarray]:
183        """
184        Extract the images from the scan and tile, reading from the file. Called
185        "extract" because it must read and extract the images from file, which is slow.
186        Use this if you're interested in only a few events, as it is inefficient when
187        reading multiple events from the same tile.
188        :param crop_size: the square size of the image crop to get for this event.
189        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
190        :return: a list of cropped images from the scan in the order of the channels.
191        """
192        frames = Frame.get_frames(self.tile)
193        images = [frame.get_image() for frame in frames]
194        return self.crop_images(images, crop_size, in_pixels)
195
196    @classmethod
197    def extract_images_for_list(
198        cls,
199        events: list[typing.Self],
200        crop_size: int | list[int] = None,
201        in_pixels: bool = True,
202    ) -> list[list[np.ndarray]]:
203        """
204        Get the images for a list of events, ensuring that there is no wasteful reading
205        of the same tile multiple times. This function is more efficient than calling
206        extract_event_images for each event.
207        TODO: test this function
208        :param events: the events to extract images for.
209        :param crop_size: the square size of the image crop to get for this event.
210                          Defaults to four times the size of the event.
211        :param in_pixels: whether the crop size is in pixels or micrometers.
212                          Defaults to pixels, and is ignored if crop_size is None.
213        :return: a list of lists of cropped images for each event.
214        """
215        if len(events) == 0:
216            return []
217
218        # Populate a crop size if none provided
219        if crop_size is None:
220            crop_size = [4 * event.size for event in events]
221            in_pixels = True
222        # Propagate a constant crop size
223        elif isinstance(crop_size, int):
224            crop_size = [crop_size] * len(events)
225
226        # Sort the events by tile; use a shallow copy to avoid modifying the original
227        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
228
229        # Allocate the list to size
230        images = [None] * len(events)
231        last_tile = None
232        frame_images = None  # Holds large numpy arrays, so expensive to compare
233        # Iterate through in sorted order
234        for i in order:
235            if last_tile != events[i].tile:
236                # Gather the frame images, preserving them for the next event
237                frames = Frame.get_frames(events[i].tile)
238                frame_images = [frame.get_image() for frame in frames]
239
240                last_tile = events[i].tile
241            # Use the frame images to crop the event images
242            # Preserve the original order using order[i]
243            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
244        return images

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( scan: csi_images.csi_scans.Scan, tile: csi_images.csi_tiles.Tile, x: int, y: int, size: int = 12, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
64    def __init__(
65        self,
66        scan: Scan,
67        tile: Tile,
68        x: int,
69        y: int,
70        size: int = 12,  # End-to-end size in pixels
71        metadata: pd.Series = None,
72        features: pd.Series = None,
73    ):
74        self.scan = scan
75        self.tile = tile
76        self.x = x
77        self.y = y
78        self.size = size
79        self.metadata = metadata
80        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

scan
tile
x
y
size
metadata
features
def get_scan_position(self) -> tuple[float, float]:
 91    def get_scan_position(self) -> tuple[float, float]:
 92        """
 93        Get the position of the event in the scanner's coordinate frame.
 94        :return: the scan position of the event in micrometers (um).
 95        """
 96        # Get overall pixel position
 97        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 98        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 99        # Convert to micrometers
100        x_um = pixel_x * self.scan.pixel_size_um
101        y_um = pixel_y * self.scan.pixel_size_um
102        # Add the scan's origin in the scanner frame
103        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
104        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
105        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
107    def get_slide_position(self) -> tuple[float, float]:
108        """
109        Get the slide position of the event in micrometers (um).
110        :return: the slide position of the event.
111        """
112        # Turn scan_position into a 3x1 vector
113        scan_position = self.get_scan_position()
114        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
115
116        # Multiply by the appropriate homogeneous matrix
117        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
118            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
119        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
120            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
121        else:
122            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
123        slide_position = np.matmul(transform, scan_position)
124        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop_images( self, images: list[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
126    def crop_images(
127        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
128    ) -> list[np.ndarray]:
129        """
130        Get the event crops from the frame images. Called "get" because it does not
131        need to extract anything; it is very quick for extracting multiple events from
132        the same tile.
133        Use this if you're interested in many events.
134        :param images: the frame images.
135        :param crop_size: the square size of the image crop to get for this event.
136        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
137        :return: image_size x image_size crops of the event in the provided frames. If
138        the event is too close to the edge, the crop will be smaller and not centered.
139        """
140        # Convert a crop size in micrometers to pixels
141        if not in_pixels:
142            crop_size = round(crop_size / self.scan.pixel_size_um)
143        # Find the crop bounds
144        bounds = [
145            self.x - crop_size // 2,
146            self.y - crop_size // 2,
147            self.x + math.ceil(crop_size / 2),
148            self.y + math.ceil(crop_size / 2),
149        ]
150        # Determine how much the bounds violate the image size
151        displacements = [
152            max(0, -bounds[0]),
153            max(0, -bounds[1]),
154            max(0, bounds[2] - images[0].shape[1]),
155            max(0, bounds[3] - images[0].shape[0]),
156        ]
157        # Cap off the bounds
158        bounds = [
159            max(0, bounds[0]),
160            max(0, bounds[1]),
161            min(images[0].shape[1], bounds[2]),
162            min(images[0].shape[0], bounds[3]),
163        ]
164
165        # Crop the images
166        cropped_images = []
167        for image in images:
168            # Create a blank image of the right size
169            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
170
171            # Insert the cropped image into the blank image, leaving a black buffer
172            # around the edges if the crop would go beyond the original image bounds
173            cropped_image[
174                displacements[1] : crop_size - displacements[3],
175                displacements[0] : crop_size - displacements[2],
176            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
177            cropped_images.append(cropped_image)
178        return cropped_images

Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def extract_images( self, crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
180    def extract_images(
181        self, crop_size: int = 100, in_pixels: bool = True
182    ) -> list[np.ndarray]:
183        """
184        Extract the images from the scan and tile, reading from the file. Called
185        "extract" because it must read and extract the images from file, which is slow.
186        Use this if you're interested in only a few events, as it is inefficient when
187        reading multiple events from the same tile.
188        :param crop_size: the square size of the image crop to get for this event.
189        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
190        :return: a list of cropped images from the scan in the order of the channels.
191        """
192        frames = Frame.get_frames(self.tile)
193        images = [frame.get_image() for frame in frames]
194        return self.crop_images(images, crop_size, in_pixels)

Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

a list of cropped images from the scan in the order of the channels.

@classmethod
def extract_images_for_list( cls, events: list[typing.Self], crop_size: int | list[int] = None, in_pixels: bool = True) -> list[list[numpy.ndarray]]:
196    @classmethod
197    def extract_images_for_list(
198        cls,
199        events: list[typing.Self],
200        crop_size: int | list[int] = None,
201        in_pixels: bool = True,
202    ) -> list[list[np.ndarray]]:
203        """
204        Get the images for a list of events, ensuring that there is no wasteful reading
205        of the same tile multiple times. This function is more efficient than calling
206        extract_event_images for each event.
207        TODO: test this function
208        :param events: the events to extract images for.
209        :param crop_size: the square size of the image crop to get for this event.
210                          Defaults to four times the size of the event.
211        :param in_pixels: whether the crop size is in pixels or micrometers.
212                          Defaults to pixels, and is ignored if crop_size is None.
213        :return: a list of lists of cropped images for each event.
214        """
215        if len(events) == 0:
216            return []
217
218        # Populate a crop size if none provided
219        if crop_size is None:
220            crop_size = [4 * event.size for event in events]
221            in_pixels = True
222        # Propagate a constant crop size
223        elif isinstance(crop_size, int):
224            crop_size = [crop_size] * len(events)
225
226        # Sort the events by tile; use a shallow copy to avoid modifying the original
227        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
228
229        # Allocate the list to size
230        images = [None] * len(events)
231        last_tile = None
232        frame_images = None  # Holds large numpy arrays, so expensive to compare
233        # Iterate through in sorted order
234        for i in order:
235            if last_tile != events[i].tile:
236                # Gather the frame images, preserving them for the next event
237                frames = Frame.get_frames(events[i].tile)
238                frame_images = [frame.get_image() for frame in frames]
239
240                last_tile = events[i].tile
241            # Use the frame images to crop the event images
242            # Preserve the original order using order[i]
243            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
244        return images

Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function

Parameters
  • events: the events to extract images for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns

a list of lists of cropped images for each event.

class EventArray:
247class EventArray:
248    """
249    A class that holds a large number of events' data, making it easy to analyze and
250    manipulate many events at once. A more separated version of the Event class.
251    """
252
253    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
254
255    def __init__(
256        self,
257        info: pd.DataFrame = None,
258        metadata: pd.DataFrame = None,
259        features: pd.DataFrame = None,
260    ):
261        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
262        if info is not None and (
263            not all(col in info.columns for col in self.INFO_COLUMNS)
264            or len(info.columns) != 6
265        ):
266            raise ValueError(
267                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
268            )
269        # All DataFrames must all have the same number of rows
270        if metadata is not None and (info is None or len(info) != len(metadata)):
271            raise ValueError(
272                "If EventArray.metadata is not None, it should match rows with .info"
273            )
274        if features is not None and (info is None or len(info) != len(features)):
275            raise ValueError(
276                "If EventArray.features is not None, it should match rows with .info"
277            )
278        self.info = info
279        self.metadata = metadata
280        self.features = features
281
282    def __len__(self) -> int:
283        # Convenience method to get the number of events
284        if self.info is None:
285            return 0
286        else:
287            return len(self.info)
288
289    def __eq__(self, other):
290        is_equal = True
291        # Parse all possibilities for info
292        if isinstance(self.info, pd.DataFrame):
293            if isinstance(other.info, pd.DataFrame):
294                is_equal = self.info.equals(other.info)
295                if not is_equal:
296                    return False
297            else:
298                return False
299        elif self.info is None:
300            if other.info is not None:
301                return False
302
303        # Parse all possibilities for metadata
304        if isinstance(self.metadata, pd.DataFrame):
305            if isinstance(other.metadata, pd.DataFrame):
306                is_equal = self.metadata.equals(other.metadata)
307                if not is_equal:
308                    return False
309            else:
310                return False
311        elif self.metadata is None:
312            if other.metadata is not None:
313                return False
314
315        # Parse all possibilities for features
316        if isinstance(self.features, pd.DataFrame):
317            if isinstance(other.features, pd.DataFrame):
318                is_equal = self.features.equals(other.features)
319                if not is_equal:
320                    return False
321            else:
322                return False
323        elif self.features is None:
324            if other.features is not None:
325                return False
326
327        return is_equal
328
329    def sort(self, by: str | list[str], ascending: bool = True) -> typing.Self:
330        """
331        Sort the EventArray by a column in the info, metadata, or features DataFrames.
332        :param by: name of the column to sort by.
333        :param ascending: whether to sort in ascending order.
334        :return:
335        """
336        everything = pd.concat([self.info, self.metadata, self.features], axis=1)
337        order = everything.sort_values(by=by, ascending=ascending).index
338        self.info = self.info.loc[order].reset_index(drop=True)
339        if self.metadata is not None:
340            self.metadata = self.metadata.loc[order].reset_index(drop=True)
341        if self.features is not None:
342            self.features = self.features.loc[order].reset_index(drop=True)
343        return self
344
345    def add_metadata(self, new_metadata: pd.DataFrame) -> None:
346        """
347        Add metadata to the EventArray.
348        :param new_metadata: the metadata to add.
349        """
350        if self.metadata is None:
351            if len(self) != len(new_metadata):
352                raise ValueError("New metadata does not match length of existing info")
353            self.metadata = new_metadata
354        else:
355            # Add the new metadata columns to the existing metadata
356            self.metadata = pd.concat([self.metadata, new_metadata], axis=1)
357
358    def add_features(self, new_features: pd.DataFrame) -> None:
359        """
360        Add features to the EventArray.
361        :param new_features: the metadata to add.
362        """
363        if self.features is None:
364            if len(self) != len(new_features):
365                raise ValueError("New metadata does not match length of existing info")
366            self.features = new_features
367        else:
368            # Add the new metadata columns to the existing metadata
369            self.features = pd.concat([self.features, new_features], axis=1)
370
371    @classmethod
372    def from_list(cls, events: list[typing.Self]) -> typing.Self:
373        """
374        Combine EventArrays in a list into a single EventArray.
375        :param events: the new list of events.
376        """
377        all_info = []
378        all_metadata = []
379        all_features = []
380        for event_array in events:
381            # Skip empty EventArrays
382            if event_array.info is not None:
383                all_info.append(event_array.info)
384            if event_array.metadata is not None:
385                all_metadata.append(event_array.metadata)
386            if event_array.features is not None:
387                all_features.append(event_array.features)
388        if len(all_info) == 0:
389            return EventArray()
390        else:
391            all_info = pd.concat(all_info, ignore_index=True)
392        if len(all_metadata) == 0:
393            all_metadata = None
394        else:
395            all_metadata = pd.concat(all_metadata, ignore_index=True)
396        if len(all_features) == 0:
397            all_features = None
398        else:
399            all_features = pd.concat(all_features, ignore_index=True)
400
401        return EventArray(all_info, all_metadata, all_features)
402
403    @classmethod
404    def from_events(cls, events: list[Event]) -> typing.Self:
405        """
406        Set the events in the EventArray to a new list of events.
407        :param events: the new list of events.
408        """
409        # Return an empty array if we were passed nothing
410        if events is None or len(events) == 0:
411            return EventArray()
412        # Otherwise, grab the info
413        info = pd.DataFrame(
414            {
415                "slide_id": [event.scan.slide_id for event in events],
416                "tile": [event.tile.n for event in events],
417                "roi": [event.tile.n_roi for event in events],
418                "x": [event.x for event in events],
419                "y": [event.y for event in events],
420                "size": [event.size for event in events],
421            }
422        )
423        metadata_list = [event.metadata for event in events]
424        # Iterate through and ensure that all metadata is the same shape
425        for metadata in metadata_list:
426            if type(metadata) != type(metadata_list[0]):
427                raise ValueError("All metadata must be the same type.")
428            if metadata is not None and metadata.shape != metadata_list[0].shape:
429                raise ValueError("All metadata must be the same shape.")
430        if metadata_list[0] is None:
431            metadata = None
432        else:
433            metadata = pd.DataFrame(metadata_list)
434        features_list = [event.features for event in events]
435        # Iterate through and ensure that all features are the same shape
436        for features in features_list:
437            if type(features) != type(features_list[0]):
438                raise ValueError("All features must be the same type.")
439            if features is not None and features.shape != features_list[0].shape:
440                raise ValueError("All features must be the same shape.")
441        if features_list[0] is None:
442            features = None
443        else:
444            features = pd.DataFrame(features_list)
445        return EventArray(info=info, metadata=metadata, features=features)
446
447    def to_events(
448        self,
449        scans: list[Scan],
450        ignore_missing_scans=True,
451        ignore_metadata=False,
452        ignore_features=False,
453    ) -> list[Event]:
454        """
455        Get the events in the EventArray as a list of events.
456        :param scans: the scans that the events belong to. Pass an empty list if you
457                      don't care about scan metadata.
458        :param ignore_missing_scans: whether to create blank scans for events without scans.
459        :param ignore_metadata: whether to ignore metadata or not
460        :param ignore_features: whether to ignore features or not
461        :return:
462        """
463        events = []
464        for i in range(len(self.info)):
465            # Determine the associated scan
466            scan = None
467            for s in scans:
468                if s.slide_id == self.info["slide_id"][i]:
469                    scan = s
470                    break
471            if scan is None:
472                if ignore_missing_scans:
473                    # Create a placeholder scan if the scan is missing
474                    scan = Scan.make_placeholder(
475                        self.info["slide_id"][i],
476                        self.info["tile"][i],
477                        self.info["roi"][i],
478                    )
479                else:
480                    raise ValueError(
481                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
482                    )
483            # Add to the list
484            events.append(
485                Event(
486                    scan,
487                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
488                    self.info["x"][i],
489                    self.info["y"][i],
490                    size=self.info["size"][i],
491                    metadata=None if ignore_metadata else self.metadata.loc[i],
492                    features=None if ignore_features else self.features.loc[i],
493                )
494            )
495        return events
496
497    def to_dataframe(self) -> pd.DataFrame:
498        """
499        Convert all the data in the EventArray to a single DataFrame.
500        :return: a DataFrame with all the data in the EventArray.
501        """
502        # Make a copy of the info DataFrame and prepend "info_" to the column names
503        output = self.info.copy()
504        output.columns = [f"info_{col}" for col in output.columns]
505        # Combine with the metadata and prepend "metadata_" to the column names
506        if self.metadata is not None:
507            metadata = self.metadata.copy()
508            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
509            output = pd.concat([output, metadata], axis=1)
510        # Combine with the features and prepend "features_" to the column names
511        if self.features is not None:
512            features = self.features.copy()
513            features.columns = [f"features_{col}" for col in features.columns]
514            output = pd.concat([output, features], axis=1)
515        return output
516
517    @classmethod
518    def from_dataframe(cls, df) -> typing.Self:
519        """
520        From a single, special DataFrame, create an EventArray.
521        :return: a DataFrame with all the data in the EventArray.
522        """
523        # Split the columns into info, metadata, and features and strip prefix
524        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
525        info.columns = [col.replace("info_", "") for col in info.columns]
526        if info.size == 0:
527            info = None
528        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
529        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
530        if metadata.size == 0:
531            metadata = None
532        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
533        features.columns = [col.replace("features_", "") for col in features.columns]
534        if features.size == 0:
535            features = None
536        return cls(info=info, metadata=metadata, features=features)
537
538    def save_csv(self, output_path: str) -> bool:
539        """
540        Save the events to an CSV file, including metadata and features.
541        :param output_path:
542        :return:
543        """
544        self.to_dataframe().to_csv(output_path, index=False)
545        return os.path.exists(output_path)
546
547    @classmethod
548    def load_csv(cls, input_path: str) -> typing.Self:
549        """
550        Load the events from an CSV file, including metadata and features.
551        :param input_path:
552        :return:
553        """
554        # Load the CSV file
555        df = pd.read_csv(input_path)
556        return cls.from_dataframe(df)
557
558    def save_hdf5(self, output_path: str) -> bool:
559        """
560        Save the events to an HDF5 file, including metadata and features.
561        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
562        though these files are slightly harder to view in HDFView or similar.
563        :param output_path:
564        :return:
565        """
566        # Open the output_path as an HDF5 file
567        with pd.HDFStore(output_path) as store:
568            # Store the dataframes in the HDF5 file
569            if self.info is not None:
570                store.put("info", self.info, index=False)
571            if self.metadata is not None:
572                store.put("metadata", self.metadata, index=False)
573            if self.features is not None:
574                store.put("features", self.features, index=False)
575        return os.path.exists(output_path)
576
577    @classmethod
578    def load_hdf5(cls, input_path: str) -> typing.Self:
579        """
580        Load the events from an HDF5 file, including metadata and features.
581        :param input_path:
582        :return:
583        """
584        # Open the input_path as an HDF5 file
585        with pd.HDFStore(input_path) as store:
586            # Load the dataframes from the HDF5 file
587            info = store.get("info") if "info" in store else None
588            metadata = store.get("metadata") if "metadata" in store else None
589            features = store.get("features") if "features" in store else None
590        return cls(info=info, metadata=metadata, features=features)
591
592    @classmethod
593    def load_ocular(
594        cls,
595        input_path: str,
596        event_type="cells",
597        cell_data_files=(
598            "rc-final1.rds",
599            "rc-final2.rds",
600            "rc-final3.rds",
601            "rc-final4.rds",
602            "ocular_interesting.rds",
603        ),
604        others_data_files=(
605            "others-final1.rds",
606            "others-final2.rds",
607            "others-final3.rds",
608            "others-final4.rds",
609        ),
610        atlas_data_files=(
611            "ocular_interesting.rds",
612            "ocular_not_interesting.rds",
613        ),
614        merge_event_data_with_stats=True,
615        filter_and_generate_morphs=True,
616        drop_common_events=True,
617        log=None,
618    ) -> typing.Self:
619        """
620
621        :param input_path:
622        :param event_type:
623        :param cell_data_files:
624        :param others_data_files:
625        :param atlas_data_files:
626        :param merge_event_data_with_stats:
627        :param filter_and_generate_morphs:
628        :param drop_common_events:
629        :param log:
630        :return:
631        """
632        # Check if the input path is a directory or a file
633        if os.path.isfile(input_path):
634            data_files = [os.path.basename(input_path)]
635            input_path = os.path.dirname(input_path)
636        if event_type == "cells":
637            data_files = cell_data_files
638        elif event_type == "others":
639            data_files = others_data_files
640        else:
641            raise ValueError("Invalid event type.")
642
643        # Load the data from the OCULAR files
644        file_data = {}
645        for file in data_files:
646            file_path = os.path.join(input_path, file)
647            if not os.path.isfile(file_path):
648                if log is not None:
649                    log.warning(f"{file} not found for in {input_path}")
650                continue
651            file_data[file] = pyreadr.read_r(file_path)
652            # Get the DataFrame associated with None (pyreadr dict quirk)
653            file_data[file] = file_data[file][None]
654            if len(file_data[file]) == 0:
655                # File gets dropped from the dict
656                file_data.pop(file)
657                if log is not None:
658                    log.warning(f"{file} has no cells")
659                continue
660
661            if log is not None:
662                log.debug(f"{file} has {len(file_data[file])} cells")
663
664            # Drop common cells if requested and in this file
665            if file in atlas_data_files and drop_common_events:
666                common_cell_indices = (
667                    file_data[file]["catalogue_classification"] == "common_cell"
668                )
669                if log is not None:
670                    log.debug(
671                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
672                        f"common cells from {file}"
673                    )
674                file_data[file] = file_data[file][common_cell_indices == False]
675
676            if len(file_data[file]) == 0:
677                # File gets dropped from the dict
678                file_data.pop(file)
679                if log is not None:
680                    log.warning(f"{file} has no cells after dropping common cells")
681                continue
682
683            # Extract frame_id and cell_id
684            # DAPI- events already have frame_id cell_id outside rowname
685            if event_type == "cells":
686                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
687                # get frame_id cell_id from rownames column and split into two columns
688                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
689                if len(split_res.columns) != 2:
690                    log.warning(
691                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
692                    )
693                # then assign it back to the dataframe
694                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
695            # reset indexes since they can cause NaN values in concat
696            file_data[file].reset_index(drop=True, inplace=True)
697
698        # Merge the data from all files
699        if len(file_data) == 0:
700            return EventArray()
701        elif len(file_data) == 1:
702            data = [file_data[file] for file in file_data.keys()][0]
703        else:
704            data = pd.concat(file_data.values())
705
706        if log is not None:
707            log.debug(f"Gathered a total of {len(data)} events")
708
709        # Others is missing the "slide_id". Insert it right before "frame_id" column
710        if event_type == "others" and "slide_id" not in data.columns:
711            if os.path.basename(input_path) == "ocular":
712                slide_id = os.path.basename(os.path.dirname(input_path))
713            else:
714                slide_id = "UNKNOWN"
715            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
716
717        # Sort according to ascending cell_id to keep the original, which is in manual_df
718        data = data.sort_values(by=["cell_id"], ascending=True)
719        # Filter out duplicates by x & y
720        data = data.assign(
721            unique_id=data["slide_id"]
722            + "_"
723            + data["frame_id"].astype(str)
724            + "_"
725            + data["cellx"].astype(int).astype(str)
726            + "_"
727            + data["celly"].astype(int).astype(str)
728        )
729        data = data.drop_duplicates(subset=["unique_id"], keep="first", inplace=False)
730        # Filter out duplicates by cell_id
731        data = data.assign(
732            unique_id=data["slide_id"]
733            + "_"
734            + data["frame_id"].astype(str)
735            + "_"
736            + data["cell_id"].astype(str)
737        )
738        data.reset_index(drop=True, inplace=True)
739        # All columns up to "slide_id" are features; drop the "slide_id"
740        features = data.loc[:, :"slide_id"].iloc[:, :-1]
741        data = data.loc[:, "slide_id":]
742        # Grab the info columns
743        info = data[["slide_id", "frame_id", "cellx", "celly"]]
744        info.columns = ["slide_id", "tile", "x", "y"]
745        info = info.assign(
746            roi=0,  # OCULAR only works on 1 ROI, as far as known
747            size=25,  # Static, for later montaging
748        )
749        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
750        # Metadata has duplicate columns for later convenience
751        metadata = data
752        return EventArray(info, metadata, features)
753
754    def save_ocular(self, output_path: str, event_type: str = "cells") -> bool:
755        """
756        Save the events to an OCULAR file. Relies on the dataframe originating
757        from an OCULAR file (same columns; duplicate metadata/info).
758        :param output_path:
759        :return:
760        """
761        if event_type == "cells":
762            file_stub = "rc-final"
763        elif event_type == "others":
764            file_stub = "others-final"
765        else:
766            raise ValueError("Invalid event type. Must be cells or others.")
767
768        # Check for the "ocular_interesting" column
769        if event_type == "cells" and "ocular_interesting" in self.metadata.columns:
770            interesting = self.metadata["ocular_interesting"]
771            # Split the metadata into interesting and regular
772            # Interesting will only have dropped columns, with no internal changes
773            interesting = pd.concat(
774                [self.features[interesting], self.metadata[interesting]], axis=1
775            ).reset_index(drop=True)
776            # Data will get some columns changed, so copy it
777            data = (
778                pd.concat(
779                    [self.features[~interesting], self.metadata[~interesting]], axis=1
780                )
781                .copy(deep=True)
782                .reset_index(drop=True)
783                .drop(columns=["ocular_interesting"])
784            )
785
786            # Drop particular columns for "interesting"
787            interesting = interesting.drop(
788                [
789                    "clust",
790                    "hcpc",
791                    "frame_id",
792                    "cell_id",
793                    "unique_id",
794                    "ocular_interesting",
795                ],
796                axis=1,
797            )
798            # Save both .csv and .rds
799            interesting.to_csv(
800                os.path.join(output_path, "ocular_interesting.csv"), index=False
801            )
802            pyreadr.write_rds(
803                os.path.join(output_path, "ocular_interesting.rds"), interesting
804            )
805        else:
806            # Get all data, copying it
807            data = (
808                pd.concat([self.features, self.metadata], axis=1)
809                .copy(deep=True)
810                .reset_index(drop=True)
811            )
812
813        # Split based on cluster number to conform to *-final[1-4].rds
814        n_clusters = max(data["clust"]) + 1
815        split_idx = [round(i * n_clusters / 4) for i in range(5)]
816        for i in range(4):
817            subset = (split_idx[i] <= data["clust"]) & (
818                data["clust"] < split_idx[i + 1]
819            )
820            subset = data[subset].reset_index(drop=True)
821            subset["hcpc"] = i + 1
822            pyreadr.write_rds(
823                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
824            )
825
826        # Create new example cell strings
827        data["example_cell_id"] = (
828            data["slide_id"]
829            + " "
830            + data["frame_id"].astype(str)
831            + " "
832            + data["cell_id"].astype(str)
833            + " "
834            + data["cellx"].astype(int).astype(str)
835            + " "
836            + data["celly"].astype(int).astype(str)
837        )
838        # Find averagable data columns
839        if "cellcluster_id" in data.columns:
840            avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist()
841        else:
842            avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist()
843        # Group by cluster and average
844        data = data.groupby("clust").agg(
845            **{col: (col, "mean") for col in avg_cols},
846            count=("clust", "size"),  # count rows in each cluster
847            example_cells=("example_cell_id", lambda x: ",".join(x)),
848            hcpc=("hcpc", lambda x: x.iloc[0]),
849        )
850        data = data.reset_index()  # Do NOT drop, index is "clust"
851        # Create new columns
852        metadata = pd.DataFrame(
853            {
854                "count": data["count"],
855                "example_cells": data["example_cells"],
856                "clust": data["clust"].astype(int),
857                "hcpc": data["hcpc"].astype(int),
858                "id": data["clust"].astype(int).astype(str),
859                "cccluster": "0",  # Dummy value
860                "ccdistance": 0.0,  # Dummy value
861                "rownum": list(range(len(data))),
862                "framegroup": 0,  # Dummy value
863            }
864        )
865        data = pd.concat([data.loc[:, avg_cols], metadata], axis=1)
866        # Save the data
867        data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
868        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
255    def __init__(
256        self,
257        info: pd.DataFrame = None,
258        metadata: pd.DataFrame = None,
259        features: pd.DataFrame = None,
260    ):
261        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
262        if info is not None and (
263            not all(col in info.columns for col in self.INFO_COLUMNS)
264            or len(info.columns) != 6
265        ):
266            raise ValueError(
267                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
268            )
269        # All DataFrames must all have the same number of rows
270        if metadata is not None and (info is None or len(info) != len(metadata)):
271            raise ValueError(
272                "If EventArray.metadata is not None, it should match rows with .info"
273            )
274        if features is not None and (info is None or len(info) != len(features)):
275            raise ValueError(
276                "If EventArray.features is not None, it should match rows with .info"
277            )
278        self.info = info
279        self.metadata = metadata
280        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y', 'size']
info
metadata
features
def sort(self, by: str | list[str], ascending: bool = True) -> Self:
329    def sort(self, by: str | list[str], ascending: bool = True) -> typing.Self:
330        """
331        Sort the EventArray by a column in the info, metadata, or features DataFrames.
332        :param by: name of the column to sort by.
333        :param ascending: whether to sort in ascending order.
334        :return:
335        """
336        everything = pd.concat([self.info, self.metadata, self.features], axis=1)
337        order = everything.sort_values(by=by, ascending=ascending).index
338        self.info = self.info.loc[order].reset_index(drop=True)
339        if self.metadata is not None:
340            self.metadata = self.metadata.loc[order].reset_index(drop=True)
341        if self.features is not None:
342            self.features = self.features.loc[order].reset_index(drop=True)
343        return self

Sort the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column to sort by.
  • ascending: whether to sort in ascending order.
Returns
def add_metadata(self, new_metadata: pandas.core.frame.DataFrame) -> None:
345    def add_metadata(self, new_metadata: pd.DataFrame) -> None:
346        """
347        Add metadata to the EventArray.
348        :param new_metadata: the metadata to add.
349        """
350        if self.metadata is None:
351            if len(self) != len(new_metadata):
352                raise ValueError("New metadata does not match length of existing info")
353            self.metadata = new_metadata
354        else:
355            # Add the new metadata columns to the existing metadata
356            self.metadata = pd.concat([self.metadata, new_metadata], axis=1)

Add metadata to the EventArray.

Parameters
  • new_metadata: the metadata to add.
def add_features(self, new_features: pandas.core.frame.DataFrame) -> None:
358    def add_features(self, new_features: pd.DataFrame) -> None:
359        """
360        Add features to the EventArray.
361        :param new_features: the metadata to add.
362        """
363        if self.features is None:
364            if len(self) != len(new_features):
365                raise ValueError("New metadata does not match length of existing info")
366            self.features = new_features
367        else:
368            # Add the new metadata columns to the existing metadata
369            self.features = pd.concat([self.features, new_features], axis=1)

Add features to the EventArray.

Parameters
  • new_features: the metadata to add.
@classmethod
def from_list(cls, events: list[typing.Self]) -> Self:
371    @classmethod
372    def from_list(cls, events: list[typing.Self]) -> typing.Self:
373        """
374        Combine EventArrays in a list into a single EventArray.
375        :param events: the new list of events.
376        """
377        all_info = []
378        all_metadata = []
379        all_features = []
380        for event_array in events:
381            # Skip empty EventArrays
382            if event_array.info is not None:
383                all_info.append(event_array.info)
384            if event_array.metadata is not None:
385                all_metadata.append(event_array.metadata)
386            if event_array.features is not None:
387                all_features.append(event_array.features)
388        if len(all_info) == 0:
389            return EventArray()
390        else:
391            all_info = pd.concat(all_info, ignore_index=True)
392        if len(all_metadata) == 0:
393            all_metadata = None
394        else:
395            all_metadata = pd.concat(all_metadata, ignore_index=True)
396        if len(all_features) == 0:
397            all_features = None
398        else:
399            all_features = pd.concat(all_features, ignore_index=True)
400
401        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
@classmethod
def from_events(cls, events: list[Event]) -> Self:
403    @classmethod
404    def from_events(cls, events: list[Event]) -> typing.Self:
405        """
406        Set the events in the EventArray to a new list of events.
407        :param events: the new list of events.
408        """
409        # Return an empty array if we were passed nothing
410        if events is None or len(events) == 0:
411            return EventArray()
412        # Otherwise, grab the info
413        info = pd.DataFrame(
414            {
415                "slide_id": [event.scan.slide_id for event in events],
416                "tile": [event.tile.n for event in events],
417                "roi": [event.tile.n_roi for event in events],
418                "x": [event.x for event in events],
419                "y": [event.y for event in events],
420                "size": [event.size for event in events],
421            }
422        )
423        metadata_list = [event.metadata for event in events]
424        # Iterate through and ensure that all metadata is the same shape
425        for metadata in metadata_list:
426            if type(metadata) != type(metadata_list[0]):
427                raise ValueError("All metadata must be the same type.")
428            if metadata is not None and metadata.shape != metadata_list[0].shape:
429                raise ValueError("All metadata must be the same shape.")
430        if metadata_list[0] is None:
431            metadata = None
432        else:
433            metadata = pd.DataFrame(metadata_list)
434        features_list = [event.features for event in events]
435        # Iterate through and ensure that all features are the same shape
436        for features in features_list:
437            if type(features) != type(features_list[0]):
438                raise ValueError("All features must be the same type.")
439            if features is not None and features.shape != features_list[0].shape:
440                raise ValueError("All features must be the same shape.")
441        if features_list[0] is None:
442            features = None
443        else:
444            features = pd.DataFrame(features_list)
445        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_events( self, scans: list[csi_images.csi_scans.Scan], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
447    def to_events(
448        self,
449        scans: list[Scan],
450        ignore_missing_scans=True,
451        ignore_metadata=False,
452        ignore_features=False,
453    ) -> list[Event]:
454        """
455        Get the events in the EventArray as a list of events.
456        :param scans: the scans that the events belong to. Pass an empty list if you
457                      don't care about scan metadata.
458        :param ignore_missing_scans: whether to create blank scans for events without scans.
459        :param ignore_metadata: whether to ignore metadata or not
460        :param ignore_features: whether to ignore features or not
461        :return:
462        """
463        events = []
464        for i in range(len(self.info)):
465            # Determine the associated scan
466            scan = None
467            for s in scans:
468                if s.slide_id == self.info["slide_id"][i]:
469                    scan = s
470                    break
471            if scan is None:
472                if ignore_missing_scans:
473                    # Create a placeholder scan if the scan is missing
474                    scan = Scan.make_placeholder(
475                        self.info["slide_id"][i],
476                        self.info["tile"][i],
477                        self.info["roi"][i],
478                    )
479                else:
480                    raise ValueError(
481                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
482                    )
483            # Add to the list
484            events.append(
485                Event(
486                    scan,
487                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
488                    self.info["x"][i],
489                    self.info["y"][i],
490                    size=self.info["size"][i],
491                    metadata=None if ignore_metadata else self.metadata.loc[i],
492                    features=None if ignore_features else self.features.loc[i],
493                )
494            )
495        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to. Pass an empty list if you don't care about scan metadata.
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
def to_dataframe(self) -> pandas.core.frame.DataFrame:
497    def to_dataframe(self) -> pd.DataFrame:
498        """
499        Convert all the data in the EventArray to a single DataFrame.
500        :return: a DataFrame with all the data in the EventArray.
501        """
502        # Make a copy of the info DataFrame and prepend "info_" to the column names
503        output = self.info.copy()
504        output.columns = [f"info_{col}" for col in output.columns]
505        # Combine with the metadata and prepend "metadata_" to the column names
506        if self.metadata is not None:
507            metadata = self.metadata.copy()
508            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
509            output = pd.concat([output, metadata], axis=1)
510        # Combine with the features and prepend "features_" to the column names
511        if self.features is not None:
512            features = self.features.copy()
513            features.columns = [f"features_{col}" for col in features.columns]
514            output = pd.concat([output, features], axis=1)
515        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe(cls, df) -> Self:
517    @classmethod
518    def from_dataframe(cls, df) -> typing.Self:
519        """
520        From a single, special DataFrame, create an EventArray.
521        :return: a DataFrame with all the data in the EventArray.
522        """
523        # Split the columns into info, metadata, and features and strip prefix
524        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
525        info.columns = [col.replace("info_", "") for col in info.columns]
526        if info.size == 0:
527            info = None
528        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
529        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
530        if metadata.size == 0:
531            metadata = None
532        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
533        features.columns = [col.replace("features_", "") for col in features.columns]
534        if features.size == 0:
535            features = None
536        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Returns

a DataFrame with all the data in the EventArray.

def save_csv(self, output_path: str) -> bool:
538    def save_csv(self, output_path: str) -> bool:
539        """
540        Save the events to an CSV file, including metadata and features.
541        :param output_path:
542        :return:
543        """
544        self.to_dataframe().to_csv(output_path, index=False)
545        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv(cls, input_path: str) -> Self:
547    @classmethod
548    def load_csv(cls, input_path: str) -> typing.Self:
549        """
550        Load the events from an CSV file, including metadata and features.
551        :param input_path:
552        :return:
553        """
554        # Load the CSV file
555        df = pd.read_csv(input_path)
556        return cls.from_dataframe(df)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
Returns
def save_hdf5(self, output_path: str) -> bool:
558    def save_hdf5(self, output_path: str) -> bool:
559        """
560        Save the events to an HDF5 file, including metadata and features.
561        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
562        though these files are slightly harder to view in HDFView or similar.
563        :param output_path:
564        :return:
565        """
566        # Open the output_path as an HDF5 file
567        with pd.HDFStore(output_path) as store:
568            # Store the dataframes in the HDF5 file
569            if self.info is not None:
570                store.put("info", self.info, index=False)
571            if self.metadata is not None:
572                store.put("metadata", self.metadata, index=False)
573            if self.features is not None:
574                store.put("features", self.features, index=False)
575        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
577    @classmethod
578    def load_hdf5(cls, input_path: str) -> typing.Self:
579        """
580        Load the events from an HDF5 file, including metadata and features.
581        :param input_path:
582        :return:
583        """
584        # Open the input_path as an HDF5 file
585        with pd.HDFStore(input_path) as store:
586            # Load the dataframes from the HDF5 file
587            info = store.get("info") if "info" in store else None
588            metadata = store.get("metadata") if "metadata" in store else None
589            features = store.get("features") if "features" in store else None
590        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), merge_event_data_with_stats=True, filter_and_generate_morphs=True, drop_common_events=True, log=None) -> Self:
592    @classmethod
593    def load_ocular(
594        cls,
595        input_path: str,
596        event_type="cells",
597        cell_data_files=(
598            "rc-final1.rds",
599            "rc-final2.rds",
600            "rc-final3.rds",
601            "rc-final4.rds",
602            "ocular_interesting.rds",
603        ),
604        others_data_files=(
605            "others-final1.rds",
606            "others-final2.rds",
607            "others-final3.rds",
608            "others-final4.rds",
609        ),
610        atlas_data_files=(
611            "ocular_interesting.rds",
612            "ocular_not_interesting.rds",
613        ),
614        merge_event_data_with_stats=True,
615        filter_and_generate_morphs=True,
616        drop_common_events=True,
617        log=None,
618    ) -> typing.Self:
619        """
620
621        :param input_path:
622        :param event_type:
623        :param cell_data_files:
624        :param others_data_files:
625        :param atlas_data_files:
626        :param merge_event_data_with_stats:
627        :param filter_and_generate_morphs:
628        :param drop_common_events:
629        :param log:
630        :return:
631        """
632        # Check if the input path is a directory or a file
633        if os.path.isfile(input_path):
634            data_files = [os.path.basename(input_path)]
635            input_path = os.path.dirname(input_path)
636        if event_type == "cells":
637            data_files = cell_data_files
638        elif event_type == "others":
639            data_files = others_data_files
640        else:
641            raise ValueError("Invalid event type.")
642
643        # Load the data from the OCULAR files
644        file_data = {}
645        for file in data_files:
646            file_path = os.path.join(input_path, file)
647            if not os.path.isfile(file_path):
648                if log is not None:
649                    log.warning(f"{file} not found for in {input_path}")
650                continue
651            file_data[file] = pyreadr.read_r(file_path)
652            # Get the DataFrame associated with None (pyreadr dict quirk)
653            file_data[file] = file_data[file][None]
654            if len(file_data[file]) == 0:
655                # File gets dropped from the dict
656                file_data.pop(file)
657                if log is not None:
658                    log.warning(f"{file} has no cells")
659                continue
660
661            if log is not None:
662                log.debug(f"{file} has {len(file_data[file])} cells")
663
664            # Drop common cells if requested and in this file
665            if file in atlas_data_files and drop_common_events:
666                common_cell_indices = (
667                    file_data[file]["catalogue_classification"] == "common_cell"
668                )
669                if log is not None:
670                    log.debug(
671                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
672                        f"common cells from {file}"
673                    )
674                file_data[file] = file_data[file][common_cell_indices == False]
675
676            if len(file_data[file]) == 0:
677                # File gets dropped from the dict
678                file_data.pop(file)
679                if log is not None:
680                    log.warning(f"{file} has no cells after dropping common cells")
681                continue
682
683            # Extract frame_id and cell_id
684            # DAPI- events already have frame_id cell_id outside rowname
685            if event_type == "cells":
686                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
687                # get frame_id cell_id from rownames column and split into two columns
688                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
689                if len(split_res.columns) != 2:
690                    log.warning(
691                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
692                    )
693                # then assign it back to the dataframe
694                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
695            # reset indexes since they can cause NaN values in concat
696            file_data[file].reset_index(drop=True, inplace=True)
697
698        # Merge the data from all files
699        if len(file_data) == 0:
700            return EventArray()
701        elif len(file_data) == 1:
702            data = [file_data[file] for file in file_data.keys()][0]
703        else:
704            data = pd.concat(file_data.values())
705
706        if log is not None:
707            log.debug(f"Gathered a total of {len(data)} events")
708
709        # Others is missing the "slide_id". Insert it right before "frame_id" column
710        if event_type == "others" and "slide_id" not in data.columns:
711            if os.path.basename(input_path) == "ocular":
712                slide_id = os.path.basename(os.path.dirname(input_path))
713            else:
714                slide_id = "UNKNOWN"
715            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
716
717        # Sort according to ascending cell_id to keep the original, which is in manual_df
718        data = data.sort_values(by=["cell_id"], ascending=True)
719        # Filter out duplicates by x & y
720        data = data.assign(
721            unique_id=data["slide_id"]
722            + "_"
723            + data["frame_id"].astype(str)
724            + "_"
725            + data["cellx"].astype(int).astype(str)
726            + "_"
727            + data["celly"].astype(int).astype(str)
728        )
729        data = data.drop_duplicates(subset=["unique_id"], keep="first", inplace=False)
730        # Filter out duplicates by cell_id
731        data = data.assign(
732            unique_id=data["slide_id"]
733            + "_"
734            + data["frame_id"].astype(str)
735            + "_"
736            + data["cell_id"].astype(str)
737        )
738        data.reset_index(drop=True, inplace=True)
739        # All columns up to "slide_id" are features; drop the "slide_id"
740        features = data.loc[:, :"slide_id"].iloc[:, :-1]
741        data = data.loc[:, "slide_id":]
742        # Grab the info columns
743        info = data[["slide_id", "frame_id", "cellx", "celly"]]
744        info.columns = ["slide_id", "tile", "x", "y"]
745        info = info.assign(
746            roi=0,  # OCULAR only works on 1 ROI, as far as known
747            size=25,  # Static, for later montaging
748        )
749        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
750        # Metadata has duplicate columns for later convenience
751        metadata = data
752        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • merge_event_data_with_stats:
  • filter_and_generate_morphs:
  • drop_common_events:
  • log:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells') -> bool:
754    def save_ocular(self, output_path: str, event_type: str = "cells") -> bool:
755        """
756        Save the events to an OCULAR file. Relies on the dataframe originating
757        from an OCULAR file (same columns; duplicate metadata/info).
758        :param output_path:
759        :return:
760        """
761        if event_type == "cells":
762            file_stub = "rc-final"
763        elif event_type == "others":
764            file_stub = "others-final"
765        else:
766            raise ValueError("Invalid event type. Must be cells or others.")
767
768        # Check for the "ocular_interesting" column
769        if event_type == "cells" and "ocular_interesting" in self.metadata.columns:
770            interesting = self.metadata["ocular_interesting"]
771            # Split the metadata into interesting and regular
772            # Interesting will only have dropped columns, with no internal changes
773            interesting = pd.concat(
774                [self.features[interesting], self.metadata[interesting]], axis=1
775            ).reset_index(drop=True)
776            # Data will get some columns changed, so copy it
777            data = (
778                pd.concat(
779                    [self.features[~interesting], self.metadata[~interesting]], axis=1
780                )
781                .copy(deep=True)
782                .reset_index(drop=True)
783                .drop(columns=["ocular_interesting"])
784            )
785
786            # Drop particular columns for "interesting"
787            interesting = interesting.drop(
788                [
789                    "clust",
790                    "hcpc",
791                    "frame_id",
792                    "cell_id",
793                    "unique_id",
794                    "ocular_interesting",
795                ],
796                axis=1,
797            )
798            # Save both .csv and .rds
799            interesting.to_csv(
800                os.path.join(output_path, "ocular_interesting.csv"), index=False
801            )
802            pyreadr.write_rds(
803                os.path.join(output_path, "ocular_interesting.rds"), interesting
804            )
805        else:
806            # Get all data, copying it
807            data = (
808                pd.concat([self.features, self.metadata], axis=1)
809                .copy(deep=True)
810                .reset_index(drop=True)
811            )
812
813        # Split based on cluster number to conform to *-final[1-4].rds
814        n_clusters = max(data["clust"]) + 1
815        split_idx = [round(i * n_clusters / 4) for i in range(5)]
816        for i in range(4):
817            subset = (split_idx[i] <= data["clust"]) & (
818                data["clust"] < split_idx[i + 1]
819            )
820            subset = data[subset].reset_index(drop=True)
821            subset["hcpc"] = i + 1
822            pyreadr.write_rds(
823                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
824            )
825
826        # Create new example cell strings
827        data["example_cell_id"] = (
828            data["slide_id"]
829            + " "
830            + data["frame_id"].astype(str)
831            + " "
832            + data["cell_id"].astype(str)
833            + " "
834            + data["cellx"].astype(int).astype(str)
835            + " "
836            + data["celly"].astype(int).astype(str)
837        )
838        # Find averagable data columns
839        if "cellcluster_id" in data.columns:
840            avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist()
841        else:
842            avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist()
843        # Group by cluster and average
844        data = data.groupby("clust").agg(
845            **{col: (col, "mean") for col in avg_cols},
846            count=("clust", "size"),  # count rows in each cluster
847            example_cells=("example_cell_id", lambda x: ",".join(x)),
848            hcpc=("hcpc", lambda x: x.iloc[0]),
849        )
850        data = data.reset_index()  # Do NOT drop, index is "clust"
851        # Create new columns
852        metadata = pd.DataFrame(
853            {
854                "count": data["count"],
855                "example_cells": data["example_cells"],
856                "clust": data["clust"].astype(int),
857                "hcpc": data["hcpc"].astype(int),
858                "id": data["clust"].astype(int).astype(str),
859                "cccluster": "0",  # Dummy value
860                "ccdistance": 0.0,  # Dummy value
861                "rownum": list(range(len(data))),
862                "framegroup": 0,  # Dummy value
863            }
864        )
865        data = pd.concat([data.loc[:, avg_cols], metadata], axis=1)
866        # Save the data
867        data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
868        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
Returns