Source code for locpix.preprocessing.functions
"""Preprocessing module.
This module contains functions to preprocess the data,
including (add as added):
- file to datastructure
- convert .csv to datastructure
- convert .parquet to datastructure
"""
import polars as pl
import os
from . import datastruc
[docs]
def file_to_datastruc(
input_file,
file_type,
dim,
channel_col,
frame_col,
x_col,
y_col,
z_col,
channel_choice=None,
channel_label=None,
):
"""Loads in .csv or .parquet and converts to the required datastructure.
Currently considers the following columns: channel frame x y z
Also user can specify the channels they want to consider, these
should be present in the channels column
Args:
input_file (string) : Location of the file
file_type (string) : Either csv or parquet
save_loc (string) : Location to save datastructure to
dim (int) : Dimensions to consider either 2 or 3
channel_col (string) : Name of column which gives channel
for localisation
frame_col (string) : Name of column which gives frame for localisation
x_col (string) : Name of column which gives x for localisation
y_col (string) : Name of column which gives y for localisation
z_col (string) : Name of column which gives z for localisation
channel_choice (list of ints) : If specified then this will be list
of integers representing channels to be considered
channel_label (list of strings) : If specified then this is the
label for each channel i.e. ['egfr', 'ereg','unk'] means
channel 0 is egfr protein, channel 1 is ereg proteins and
channel 2 is unknown
Returns:
datastruc (SMLM_datastruc) : Datastructure containg the data
"""
# Check dimensions correctly specified
if dim != 2 and dim != 3:
raise ValueError("Dimensions must be 2 or 3")
if dim == 2 and z_col:
raise ValueError("If dimensions are two no z should be specified")
if dim == 3 and not z_col:
raise ValueError("If dimensions are 3 then z_col must be specified")
# check file type parquet or csv
if file_type != "csv" and file_type != "parquet":
raise ValueError(f"{file_type} is not supported, should be csv or parquet")
# Load in data
if dim == 2:
if file_type == "csv":
df = pl.read_csv(input_file, columns=[channel_col, frame_col, x_col, y_col])
elif file_type == "parquet":
df = pl.read_parquet(
input_file, columns=[channel_col, frame_col, x_col, y_col]
)
df = df.rename(
{channel_col: "channel", frame_col: "frame", x_col: "x", y_col: "y"}
)
elif dim == 3:
if file_type == "csv":
df = pl.read_csv(
input_file, columns=[channel_col, frame_col, x_col, y_col, z_col]
)
elif file_type == "parquet":
df = pl.read_parquet(
input_file, columns=[channel_col, frame_col, x_col, y_col]
)
df = df.rename(
{
channel_col: "channel",
frame_col: "frame",
x_col: "x",
y_col: "y",
z_col: "z",
}
)
# Specify channels to consider
# if channel_choice is None:
# channels = df["channel"].unique()
# channels = sorted(channels)
# else:
channels = channel_choice
# Get name of file - assumes last part of input file name
if file_type == "csv":
name = os.path.basename(os.path.normpath(input_file)).removesuffix(".csv")
elif file_type == "parquet":
name = os.path.basename(os.path.normpath(input_file)).removesuffix(".parquet")
return datastruc.item(name, df, dim, channels, channel_label)