Source code for locpix.scripts.img_seg.img_train_prep

#!/usr/bin/env python
"""Prepare images for training module

Take in items and convert to images ready for training
"""

import os
import json

from locpix.preprocessing import datastruc
import numpy as np

# import skimage
from locpix.visualise import vis_img
import tifffile


[docs] def preprocess_train_files(project_folder, config, metadata, fold, model_name): """Preprocess data Args: project_folder (string) : Project folder config (dict) : Configuration for the train script metadata (dict) : Metadata associated with the project fold (int) : Fold to be preprocessed""" # check train val test files train_files = metadata["train_folds"][fold] val_files = metadata["val_folds"][fold] test_files = metadata["test_files"] # check files if not set(train_files).isdisjoint(test_files): raise ValueError("Train files and test files shared files!!") if not set(train_files).isdisjoint(val_files): raise ValueError("Train files and val files shared files!!") if not set(val_files).isdisjoint(test_files): raise ValueError("Val files and test files shared files!!") if len(set(train_files)) != len(train_files): raise ValueError("Train files contains duplicates") if len(set(val_files)) != len(val_files): raise ValueError("Val files contains duplicates") if len(set(test_files)) != len(test_files): raise ValueError("Test files contains duplicates") print("Train files") print(train_files) print("Test files") print(test_files) print("Val files") print(val_files) # list items input_root = os.path.join(project_folder, "annotate/annotated") try: files = os.listdir(input_root) files = [os.path.splitext(file)[0] for file in files] except FileNotFoundError: raise ValueError("There should be some files to open") # make necessary folders if not present train_folder = os.path.join(project_folder, f"train_files/{model_name}/train") val_folder = os.path.join(project_folder, f"train_files/{model_name}/val") folders = [ train_folder, val_folder, ] for folder in folders: if os.path.exists(folder): raise ValueError(f"Cannot proceed as {folder} already exists") else: os.makedirs(folder) # convert files into imgs and masks train_files = [os.path.join(input_root, file + ".parquet") for file in train_files] val_files = [os.path.join(input_root, file + ".parquet") for file in val_files] parquet_2_img( train_files, config["channels"], config["sum_chan"], config["img_threshold"], config["img_interpolate"], train_folder, ) parquet_2_img( val_files, config["channels"], config["sum_chan"], config["img_threshold"], config["img_interpolate"], val_folder, )
[docs] def preprocess_all_files(project_folder, config, metadata, model_name): """Preprocess data Args: project_folder (string) : Project folder config (dict) : Configuration for the train script metadata (dict) : Metadata associated with the project fold (int) : Fold to be preprocessed""" # check train val test files train_files = metadata["train_files"] test_files = metadata["test_files"] # check files if not set(train_files).isdisjoint(test_files): raise ValueError("Train files and test files shared files!!") if len(set(train_files)) != len(train_files): raise ValueError("Train files contains duplicates") if len(set(test_files)) != len(test_files): raise ValueError("Test files contains duplicates") all_files = train_files + test_files print("All files") print(all_files) # list items input_root = os.path.join(project_folder, "annotate/annotated") try: files = os.listdir(input_root) files = [os.path.splitext(file)[0] for file in files] except FileNotFoundError: raise ValueError("There should be some files to open") # make necessary folders if not present folder = os.path.join(project_folder, f"train_files/{model_name}/all") if os.path.exists(folder): raise ValueError(f"Cannot proceed as {folder} already exists") else: os.makedirs(folder) # convert files into imgs and masks all_files = [os.path.join(input_root, file + ".parquet") for file in all_files] parquet_2_img( all_files, config["channels"], config["sum_chan"], config["img_threshold"], config["img_interpolate"], folder, )
# def preprocess_test_files(project_folder, config, metadata): # """Preprocess data # # Args: # project_folder (string) : Project folder # config (dict) : Configuration for the # train script # metadata (dict) : Metadata associated with # the project""" # # folds = len(metadata["train_folds"]) # for fold in range(folds): # # check train val test files # train_files = metadata["train_folds"][fold] # val_files = metadata["val_folds"][fold] # test_files = metadata["test_files"] # # check files # if not set(train_files).isdisjoint(test_files): # raise ValueError("Train files and test files shared files!!") # if not set(train_files).isdisjoint(val_files): # raise ValueError("Train files and val files shared files!!") # if not set(val_files).isdisjoint(test_files): # raise ValueError("Val files and test files shared files!!") # if len(set(train_files)) != len(train_files): # raise ValueError("Train files contains duplicates") # if len(set(val_files)) != len(val_files): # raise ValueError("Val files contains duplicates") # if len(set(test_files)) != len(test_files): # raise ValueError("Test files contains duplicates") # print("Fold: ", fold) # print("Train files") # print(train_files) # print("Test files") # print(test_files) # print("Val files") # print(val_files) # # # list items # input_root = os.path.join(project_folder, "annotate/annotated") # try: # files = os.listdir(input_root) # files = [os.path.splitext(file)[0] for file in files] # except FileNotFoundError: # raise ValueError("There should be some files to open") # # # make necessary folders if not present # test_folder = os.path.join(project_folder, f"test_files/{model_name}/") # folders = [ # test_folder, # ] # for folder in folders: # if os.path.exists(folder): # raise ValueError(f"Cannot proceed as {folder} already exists") # else: # os.makedirs(folder) # # # convert files into imgs and masks # test_files = [os.path.join(input_root, file + ".parquet") for file in test_files] # parquet_2_img( # test_files, # config["labels"], # config["sum_chan"], # config["img_threshold"], # config["img_interpolate"], # test_folder, # )
[docs] def clean_up(project_folder, model_name): """Clean up data Args: project_folder (string) : Project folder""" train_folder = os.path.join(project_folder, f"train_files/{model_name}/train") val_folder = os.path.join(project_folder, f"train_files/{model_name}/val") # remove train files for file in os.listdir(train_folder): file_path = os.path.join(train_folder, file) os.remove(file_path) os.rmdir(train_folder) # remove val files for file in os.listdir(val_folder): file_path = os.path.join(val_folder, file) os.remove(file_path) os.rmdir(val_folder)
[docs] def clean_up_all(project_folder, model_name): """Clean up data Args: project_folder (string) : Project folder""" img_folder = os.path.join(project_folder, f"train_files/{model_name}/all") # remove train files for file in os.listdir(img_folder): file_path = os.path.join(img_folder, file) os.remove(file_path) os.rmdir(img_folder)
[docs] def parquet_2_img(files, channels, sum_chan, img_threshold, img_interpolate, folder): """Convert data from .parquet files to .png files and save Args: files (list) : List of files (.parquet) channels (list) : List of channels id by label to render in img sum_chan (bool) : If True the channels are summed img_threshold (float) : The threshold for the image img_interpolate (string) : How to interpolate the image folder (string) : Folder to save data to """ # for file in input for datum in files: item = datastruc.item(None, None, None, None, None) item.load_from_parquet(os.path.join(datum)) # convert histo, channel_map, label_map = item.render_histo(channels) label = item.render_seg() img_info_path = os.path.join(folder, "img_info.json") with open(img_info_path, "w") as outfile: json.dump(label_map, outfile) # transpose to img space label = label.T label = label.astype("int32") if not sum_chan: img = histo[0].T elif sum_chan: img = histo[0].T + histo[1].T else: raise ValueError("sum_chan should be true or false") img = vis_img.manual_threshold(img, img_threshold, how=img_interpolate) # save # img_folder = os.path.join(folder, 'imgs') # label_folder = os.path.join(folder, 'labels') # folders = [img_folder, label_folder] # for folder in folders: img_path = os.path.join(folder, item.name + ".tif") label_path = os.path.join(folder, item.name + "_masks.tif") # save old_img = img old_label = label tifffile.imwrite(img_path, img) tifffile.imwrite(label_path, label) # plt.imsave(img_path, img) # plt.imsave(label_path, label) # load and check img = tifffile.imread(img_path) label = tifffile.imread(label_path) np.testing.assert_array_equal(old_img, img) np.testing.assert_array_equal(old_label, label)