Source code for pidp_tools.analysis

import numpy as np
import matplotlib.pyplot as plt
from pidp_tools.formatting import format_labels
import sklearn.metrics
import joblib
import tqdm
import pandas as pd
import warnings

[docs] def install_ROOT(): """ Installs ROOT. Examples -------- >>> from pidp_tools import \* >>> install_ROOT() >>> from ROOT import \* """ import subprocess try: import ROOT except: try: subprocess.run(["wget","-q","https://github.com/MohamedElashri/ROOT/releases/download/ubuntu/root_v6.28.04_Ubuntu_20.04.zip"]) subprocess.run(["unzip", "-o", "-qq", "/content/root_v6.28.04_Ubuntu_20.04.zip"]) subprocess.run(["apt-get", "-qq", "install", "git", "dpkg-dev", "cmake", "g++", "gcc", "binutils", "libx11-dev", "libxpm-dev", "libxft-dev", "libxext-dev", "tar", "gfortran", "subversion", "&>", "/dev/null", "2>&1"]) subprocess.run(["rm", "-f", "root_v6.28.04_Ubuntu_20.04.zip"]) subprocess.run(["wget", "-q", "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb"]) subprocess.run(["sudo", "dpkg", "-i", "libssl1.1_1.1.1f-1ubuntu2_amd64.deb"]) subprocess.run(["rm", "-f", "libssl1.1_1.1.1f-1ubuntu2_amd64.deb"]) import sys sys.path.append("/content/root_build/") sys.path.append("/content/root_build/bin/") sys.path.append("/content/root_build/include/") sys.path.append("/content/root_build/lib/") import ctypes ctypes.cdll.LoadLibrary('/content/root_build/lib//libCore.so') ctypes.cdll.LoadLibrary('/content/root_build/lib//libThread.so') ctypes.cdll.LoadLibrary('/content/root_build/lib//libTreePlayer.so') except: raise OSError("Unable to install ROOT. This install was designed specifically for use in google colab. Installing on personal computers is discouraged due to the file size.")
[docs] def get_charge(ptype): """ Returns the charge of the provided particle Parameters ---------- ptype \: str or int The particle to find the charge of. If int, the particle is assumed to be the element of the following list with the corresponding index\: ["Photon", "KLong", "Neutron", "Proton", "K+", "Pi+", "AntiMuon", "Positron", "AntiProton", "K-", "Pi-", "Muon", "Electron", "No ID"]. Examples -------- >>> get_charge("AntiProton") -1 >>> get_charge(3) 1 """ if ptype in ["Electron", "Muon", "Pi-", "K-",'AntiProton',3,4,5,6,7]: return -1 elif ptype in ["Positron","AntiMuon", "Pi+", "K+", "Proton",8,9,10,11,12]: return 1 else: return 0
[docs] def round_accuracies(num): """ Rounds a number to 2 decimal places. If the rounded number is 0.00 or 1.00, an int is returned. Parameters ---------- num \: float or int The number to round. Examples -------- >>> round_accuracies(0.3333333) 0.33 >>> round_accuracies(0.001) 0 """ new_num = round(num,2) if new_num == 0.00: return 0 elif new_num == 1.00: return 1 else: return new_num
[docs] class ConfusionMatrix(): """ Creates a confusion matrix based on a collection of labels and predictions. Parameters ---------- labels \: list A list of strings or integers that represent the true particle type for a series of events. predictions \: list A list of strings or integers that represent the predicted particle type for a series of events. title \: str, default "" The title of the confusion matrix. purity \: bool, default False Normalize confusion matrix by columns instead of rows. If True, the sum of column values will be normalized to 1. label_selection \: {"all", "charge", "necessary"}, default "necessary" The way to determine which columns and rows to include in the confusion matrix: - "all" \: Includes all particle rows and columns, even if they are entirely empty. - "charge" \: Includes all of the particle types included in the labels and predictions, plus all particles of the same charge category (charged, neutral) as those included in the labels and predictions. - "necessary" \: Includes only those particles that are included in the labels or predictions. """ def __init__(self, labels, predictions, title="", purity=False, label_selection = "necessary"): self.title= title self.purity = purity self.label_selection = label_selection particle_list = ["Photon","KLong","Neutron","Proton","K+","Pi+","AntiMuon","Positron","AntiProton","K-","Pi-","Muon","Electron","No ID"] if isinstance(labels[0],str): labels = [particle_list.index(label) for label in labels] if isinstance(predictions[0],str): predictions = [particle_list.index(prediction) for prediction in predictions] labels = [int(i) for i in labels] predictions = [int(i) for i in predictions] if len(labels) != len(predictions): raise ValueError("Labels and predictions must have same length. Labels has length " + str(len(labels)) + " and predictions has length " + str(len(predictions)) + ".") self.calculate_matrix(labels, predictions) self.display_matrix(title)
[docs] @classmethod def from_estimator(cls, estimator, df, target='Generated As', title="", purity=False, label_selection="necessary"): """ Creates a confusion matrix based on the predictions made by the provided estimator. Parameters ---------- estimator \: function or method The estimator to be used to identify particles. Estimators can take either rows of a dataframe and return a string (to be compatible with the .apply method of the dataframe object), or can take in an entire dataframe and return a series of strings. df \: :external:class:`pandas.DataFrame` The dataframe whose rows represent particles that can be identified by the estimator. Supplied dataframes should have a "Hypothesis" column, which contains either a str or int, and a "Number of Hypotheses" column, which contains an int. target \: str, default "Generated As" The target of the estimator. The supplied dataframe must have a column with this label. title \: str, default "" The title of the confusion matrix. purity \: bool, default False Normalize confusion matrix by columns instead of rows. If True, the sum of column values will be normalized to 1. label_selection \: {"all", "charge", "necessary"}, default "necessary" The way to determine which columns and rows to include in the confusion matrix: - "all" \: Includes all particle rows and columns, even if they are entirely empty. - "charge" \: Includes all of the particle types included in the labels and predictions, plus all particles of the same charge category (charged, neutral) as those included in the labels and predictions. - "necessary" \: Includes only those particles that are included in the labels or predictions. Returns ------- :class:`ConfusionMatrix` """ # Initialize variables particle_list = ["Photon","KLong","Neutron","Proton","K+","Pi+","AntiMuon","Positron","AntiProton","K-","Pi-","Muon","Electron","No ID"] dataset = df.copy().reset_index(drop=True) predictions = [] identities = [] #Ensure hypothesis and generated as columns are strings for use in PID functions if isinstance(df['Hypothesis'][0],np.int64) or isinstance(df['Hypothesis'][0],int): dataset['Hypothesis']=dataset['Hypothesis'].apply(lambda x: particle_list[x]) if isinstance(df['Generated As'][0],np.int64) or isinstance(df['Generated As'][0],int): dataset['Generated As']=dataset['Generated As'].apply(lambda x: particle_list[x]) try: predictions_full = estimator(dataset) if len(predictions_full) != len(dataset.index): predictions_full = dataset.apply(estimator,axis=1) except: predictions_full = dataset.apply(estimator,axis=1) #Converts predictions, as well as the hypothesis and generated as columns, back to integers. dataset['Prediction'] = predictions_full.apply(particle_list.index).to_list() dataset['Hypothesis'] = dataset['Hypothesis'].apply(particle_list.index) dataset['Generated As'] = dataset['Generated As'].apply(particle_list.index) #Analyzes the predictions using the hypothesis scheme nRows = len(dataset.index) starting_index = 0 dataset['is matched'] = (dataset['Hypothesis'] == dataset['Prediction']) | (dataset['Hypothesis'] == 13) index_list = [] event_nos = [] i = 0 while starting_index <= nRows - 1 : ending_index = starting_index + dataset['Number of Hypotheses'][starting_index] index_list.append((starting_index, ending_index)) event_nos.extend([i for _ in range(starting_index, ending_index)]) i+= 1 starting_index = ending_index number_of_events = i dataset['eventno'] = event_nos reduced_dataset = dataset.loc[dataset['is matched']] grouped = reduced_dataset.sample(frac=1)[['Prediction','eventno']].groupby('eventno') predictions = grouped.head(1).set_index('eventno').sort_index().reindex(list(range(number_of_events)),fill_value=13)['Prediction'].to_list() identities = [int(dataset[target][starting_index]) for starting_index, ending_index in index_list] confusion_matrix = cls(identities, predictions, target=target,title=title, purity=purity, label_selection=label_selection) return confusion_matrix
[docs] @classmethod def from_model(cls, model, df, target="Generated As", title="", purity=False, match_hypothesis=False, label_selection="charge"): """ Creates a confusion matrix based on the predictions made by the provided model. Parameters ---------- model \: Any scikit-learn trained model with "predict" and "predict_proba" methods. The model to be used to predict the particle type of the particles supplied in the dataframe. df \: :external:class:`pandas.DataFrame` The dataframe whose rows represent particles that can be identified by the model. Supplied dataframes should have a "Hypothesis" column, which contains either a str or int, and a "Number of Hypotheses" column, which contains an int. target \: str, default "Generated As" The target of the model. The supplied dataframe must have a column with this label. title \: str, default "" The title of the confusion matrix. purity \: bool, default False Normalize confusion matrix by columns instead of rows. If True, the sum of column values will be normalized to 1. match_hypothesis \: bool, default False: Require predictions to match the supplied hypothesis. If True, only considers predictions that match the hypothesis. Neutral particles, which have no hypothesis, are still considered in the typical sense. If False, the prediction of the model is the most frequent prediction among all hypotheses. label_selection \: {"all", "charge", "necessary"}, default "necessary" The way to determine which columns and rows to include in the confusion matrix: - "all"\: Includes all particle rows and columns, even if they are entirely empty. - "charge"\: Includes all of the particle types included in the labels and predictions, plus all particles of the same charge category (charged, neutral) as those included in the labels and predictions. - "necessary"\: Includes only those particles that are included in the labels or predictions. Returns ------- :class:`ConfusionMatrix` """ particle_list = ["Photon","KLong","Neutron","Proton","K+","Pi+","AntiMuon","Positron","AntiProton","K-","Pi-","Muon","Electron","No ID"] dataset = df.copy().reset_index(drop=True) predictions = [] identities = [] data_to_test = dataset[[column for column in model.feature_names_in_]] if isinstance(df['Hypothesis'][0],str): dataset['Hypothesis']=dataset['Hypothesis'].apply(particle_list.index) if isinstance(df['Generated As'][0],str): dataset['Generated As']=dataset['Generated As'].apply(particle_list.index) nRows = len(dataset.index) starting_index = 0 index_list = [] event_nos = [] i = 0 while starting_index <= nRows - 1 : ending_index = starting_index + dataset['Number of Hypotheses'][starting_index] index_list.append((starting_index, ending_index)) event_nos.extend([i for _ in range(starting_index, ending_index)]) i+= 1 starting_index = ending_index number_of_events = i dataset['eventNo'] = event_nos if match_hypothesis: temp_predictions = model.predict_proba(data_to_test) dataset['Confidence'] = [max(probs) for probs in temp_predictions] dataset['Prediction'] = model.classes_[np.argmax(temp_predictions, axis=1)] identities_grouped = dataset[['Generated As','Prediction','eventNo']].groupby('eventNo') identities = identities_grouped['Generated As'].head(1).to_list() matches_hypotheses_bool_list = (dataset['Prediction'] == dataset['Hypothesis']) | (dataset['Hypothesis'] == 13) matching_hypotheses = dataset.loc[matches_hypotheses_bool_list] grouped_df = matching_hypotheses[['Generated As','Prediction','Confidence','eventNo']].groupby('eventNo') max_confidence_indices = grouped_df['Confidence'].idxmax() predictions_temp = dataset[['Prediction','eventNo']].iloc[max_confidence_indices] predictions = predictions_temp.set_index('eventNo')['Prediction'].reindex(list(range(number_of_events)),fill_value=13).to_list() else: dataset['Prediction'] = model.predict(data_to_test) grouped_df = dataset[['Generated As','Prediction','eventNo']].groupby('eventNo') identities = grouped_df['Generated As'].head(1).to_list() predictions = grouped_df['Prediction'].agg(lambda x: x.value_counts().index[0]).to_list() confusion_matrix = cls(identities, predictions, target=target,title=title, purity=purity, label_selection=label_selection) return confusion_matrix
[docs] def calculate_matrix(self, labels, predictions): """ Calculates the confusion matrix based on a collection of labels and predictions. Parameters ---------- labels \: list A list of integers that represent the true particle type for a series of events. predictions \: list A list of integers that represent the predicted particle type for a series of events. """ temp_confusion_matrix = np.zeros((13,14)) particle_list = ["Photon","KLong","Neutron","Proton","K+","Pi+","AntiMuon","Positron","AntiProton","K-","Pi-","Muon","Electron","No ID"] particle_array = np.array(particle_list) match self.label_selection: case 'charge': included_particles = list(set(list(labels) + list(predictions))) contains_neutral_particles = np.any([i in included_particles for i in [0,1,2]]) contains_charged_particles = np.any([i in included_particles for i in [3,4,5,6,7,8,9,10,11,12]]) if contains_charged_particles and not contains_neutral_particles: self.included_particles = [3,4,5,6,7,8,9,10,11,12] elif contains_neutral_particles and not contains_charged_particles: self.included_particles = [0,1,2] else: self.included_particles = list(range(13)) self.x_labels = particle_array[[*self.included_particles, 13]] self.y_labels = particle_array[self.included_particles] case 'all': self.included_particles = list(range(13)) self.x_labels = particle_array[[*self.included_particles, 13]] self.y_labels = particle_array[self.included_particles] case 'necessary': self.included_particles = list(set(list(labels) + list(predictions))) self.included_particles.sort() if 13 in self.included_particles: self.included_particles = [i for i in self.included_particles if int(i) != 13] self.x_labels = particle_array[[*self.included_particles, 13]] self.y_labels = particle_array[self.included_particles] case _: raise ValueError("Label selection must be one of the following: 'charge','all','necessary'") self.nXticks = len(self.x_labels) self.nYticks = len(self.y_labels) np.add.at(temp_confusion_matrix,(labels,predictions),1) if np.sum(temp_confusion_matrix[:, 13]) > 0: temp_confusion_matrix = temp_confusion_matrix[self.included_particles, :] temp_confusion_matrix = temp_confusion_matrix[:,[*self.included_particles,13]] else: temp_confusion_matrix = temp_confusion_matrix[self.included_particles, :] temp_confusion_matrix = temp_confusion_matrix[:,self.included_particles] self.x_labels = [i for i in self.x_labels if i != "No ID"] self.nXticks -= 1 if self.purity: temp_confusion_matrix = np.transpose(temp_confusion_matrix) self.confusion_matrix = np.zeros_like(temp_confusion_matrix) for i in range(len(temp_confusion_matrix)): self.confusion_matrix[i]= temp_confusion_matrix[i]/sum(temp_confusion_matrix[i]) if sum(temp_confusion_matrix[i]) > 0 else np.zeros_like(temp_confusion_matrix[i]) if self.purity: self.confusion_matrix = np.transpose(self.confusion_matrix)
[docs] def display_matrix(self, title): """ Displays the confusion matrix. Parameters ---------- title \: str, default "" The title of the confusion matrix. """ self.fig, self.ax = plt.subplots() self.im = self.ax.imshow(self.confusion_matrix) self.text = None cmap_min, cmap_max = self.im.cmap(0), self.im.cmap(1.0) self.text = np.empty_like(self.confusion_matrix, dtype=object) thresh = (self.confusion_matrix.max() + self.confusion_matrix.min()) / 2.0 for i in range(self.nYticks): for j in range(self.nXticks): color = cmap_max if self.confusion_matrix[i][j] < thresh else cmap_min text_cm = round(self.confusion_matrix[i][j], 2) if float(text_cm) == float(0): text_cm = 0 default_text_kwargs = dict(ha="center", va="center", color=color) text_kwargs = {**default_text_kwargs} self.text[i][j] = self.ax.text(j, i, text_cm, **text_kwargs) self.fig.colorbar(self.im, ax=self.ax) self.ax.set(xticks=np.arange(self.nXticks),yticks=np.arange(self.nYticks),xticklabels=[format_labels(x) for x in self.x_labels],yticklabels=[format_labels(y) for y in self.y_labels],ylabel="Generated As",xlabel="Identified As") self.ax.set_ylim((self.nYticks - 0.5, -0.5)) self.fig.set_figheight(7) self.fig.set_figwidth(7) self.ax.set_title(title) self.fig.show()
def __repr__(self): return ""
[docs] def split_df(input_df, training_fraction=0.9): """ Splits the supplied dataframe into training data and test data, preserving hypothesis groups. Parameters ---------- input_df \: :external:class:`pandas.DataFrame` The dataframe to split. The supplied dataframe should have a "Number of Hypotheses" column. training_fraction \: float, default 0.9 The fraction of events to be included in the training dataset. All remaining events will be included in the test dataset. Returns ------- training \: :external:class:`pandas.DataFrame` A dataframe containing the requested fraction of the input data. test \: :external:class:`pandas.DataFrame` A dataframe containing the rows of the input data not included in the training dataset. """ if round(training_fraction,2) == 1.: raise ValueError("Cannot create split dataset with such a large training fraction. Reduce the training fraction.") elif round(training_fraction,2) == 0: raise ValueError("Cannot create split dataset with such a small training fraction. Increase the training fraction.") elif training_fraction > 1 or training_fraction < 0: raise ValueError("training_fraction must be between 0 and 1.") if round(training_fraction,2)< 0.5: switch_train_test= True every_n_events = int(round((1-training_fraction)/training_fraction) + 1) else: switch_train_test = False every_n_events = int(round(training_fraction/(1-training_fraction)) + 1) df = input_df.copy().reset_index(drop=True) nRows = len(df.index) starting_index = 0 training_list = [] test_list = [] counter = 0 while starting_index <= nRows - 1 : counter += 1 ending_index = starting_index + df['Number of Hypotheses'][starting_index] if counter % every_n_events == 0: test_list.extend([not switch_train_test for _ in range(starting_index, ending_index)]) training_list.extend([switch_train_test for _ in range(starting_index, ending_index)]) else: test_list.extend([switch_train_test for _ in range(starting_index, ending_index)]) training_list.extend([not switch_train_test for _ in range(starting_index, ending_index)]) starting_index = ending_index test = df.loc[test_list].reset_index(drop=True) training = df.loc[training_list].reset_index(drop=True) return training, test
[docs] def grab_events(input_df, n_each = 5000,reverse = False, return_strings = False, allow_less=False): """ Grabs the selected number of events for each particle type, preserving hypothesis groups. Parameters ---------- input_df \: :external:class:`pandas.DataFrame` The dataframe to grab events from. The supplied dataframe should have a "Number of Hypotheses" column. n_each \: int, default 5000 The number of events of each particle type to include in the resulting dataset. The number of events for each particle type may be smaller if "allow_less" is True. reverse \: bool, default False Grab events from the end of the dataframe first. If True, events are grabbed from the end of the file first. return_strings \: bool, default False Return a dataframe in which the "Hypothesis" and "Generated As" columns contain strings instead of integers. If True, the returned dataframe will have strings in the "Hypothesis" and "Generated As" columns. allow_less \: bool, default False Allow the final dataframe to have fewer than the requested number of events if not enough data is available. If True, the resulting dataframe may not have the requested number of events for each particle, and the number of events may be different for each particle type. Returns ------- smaller_dataset \: :external:class:`pandas.DataFrame` A dataframe containing the events grabbed from the input dataframe. """ particle_list = ["Photon","KLong","Neutron","Proton","K+","Pi+","AntiMuon","Positron","AntiProton","K-","Pi-","Muon","Electron","No ID"] if reverse: df = input_df[::-1].copy().reset_index(drop=True) else: df = input_df.copy().reset_index(drop=True) if isinstance(df['Hypothesis'][0],str): df['Hypothesis']=df['Hypothesis'].apply(particle_list.index) if isinstance(df['Generated As'][0],str): df['Generated As']=df['Generated As'].apply(particle_list.index) nRows = len(df.index) starting_index = 0 training_list = [] include_list = [] counter = 0 if reverse: current_particle = 12 else: current_particle = 0 while starting_index <= nRows - 1 : if df['Generated As'][starting_index] != current_particle: if counter <= n_each and not allow_less: raise ValueError("Not enough rows in dataframe to grab " + str(n_each) + " events of " + particle_list[current_particle] + " events.") if reverse: current_particle -= 1 else: current_particle += 1 counter = 0 counter += 1 ending_index = starting_index + df['Number of Hypotheses'][starting_index] if counter <= n_each: include_list.extend([True for _ in range(starting_index, ending_index)]) else: include_list.extend([False for _ in range(starting_index, ending_index)]) starting_index = ending_index if return_strings: df['Hypothesis']=df['Hypothesis'].apply(lambda x: particle_list[x]) if return_strings: df['Generated As']=df['Generated As'].apply(lambda x: particle_list[x]) smaller_dataset = df.loc[include_list].reset_index(drop=True) return smaller_dataset
[docs] def feature_importance(model, df, target='Generated As', match_hypothesis=False, n_repetitions=3, n_each=100): """ Calculates and plots the permutation feature importances of the features supplied to the provided model. Parameters ---------- model \: Any scikit-learn trained model with "predict" and "predict_proba" methods. The model to be used to predict the particle type of the particles supplied in the dataframe. df \: :external:class:`pandas.DataFrame` The dataframe whose rows represent particles that can be identified by the model. Supplied dataframes should have a "Hypothesis" column, which contains either a str or int, and a "Number of Hypotheses" column, which contains an int. target \: str, default "Generated As" The target of the model. The supplied dataframe must have a column with this label. match_hypothesis \: bool, default False: Require predictions to match the supplied hypothesis. If True, only considers predictions that match the hypothesis. Neutral particles, which have no hypothesis, are still considered in the typical sense. If False, the prediction of the model is the most frequent prediction among all hypotheses. n_repetitions \: int, default 3 The number of times to permute each feature. The feature importance is the average accuracy over all of the repetitions. n_each \: int, default 100 The number of events of each particle type to include in each permutation test. """ test_data = grab_events(df,n_each=n_each) x_test = test_data[[column for column in model.feature_names_in_]] y_test = test_data[target] importances = [] n_features_to_shuffle = len([i for i in x_test.columns if i != "Number of Hypotheses"]) starting_index = 0 predictions = [] identities = [] hypotheses = test_data['Hypothesis'].to_list() length_of_df = len(x_test.index) dfs_to_combine = [x_test] total_rows = length_of_df for column_to_shuffle in x_test.columns: if column_to_shuffle == "Number of Hypotheses": continue for _ in range(n_repetitions): total_rows += length_of_df shuffled_dataframe = x_test.copy() shuffled_dataframe[column_to_shuffle] = shuffled_dataframe[column_to_shuffle].sample(frac=1,ignore_index=True) hypotheses.extend(test_data['Hypothesis'].to_list()) dfs_to_combine.append(shuffled_dataframe) new_test =pd.concat(dfs_to_combine, ignore_index=True) index_list = [] event_nos = [] i = 0 nRows = len(new_test.index) while starting_index <= nRows - 1 : ending_index = starting_index + new_test['Number of Hypotheses'][starting_index] index_list.append((starting_index, ending_index)) event_nos.extend([i for _ in range(starting_index, ending_index)]) i+= 1 starting_index = ending_index number_of_events = i identities = [int(y_test[i % length_of_df]) for i, j in index_list] if match_hypothesis: temp_predictions = model.predict_proba(new_test) new_test['Confidence'] = [max(probs) for probs in temp_predictions] new_test['Prediction'] = np.argmax(temp_predictions, axis=1) matches_hypotheses_bool_list = (new_test['Prediction'] == new_test['Hypothesis']) | (new_test['Hypothesis'] == 13) matching_hypotheses = new_test.loc[matches_hypotheses_bool_list] grouped_df = matching_hypotheses[['Prediction','Confidence','eventNo']].groupby('eventNo') max_confidence_indices = grouped_df['Confidence'].idxmax() predictions_temp = new_test['Prediction'].iloc[max_confidence_indices] predictions = predictions_temp.set_index('eventNo').sort_index().reindex(list(range(number_of_events)),fill_value=13).to_list identities = grouped_df[target].head(1).to_list() else: new_test['Prediction'] = model.predict(new_test) new_test['eventNo'] = event_nos grouped_df = new_test[['Prediction','eventNo']].groupby('eventNo') predictions = grouped_df['Prediction'].agg(lambda x: x.value_counts().index[0]).to_list() identities = np.array(identities, dtype= int) predictions = np.array(predictions, dtype= int) starting_accuracy = sklearn.metrics.accuracy_score(identities[0:13* n_each],predictions[0:13 * n_each]) importances = [starting_accuracy-sklearn.metrics.accuracy_score(identities[13*n_each*(i*n_repetitions+1):13*n_each*((i+1)*n_repetitions+1)], predictions[13*n_each*(i*n_repetitions+1):13* n_each*((i+1)*n_repetitions+1)]) for i in range(n_features_to_shuffle)] important_features = [model.feature_names_in_[i] for i in range(n_features_to_shuffle) if importances[i] > 0.005] important_importances = [i for i in importances if i > 0.005] plt.bar(important_features,important_importances) plt.title("Feature Importances") plt.xlabel("Feature Name") plt.ylabel("Feature Importance") plt.xticks(rotation=90) plt.show()
[docs] def save_model(model, path="my_model.joblib"): """ Saves a model as a joblib dump at the specified path. Parameters ---------- model \: Any scikit-learn trained model. The model to be saved. path \: str, default "my_model.joblib" The path to the model save location. """ joblib.dump(model, path) print('Model saved as ' + path)
[docs] def load_model(path="my_model.joblib"): """ Loads a model from a joblib dump at the specified path. Parameters ---------- path \: str, default "my_model.joblib" The path to the model save location. Returns ------- model \: Scikit-learn trained model. The loaded scikit-learn model. """ print("Loading model...") model = joblib.load(path) print("Done loading model") return model