In [ ]:
# prepend parent dir to import from local pcsp
import sys

sys.path.insert(0, '../')
In [16]:
# % load_ext autoreload
# % autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import vflow
from vflow import Vset, init_args
from vflow.pipeline import build_graph
from functools import partial
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.utils
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.inspection import permutation_importance
In [1]:
# load data
data_dir = "../../_other/Enhancer/data/"
X_train = np.asarray(pd.read_csv(data_dir + "01_X_train.csv", error_bad_lines=False).iloc[:, 1:])
X_test = np.asarray(pd.read_csv(data_dir + "02_X_test.csv", error_bad_lines=False).iloc[:, 1:])
y_train = np.asarray(pd.read_csv(data_dir + "03_y_train.csv", error_bad_lines=False).iloc[:, 1])
y_test = np.asarray(pd.read_csv(data_dir + "04_y_test.csv", error_bad_lines=False).iloc[:, 1])

# initialize data
np.random.seed(14)
X_train, X_test, y_train, y_test = init_args((X_train, X_test, y_train, y_test),
                                             names=['X_train', 'X_test', 'y_train', 'y_test'])

# subsample
subsampling_fns = [partial(sklearn.utils.resample, n_samples=1000, random_state=i) for i in range(3)]

subsampling_set = Vset(name='subsampling', modules=subsampling_fns)
X_trains, y_trains = subsampling_set(X_train, y_train)

modeling_set = Vset(name='modeling',
                         modules=[RandomForestClassifier(n_estimators=50, max_depth=5), MLPClassifier()],
                         module_keys=["RF", "MLP"])

# model
modeling_set.fit(X_trains, y_trains)
preds = modeling_set.predict(X_test)

# hard metrics
hard_metrics_set = Vset(name='hard_metrics', modules=[accuracy_score, balanced_accuracy_score],
                             module_keys=["Acc", "Bal_Acc"])
hard_metrics = hard_metrics_set.evaluate(preds, y_test)

# permutation importance
feature_importance_set = Vset(name='feature_importance', modules=[permutation_importance])
importances = feature_importance_set.evaluate(modeling_set.out, X_test, y_test)

G = build_graph(importances, draw=True)
plt.show()
False
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-29bfd0a6ce35> in <module>
      2 import os
      3 print(os.path.isfile("../../_other"))
----> 4 X_train = np.asarray(pd.read_csv("../data/enhancer/01_X_train.csv", error_bad_lines=False).iloc[:, 1:])
      5 X_test = np.asarray(pd.read_csv("../data/enhancer/02_X_test.csv", error_bad_lines=False).iloc[:, 1:])
      6 y_train = np.asarray(pd.read_csv("../data/enhancer/03_y_train.csv", error_bad_lines=False).iloc[:, 1])

NameError: name 'np' is not defined
In [ ]:
print(hard_metrics)
In [ ]:
print({k: np.argsort(v.importances_mean)[-5:][::-1] for (k, v) in importances.items() if
       type(v) != vflow.module_set.ModuleSet})