import sys
sys.path.append("../exp/")

from models import get_pipe
import BPt as bp
import numpy as np
import os
from neurotools.plotting import plot


# Load in dataset
data = bp.read_pickle('../data/dataset.pkl')

# Let's replace the loaded consolidated data files
data = bp.Dataset(data['target'])
data = data.set_role(scope=list(data), role='target')
data.shape

# Add data files
files = {'curv': '../data/abcd_structural/curv/*.npy',
         'sulc': '../data/abcd_structural/sulc/*.npy',
         'thick': '../data/abcd_structural/thick/*.npy',
         'myelin': '../data/abcd_structural/myelin/*.npy'}
data = data.add_data_files(files, file_to_subject='auto')

# Drop all data w/ all missing target
data = data.drop_subjects_by_nan('target', threshold=.99)

data.get_cols('data'), data.shape

Setting NaN threshold to: 44.55
Dropped 597 Rows

(['curv', 'myelin', 'sulc', 'thick'], (9432, 49))


# Set problem spec
ps = bp.ProblemSpec(target='anthro_waist_cm',
                    random_state=5)

# Helper function around eval
def run_eval(pipe, n_jobs=8):
    
    # Run evaluate
    return bp.evaluate(pipeline=pipe,
                       dataset=data,
                       problem_spec=ps,
                       subjects='all',
                       n_jobs=n_jobs,
                       mute_warnings=True,
                       cv=5)

def plot_avg(inverse_fis):
    
    # Get average
    avg_inverse = np.mean(np.array(inverse_fis), axis=0)
    
    # Set colorbar max and min same across each
    # m = np.max(np.abs(avg_inverse))
    # m *= 1.05
    
    # Use mean val as threshold
    thresh = np.mean(np.mean(np.abs(avg_inverse)))
    
    # Plot average features for each modality seperate
    for name, vals in zip(inverse_fis[0].index, avg_inverse):
        plot(vals, threshold=thresh, title=name)


# Get pipelines
pipe = get_pipe('elastic', 'random_50_0')

# Run evaluate
results = run_eval(pipe)

Get pipeline with: elastic random_50_0 None
cache_fit_loc: None
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0566
neg_mean_squared_error: -19.84

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0811
neg_mean_squared_error: -23.41

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0689
neg_mean_squared_error: -22.35

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0618
neg_mean_squared_error: -20.88

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0798
neg_mean_squared_error: -17.87


inverse_fis = results.get_inverse_fis()


len(inverse_fis), inverse_fis[0]

(5,
 curv      [0.018847158, -0.007000035, -0.007000035, -0.5...
 myelin    [0.0, 0.0, 0.0, -2.4028604, 3.305909, -10.3928...
 sulc      [0.9584251, -0.14789416, -0.14789416, 0.101624...
 thick     [-0.29126185, 0.4006583, 0.4006583, -0.3283814...
 dtype: object)


inverse_fis[0][0].shape

(64984,)


avg_inverse = np.mean(np.array(inverse_fis), axis=0)
avg_inverse.shape, avg_inverse[0].shape

((4,), (64984,))


plot_avg(inverse_fis)


# Get pipelines
pipe = get_pipe('lgbm', 'random_50_0')

# Run evaluate
results = run_eval(pipe)

inverse_fis = results.get_inverse_fis()
plot_avg(inverse_fis)

Get pipeline with: lgbm random_50_0 None
cache_fit_loc: None
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0643
neg_mean_squared_error: -24.42

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 0.9 seconds.
explained_variance: 0.0693
neg_mean_squared_error: -26.41

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 1.0 seconds.
explained_variance: 0.0595
neg_mean_squared_error: -25.78

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 1.2 seconds.
explained_variance: 0.0686
neg_mean_squared_error: -31.15

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 1.0 seconds.
explained_variance: 0.0687
neg_mean_squared_error: -24.11


# Get pipeline
pipe = get_pipe('elastic', 'voted_random_100_10_1')

# Run evaluate
results = run_eval(pipe)

# Get inverse and plot
inverse_fis = results.get_inverse_fis()
plot_avg(inverse_fis)

Get pipeline with: elastic voted_random_100_10_1 None
Get pipeline with: elastic random_100_10 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_10_e
Get pipeline with: elastic random_100_11 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_11_e
Get pipeline with: elastic random_100_12 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_12_e
Get pipeline with: elastic random_100_13 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_13_e
Get pipeline with: elastic random_100_14 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_14_e
Get pipeline with: elastic random_100_15 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_15_e
Get pipeline with: elastic random_100_16 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_16_e
Get pipeline with: elastic random_100_17 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_17_e
Get pipeline with: elastic random_100_18 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_18_e
Get pipeline with: elastic random_100_19 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_19_e
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 15.1 seconds.
explained_variance: 0.1055
neg_mean_squared_error: -18.81

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 15.3 seconds.
explained_variance: 0.0864
neg_mean_squared_error: -23.28

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 14.6 seconds.
explained_variance: 0.1114
neg_mean_squared_error: -21.32

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 13.6 seconds.
explained_variance: 0.1056
neg_mean_squared_error: -19.91

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 14.0 seconds.
explained_variance: 0.1107
neg_mean_squared_error: -17.27


# Get pipeline
pipe = get_pipe('lgbm', 'voted_random_100-1000_3_0')

# Run evaluate
results = run_eval(pipe)

# Inverse and plot
results.get_inverse_fis()
plot_avg(inverse_fis)

Get pipeline with: lgbm voted_random_100-1000_3_0 None
Get pipeline with: lgbm random_100_0 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_0_e
Get pipeline with: lgbm random_550_0 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_550_0_e
Get pipeline with: lgbm random_1000_0 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_1000_0_e
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 3.8 seconds.
explained_variance: 0.1050
neg_mean_squared_error: -20.44

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 3.8 seconds.
explained_variance: 0.0881
neg_mean_squared_error: -23.23

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 3.8 seconds.
explained_variance: 0.0996
neg_mean_squared_error: -22.03

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 3.7 seconds.
explained_variance: 0.0843
neg_mean_squared_error: -21.78

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 3.7 seconds.
explained_variance: 0.1145
neg_mean_squared_error: -17.88


# Get pipelines
pipe = get_pipe('svm', 'random_50_0')

# Run evaluate
results = run_eval(pipe)

Get pipeline with: svm random_50_0 None
cache_fit_loc: None
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 0.7 seconds.
explained_variance: 0.0496
neg_mean_squared_error: -20.71

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 0.8 seconds.
explained_variance: 0.0603
neg_mean_squared_error: -24.48

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 0.8 seconds.
explained_variance: 0.0559
neg_mean_squared_error: -23.45

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 0.7 seconds.
explained_variance: 0.0574
neg_mean_squared_error: -21.67

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 0.8 seconds.
explained_variance: 0.0755
neg_mean_squared_error: -18.67


# Calculate p_fis
p_fis = results.permutation_importance(dataset=data,
                                       n_repeats=2,
                                       just_model=True,
                                       nested_model=True,
                                       return_as='dfs',
                                       n_jobs=-1)

Using scorer: explained_variance


# From permutation fis can get inverse by passing alongs means
inverse_fis = results.get_inverse_fis(p_fis['importances_mean'])

# Plot
plot_avg(inverse_fis)


# Get pipeline
pipe = get_pipe('svm', 'voted_random_100_2_0')

# Run evaluate
results = run_eval(pipe)

Get pipeline with: svm voted_random_100_2_0 None
Get pipeline with: svm random_100_0 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_0_e
Get pipeline with: svm random_100_1 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_1_e
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 2.9 seconds.
explained_variance: 0.0861
neg_mean_squared_error: -19.80

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 3.3 seconds.
explained_variance: 0.0671
neg_mean_squared_error: -24.29

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 2.9 seconds.
explained_variance: 0.0916
neg_mean_squared_error: -22.53

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 3.0 seconds.
explained_variance: 0.0790
neg_mean_squared_error: -21.23

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 3.1 seconds.
explained_variance: 0.0875
neg_mean_squared_error: -18.41


# Get base X and y from dataset
X, y = data.get_Xy(results.ps)

# Call this internal methods for fold 0
estimator, X_val, y_val, feat_names =\
                results._get_val_fold_Xy(results.estimators[0],
                                         X_df=X, y_df=y,
                                         fold=0,
                                         just_model=True,
                                         nested_model=True)


X_val.shape, len(feat_names)

((1882, 552), 552)


feat_names[300:310], feat_names[-1]

(['1_curv_76',
  '1_curv_78',
  '1_curv_79',
  '1_curv_81',
  '1_curv_86',
  '1_curv_88',
  '1_curv_89',
  '1_curv_90',
  '1_curv_92',
  '1_curv_93'],
 '1_thick_98')


estimator

BPtModel(estimator=BPtVotingRegressor(estimators=[('Custom 0', BPtModel(estimator=BPtPipeline(cache_loc='/home/sage/parc_scaling/exp/cache_fit', steps=[('Custom 0', BPtLoader(cache_loc='/home/sage/parc_scaling/exp/cache/random_100_0_e', estimator=SurfLabels(labels='/home/sage/parc_scaling/extra_random_parcels/random_100_0.npy'), file_mapping={0.0: DataFile...sk_ip': None, 'memmap_X': False, 'mp_context': 'loky', 'n_iter': 60, 'n_jobs': 8, 'progress_loc': None, 'random_state': 5, 'scorer': make_scorer(explained_variance_score), 'search_only_params': {'svm classifier__probability': False}, 'search_type': 'RandomSearch', 'verbose': 0, 'weight_scorer': False}, random_state=5), inds=Ellipsis))]), inds=Ellipsis))], n_jobs=1), inds='all')


# The estimator despite still containing the nested Loaders
# is designed to still be able to accept the already transformed
# by its own nested Loader's input to its predict function
estimator.predict(X_val).shape

(1882,)


# Calculate p_fis
p_fis = results.permutation_importance(dataset=data,
                                       n_repeats=2,
                                       just_model=True,
                                       nested_model=True,
                                       return_as='dfs',
                                       n_jobs=-1)

# Get inverse, same as before
inverse_fis = results.get_inverse_fis(p_fis['importances_mean'])

# Plot
plot_avg(inverse_fis)

Using scorer: explained_variance


# Get pipeline
pipe = get_pipe('elastic', 'stacked_random_100_3_1')

# Run evaluate
results = run_eval(pipe)

# Plot inverse
results.get_inverse_fis()
plot_avg(inverse_fis)

Get pipeline with: elastic stacked_random_100_3_1 None
Get pipeline with: elastic random_100_10 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_10_e
Get pipeline with: elastic random_100_11 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_11_e
Get pipeline with: elastic random_100_12 None
cache_fit_loc: /home/sage/parc_scaling/exp/cache_fit/random_100_12_e
Warning: There are 12 missing targets passed to evaluate. Subjects with missing target values will be skipped during training and scoring.
Predictions will still be made for any subjects with missing values in any validation folds.
Predicting target = anthro_waist_cm
Using problem_type = regression
Using scope = all (defining a total of 4 features).
Evaluating 9432 total data points.

Training Set: (7538, 4) (skipped 7 NaN targets)
Validation Set: (1882, 4) (skipped 5 NaN targets)
Fit fold in 413.0 seconds.
explained_variance: 0.0838
neg_mean_squared_error: -19.27

Training Set: (7535, 4) (skipped 10 NaN targets)
Validation Set: (1885, 4) (skipped 2 NaN targets)
Fit fold in 417.8 seconds.
explained_variance: 0.0747
neg_mean_squared_error: -23.58

Training Set: (7537, 4) (skipped 9 NaN targets)
Validation Set: (1883, 4) (skipped 3 NaN targets)
Fit fold in 423.7 seconds.
explained_variance: 0.0961
neg_mean_squared_error: -21.69

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 420.8 seconds.
explained_variance: 0.0834
neg_mean_squared_error: -20.40

Training Set: (7535, 4) (skipped 11 NaN targets)
Validation Set: (1885, 4) (skipped 1 NaN targets)
Fit fold in 391.1 seconds.
explained_variance: 0.0820
neg_mean_squared_error: -17.83


# Calculate p_fis
p_fis = results.permutation_importance(dataset=data,
                                       n_repeats=2,
                                       just_model=True,
                                       nested_model=True,
                                       return_as='dfs',
                                       n_jobs=-1)

# Get inverse, same as before
inverse_fis = results.get_inverse_fis(p_fis['importances_mean'])

# Plot
plot_avg(inverse_fis)

Using scorer: explained_variance

How to perform back projection of feature weights?¶

Base Elastic-Net¶

Base - LGBM¶

What about a voting ensemble of Elastic-Net ?¶

SVM / Permutation Feature Importance¶

SVM based voting ensemble¶

Stacked Ensembles - Elastic-Net¶