Decoding from fMRI data¶
In this notebook we will use the Haxby et al. data that were prepared in the Data Setup notebook to perform classification.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | import os import h5py import numpy as np import nibabel as nib import pandas as pd from sklearn.model_selection import LeaveOneGroupOut, KFold from sklearn.linear_model import SGDClassifier from sklearn.metrics import accuracy_score from sklearn.feature_selection import f_classif, SelectKBest, VarianceThreshold import warnings from utils import get_subject_runs import nilearn from sklearn.preprocessing import StandardScaler from nilearn.maskers import NiftiMasker import nilearn.plotting from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from nilearn.decoding.searchlight import search_light from sklearn import neighbors import seaborn as sns import matplotlib.pyplot as plt base_dir = '/Users/poldrack/data_unsynced/ds000105' h5_file = os.path.join(base_dir, 'derivatives/cleaned/haxby_data_cleaned.h5') |
Load the data¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | def get_subject_data( subject, h5_file, bids_dir, data_key='vtmaskdata', condmeans=False, condition_subset=None, ): assert data_key in ['braindata', 'vtmaskdata', 'difumodata'] meanstr = 'mean_' if condmeans else '' runs = get_subject_runs(subject, bids_dir) X, metadata_df = None, None with h5py.File(h5_file, 'r') as hf: for run in runs: if X is None: X = hf[f'sub-{subject}/run-{run}/{meanstr + data_key}'][:] else: X = np.vstack( (X, hf[f'sub-{subject}/run-{run}/{meanstr + data_key}'][:]) ) if metadata_df is None: conditions = [ i.decode('utf-8') for i in hf[ f'sub-{subject}/run-{run}/{meanstr}conditions' ][:] ] metadata_df = pd.DataFrame( {'conditions': conditions, 'run': run} ) else: conditions = [ i.decode('utf-8') for i in hf[ f'sub-{subject}/run-{run}/{meanstr}conditions' ][:] ] metadata_df = pd.concat( [ metadata_df, pd.DataFrame({'conditions': conditions, 'run': run}), ] ) if condition_subset is not None: metadata_df = metadata_df[ metadata_df['conditions'].isin(condition_subset) ] X = X[metadata_df.index] assert X.shape[0] == metadata_df.shape[0] return X, metadata_df X, metadata_df = get_subject_data( 1, h5_file, base_dir, condmeans=True, data_key='vtmaskdata' ) print(X.shape) metadata_df.conditions.unique() |
(96, 1404)
array(['bottle', 'cat', 'chair', 'face', 'house', 'scissors', 'scrambledpix', 'shoe'], dtype=object)
Run decoding model¶
Use a leave-one-run-out crossvalidation scheme
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | # leave one run out cross validation def run_cv_subject( data, metadata_df, nfeatures=None, shuffle=False, varthresh=None, clf=None, suppress_warnings=False, standardize=True, ): """ Perform cross-validation for decoding analysis. Parameters: - data_df (pandas.DataFrame): The input data frame containing the features and labels. - nfeatures (int): The number of features to select using ANOVA. Default is 1000, or None to disable. - shuffle_y (bool): Whether to shuffle the labels. Default is False - varthresh (float): The variance threshold for feature selection. Default is 0, or None to disable. - clf (sklearn classifier): The classifier to use. Default is None, which uses a Support Vector Machine Returns: - accs (list): A list of accuracy scores for each cross-validation fold. """ if clf is None: clf = SGDClassifier() # by default is like SVM with l2 regularization # Suppress warnings from scikit-learn if suppress_warnings: warnings.filterwarnings('ignore', category=UserWarning) warnings.filterwarnings('ignore', category=RuntimeWarning) # Leave one run out cross-validation logo = LeaveOneGroupOut() assert data.shape[0] == metadata_df.shape[0] if shuffle: metadata_df.conditions = np.random.permutation(metadata_df.conditions) # perform variance thresholding with default of zero if varthresh is not None: vt = VarianceThreshold(threshold=varthresh) data = vt.fit_transform(data) accs = [] coefs = None for train_index, test_index in logo.split(data, groups=metadata_df.run): train_X = data[train_index] assert train_X.shape[0] == len(train_index) assert train_X.shape[1] == data.shape[1] test_X = data[test_index] train_y = metadata_df.conditions.iloc[train_index] test_y = metadata_df.conditions.iloc[test_index] if standardize: scaler = StandardScaler() train_X = scaler.fit_transform(train_X) test_X = scaler.transform(test_X) if nfeatures is not None: # Feature selection based on training data only selector = SelectKBest(score_func=f_classif, k=nfeatures) train_data_selected = selector.fit_transform(train_X, train_y) test_data_selected = selector.transform(test_X) else: train_data_selected = train_X test_data_selected = test_X clf.fit(train_data_selected, train_y) if hasattr(clf, 'coef_'): if coefs is None: coefs = clf.coef_ else: coefs += clf.coef_ acc = accuracy_score(test_y, clf.predict(test_data_selected)) accs.append(acc) if coefs is not None: mean_coefs = coefs / len(accs) else: mean_coefs = None return accs, mean_coefs def get_runs_from_hf(hf, subject): runs = [int(i.split('-')[-1]) for i in hf[subject].keys()] runs.sort() return runs def run_cv( clf=None, shuffle=False, nruns=1, nfeatures=None, varthresh=None, data_key='brain', condmeans=False, condition_subset=None, ): data_key = data_key + 'data' if clf is None: clf = SGDClassifier() print(f'Running cross-validation with {clf}') if nfeatures is not None: print(f'Selecting best {nfeatures} features: ') if varthresh is not None: print(f'Performing variance thresholding with threshold {varthresh}') if shuffle: print('Using shuffled labels') accs = {} coefs = {} for subject in range(1, 7): subdata, metadata_df = get_subject_data( subject, h5_file, base_dir, condmeans=condmeans, data_key=data_key, condition_subset=condition_subset, ) print(f'Subject {subject}') shuffled_string = 'shuffled' if shuffle else 'orig' accs[subject] = {shuffled_string: []} for i in range(nruns): acc_, coef_ = run_cv_subject( subdata, metadata_df, nfeatures=nfeatures, shuffle=shuffle, varthresh=varthresh, clf=clf, ) if shuffle is False: coefs[subject] = coef_ accs[subject][shuffled_string].append(np.mean(acc_)) print( f'Mean accuracy ({shuffled_string}): {np.mean(accs[subject][shuffled_string]):.03}' ) print('') return accs, coefs # run crossvalidation across subjects with default settings # need to run without thresholding or feature selection to get the full set of coefs data_key = 'vtmask' condmeans = True condition_subset = None # ['face', 'house', 'cat'] accs, coefs = run_cv( shuffle=False, #nshuffles=10, data_key=data_key, condmeans=condmeans, varthresh=None, nfeatures=None, condition_subset=condition_subset, ) results_df = pd.DataFrame({f'SGDL2_{data_key}': [accs[i]['orig'][0] for i in accs.keys()]}) results_df |
Running cross-validation with SGDClassifier() Subject 1 Mean accuracy (orig): 0.802 Subject 2 Mean accuracy (orig): 0.646 Subject 3 Mean accuracy (orig): 0.729 Subject 4 Mean accuracy (orig): 0.635 Subject 5 Mean accuracy (orig): 0.875 Subject 6 Mean accuracy (orig): 0.729
SGDL2_vtmask | |
---|---|
0 | 0.802083 |
1 | 0.645833 |
2 | 0.729167 |
3 | 0.635417 |
4 | 0.875000 |
5 | 0.729167 |
Shuffle to assess vs chance
1 2 3 4 5 6 7 8 9 10 | n_shuf_runs = 1000 accs_shuf, _ = run_cv( shuffle=True, nruns=n_shuf_runs, data_key=data_key, condmeans=condmeans, varthresh=None, nfeatures=None, condition_subset=condition_subset, ) |
Running cross-validation with SGDClassifier() Using shuffled labels Subject 1 Mean accuracy (shuffled): 0.112 Subject 2 Mean accuracy (shuffled): 0.112 Subject 3 Mean accuracy (shuffled): 0.112 Subject 4 Mean accuracy (shuffled): 0.114 Subject 5 Mean accuracy (shuffled): 0.112 Subject 6 Mean accuracy (shuffled): 0.113
1 2 3 4 | for i in accs_shuf.keys(): # customary to add the actual value to the shuffled values to avoid p=0 pval = np.mean(accs_shuf[i]['shuffled'] >= accs[i]['orig'][0]) + 1/n_shuf_runs print(f'Subject {i}: p = {pval:.03f} versus null') |
Subject 1: p = 0.001 versus null Subject 2: p = 0.001 versus null Subject 3: p = 0.001 versus null Subject 4: p = 0.001 versus null Subject 5: p = 0.001 versus null Subject 6: p = 0.001 versus null
Visualizing the coefficients¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | def get_subject_vt_mask(subject, vtmask_dir, res=3): vt_mask_file = os.path.join( vtmask_dir, f'sub-{subject}_mask4vt_space-MNI152NLin2009cAsym_res-{res}.nii.gz', ) return nib.load(vt_mask_file) def visualize_coefs(coefs, conditions, vtmask_dir): # plot the mean coefficients for subject in range(1, 2): mask_img = get_subject_vt_mask(subject, vtmask_dir) masker = NiftiMasker(mask_img) masker.fit() print(coefs[subject].shape) coef_img = masker.inverse_transform(coefs[subject]) for c in range(coefs[subject].shape[0]): nilearn.plotting.plot_stat_map( coef_img.slicer[..., c], display_mode='z', cut_coords=[-21, -18, -15, -9, 0], title=f'Subject {subject}, condition {conditions[c]}', ) if condition_subset is None: conditions = metadata_df.conditions.unique().tolist() else: conditions = condition_subset vtmask_dir = os.path.join(base_dir, 'derivatives/vtmasks') visualize_coefs(coefs, conditions, vtmask_dir) |
(8, 1404)
Effect of different regularization schemes¶
In the previous model, we used a support vector machine with an L2 penalty, which penalizes the sum of squared weights. We can also look at the effect using a different penalty, namely an L1 penalty, which penalizes based on the sum of absolute weights, and leads in general to sparser solutions (more zero weights).
We can set the LinearSVC
classifier to use L1 penalization.
1 2 3 4 5 6 7 8 9 10 | accs_L1, coefs_L1 = run_cv( shuffle=False, data_key=data_key, condmeans=condmeans, clf=SGDClassifier(penalty='l1', alpha=0.01), varthresh=None, nfeatures=None, ) results_df[f'SGDL1_{data_key}'] = [accs_L1[i]['orig'] for i in accs_L1.keys()] visualize_coefs(coefs_L1, conditions, vtmask_dir) |
Running cross-validation with SGDClassifier(alpha=0.01, penalty='l1') Subject 1 Mean accuracy (orig): 0.76 Subject 2 Mean accuracy (orig): 0.667 Subject 3 Mean accuracy (orig): 0.719 Subject 4 Mean accuracy (orig): 0.656 Subject 5 Mean accuracy (orig): 0.795 Subject 6 Mean accuracy (orig): 0.708 (8, 1404)
Using dimensionality-reduced data¶
1 2 3 4 5 6 7 8 9 | data_key = 'difumo' accs_difumo, _ = run_cv( shuffle=False, data_key=data_key, condmeans=condmeans, varthresh=0, nfeatures=50, ) results_df[f'SGDL2_{data_key}'] = [accs_difumo[i]['orig'] for i in accs_difumo.keys()] |
Running cross-validation with SGDClassifier() Selecting best 50 features: Performing variance thresholding with threshold 0 Subject 1 Mean accuracy (orig): 0.812 Subject 2 Mean accuracy (orig): 0.74 Subject 3 Mean accuracy (orig): 0.802 Subject 4 Mean accuracy (orig): 0.708 Subject 5 Mean accuracy (orig): 0.818 Subject 6 Mean accuracy (orig): 0.667
Using whole-brain data with feature selection¶
1 2 3 4 5 6 7 8 9 | data_key = 'brain' accs_wholebrain, _ = run_cv( shuffle=False, data_key=data_key, condmeans=condmeans, varthresh=0, nfeatures=500, ) results_df[f'SGDL2_{data_key}'] = [accs_wholebrain[i]['orig'] for i in accs_wholebrain.keys()] |
Running cross-validation with SGDClassifier() Selecting best 500 features: Performing variance thresholding with threshold 0 Subject 1 Mean accuracy (orig): 0.833 Subject 2 Mean accuracy (orig): 0.729 Subject 3 Mean accuracy (orig): 0.854 Subject 4 Mean accuracy (orig): 0.76 Subject 5 Mean accuracy (orig): 0.875 Subject 6 Mean accuracy (orig): 0.76
Multi-layer perceptron¶
Now we will use a nonlinear model: a multi-layer perceptron, which has a single layer of modifiable weights. We use a single hidden layer with 100 units.
1 2 3 4 5 6 7 8 9 10 11 | data_key = 'vtmask' accs_MLP, coefs_MLP = run_cv( shuffle=False, data_key=data_key, condmeans=condmeans, clf=MLPClassifier(hidden_layer_sizes=(128), max_iter=500), varthresh=None, nfeatures=None, ) results_df[f'MLP_{data_key}'] = [accs_MLP[i]['orig'] for i in accs_MLP.keys()] |
Running cross-validation with MLPClassifier(hidden_layer_sizes=128, max_iter=500) Subject 1 Mean accuracy (orig): 0.708 Subject 2 Mean accuracy (orig): 0.771 Subject 3 Mean accuracy (orig): 0.708 Subject 4 Mean accuracy (orig): 0.604 Subject 5 Mean accuracy (orig): 0.852 Subject 6 Mean accuracy (orig): 0.688
1 | results_df
|
SGDL2_vtmask | SGDL1_vtmask | SGDL2_difumo | SGDL2_brain | MLP_vtmask | |
---|---|---|---|---|---|
0 | 0.802083 | [0.7604166666666666] | [0.8125] | [0.8333333333333334] | [0.7083333333333334] |
1 | 0.645833 | [0.6666666666666666] | [0.7395833333333334] | [0.7291666666666666] | [0.7708333333333334] |
2 | 0.729167 | [0.71875] | [0.8020833333333334] | [0.8541666666666666] | [0.7083333333333334] |
3 | 0.635417 | [0.65625] | [0.7083333333333334] | [0.7604166666666666] | [0.6041666666666666] |
4 | 0.875000 | [0.7954545454545454] | [0.8181818181818182] | [0.875] | [0.8522727272727273] |
5 | 0.729167 | [0.7083333333333334] | [0.6666666666666666] | [0.7604166666666666] | [0.6875] |
1 | sns.boxplot(data=results_df) |
<Axes: >
Searchlight decoding using surface data¶
Based on nilearn tutorial
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | def get_subject_surface_data( subject, h5_file, bids_dir, condmeans=True, hemispheres=None, condition_subset=None, ): if hemispheres is None: hemispheres = ['L', 'R'] # replace left/right with L/R hemispheres = [h if h in ['L', 'R'] else h[0].upper() for h in hemispheres] meanstr = 'mean_' if condmeans else '' runs = get_subject_runs(subject, bids_dir) X, metadata_df = None, None with h5py.File(h5_file, 'r') as hf: for run in runs: data = [hf[f'sub-{subject}/run-{run}/{meanstr + h}'][:] for h in hemispheres] combined_data = np.hstack(( data )) if X is None: X = combined_data else: X = np.vstack( (X, combined_data) ) if metadata_df is None: conditions = [ i.decode('utf-8') for i in hf[ f'sub-{subject}/run-{run}/{meanstr}conditions' ][:] ] metadata_df = pd.DataFrame( {'conditions': conditions, 'run': run} ) else: conditions = [ i.decode('utf-8') for i in hf[ f'sub-{subject}/run-{run}/{meanstr}conditions' ][:] ] metadata_df = pd.concat( [ metadata_df, pd.DataFrame({'conditions': conditions, 'run': run}), ] ) if condition_subset is not None: metadata_df = metadata_df[ metadata_df['conditions'].isin(condition_subset) ] X = X[metadata_df.index] assert X.shape[0] == metadata_df.shape[0] return X, metadata_df h5_surface_file = os.path.join(base_dir, 'derivatives/cleaned/haxby_surface_data_cleaned.h5') X, metadata_df = get_subject_surface_data( 1, h5_surface_file, base_dir, condmeans=True, hemispheres=['L'], # condition_subset=["face", "house"] ) print(X.shape) metadata_df.conditions.unique() |
(96, 10242)
array(['bottle', 'cat', 'chair', 'face', 'house', 'scissors', 'scrambledpix', 'shoe'], dtype=object)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | def searchlight_adjacency(hemi, radius=3, surf='infl'): assert hemi in ['left', 'right'] fsaverage = nilearn.datasets.fetch_surf_fsaverage(mesh="fsaverage5") infl_mesh = fsaverage[f"{surf}_{hemi}"] coords, _ = nilearn.surface.load_surf_mesh(infl_mesh) nn = neighbors.NearestNeighbors(radius=radius) return nn.fit(coords).radius_neighbors_graph(coords).tolil() def run_searchlight(subject, h5_surface_file, base_dir, radius=3, condition_subset=None, condmeans=True): scores = {} for hemis in ['left', 'right']: adjacency = searchlight_adjacency(hemis, radius) X, metadata_df = get_subject_surface_data( subject, h5_surface_file, base_dir, condmeans=condmeans, hemispheres=[hemis], condition_subset=condition_subset ) # Simple linear estimator preceded by a normalization step estimator = make_pipeline(StandardScaler(), SGDClassifier()) # Define cross-validation scheme # we can use this to do leave-one-run-out with shuffle turned off since there are 12 runs # in order in the data, so each fold leaves out two runs cv = KFold(n_splits=6, shuffle=False) # Cross-validated search light scores[hemis] = search_light(X, metadata_df.conditions.values, estimator, adjacency, cv=cv, n_jobs=12) return scores |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | fsaverage = nilearn.datasets.fetch_surf_fsaverage(mesh="fsaverage5") for subject in range(1, 7): scores = run_searchlight(subject, h5_surface_file, base_dir, condition_subset=condition_subset) fig, axes = plt.subplots(subplot_kw={'projection': '3d'}, nrows=2, ncols=2) axes = axes.flatten() surf = 'infl' ctr = 0 threshold = 0.2 for hemis in ['left', 'right']: for i, view in enumerate(['lateral', 'medial',]): infl_mesh = fsaverage[f"{surf}_{hemis}"] nilearn.plotting.plot_surf_stat_map( infl_mesh, scores[hemis], view=view, colorbar=True, threshold=threshold, bg_map=fsaverage[f"sulc_{hemis}"], title=f"Sub {subject}, {hemis} hemis", axes=axes[ctr], ) ctr += 1 |
1 |