Leakage of test set information¶
1 2 3 4 5 6 7 8 | import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectKBest from sklearn.linear_model import SGDClassifier from sklearn.metrics import accuracy_score import seaborn as sns |
1 2 3 4 5 6 7 8 9 10 11 12 13 | # generate data with no relations between features and target def mk_data(nobs=1000, nfeatures=1000, p=0.5, seed=None): rng = np.random.RandomState(seed) X = rng.normal(size=(nobs, nfeatures)) y = rng.binomial(1, p, size=nobs) return X, y nobs = 100 nfeatures = 10000 X, y = mk_data(nobs, nfeatures) assert X.shape == (nobs, nfeatures) assert y.shape == (nobs,) |
Proper crossvalidation¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | # split data into training and test sets X, y = mk_data(nobs, nfeatures, seed=1234) # do 100 shuffle splits scores = [] nsplits = 500 n_selected_features = 50 for i in range(nsplits): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # select the best features using training set only selector = SelectKBest(k=n_selected_features) X_train = selector.fit_transform(X_train, y_train) X_test = selector.transform(X_test) # fit a classifier clf = SGDClassifier() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) scores.append(score) print('accuracy:', np.mean(scores)) |
accuracy: 0.4904666666666667
Feature selection on the entire dataset¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | X, y = mk_data(nobs, nfeatures, seed=1234) # do feature selection outside the loop - induces data leakage X = SelectKBest(k=n_selected_features).fit_transform(X, y) scores_bad_fsel = [] for i in range(nsplits): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # fit a classifier clf = SGDClassifier() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) scores_bad_fsel.append(score) print('accuracy:', np.mean(scores_bad_fsel)) |
accuracy: 0.9241333333333335
1 2 3 4 5 6 7 8 | df_scores = pd.DataFrame({'proper_fsel': scores, 'bad_fsel': scores_bad_fsel}) # gather the data in a long format df_scores = pd.melt(df_scores, var_name='method', value_name='score') sns.displot(data=df_scores, x='score', hue='method', kind='kde') import matplotlib.pyplot as plt plt.axvline(0.5, color='blue', linestyle='--') |
<matplotlib.lines.Line2D at 0x34edf9c70>
Hyperparameter selection¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | param_grid = [ {'C': [.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}, ] from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC X, y = mk_data(nobs, nfeatures, seed=1234) gs = GridSearchCV(SVC(), param_grid, cv=5) gs.fit(X, y) print(gs.best_params_) print(gs.best_score_) |
{'C': 0.001, 'kernel': 'linear'} 0.5900000000000001
1 | 10.**np.arange(-4, 4, 1) |
array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier np.random.seed(14567) X, y = mk_data(nobs, nfeatures, seed=14567) pipeline = Pipeline([ ('clf', None), ]) parameters = [ { 'clf': (SGDClassifier(),), 'clf__alpha': 10.**np.arange(-4, 4, 1), 'clf__penalty': ('l1', 'l2', 'elasticnet'), }, { 'clf': (SVC(),), 'clf__C': 10.**np.arange(-4, 4, 1), 'clf__kernel': ('linear', 'rbf'), }, ] grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy') grid_search.fit(X, y) print(grid_search.best_params_) print(grid_search.best_score_) |
{'clf': SGDClassifier(), 'clf__alpha': np.float64(10.0), 'clf__penalty': 'l2'} 0.64
1 | len(grid_search.cv_results_['mean_test_score']) |
40
1 | sns.displot(grid_search.cv_results_['mean_test_score'], bins=10) |
<seaborn.axisgrid.FacetGrid at 0x34f24b860>
1 |