{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Leakage of test set information\n", "\n" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.metrics import accuracy_score\n", "import seaborn as sns\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# generate data with no relations between features and target\n", "\n", "def mk_data(nobs=1000, nfeatures=1000, p=0.5, seed=None):\n", " rng = np.random.RandomState(seed)\n", " X = rng.normal(size=(nobs, nfeatures))\n", " y = rng.binomial(1, p, size=nobs)\n", " return X, y\n", "\n", "nobs = 100\n", "nfeatures = 10000\n", "X, y = mk_data(nobs, nfeatures)\n", "assert X.shape == (nobs, nfeatures)\n", "assert y.shape == (nobs,)\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Proper crossvalidation" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy: 0.4904666666666667\n" ] } ], "source": [ "# split data into training and test sets\n", "\n", "X, y = mk_data(nobs, nfeatures, seed=1234)\n", "\n", "# do 100 shuffle splits\n", "scores = []\n", "nsplits = 500\n", "n_selected_features = 50\n", "\n", "for i in range(nsplits):\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n", " scaler = StandardScaler()\n", " X_train = scaler.fit_transform(X_train)\n", " X_test = scaler.transform(X_test)\n", " # select the best features using training set only\n", " selector = SelectKBest(k=n_selected_features)\n", " X_train = selector.fit_transform(X_train, y_train)\n", " X_test = selector.transform(X_test)\n", " # fit a classifier\n", " clf = SGDClassifier()\n", " clf.fit(X_train, y_train)\n", " score = clf.score(X_test, y_test)\n", " scores.append(score)\n", "\n", "print('accuracy:', np.mean(scores))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature selection on the entire dataset" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy: 0.9241333333333335\n" ] } ], "source": [ "X, y = mk_data(nobs, nfeatures, seed=1234)\n", "\n", "# do feature selection outside the loop - induces data leakage\n", "X = SelectKBest(k=n_selected_features).fit_transform(X, y)\n", "\n", "scores_bad_fsel = []\n", "\n", "for i in range(nsplits):\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n", " scaler = StandardScaler()\n", " X_train = scaler.fit_transform(X_train)\n", " X_test = scaler.transform(X_test)\n", " # fit a classifier\n", " clf = SGDClassifier()\n", " clf.fit(X_train, y_train)\n", " score = clf.score(X_test, y_test)\n", " scores_bad_fsel.append(score)\n", "\n", "print('accuracy:', np.mean(scores_bad_fsel))\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_scores = pd.DataFrame({'proper_fsel': scores, 'bad_fsel': scores_bad_fsel})\n", "# gather the data in a long format\n", "df_scores = pd.melt(df_scores, var_name='method', value_name='score')\n", "\n", "sns.displot(data=df_scores, x='score', hue='method', kind='kde')\n", "\n", "import matplotlib.pyplot as plt\n", "plt.axvline(0.5, color='blue', linestyle='--')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Hyperparameter selection" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'C': 0.001, 'kernel': 'linear'}\n", "0.5900000000000001\n" ] } ], "source": [ "param_grid = [\n", " {'C': [.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear']},\n", " {'C': [.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},\n", " ]\n", "\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import SVC\n", "\n", "X, y = mk_data(nobs, nfeatures, seed=1234)\n", "gs = GridSearchCV(SVC(), param_grid, cv=5)\n", "gs.fit(X, y)\n", "\n", "print(gs.best_params_)\n", "print(gs.best_score_)\n" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "10.**np.arange(-4, 4, 1)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'clf': SGDClassifier(), 'clf__alpha': np.float64(10.0), 'clf__penalty': 'l2'}\n", "0.64\n" ] } ], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.svm import LinearSVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.neural_network import MLPClassifier\n", "\n", "np.random.seed(14567)\n", "X, y = mk_data(nobs, nfeatures, seed=14567)\n", "\n", "pipeline = Pipeline([\n", " ('clf', None),\n", "])\n", "parameters = [\n", " {\n", " 'clf': (SGDClassifier(),),\n", " 'clf__alpha': 10.**np.arange(-4, 4, 1),\n", " 'clf__penalty': ('l1', 'l2', 'elasticnet'),\n", " }, \n", " {\n", " 'clf': (SVC(),),\n", " 'clf__C': 10.**np.arange(-4, 4, 1),\n", " 'clf__kernel': ('linear', 'rbf'),\n", " },\n", "\n", "]\n", "grid_search = GridSearchCV(pipeline, parameters,\n", " scoring='accuracy')\n", "grid_search.fit(X, y)\n", "\n", "print(grid_search.best_params_)\n", "print(grid_search.best_score_)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "40" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(grid_search.cv_results_['mean_test_score'])\n" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.displot(grid_search.cv_results_['mean_test_score'], bins=10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }