{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to Python and Scikit-Learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Mainly inspired from https://scikit-learn.org/stable/modules/tree.html." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# A useful package\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load a dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# From the library sklearn (Scikit-learn), in the part datasets\n", "# I want to load the fonction 'load_iris'\n", "from sklearn.datasets import load_breast_cancer\n", "\n", "# Create the input and output data\n", "# I call the function 'load_iris'...\n", "X, y = load_breast_cancer(return_X_y=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "load_breast_cancer?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Divide this dataset into a learning set and a test set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Size of my data\n", "print(X.shape)\n", "print(y.shape)\n", "\n", "# Number of samples\n", "print(X.shape[0])\n", "# Number of attributes\n", "print(X.shape[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "number_of_ls = 250\n", "## randomise a bit the data\n", "random_state = 0\n", "from sklearn.utils import check_random_state\n", "random_state = check_random_state(random_state)\n", "a_permutation = random_state.permutation(np.arange(X.shape[0]))\n", "print(\"New order: {}\".format(a_permutation))\n", "X = X[a_permutation, :]\n", "y = y[a_permutation]\n", "\n", "# Now I can divide the dataset\n", "X_train, X_test = X[:number_of_ls,:], X[number_of_ls:,:]\n", "y_train, y_test = y[:number_of_ls], y[number_of_ls:]\n", "\n", "print('LS size = {}'.format(X_train.shape))\n", "print('TS size = {}'.format(X_train.shape)) #" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load an algorithm and learn a model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# or load the full module \n", "from sklearn import tree\n", "clf = tree.DecisionTreeClassifier()\n", "\n", "# or only load the DecisionTreeClassifier...\n", "from sklearn.tree import DecisionTreeClassifier\n", "clf = DecisionTreeClassifier()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What are the parameters of a model? " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Learn a model = fit the model parameters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf = clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "plt.figure(figsize=(25,15))\n", "tree.plot_tree(clf, filled=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(25,15))\n", "tree.plot_tree(clf, filled=True);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a prediction?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = clf.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Evaluate those prediction (we have the solution 'y_test')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Add noisy features to make the problem harder\n", "random_state = np.random.RandomState(0)\n", "n_samples, n_features = X.shape\n", "X = np.c_[X, random_state.randn(X.shape[0], 20 * X.shape[1])]\n", "X_train, X_test = X[:number_of_ls,:], X[number_of_ls:,:]\n", "y_train, y_test = y[:number_of_ls], y[number_of_ls:]\n", "\n", "clf.fit(X_train, y_train)\n", "predictions = clf.predict(X_test)\n", "# A solution, each classifier/regressor have a \"score\" property\n", "clf.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# But you can use other metrics (depending on what you want to evaluate, see Lecture 6)\n", "from sklearn.metrics import accuracy_score\n", "accuracy_score(y_test, predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# More evaluation: predict the probability for each class\n", "probas = clf.predict_proba(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import roc_auc_score,roc_curve\n", "fpr, tpr, thresholds = roc_curve(y_test, probas[:,1])\n", "score = roc_auc_score(y_test,probas[:,1])\n", "\n", "plt.figure()\n", "lw = 2\n", "plt.plot(fpr, tpr, color='darkorange',\n", " lw=lw, label='ROC curve (area = {})'.format(score))\n", "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('Receiver operating characteristic example')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }