{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to Python and Scikit-Learn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Mainly inspired from https://scikit-learn.org/stable/modules/tree.html."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A useful package\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From the library sklearn (Scikit-learn), in the part datasets\n",
    "# I want to load the fonction 'load_iris'\n",
    "from sklearn.datasets import load_breast_cancer\n",
    "\n",
    "# Create the input and output data\n",
    "# I call the function 'load_iris'...\n",
    "X, y = load_breast_cancer(return_X_y=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_breast_cancer?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Divide this dataset into a learning set and a test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Size of my data\n",
    "print(X.shape)\n",
    "print(y.shape)\n",
    "\n",
    "# Number of samples\n",
    "print(X.shape[0])\n",
    "# Number of attributes\n",
    "print(X.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "number_of_ls = 250\n",
    "## randomise a bit the data\n",
    "random_state = 0\n",
    "from sklearn.utils import check_random_state\n",
    "random_state = check_random_state(random_state)\n",
    "a_permutation = random_state.permutation(np.arange(X.shape[0]))\n",
    "print(\"New order: {}\".format(a_permutation))\n",
    "X = X[a_permutation, :]\n",
    "y = y[a_permutation]\n",
    "\n",
    "# Now I can divide the dataset\n",
    "X_train, X_test = X[:number_of_ls,:], X[number_of_ls:,:]\n",
    "y_train, y_test = y[:number_of_ls], y[number_of_ls:]\n",
    "\n",
    "print('LS size = {}'.format(X_train.shape))\n",
    "print('TS size = {}'.format(X_train.shape)) #"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load an algorithm and learn a model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# or load the full module \n",
    "from sklearn import tree\n",
    "clf = tree.DecisionTreeClassifier()\n",
    "\n",
    "# or only load the DecisionTreeClassifier...\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "clf = DecisionTreeClassifier()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What are the parameters of a model? "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Learn a model = fit the model parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.figure(figsize=(25,15))\n",
    "tree.plot_tree(clf, filled=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(25,15))\n",
    "tree.plot_tree(clf, filled=True);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make a prediction?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate those prediction (we have the solution 'y_test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add noisy features to make the problem harder\n",
    "random_state = np.random.RandomState(0)\n",
    "n_samples, n_features = X.shape\n",
    "X = np.c_[X, random_state.randn(X.shape[0], 20 * X.shape[1])]\n",
    "X_train, X_test = X[:number_of_ls,:], X[number_of_ls:,:]\n",
    "y_train, y_test = y[:number_of_ls], y[number_of_ls:]\n",
    "\n",
    "clf.fit(X_train, y_train)\n",
    "predictions = clf.predict(X_test)\n",
    "# A solution, each classifier/regressor have a \"score\" property\n",
    "clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# But you can use other metrics (depending on what you want to evaluate, see Lecture 6)\n",
    "from sklearn.metrics import accuracy_score\n",
    "accuracy_score(y_test, predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# More evaluation: predict the probability for each class\n",
    "probas = clf.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_auc_score,roc_curve\n",
    "fpr, tpr, thresholds = roc_curve(y_test, probas[:,1])\n",
    "score = roc_auc_score(y_test,probas[:,1])\n",
    "\n",
    "plt.figure()\n",
    "lw = 2\n",
    "plt.plot(fpr, tpr, color='darkorange',\n",
    "         lw=lw, label='ROC curve (area = {})'.format(score))\n",
    "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n",
    "plt.xlim([0.0, 1.0])\n",
    "plt.ylim([0.0, 1.05])\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.title('Receiver operating characteristic example')\n",
    "plt.legend(loc=\"lower right\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}