# Introduction to Python and Scikit-Learn

Mainly inspired from https://scikit-learn.org/stable/modules/tree.html.

In [None]:
# A useful package
import numpy as np

## Load a dataset

In [None]:
# From the library sklearn (Scikit-learn), in the part datasets
# I want to load the fonction 'load_iris'
from sklearn.datasets import load_breast_cancer

# Create the input and output data
# I call the function 'load_iris'...
X, y = load_breast_cancer(return_X_y=True)

In [None]:
load_breast_cancer?

### Divide this dataset into a learning set and a test set

In [None]:
# Size of my data
print(X.shape)
print(y.shape)

# Number of samples
print(X.shape[0])
# Number of attributes
print(X.shape[1])

In [None]:
number_of_ls = 250
## randomise a bit the data
random_state = 0
from sklearn.utils import check_random_state
random_state = check_random_state(random_state)
a_permutation = random_state.permutation(np.arange(X.shape[0]))
print("New order: {}".format(a_permutation))
X = X[a_permutation, :]
y = y[a_permutation]

# Now I can divide the dataset
X_train, X_test = X[:number_of_ls,:], X[number_of_ls:,:]
y_train, y_test = y[:number_of_ls], y[number_of_ls:]

print('LS size = {}'.format(X_train.shape))
print('TS size = {}'.format(X_train.shape)) #

## Load an algorithm and learn a model

In [None]:
# or load the full module 
from sklearn import tree
clf = tree.DecisionTreeClassifier()

# or only load the DecisionTreeClassifier...
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

### What are the parameters of a model? 

In [None]:
clf?

### Learn a model = fit the model parameters

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(25,15))
tree.plot_tree(clf, filled=True)

In [None]:
plt.figure(figsize=(25,15))
tree.plot_tree(clf, filled=True);

## Make a prediction?

In [None]:
predictions = clf.predict(X_test)

### Evaluate those prediction (we have the solution 'y_test')

In [None]:
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(X.shape[0], 20 * X.shape[1])]
X_train, X_test = X[:number_of_ls,:], X[number_of_ls:,:]
y_train, y_test = y[:number_of_ls], y[number_of_ls:]

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
# A solution, each classifier/regressor have a "score" property
clf.score(X_test, y_test)

In [None]:
# But you can use other metrics (depending on what you want to evaluate, see Lecture 6)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

In [None]:
# More evaluation: predict the probability for each class
probas = clf.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
fpr, tpr, thresholds = roc_curve(y_test, probas[:,1])
score = roc_auc_score(y_test,probas[:,1])

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
 lw=lw, label='ROC curve (area = {})'.format(score))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
