Machine learning Chapter 3 Classification

Time:2020-6-22

Machine learning practice: notes based on scikit learn and tensorflow

Reference: the author’s jupyter notebook
Chapter 2 – End-to-end Machine Learning project

  1. Get the code of MNIST dataset:

    def sort_by_target(mnist):
        reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
        reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
        mnist.data[:60000] = mnist.data[reorder_train]
        mnist.target[:60000] = mnist.target[reorder_train]
        mnist.data[60000:] = mnist.data[reorder_test + 60000]
        mnist.target[60000:] = mnist.target[reorder_test + 60000]
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
  2. View these arrays

    #print(mnist["data"], mnist["target"])
    #print(mnist.data.shape)
    X, y = mnist["data"], mnist["target"]
    #print(X.shape)
    #print(y.shape)
    
    some_digit = X[36000]
    some_digit_image = some_digit.reshape(28, 28)
    plt.imshow(some_digit_image, cmap = mpl.cm.binary,
            interpolation="nearest")
    plt.axis("off")
    #plt.show()
    #print(y[36000])
  3. Some digital images in MNIST dataset

    def plot_digits(instances, images_per_row=10, **options):
        size = 28
        images_per_row = min(len(instances), images_per_row)
        images = [instance.reshape(size,size) for instance in instances]
        n_rows = (len(instances) - 1) // images_per_row + 1
        row_images = []
        n_empty = n_rows * images_per_row - len(instances)
        images.append(np.zeros((size, size * n_empty)))
        for row in range(n_rows):
            rimages = images[row * images_per_row : (row + 1) * images_per_row]
            row_images.append(np.concatenate(rimages, axis=1))
        image = np.concatenate(row_images, axis=0)
        plt.imshow(image, cmap = mpl.cm.binary, **options)
        plt.axis("off")
    
    plt.figure(figsize=(9,9))
    example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
    plot_digits(example_images, images_per_row=10)
    #save_fig("more_digits_plot")
    #plt.show()
  4. Shuffle data sets

    X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
    shuffle_index = np.random.permutation(60000)
    X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
  5. Train a binary classifier to create a target vector for this classification task:

    y_train_5 = (y_train == 5)  # True for all 5s, False for all other digits.
    y_test_5 = (y_test == 5)
  6. Create a sgdclassifier (random gradient descent classifier) and train on the whole training set:

    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)   #random_state=42
    sgd_clf.fit(X_train, y_train_5)
    #print(sgd_clf.fit(X_train, y_train_5))
    #Now you can use it to detect the image of the number 5:
    sgd_clf.predict([some_digit])
    #print(sgd_clf.predict([some_digit]))
  7. Cross validation

    from sklearn.model_selection import cross_val_score
    cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
    print(cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
    
    #The following code is compared with the previous cross_ val_ Score () is roughly the same and prints the same result:
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    skfolds = StratifiedKFold(n_splits=3, random_state=42)   #random_state=42
    for train_index, test_index in skfolds.split(X_train, y_train_5):
        clone_clf = clone(sgd_clf)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train_5[train_index])
        X_test_fold = X_train[test_index]
        y_test_fold = (y_train_5[test_index])
    
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))
  8. A stupid classifier (not what I said), which classifies every graph as “non-5”:

    from sklearn.base import BaseEstimator
    class Never5Classifier(BaseEstimator):
        def fit(self, X, y=None):
            pass
        def predict(self, X):
            return np.zeros((len(X), 1), dtype=bool)
    #Accuracy
    never_5_clf = Never5Classifier()
    print(cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
  9. Obfuscation matrix: a better way to evaluate classifier performance is obfuscation matrix.

    from sklearn.model_selection import cross_val_predict
    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
    #cross_ val_ The predict () function also performs k-fold cross validation, but instead of evaluating scores, it returns predictions for each fold. This means that a clean forecast can be obtained for each instance
    from sklearn.metrics import confusion_matrix
    #confusion_matrix(y_train_5, y_train_pred)
    #print(confusion_matrix(y_train_5, y_train_pred))
    y_train_perfect_predictions = y_train_5
    #print(confusion_matrix(y_train_5, y_train_perfect_predictions))
  10. Precision and recall

    #Precision = TP / (TP + FP): TP is the number of real classes, FP is the number of false positive classes.
    #Recall rate = TP / (TP + FN): FN is the number of false negative classes.
    from sklearn.metrics import precision_score, recall_score
    print(precision_ score(y_ train_ 5, y_ train_ PRED) precision: 4344 / (4344 + 1307)
    print(recall_ score(y_ train_ 5, y_ train_ PRED)) × recall rate 4344 / (4344 + 1077)
    
    #F1 score: F1 = 2 / (1 / accuracy + 1 / recall rate) = TP / (TP + (FN + FP) / 2)
    from sklearn.metrics import f1_score
    print(f1_score(y_train_5, y_train_pred))
  11. Precision / recall tradeoff: threshold

    y_scores = sgd_clf.decision_function([some_digit])
    print(y_scores)
    threshold = 0
    y_some_digit_pred = (y_scores > threshold)
    print(y_some_digit_pred)
    #Increase threshold
    threshold = 200000
    y_some_digit_pred_a = (y_scores > threshold)
    print(y_some_digit_pred_a)
  12. Decide what threshold to use

    #Get scores for all instances in the training set
    y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
    #Calculate accuracy and recall for all possible thresholds
    from sklearn.metrics import precision_recall_curve
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
    #Use Matplotlib to plot the function of precision and recall rate relative to threshold
    def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
        plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
        plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
        plt.xlabel("Threshold")
        plt.legend(loc="upper left")
        plt.ylim([0, 1])
    plt.figure(figsize=(8, 4))
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plt.xlim([-700000, 700000])
    plt.show()
    #print((y_train_pred == (y_scores > 0)).all())
    y_train_pred_90 = (y_scores > 70000)
    from sklearn.metrics import precision_score, recall_score
    print(precision_ score(y_ train_ 5, y_ train_ pred_ 90) accuracy
    print(recall_ score(y_ train_ 5, y_ train_ pred_ 90) recall rate
  13. Function diagram PR of precision and recall rate

    def plot_precision_vs_recall(precisions, recalls):
        plt.plot(recalls, precisions, "b-", linewidth=2)
        plt.xlabel("Recall", fontsize=16)
        plt.ylabel("Precision", fontsize=16)
        plt.axis([0, 1, 0, 1])
    
    plt.figure(figsize=(8, 6))
    plot_precision_vs_recall(precisions, recalls)
    plt.show()
  14. ROC curve (working characteristic curve of subjects): true rate and false positive rate

    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.axis([0, 1, 0, 1])
        plt.xlabel('False Positive Rate', fontsize=16)
        plt.ylabel('True Positive Rate', fontsize=16)
    '''
    plt.figure(figsize=(8, 6))
    plot_roc_curve(fpr, tpr)
    plt.show()
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_train_5, y_scores))
  15. A randomforest classifier is trained and its ROC curve and ROC AUC score are compared with sgdclassifier classifier.

    from sklearn.ensemble import RandomForestClassifier
    forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
    y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
    y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
    fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
    plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
    plt.legend(loc="lower right", fontsize=16)
    plt.show()
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_train_5, y_scores_forest))
  16. Multi class classifier, try sgdclassifier

    #Try sgdclassifier:
    sgd_clf.fit(X_train, y_train)
    sgd_clf.predict([some_digit])
    #print(sgd_clf.predict([some_digit]))
    some_digit_scores = sgd_clf.decision_function([some_digit])
    #print(some_digit_scores)
    #print(np.argmax(some_digit_scores))
    #print(sgd_clf.classes_)
    #print(sgd_clf.classes_[5])
    
    #The following code uses the ovo strategy to create a multi category classifier based on sgdclassifier:
    from sklearn.multiclass import OneVsOneClassifier
    ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))
    ovo_clf.fit(X_train, y_train)
    ovo_clf.predict([some_digit])
    len(ovo_clf.estimators_)
    #print(ovo_clf.predict([some_digit]))
    #print(len(ovo_clf.estimators_))
  17. Train randomforestclassifier

    from sklearn.model_selection import cross_val_score
    forest_clf.fit(X_train, y_train)
    #print(forest_clf.predict([some_digit]))
    #print(forest_ clf.predict_ proba([some_ Digital]) (probability list)
    #print(cross_ val_ score(sgd_ clf, X_ train, y_ Train, CV = 3, scoring = "accuracy"))
    #Simply scale the input
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
    #print(cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy"))
  18. Use matshow() function of Matplotlib to view the obfuscation matrix

    y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)
    #print(conf_mx)
    #Use matshow() function of Matplotlib to view the image representation of the obfuscation matrix
    #plt.matshow(conf_mx, cmap=plt.cm.gray)
    #save_fig("confusion_matrix_plot", tight_layout=False)
  19. You need to divide each value in the confusion matrix by the number of pictures in the corresponding category, so you are comparing the error rate rather than the absolute value of the error

    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    #Fill the diagonal with 0, only keep the error, redraw the result:
    np.fill_diagonal(norm_conf_mx, 0)
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    #save_fig("confusion_matrix_errors_plot", tight_layout=False)
  20. Look at the examples of numbers 3 and 5:

    cl_a, cl_b = 3, 5
    X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
    X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
    X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
    X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]
    
    plt.figure(figsize=(8,8))
    plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
    plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
    plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
    plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
    #save_fig("error_analysis_digits_plot")
  21. Multi label classification

    #This code will create a Y_ Multilabel array, which contains the target labels of two digital images: the first indicates whether the number is a large number (7, 8, 9), and the second indicates whether it is an odd number.
    from sklearn.neighbors import KNeighborsClassifier
    y_train_large = (y_train >= 7)
    y_train_odd = (y_train % 2 == 1)
    y_multilabel = np.c_[y_train_large, y_train_odd]
    
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train, y_multilabel)
    #print(knn_clf.fit(X_train, y_multilabel))
    #The next line creates an instance of kneighborsclassifier (it supports multi label classification, not all classifiers), and then uses multiple target arrays to
    #Training. Now use it to make a prediction, and notice the two labels it outputs:
    knn_ clf.predict ([some_ Digit]) (the number 5 is indeed small (false), and is odd (true).
    #print(knn_clf.predict([some_digit]))
  22. The following code calculates the average F1 score for all tags:

    from sklearn.metrics import f1_score
    y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
    f1_score(y_multilabel, y_train_knn_pred, average="macro")
    #print(f1_score(y_multilabel, y_train_knn_pred, average="macro"))
  23. Multi output classification (multi output multi category classification)

    #Start by creating training and test sets, and use numpy's randint() function
    #Adds noise to the pixel strength of the MNIST picture. The goal is to restore the picture to the original:
    noise = np.random.randint(0, 100, (len(X_train), 784))
    X_train_mod = X_train + noise
    noise = np.random.randint(0, 100, (len(X_test), 784))
    X_test_mod = X_test + noise
    y_train_mod = X_train
    y_test_mod = X_test
    
    some_index = 5500
    #plt.subplot(121); plot_digit(X_test_mod[some_index])
    #plt.subplot(122); plot_digit(y_test_mod[some_index])
    #save_fig("noisy_digit_example_plot")
  24. Clean this picture:

    knn_clf.fit(X_train_mod, y_train_mod)
    clean_digit = knn_clf.predict([X_test_mod[some_index]])
    plot_digit(clean_digit)
    save_fig("cleaned_digit_example_plot")
    plt.show()