Machine learning Chapter 4 training model

Time:2020-5-18

Reference: the author’s jupyter notebook
Chapter 2 – End-to-end Machine Learning project

  1. Generate picture and save
    from __future__ import division, print_function, unicode_literals
    import numpy as np
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import os
    np.random.seed(42)
    
    mpl.rc('axes', labelsize=14)
    mpl.rc('xtick', labelsize=12)
    mpl.rc('ytick', labelsize=12)
    
    # Where to save the figures
    PROJECT_ROOT_DIR = "images"
    CHAPTER_ID = "traininglinearmodels"
    
    def save_fig(fig_id, tight_layout=True):
        path = os.path.join(PROJECT_ROOT_DIR, CHAPTER_ID, fig_id + ".png")
        print("Saving figure", fig_id)
        if tight_layout:
            plt.tight_layout()
        plt.savefig(path, format='png', dpi=600)

linear regression

  1. Generate some linear data to test the formula (standard equation)

    import numpy as np
    X = 2 * np.random.rand(100, 1)
    y = 4 + 3 * X + np.random.randn(100, 1)
    
    plt.plot(X, y, "b.")
    plt.xlabel("$x_1$", fontsize=18)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.axis([0, 2, 0, 15])
    #save_fig("generated_data_plot")
  2. Linear algebra module using numpy( np.linalg )The inv() function in is used to inverse the matrix, and the inner product of the matrix is calculated by the dot() method

    X_b = np.c_[np.ones((100, 1)), X]  # add x0 = 1 to each instance
    theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
    #print(theta_best)
  3. You can use * to make predictions:

    X_new = np.array([[0], [2]])
    X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance
    y_predict = X_new_b.dot(theta_best)
    #print(y_predict)
    
    #Draw the prediction results of the model
    plt.plot(X_new, y_predict, "r-")
    plt.plot(X, y, "b.")
    plt.axis([0, 2, 0, 15])
    plt.plot(X_new, y_predict, "r-", linewidth=2, label="Predictions")
    plt.plot(X, y, "b.")
    plt.xlabel("$x_1$", fontsize=18)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.legend(loc="upper left", fontsize=14)
    plt.axis([0, 2, 0, 15])
    #save_fig("linear_model_predictions")
    #plt.show()
  4. The equivalent code for scikit learn is as follows

    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    lin_reg.fit(X, y)
    print(lin_reg.intercept_, lin_reg.coef_)
    print(lin_reg.predict(X_new))
    theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_b, y, rcond=1e-6)
    print(theta_best_svd)
    print(np.linalg.pinv(X_b).dot(y))

gradient descent

The central idea of gradient descent is to adjust the parameters iteratively to minimize the cost function. If the learning rate is too low, the algorithm needs a lot of iterations to converge, which will take a long time. If the learning rate is too high, you may cross the valley directly to the other side of the mountain, or even higher than the previous starting point. This will lead to algorithm divergence, increasing value, and finally unable to find a good solution

  1. Batch gradient descent: three formulas, fast implementation of this algorithm:

    eta = 0.1
    n_iterations = 1000
    m = 100
    theta = np.random.randn(2,1)
    
    for iteration in range(n_iterations):
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
        theta = theta - eta * gradients
    #print(theta)
    #print(X_new_b.dot(theta))
  2. When using three different learning rates, the first ten steps of gradient decline are as follows:

    theta_path_bgd = []
    def plot_gradient_descent(theta, eta, theta_path=None):
        m = len(X_b)
        plt.plot(X, y, "b.")
        n_iterations = 1000
        for iteration in range(n_iterations):
            if iteration < 10:
                y_predict = X_new_b.dot(theta)
                style = "b-" if iteration > 0 else "r--"
                plt.plot(X_new, y_predict, style)
            gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
            theta = theta - eta * gradients
            if theta_path is not None:
                theta_path.append(theta)
        plt.xlabel("$x_1$", fontsize=18)
        plt.axis([0, 2, 0, 15])
        plt.title(r"$\eta = {}$".format(eta), fontsize=16)
    
    np.random.seed(42)
    theta = np.random.randn(2,1)  # random initialization
    plt.figure(figsize=(10,4))
    plt.subplot(131); plot_gradient_descent(theta, eta=0.02)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)
    plt.subplot(133); plot_gradient_descent(theta, eta=0.5)
    #save_fig("gradient_descent_plot")
    #plt.show()
  3. The following code uses a simple learning plan to achieve random gradient descent:

    theta_path_sgd = []
    m = len(X_b)
    np.random.seed(42)
    
    n_epochs = 50
    t0, t1 = 5, 50  # learning schedule hyperparameters
    
    def learning_schedule(t):
        return t0 / (t + t1)
    
    theta = np.random.randn(2,1)  # random initialization
    
    for epoch in range(n_epochs):
        for i in range(m):
            if epoch == 0 and i < 20:                    # not shown in the book
                y_predict = X_new_b.dot(theta)           # not shown
                style = "b-" if i > 0 else "r--"         # not shown
                plt.plot(X_new, y_predict, style)        # not shown
            random_index = np.random.randint(m)
            xi = X_b[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
            eta = learning_schedule(epoch * m + i)
            theta = theta - eta * gradients
            theta_path_sgd.append(theta)                 # not shown
    
    plt.plot(X, y, "b.")                                 # not shown
    plt.xlabel("$x_1$", fontsize=18)                     # not shown
    plt.ylabel("$y$", rotation=0, fontsize=18)           # not shown
    plt.axis([0, 2, 0, 15])                              # not shown
    #save_fig("sgd_plot")                                 # not shown
    #plt.show()
    
    from sklearn.linear_model import SGDRegressor
    sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)
    sgd_reg.fit(X, y.ravel())
    print(sgd_reg.intercept_, sgd_reg.coef_)
  4. Small batch gradient decline

    theta_path_bgd = []
    theta_path_sgd = []
    theta_path_mgd = []
    m = 100
    n_iterations = 50
    minibatch_size = 20
    
    np.random.seed(42)
    theta = np.random.randn(2,1)  # random initialization
    
    t0, t1 = 200, 1000
    def learning_schedule(t):
        return t0 / (t + t1)
    
    t = 0
    for epoch in range(n_iterations):
        shuffled_indices = np.random.permutation(m)
        X_b_shuffled = X_b[shuffled_indices]
        y_shuffled = y[shuffled_indices]
        for i in range(0, m, minibatch_size):
            t += 1
            xi = X_b_shuffled[i:i+minibatch_size]
            yi = y_shuffled[i:i+minibatch_size]
            gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)
            eta = learning_schedule(t)
            theta = theta - eta * gradients
            theta_path_mgd.append(theta)
    
    theta_path_bgd = np.array(theta_path_bgd)
    theta_path_sgd = np.array(theta_path_sgd)
    theta_path_mgd = np.array(theta_path_mgd)
    
    plt.figure(figsize=(7,4))
    plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], "r-s", linewidth=1, label="Stochastic")
    plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], "g-+", linewidth=2, label="Mini-batch")
    plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], "b-o", linewidth=3, label="Batch")
    plt.legend(loc="upper left", fontsize=16)
    plt.xlabel(r"$\theta_0$", fontsize=20)
    plt.ylabel(r"$\theta_1$   ", fontsize=20, rotation=0)
    plt.axis([2.5, 4.5, 2.3, 3.9])
    save_fig("gradient_descent_paths_plot")
    plt.show()

polynomial regression

  1. Based on simple quadratic equation (Note: the form of quadratic equation is y = AX2 + BX + C), some nonlinear data (adding random noise) are produced

    import numpy as np
    import numpy.random as rnd
    np.random.seed(42)
    
    m = 100
    X = 6 * np.random.rand(m, 1) - 3
    y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)
    
    plt.plot(X, y, "b.")
    plt.xlabel("$x_1$", fontsize=18)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.axis([-3, 3, 0, 10])
    save_fig("quadratic_data_plot")
    plt.show()
  2. Use scikit learn’s polynomialfeatures class to transform training data

    from sklearn.preprocessing import PolynomialFeatures
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly_features.fit_transform(X)
    #print(X[0])
    #print(X_poly[0])
  3. Match a linearregression model to this extended training set

    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y)
    #print(lin_reg.intercept_, lin_reg.coef_)
    
    X_new=np.linspace(-3, 3, 100).reshape(100, 1)
    X_new_poly = poly_features.transform(X_new)
    y_new = lin_reg.predict(X_new_poly)
    plt.plot(X, y, "b.")
    plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
    plt.xlabel("$x_1$", fontsize=18)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.legend(loc="upper left", fontsize=14)
    plt.axis([-3, 3, 0, 10])
    #Save ﹐ fig ("polynomial ﹐ predictions ﹐ plot")
    #plt.show()

learning curve

  1. Higher order polynomial regression

    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    
    for style, width, degree in (("g-", 1, 300), ("b--", 2, 2), ("r-+", 2, 1)):
        polybig_features = PolynomialFeatures(degree=degree, include_bias=False)
        std_scaler = StandardScaler()
        lin_reg = LinearRegression()
        polynomial_regression = Pipeline([
                ("poly_features", polybig_features),
                ("std_scaler", std_scaler),
                ("lin_reg", lin_reg),
            ])
        polynomial_regression.fit(X, y)
        y_newbig = polynomial_regression.predict(X_new)
        plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)
    
    plt.plot(X, y, "b.", linewidth=3)
    plt.legend(loc="upper left")
    plt.xlabel("$x_1$", fontsize=18)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.axis([-3, 3, 0, 10])
    #Save fig (high degree polynomials plot)
    #plt.show()
  2. Learning curve of pure linear regression model

    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    
    def plot_learning_curves(model, X, y):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
        train_errors, val_errors = [], []
        for m in range(1, len(X_train)):
            model.fit(X_train[:m], y_train[:m])
            y_train_predict = model.predict(X_train[:m])
            y_val_predict = model.predict(X_val)
            train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
            val_errors.append(mean_squared_error(y_val, y_val_predict))
    
        plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
        plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
        plt.legend(loc="upper right", fontsize=14)   # not shown in the book
        plt.xlabel("Training set size", fontsize=14) # not shown
        plt.ylabel("RMSE", fontsize=14)              # not shown
    '''
    '''
    lin_reg = LinearRegression()
    plot_learning_curves(lin_reg, X, y)
    plt.axis([0, 80, 0, 3])                         # not shown in the book
    #Save ﹐ fig ("under fitting ﹐ learning ﹐ curves ﹐ plot) ﹐ not show
    #plt.show()                                      # not shown
  3. The 10th order learning curve of polynomial regression model

    polynomial_regression = Pipeline([
            ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
            ("lin_reg", LinearRegression()),
        ])
    
    plot_learning_curves(polynomial_regression, X, y)
    plt.axis([0, 80, 0, 3])           # not shown
    Save ("learning curves" not show
    plt.show()

Regular linear model

  1. Ridge regression (also called jihonov regularization) is a regularized version of linear regression

    from sklearn.linear_model import Ridge
    
    np.random.seed(42)
    m = 20
    X = 3 * np.random.rand(m, 1)
    y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5
    X_new = np.linspace(0, 3, 100).reshape(100, 1)
    
    def plot_model(model_class, polynomial, alphas, **model_kargs):
        for alpha, style in zip(alphas, ("b-", "g--", "r:")):
            model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()
            if polynomial:
                model = Pipeline([
                        ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
                        ("std_scaler", StandardScaler()),
                        ("regul_reg", model),
                    ])
            model.fit(X, y)
            y_new_regul = model.predict(X_new)
            lw = 2 if alpha > 0 else 1
            plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r"$\alpha = {}$".format(alpha))
        plt.plot(X, y, "b.", linewidth=3)
        plt.legend(loc="upper left", fontsize=15)
        plt.xlabel("$x_1$", fontsize=18)
        plt.axis([0, 3, 0, 4])
    
    plt.figure(figsize=(8,4))
    plt.subplot(121)
    plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.subplot(122)
    plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)
    
    Save ﹐ fig ("ridge ﹐ region ﹐ plot")
    plt.show()
  2. Using scikit learn to perform ridge regression of closed form solution

    from sklearn.linear_model import Ridge
    ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)
    ridge_reg.fit(X, y)
    ridge_reg.predict([[1.5]])
    
    ridge_reg = Ridge(alpha=1, solver="sag", random_state=42)
    ridge_reg.fit(X, y)
    ridge_reg.predict([[1.5]])
    
    #Using random gradient descent
    from sklearn.linear_model import SGDRegressor
    sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty="l2", random_state=42)
    sgd_reg.fit(X, y.ravel())
    sgd_reg.predict([[1.5]])
  3. Lasso regression and lasso regression. Another regularization of linear regression is called least absolute shrinkage and selection operator regression.

    from sklearn.linear_model import Lasso
    
    plt.figure(figsize=(8,4))
    plt.subplot(121)
    plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)
    plt.ylabel("$y$", rotation=0, fontsize=18)
    plt.subplot(122)
    plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1, random_state=42)
    #Save "fig (" lasso "region" plot)
    #plt.show()
  4. A small example of lasso class using scikit learn.

    from sklearn.linear_model import ElasticNet
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    elastic_net.fit(X, y)
    #print(elastic_net.predict([[1.5]]))
  5. Elastic network, a small example of elasticnet using scikit learn

    from sklearn.linear_model import ElasticNet
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    elastic_net.fit(X, y)
    #print(elastic_net.predict([[1.5]]))
  6. Early stop method

    from sklearn.linear_model import SGDRegressor
    np.random.seed(42)
    m = 100
    X = 6 * np.random.rand(m, 1) - 3
    y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)
    
    X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)
    
    poly_scaler = Pipeline([
            ("poly_features", PolynomialFeatures(degree=90, include_bias=False)),
            ("std_scaler", StandardScaler()),
        ])
    
    X_train_poly_scaled = poly_scaler.fit_transform(X_train)
    X_val_poly_scaled = poly_scaler.transform(X_val)
    
    sgd_reg = SGDRegressor(max_iter=1,
                        tol=-np.infty,
                        penalty=None,
                        eta0=0.0005,
                        warm_start=True,
                        learning_rate="constant",
                        random_state=42)
    
    n_epochs = 500
    train_errors, val_errors = [], []
    for epoch in range(n_epochs):
        sgd_reg.fit(X_train_poly_scaled, y_train)
        y_train_predict = sgd_reg.predict(X_train_poly_scaled)
        y_val_predict = sgd_reg.predict(X_val_poly_scaled)
        train_errors.append(mean_squared_error(y_train, y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))
    
    best_epoch = np.argmin(val_errors)
    best_val_rmse = np.sqrt(val_errors[best_epoch])
    
    plt.annotate('Best model',
                xy=(best_epoch, best_val_rmse),
                xytext=(best_epoch, best_val_rmse + 1),
                ha="center",
                arrowprops=dict(facecolor='black', shrink=0.05),
                fontsize=16,
                )
    
    best_val_rmse -= 0.03  # just to make the graph look better
    plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], "k:", linewidth=2)
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="Validation set")
    plt.plot(np.sqrt(train_errors), "r--", linewidth=2, label="Training set")
    plt.legend(loc="upper right", fontsize=14)
    plt.xlabel("Epoch", fontsize=14)
    plt.ylabel("RMSE", fontsize=14)
    Save fig ("early stop")
    plt.show()
  7. Lasso regression and ridge regression

    t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5
    
    # ignoring bias term
    t1s = np.linspace(t1a, t1b, 500)
    t2s = np.linspace(t2a, t2b, 500)
    t1, t2 = np.meshgrid(t1s, t2s)
    T = np.c_[t1.ravel(), t2.ravel()]
    Xr = np.array([[-1, 1], [-0.3, -1], [1, 0.1]])
    yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]
    
    J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)
    
    N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)
    N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)
    
    t_min_idx = np.unravel_index(np.argmin(J), J.shape)
    t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]
    
    t_init = np.array([[0.25], [-1]])
    
    def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):
        path = [theta]
        for iteration in range(n_iterations):
            gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + 2 * l2 * theta
    
            theta = theta - eta * gradients
            path.append(theta)
        return np.array(path)
    
    plt.figure(figsize=(12, 8))
    for i, N, l1, l2, title in ((0, N1, 0.5, 0, "Lasso"), (1, N2, 0,  0.1, "Ridge")):
        JR = J + l1 * N1 + l2 * N2**2
        
        tr_min_idx = np.unravel_index(np.argmin(JR), JR.shape)
        t1r_min, t2r_min = t1[tr_min_idx], t2[tr_min_idx]
    
        levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)
        levelsJR=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(JR) - np.min(JR)) + np.min(JR)
        levelsN=np.linspace(0, np.max(N), 10)
        
        path_J = bgd_path(t_init, Xr, yr, l1=0, l2=0)
        path_JR = bgd_path(t_init, Xr, yr, l1, l2)
        path_N = bgd_path(t_init, Xr, yr, np.sign(l1)/3, np.sign(l2), core=0)
    
        plt.subplot(221 + i * 2)
        plt.grid(True)
        plt.axhline(y=0, color='k')
        plt.axvline(x=0, color='k')
        plt.contourf(t1, t2, J, levels=levelsJ, alpha=0.9)
        plt.contour(t1, t2, N, levels=levelsN)
        plt.plot(path_J[:, 0], path_J[:, 1], "w-o")
        plt.plot(path_N[:, 0], path_N[:, 1], "y-^")
        plt.plot(t1_min, t2_min, "rs")
        plt.title(r"$\ell_{}$ penalty".format(i + 1), fontsize=16)
        plt.axis([t1a, t1b, t2a, t2b])
        if i == 1:
            plt.xlabel(r"$\theta_1$", fontsize=20)
        plt.ylabel(r"$\theta_2$", fontsize=20, rotation=0)
    
        plt.subplot(222 + i * 2)
        plt.grid(True)
        plt.axhline(y=0, color='k')
        plt.axvline(x=0, color='k')
        plt.contourf(t1, t2, JR, levels=levelsJR, alpha=0.9)
        plt.plot(path_JR[:, 0], path_JR[:, 1], "w-o")
        plt.plot(t1r_min, t2r_min, "rs")
        plt.title(title, fontsize=16)
        plt.axis([t1a, t1b, t2a, t2b])
        if i == 1:
            plt.xlabel(r"$\theta_1$", fontsize=20)
    
    save_fig("lasso_vs_ridge_plot")
    plt.show()
  8. logistic regression

    #Logical function
    t = np.linspace(-10, 10, 100)
    sig = 1 / (1 + np.exp(-t))
    plt.figure(figsize=(9, 3))
    plt.plot([-10, 10], [0, 0], "k-")
    plt.plot([-10, 10], [0.5, 0.5], "k:")
    plt.plot([-10, 10], [1, 1], "k:")
    plt.plot([0, 0], [-1.1, 1.1], "k-")
    plt.plot(t, sig, "b-", linewidth=2, label=r"$\sigma(t) = \frac{1}{1 + e^{-t}}$")
    plt.xlabel("t")
    plt.legend(loc="upper left", fontsize=20)
    plt.axis([-10, 10, -0.1, 1.1])
    save_fig("logistic_function_plot")
    plt.show()
  9. Decision boundary

    #Create a classifier to detect Virginia iris.
    from sklearn import datasets
    iris = datasets.load_iris()
    list(iris.keys())
    #print(list(iris.keys()))
    #print(iris.DESCR)
    
    X = iris["data"][:, 3:]  # petal width
    y = (iris["target"] == 2).astype(np.int)  # 1 if Iris-Virginica, else 0
  10. Training logistic regression model

    from sklearn.linear_model import LogisticRegression
    log_reg = LogisticRegression(solver="liblinear", random_state=42)
    log_reg.fit(X, y)
    
    #Reduced Edition
    X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
    y_proba = log_reg.predict_proba(X_new)
    
    plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2, label="Iris-Virginica")
    plt.plot(X_new, y_proba[:, 0], "b--", linewidth=2, label="Not Iris-Virginica")
    plt.show()
    
    #Full version
    X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
    y_proba = log_reg.predict_proba(X_new)
    decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
    
    plt.figure(figsize=(8, 3))
    plt.plot(X[y==0], y[y==0], "bs")
    plt.plot(X[y==1], y[y==1], "g^")
    plt.plot([decision_boundary, decision_boundary], [-1, 2], "k:", linewidth=2)
    plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2, label="Iris-Virginica")
    plt.plot(X_new, y_proba[:, 0], "b--", linewidth=2, label="Not Iris-Virginica")
    plt.text(decision_boundary+0.02, 0.15, "Decision  boundary", fontsize=14, color="k", ha="center")
    plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')
    plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')
    plt.xlabel("Petal width (cm)", fontsize=14)
    plt.ylabel("Probability", fontsize=14)
    plt.legend(loc="center left", fontsize=14)
    plt.axis([0, 3, -0.02, 1.02])
    #Save "fig (" logistic "region" plot)
    #plt.show()
    print(decision_boundary)
    print(log_reg.predict([[1.7], [1.5]]))
  11. Softmax regression multiple logistic regression

    from sklearn.linear_model import LogisticRegression
    
    X = iris["data"][:, (2, 3)]  # petal length, petal width
    y = (iris["target"] == 2).astype(np.int)
    
    log_reg = LogisticRegression(solver="liblinear", C=10**10, random_state=42)
    log_reg.fit(X, y)
    
    x0, x1 = np.meshgrid(
            np.linspace(2.9, 7, 500).reshape(-1, 1),
            np.linspace(0.8, 2.7, 200).reshape(-1, 1),
        )
    X_new = np.c_[x0.ravel(), x1.ravel()]
    
    y_proba = log_reg.predict_proba(X_new)
    
    plt.figure(figsize=(10, 4))
    plt.plot(X[y==0, 0], X[y==0, 1], "bs")
    plt.plot(X[y==1, 0], X[y==1, 1], "g^")
    
    zz = y_proba[:, 1].reshape(x0.shape)
    contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)
    
    
    left_right = np.array([2.9, 7])
    boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]
    
    plt.clabel(contour, inline=1, fontsize=12)
    plt.plot(left_right, boundary, "k--", linewidth=3)
    plt.text(3.5, 1.5, "Not Iris-Virginica", fontsize=14, color="b", ha="center")
    plt.text(6.5, 2.3, "Iris-Virginica", fontsize=14, color="g", ha="center")
    plt.xlabel("Petal length", fontsize=14)
    plt.ylabel("Petal width", fontsize=14)
    plt.axis([2.9, 7, 0.8, 2.7])
    save_fig("logistic_regression_contour_plot")
    plt.show()
    
    X = iris["data"][:, (2, 3)]  # petal length, petal width
    y = iris["target"]
    
    softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10, random_state=42)
    softmax_reg.fit(X, y)
    
    x0, x1 = np.meshgrid(
            np.linspace(0, 8, 500).reshape(-1, 1),
            np.linspace(0, 3.5, 200).reshape(-1, 1),
        )
    X_new = np.c_[x0.ravel(), x1.ravel()]
    
    y_proba = softmax_reg.predict_proba(X_new)
    y_predict = softmax_reg.predict(X_new)
    
    zz1 = y_proba[:, 1].reshape(x0.shape)
    zz = y_predict.reshape(x0.shape)
    
    plt.figure(figsize=(10, 4))
    plt.plot(X[y==2, 0], X[y==2, 1], "g^", label="Iris-Virginica")
    plt.plot(X[y==1, 0], X[y==1, 1], "bs", label="Iris-Versicolor")
    plt.plot(X[y==0, 0], X[y==0, 1], "yo", label="Iris-Setosa")
    
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    
    plt.contourf(x0, x1, zz, cmap=custom_cmap)
    contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)
    plt.clabel(contour, inline=1, fontsize=12)
    plt.xlabel("Petal length", fontsize=14)
    plt.ylabel("Petal width", fontsize=14)
    plt.legend(loc="center left", fontsize=14)
    plt.axis([0, 7, 0, 3.5])
    save_fig("softmax_regression_contour_plot")
    plt.show()
    
    print(softmax_reg.predict([[5, 2]]))
    print(softmax_reg.predict_proba([[5, 2]]))