Machine learning Chapter 2 end to end machine learning project

Time:2020-6-24

Machine learning practice: notes based on scikit learn and tensorflow

Reference: the author’s jupyter notebook
Chapter 2 – End-to-end Machine Learning project

  1. Download data

    • Open vscode, create a new Python file, enter the following code, Download housing.tgz Document, and housing.csv Extract to this directory
    import os
    import tarfile
    from six.moves import urllib
    
    download_root = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
    HOUSING_PATH = "datasets/housing"
    HOUSING_URL = download_root + HOUSING_PATH + "/housing.tgz"
    
    def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
        if not os.path.isdir(housing_path):
            os.makedirs(housing_path)
        tgz_path = os.path.join(housing_path, "housing.tgz")
        urllib.request.urlretrieve(housing_url, tgz_path)
        housing_tgz = tarfile.open(tgz_path)
        housing_tgz.extractall(path=housing_path)
        housing_tgz.close()
    
    fetch_housing_data()

    After downloading, you can annotate the function

  2. Quick view of data structure

    • Using pandas to load data
    mport pandas as pd
    def load_housing_data(housing_path=HOUSING_PATH):
      csv_path = os.path.join(housing_path, "housing.csv")
      return pd.read_csv(csv_path)

    Function returns a pandas dataframe object containing all data

    • Call the head() method of dataframes to view the first five rows of data (because vscode is used, it will be different from the book), and then view the comments
    housing = load_housing_data()
    print(housing.head())

    There are 10 attributes in total

    • With the info () method, you can quickly get a simple description of the dataset, especially the total number of rows, the type of each attribute and the number of non null values
      print(housing.info())

    • Use value_ Count () method to see how many categories exist and how many areas exist under each category
      print(housing["ocean_proximity"].value_counts())

    • The description () method enables you to display a summary of numeric attributes
      print(housing.describe())

    • The hist () method is called on the whole dataset to draw the histogram of each attribute

    import matplotlib.pyplot as plt
    housing.hist(bins=50, figsize=(50,15))
    plt.show()
  3. Create test set

    • In theory, creating a test set is very simple: just randomly select some instances, usually 20% of the data set, and then put them aside:
    import numpy as np
    def split_train_test(data, test_ratio):
        shuffled_indices = np.random.permutation(len(data))
        test_set_size = int(len(data) * test_ratio)
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        return data.iloc[train_indices], data.iloc[test_indices]
    
    train_set, test_set = split_train_test(housing, 0.2)
    print(len(train_set), "train +", len(test_set), "test")
    • But it’s not perfect: if you run it again, it will produce a different data set! In this way, you (or your machine learning algorithm) will see the entire data set, which is what you need to avoid when creating a test set. A common solution is to use an identifier for each instance to decide whether to enter the test set (assuming that each instance has a unique and unchanging identifier)
    import hashlib
    def test_set_check(identifier,test_ratio, hash):
        return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
    
    def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
        ids = data[id_column]
        in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
        return data.loc[~in_test_set], data.loc[in_test_set]
    
    #housing_with_id = housing.reset_index()
    #housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
    #train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(housing, test_size=0.2, random=42)
    • stratified sampling
    housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
    housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
    from sklearn.model_selection import StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    print(housing["income_cat"].value_counts() / len(housing))
    for set in (strat_train_set, strat_test_set):
        set.drop(["income_cat"], axis=1, inplace=True)
  4. Data exploration and visualization

    • Create a copyhousing = strat_train_set.copy()
    • Visualize geographic data
    #housing.plot(kind="scatter", x="longitude", y="latitude")
    #housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
    housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"] / 100, label="population",
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
    plt.legend()
    plt.show()
    • Looking for relevance
    #corr_matrix = housing.corr()
    #print(corr_matrix["median_house_value"].sort_values(ascending=False))
    from  pandas.plotting  import scatter_ Matrix is missing tools
    attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
    scatter_matrix(housing[attributes], figsize=(12, 8))
    housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
    plt.show()
  5. Experiment with combinations of different attributes

    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"]=housing["population"]/housing["households"]
    corr_matrix = housing.corr()
    print(corr_matrix["median_house_value"].sort_values(ascending=False))
  6. Data preparation of machine learning algorithm

    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()
  7. Data cleaning: 1 out of 4

    #housing.dropna(subset=["total_bedrooms"])    # option 1
    #housing.drop("total_bedrooms", axis=1)       # option 2
    #median = housing["total_bedrooms"].median()
    #housing["total_bedrooms"].fillna(median)     # option 3
    
    #Option 4: the computer provided by scikit learn specifies that you want to replace the missing value of the attribute with the median value of the attribute
    from  sklearn.impute  Different from the book, import simpleimpeter has evolved
    ImPler = simpleimpler (strategy = "median") (create an instance of imPler)
    housing_ num =  housing.drop ("ocean_ Proximity ", axis = 1) ා create a data copy ocean without text attribute_ proximity
    imputer.fit (housing_ Num) use the fit () method to adapt the imPler instance to the training set
    #print(imputer.statistics_)
    #print(housing_num.median().values)
    X =  imputer.transform (housing_ Num) replace
    housing_ tr =  pd.DataFrame (X, columns=housing_ num.columns )Replace panda dataframe
  8. Working with text and classification properties

    #First, these text labels are converted into numbers. Scikit learn provides a converter labelencoder for such tasks:
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    housing_cat = housing["ocean_proximity"]
    housing_cat_encoded = encoder.fit_transform(housing_cat)
    #print(housing_cat_encoded)
    #print(encoder.classes_)
    
    #Scikit learn provides a onehotencoder, which can convert the integer classification value into a unique heat vector
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder()
    housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
    #print(housing_cat_1hot.toarray())
    
    #Using the labelbinarizer class, you can complete two transformations at once
    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    housing_cat_1hot = encoder.fit_transform(housing_cat)
    print(housing_cat_1hot)
  9. Custom converter

    from sklearn.base import BaseEstimator, TransformerMixin
    rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
    class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
            self.add_bedrooms_per_room = add_bedrooms_per_room
        def fit(self, X, y=None):
            return self    #nothing else to do
        def transform(self, X, y=None):
            rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
            population_per_household = X[:, population_ix] / X[:, household_ix]
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
                return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
            else:
                return np.c_[X, rooms_per_household, population_per_household]
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)
  10. Conversion pipeline

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
        ])
    housing_num_tr = num_pipeline.fit_transform(housing_num)
    #print(housing_num_tr)
    
    from sklearn.compose import ColumnTransformer
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    
    housing_prepared = full_pipeline.fit_transform(housing)
    #print(housing_prepared)
    #print(housing_prepared.shape)
  11. Selection and training model

    • Train a linear regression model:
    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)
    #print(lin_reg)
    #Try the example
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    #print("Predictions:", lin_reg.predict(some_data_prepared))
    #print("Labels:", list(some_labels))
    #print(some_data_prepared)
    • Mean using scikit learn_ squared_ Error function to measure RMSE of regression model on the whole training set:
    from sklearn.metrics import mean_squared_error
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    #print(lin_rmse)
    from sklearn.metrics import mean_absolute_error
    lin_mae = mean_absolute_error(housing_labels, housing_predictions)
    #print(lin_mae)
    • Let’s train a decision tree regressor.
    from sklearn.tree import DecisionTreeRegressor
    tree_reg = DecisionTreeRegressor(random_state=42)
    tree_reg.fit(housing_prepared, housing_labels)
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    #print(tree_ RMSE) may over fit the data seriously
    • Use cross validation to better evaluate
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    
    def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    #display_scores(tree_rmse_scores)
    • Calculate the score of the linear regression model
    lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)
    #display_scores(lin_rmse_scores)
    • Random forest model
    from sklearn.ensemble import RandomForestRegressor
    forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
    forest_reg.fit(housing_prepared, housing_labels)
    housing_predictions = forest_reg.predict(housing_prepared)
    forest_mse = mean_squared_error(housing_labels, housing_predictions)
    forest_rmse = np.sqrt(forest_mse)
    #print(forest_rmse)
    from sklearn.model_selection import cross_val_score
    forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)
    #display_scores(forest_rmse_scores)
    scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    #print(pd.Series(np.sqrt(-scores)).describe())
  12. Fine tune the model

  13. Grid search

    #You can use scikit learn's gridsearchcv to explore for you. All you have to do is tell it what super parameters you want to experiment with and what values you need to try. It will use cross validation to evaluate all possible combinations of super parameter values.
    #The following code searches for the best combination of the hyperparametric values of randomforestregressor:
    #When you don't know what value a super parameter should be assigned, a simple way is to continuously try the power of 10
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import GridSearchCV
    param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, # try 12 (3×4) combinations of hyperparameters
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, # then try 6 (2×3) combinations with bootstrap set as False
    ]
    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(housing_prepared, housing_labels)
    #print(grid_search.best_params_)
    #print(grid_search.best_estimator_)
    
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
       print(np.sqrt(-mean_score), params)
    print(pd.DataFrame(grid_search.cv_results_))
    #Random search
    #Integration approach
  14. Analyzing the best model and its errors

    feature_importances = grid_search.best_estimator_.feature_importances_
    #print(feature_importances)
    #Display these importance scores next to the corresponding attribute names:
    extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    #cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
    cat_encoder = full_pipeline.named_transformers_["cat"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    sorted(zip(feature_importances, attributes), reverse=True)
    #print(sorted(zip(feature_importances, attributes), reverse=True))
    #Pass the test set evaluation system
    from sklearn.metrics import mean_squared_error
    final_model = grid_search.best_estimator_
    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()
    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    #print(final_rmse)
  15. Start up, monitoring and maintenance system