juanse77.github.io

Predictor de precios inmobiliarios:¶

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LassoCV

import mlflow

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.metrics import RocCurveDisplay, roc_curve
In [2]:
mlflow_client = mlflow.client.MlflowClient("http://127.0.0.1:5000")
mlflow.set_tracking_uri("http://127.0.0.1:5000")
In [3]:
mlflow.set_experiment("house-sales-2.0")
Out[3]:
<Experiment: artifact_location='./mlruns/4', creation_time=1664730900366, experiment_id='4', last_update_time=1664730900366, lifecycle_stage='active', name='house-sales-2.0', tags={}>

Exploración de datos:¶

In [4]:
df = pd.read_csv("train.csv")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

Existen varios campos con datos perdidos, aunque al explorar la descripción de los datos nos encontramos con que algunos datos perdidos tienen significado; como por ejemplo PoolQC, en el que el campo NA significa que el inmueble no tiene piscina. Por este motivo se recodificarán los campos de este tipo con el valor por defecto 'no_item' en el momento de las imputaciones. En los otros casos imputaremos a la moda los campos categoricos y a la mediana los campos numéricos.

In [5]:
seed = np.random.randint(1, 10000)

Imputaciones:¶

  • El campo MSSubClass lo imputamos a la moda.
  • El campo MSZoning lo imputaremos a la moda.
  • El campo LotFrontage lo imputaremos a la mediana.
  • El campo LotArea lo imputaremos a la mediana.
  • El campo Street lo imputaremos a la moda.
  • El campo Alley lo imputaremos a no_item.
  • El campo LotShape lo imputaremos a la moda.
  • El campo LandContour lo imputaremos a la moda.
  • El campo Utilities lo imputaremos a la moda.
  • El campo LotConfig lo imputaremos a la moda.
  • El campo LandSlope lo imputamos a la moda.
  • El campo Neighborhood lo imputamos a la moda.
  • El campo Condition1 lo imputamos a la moda.
  • El campo Condition2 lo imputamos a la moda.
  • El campo BldgType lo imputamos a la moda.
  • El campo HouseStyle lo imputmos a la moda.
  • El campo OverallQual lo imputamos a la mediana.
  • El campo OverallCond lo imputamos a la mediana.
  • El campo YearBuilt lo imputamos a la mediana.
  • El campo YearRemodAdd lo imputamos a la mediana.
  • El campo RoofStyle lo imputamos a moda.
  • El campo RoofMatl lo imputamos a moda.
  • El campo Exterior1st lo imputamos a la moda.
  • El campo Exterior2nd lo imputamos a la moda.
  • El campo MasVnrType lo imputaremos a la moda.
  • El campo MasVnrArea lo imputaremos a la mediana.
  • El campo ExterQual lo imputamos a la moda.
  • El campo ExterCond lo imputamos a la moda.
  • El campo Foundation lo imputamos a la moda.
  • El campo BsmtQual lo imputaremos a no_item.
  • El campo BsmtCond lo imputaremos a no_item.
  • El campo BsmtExposure lo imputaremos a no_item.
  • El campo BsmtFinType1 lo imputaremos a no_item.
  • El campo BsmtFinSF1 lo imputamos a la mediana.
  • El campo BsmtFinType2 lo imputaremos a no_item.
  • El campo BsmtFinSF2 lo imputamos a la mediana.
  • El campo BsmtUnfSF lo imputamos a la mediana.
  • El campo TotalBsmtSF lo imputamos a la mediana.
  • El campo Heating lo imputamos a la moda.
  • El campo HeatingQC lo imputamos a la moda.
  • El campo CentralAir lo imputamos a la moda.
  • El campo Electrical lo imputaremos a la moda.
  • El campo 1stFlrSF lo imputamos a la mediana.
  • El campo 2ndFlrSF lo imputamos a la mediana.
  • El campo LowQualFinSF lo imputamos a la mediana.
  • El campo GrLivArea lo imputamos a la mediana.
  • El campo BsmtFullBath lo imputamos a la mediana.
  • El campo BsmtHalfBath lo imputamos a la mediana.
  • El campo FullBath lo imputamos a la mediana.
  • El campo HalfBath lo imputamos a la mediana.
  • El campo BedroomAbvGr lo imputamos a la mediana.
  • El campo KitchenAbvGr lo imputamos a la mediana.
  • El campo KitchenQual lo imputamos a la moda.
  • El campo TotRmsAbvGrd lo imputamos a la mediana.
  • El campo Functional lo imputamos a moda.
  • El campo Fireplaces lo imputamos a la mediana.
  • El campo FireplaceQu lo imputaremos a no_item.
  • El campo GarageType lo imputaremos a no_item.
  • El campo GarageYrBlt lo imputaremos a la mediana.
  • El campo GarageFinish lo imputaremos a no_item.
  • El campo GarageQual lo imputaremos a no_item.
  • El campo GarageCars lo imputeremos a la mediana.
  • El campo GarageArea lo imputamos a la mediana.
  • El campo GarageQual lo imputamos a no_item.
  • El campo GarageCond lo imputaremos a no_item.
  • El campo PavedDrive lo imputamos a la moda.
  • El campo WookDeckSF lo imputamos a la mediana.
  • El campo OpenPorchSF lo imputamos la mediana.
  • El campo EnclosedPorch lo imputamos a la mediana.
  • El campo 3SsnPorch lo imputamos a la mediana.
  • El campo ScreenPorch lo imputamos a la mediana.
  • El campo PoolArea lo imputamos a la mediana.
  • El campo PoolQC lo imputamos a no_item.
  • El campo Fence lo imputamos a no_item.
  • El campo MiscFeature lo imputamos a no_item.
  • El campo MiscVal lo imputamos a la mediana.
  • El campo MoSold lo imputamos a la moda.
  • El campo YrSold lo imputamos a la mediana.
  • El campo SaleType lo imputamos a la moda.
  • El campo SaleCondition lo imputamos a la moda.
In [6]:
y = df.pop('SalePrice')

X = df
X = X.drop("Id", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
In [7]:
modas = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
         'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
         'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir',
         'Electrical', 'KitchenQual', 'Functional', 'PavedDrive', 'MoSold', 'SaleType', 'SaleCondition']

medianas = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',  'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
            'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold']

constantes = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
In [8]:
class CustomImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, fields):
        self.fields = fields
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return self.__impute(X)
    
    def fit_transform(self, X, y=None):
        return self.__impute(X)
    
    def __impute(self, X):
        X_out = X.copy()
        
        s_imp = SimpleImputer(strategy='most_frequent')
        X_out.loc[:, self.fields['modas']] = s_imp.fit_transform(X.loc[:, self.fields['modas']])
        
        s_imp = SimpleImputer(strategy='median')
        X_out.loc[:, self.fields['medianas']] = s_imp.fit_transform(X.loc[:, self.fields['medianas']])
        
        s_imp = SimpleImputer(strategy='constant', fill_value='no-item')        
        X_out.loc[:, self.fields['constantes']] = s_imp.fit_transform(X.loc[:, self.fields['constantes']])
        
        return X_out
In [9]:
def check_all_fields(fields, columns):
    
    error_fields = [f for f in fields if f not in columns]
    error_fields.extend([f for f in columns if f not in fields])
    
    if len(error_fields) > 0:
        print(error_fields)
    else:
        print("Fields ok")
In [10]:
num_fields = list(X_train.select_dtypes(exclude=['object']).columns)
str_fields = list(X_train.select_dtypes(include=['object']).columns)

num_fields.remove('MSSubClass')
str_fields.append('MSSubClass')

idx_num_fields = [i for i, f in enumerate(X_train.columns) if f in num_fields]
idx_str_fields = [i for i, f in enumerate(X_train.columns) if f in str_fields]
In [11]:
fields_ct_impute = modas + medianas + constantes
fields_ct_transform = num_fields + str_fields

check_all_fields(fields_ct_impute, X_train.columns)
check_all_fields(fields_ct_transform, X_train.columns)
Fields ok
Fields ok
In [12]:
ct_transform = ColumnTransformer([
    ("ohe", OneHotEncoder(handle_unknown='ignore'), str_fields),
    ("nums", MinMaxScaler(), num_fields),
])
In [13]:
params = {
    "n_estimators": 500,
    "max_depth": 10,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
    "random_state": seed,
}
In [14]:
pipeline = Pipeline([
    ('imp', CustomImputer({"medianas": medianas, "modas": modas, "constantes": constantes})),
    ('transform', ct_transform),
    ('selection', SelectKBest(score_func=chi2, k=200)),
    ('model', GradientBoostingRegressor(**params)),
])
In [15]:
pipeline
Out[15]:
Pipeline(steps=[('imp',
                 CustomImputer(fields={'constantes': ['Alley', 'BsmtQual',
                                                      'BsmtCond',
                                                      'BsmtExposure',
                                                      'BsmtFinType1',
                                                      'BsmtFinType2',
                                                      'FireplaceQu',
                                                      'GarageType',
                                                      'GarageFinish',
                                                      'GarageQual',
                                                      'GarageCond', 'PoolQC',
                                                      'Fence', 'MiscFeature'],
                                       'medianas': ['LotFrontage', 'LotArea',
                                                    'OverallQual',
                                                    'OverallCond', 'YearBuilt',
                                                    'YearRemodAdd',
                                                    'MasVnrArea', 'BsmtFinSF1'...
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch', '3SsnPorch', ...])])),
                ('selection',
                 SelectKBest(k=200,
                             score_func=<function chi2 at 0x000002C7871D3130>)),
                ('model',
                 GradientBoostingRegressor(learning_rate=0.01, max_depth=10,
                                           min_samples_split=5,
                                           n_estimators=500,
                                           random_state=4209))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('imp',
                 CustomImputer(fields={'constantes': ['Alley', 'BsmtQual',
                                                      'BsmtCond',
                                                      'BsmtExposure',
                                                      'BsmtFinType1',
                                                      'BsmtFinType2',
                                                      'FireplaceQu',
                                                      'GarageType',
                                                      'GarageFinish',
                                                      'GarageQual',
                                                      'GarageCond', 'PoolQC',
                                                      'Fence', 'MiscFeature'],
                                       'medianas': ['LotFrontage', 'LotArea',
                                                    'OverallQual',
                                                    'OverallCond', 'YearBuilt',
                                                    'YearRemodAdd',
                                                    'MasVnrArea', 'BsmtFinSF1'...
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch', '3SsnPorch', ...])])),
                ('selection',
                 SelectKBest(k=200,
                             score_func=<function chi2 at 0x000002C7871D3130>)),
                ('model',
                 GradientBoostingRegressor(learning_rate=0.01, max_depth=10,
                                           min_samples_split=5,
                                           n_estimators=500,
                                           random_state=4209))])
CustomImputer(fields={'constantes': ['Alley', 'BsmtQual', 'BsmtCond',
                                     'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'FireplaceQu',
                                     'GarageType', 'GarageFinish', 'GarageQual',
                                     'GarageCond', 'PoolQC', 'Fence',
                                     'MiscFeature'],
                      'medianas': ['LotFrontage', 'LotArea', 'OverallQual',
                                   'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                   'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                   'BsmtUnfS...
                      'modas': ['MSSubClass', 'MSZoning', 'Street', 'LotShape',
                                'LandContour', 'Utilities', 'LotConfig',
                                'LandSlope', 'Neighborhood', 'Condition1',
                                'Condition2', 'BldgType', 'HouseStyle',
                                'RoofStyle', 'RoofMatl', 'Exterior1st',
                                'Exterior2nd', 'MasVnrType', 'ExterQual',
                                'ExterCond', 'Foundation', 'Heating',
                                'HeatingQC', 'CentralAir', 'Electrical',
                                'KitchenQual', 'Functional', 'PavedDrive',
                                'MoSold', 'SaleType', ...]})
ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                 ['MSZoning', 'Street', 'Alley', 'LotShape',
                                  'LandContour', 'Utilities', 'LotConfig',
                                  'LandSlope', 'Neighborhood', 'Condition1',
                                  'Condition2', 'BldgType', 'HouseStyle',
                                  'RoofStyle', 'RoofMatl', 'Exterior1st',
                                  'Exterior2nd', 'MasVnrType', 'ExterQual',
                                  'ExterCond', 'Foundation', 'BsmtQual',
                                  'Bsmt...
                                  'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                  'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                  'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                                  '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
                                  'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                                  'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                                  'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
                                  'GarageCars', 'GarageArea', 'WoodDeckSF',
                                  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', ...])])
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass']
OneHotEncoder(handle_unknown='ignore')
['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
MinMaxScaler()
SelectKBest(k=200, score_func=<function chi2 at 0x000002C7871D3130>)
GradientBoostingRegressor(learning_rate=0.01, max_depth=10, min_samples_split=5,
                          n_estimators=500, random_state=4209)
In [16]:
pipeline.get_params()
Out[16]:
{'memory': None,
 'steps': [('imp',
   CustomImputer(fields={'constantes': ['Alley', 'BsmtQual', 'BsmtCond',
                                        'BsmtExposure', 'BsmtFinType1',
                                        'BsmtFinType2', 'FireplaceQu',
                                        'GarageType', 'GarageFinish', 'GarageQual',
                                        'GarageCond', 'PoolQC', 'Fence',
                                        'MiscFeature'],
                         'medianas': ['LotFrontage', 'LotArea', 'OverallQual',
                                      'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                      'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                      'BsmtUnfS...
                         'modas': ['MSSubClass', 'MSZoning', 'Street', 'LotShape',
                                   'LandContour', 'Utilities', 'LotConfig',
                                   'LandSlope', 'Neighborhood', 'Condition1',
                                   'Condition2', 'BldgType', 'HouseStyle',
                                   'RoofStyle', 'RoofMatl', 'Exterior1st',
                                   'Exterior2nd', 'MasVnrType', 'ExterQual',
                                   'ExterCond', 'Foundation', 'Heating',
                                   'HeatingQC', 'CentralAir', 'Electrical',
                                   'KitchenQual', 'Functional', 'PavedDrive',
                                   'MoSold', 'SaleType', ...]})),
  ('transform',
   ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                    ['MSZoning', 'Street', 'Alley', 'LotShape',
                                     'LandContour', 'Utilities', 'LotConfig',
                                     'LandSlope', 'Neighborhood', 'Condition1',
                                     'Condition2', 'BldgType', 'HouseStyle',
                                     'RoofStyle', 'RoofMatl', 'Exterior1st',
                                     'Exterior2nd', 'MasVnrType', 'ExterQual',
                                     'ExterCond', 'Foundation', 'BsmtQual',
                                     'Bsmt...
                                     'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                     'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                     'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                                     '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
                                     'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                                     'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                                     'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
                                     'GarageCars', 'GarageArea', 'WoodDeckSF',
                                     'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', ...])])),
  ('selection',
   SelectKBest(k=200, score_func=<function chi2 at 0x000002C7871D3130>)),
  ('model',
   GradientBoostingRegressor(learning_rate=0.01, max_depth=10, min_samples_split=5,
                             n_estimators=500, random_state=4209))],
 'verbose': False,
 'imp': CustomImputer(fields={'constantes': ['Alley', 'BsmtQual', 'BsmtCond',
                                      'BsmtExposure', 'BsmtFinType1',
                                      'BsmtFinType2', 'FireplaceQu',
                                      'GarageType', 'GarageFinish', 'GarageQual',
                                      'GarageCond', 'PoolQC', 'Fence',
                                      'MiscFeature'],
                       'medianas': ['LotFrontage', 'LotArea', 'OverallQual',
                                    'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                    'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                    'BsmtUnfS...
                       'modas': ['MSSubClass', 'MSZoning', 'Street', 'LotShape',
                                 'LandContour', 'Utilities', 'LotConfig',
                                 'LandSlope', 'Neighborhood', 'Condition1',
                                 'Condition2', 'BldgType', 'HouseStyle',
                                 'RoofStyle', 'RoofMatl', 'Exterior1st',
                                 'Exterior2nd', 'MasVnrType', 'ExterQual',
                                 'ExterCond', 'Foundation', 'Heating',
                                 'HeatingQC', 'CentralAir', 'Electrical',
                                 'KitchenQual', 'Functional', 'PavedDrive',
                                 'MoSold', 'SaleType', ...]}),
 'transform': ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                  ['MSZoning', 'Street', 'Alley', 'LotShape',
                                   'LandContour', 'Utilities', 'LotConfig',
                                   'LandSlope', 'Neighborhood', 'Condition1',
                                   'Condition2', 'BldgType', 'HouseStyle',
                                   'RoofStyle', 'RoofMatl', 'Exterior1st',
                                   'Exterior2nd', 'MasVnrType', 'ExterQual',
                                   'ExterCond', 'Foundation', 'BsmtQual',
                                   'Bsmt...
                                   'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                   'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                   'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                                   '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
                                   'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                                   'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                                   'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
                                   'GarageCars', 'GarageArea', 'WoodDeckSF',
                                   'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', ...])]),
 'selection': SelectKBest(k=200, score_func=<function chi2 at 0x000002C7871D3130>),
 'model': GradientBoostingRegressor(learning_rate=0.01, max_depth=10, min_samples_split=5,
                           n_estimators=500, random_state=4209),
 'imp__fields': {'medianas': ['LotFrontage',
   'LotArea',
   'OverallQual',
   'OverallCond',
   'YearBuilt',
   'YearRemodAdd',
   'MasVnrArea',
   'BsmtFinSF1',
   'BsmtFinSF2',
   'BsmtUnfSF',
   'TotalBsmtSF',
   '1stFlrSF',
   '2ndFlrSF',
   'LowQualFinSF',
   'GrLivArea',
   'BsmtFullBath',
   'BsmtHalfBath',
   'FullBath',
   'HalfBath',
   'BedroomAbvGr',
   'KitchenAbvGr',
   'TotRmsAbvGrd',
   'Fireplaces',
   'GarageYrBlt',
   'GarageCars',
   'GarageArea',
   'WoodDeckSF',
   'OpenPorchSF',
   'EnclosedPorch',
   '3SsnPorch',
   'ScreenPorch',
   'PoolArea',
   'MiscVal',
   'YrSold'],
  'modas': ['MSSubClass',
   'MSZoning',
   'Street',
   'LotShape',
   'LandContour',
   'Utilities',
   'LotConfig',
   'LandSlope',
   'Neighborhood',
   'Condition1',
   'Condition2',
   'BldgType',
   'HouseStyle',
   'RoofStyle',
   'RoofMatl',
   'Exterior1st',
   'Exterior2nd',
   'MasVnrType',
   'ExterQual',
   'ExterCond',
   'Foundation',
   'Heating',
   'HeatingQC',
   'CentralAir',
   'Electrical',
   'KitchenQual',
   'Functional',
   'PavedDrive',
   'MoSold',
   'SaleType',
   'SaleCondition'],
  'constantes': ['Alley',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'FireplaceQu',
   'GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']},
 'transform__n_jobs': None,
 'transform__remainder': 'drop',
 'transform__sparse_threshold': 0.3,
 'transform__transformer_weights': None,
 'transform__transformers': [('ohe',
   OneHotEncoder(handle_unknown='ignore'),
   ['MSZoning',
    'Street',
    'Alley',
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'Heating',
    'HeatingQC',
    'CentralAir',
    'Electrical',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PavedDrive',
    'PoolQC',
    'Fence',
    'MiscFeature',
    'SaleType',
    'SaleCondition',
    'MSSubClass']),
  ('nums',
   MinMaxScaler(),
   ['LotFrontage',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'])],
 'transform__verbose': False,
 'transform__verbose_feature_names_out': True,
 'transform__ohe': OneHotEncoder(handle_unknown='ignore'),
 'transform__nums': MinMaxScaler(),
 'transform__ohe__categories': 'auto',
 'transform__ohe__drop': None,
 'transform__ohe__dtype': numpy.float64,
 'transform__ohe__handle_unknown': 'ignore',
 'transform__ohe__max_categories': None,
 'transform__ohe__min_frequency': None,
 'transform__ohe__sparse': True,
 'transform__nums__clip': False,
 'transform__nums__copy': True,
 'transform__nums__feature_range': (0, 1),
 'selection__k': 200,
 'selection__score_func': <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 'model__alpha': 0.9,
 'model__ccp_alpha': 0.0,
 'model__criterion': 'friedman_mse',
 'model__init': None,
 'model__learning_rate': 0.01,
 'model__loss': 'squared_error',
 'model__max_depth': 10,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 5,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 500,
 'model__n_iter_no_change': None,
 'model__random_state': 4209,
 'model__subsample': 1.0,
 'model__tol': 0.0001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}
In [17]:
search = GridSearchCV(
    estimator=pipeline, 
    param_grid={
        'selection__k': [120, 160, 200, 240],
        'model__max_depth': [6, 8, 10, 12],
        'model__min_samples_split': [2, 4, 6],
        'model__n_estimators': [200, 400, 600]
    },
    verbose=3
)
search.fit(X_train, y_train)
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Out[17]:
GridSearchCV(estimator=Pipeline(steps=[('imp',
                                        CustomImputer(fields={'constantes': ['Alley',
                                                                             'BsmtQual',
                                                                             'BsmtCond',
                                                                             'BsmtExposure',
                                                                             'BsmtFinType1',
                                                                             'BsmtFinType2',
                                                                             'FireplaceQu',
                                                                             'GarageType',
                                                                             'GarageFinish',
                                                                             'GarageQual',
                                                                             'GarageCond',
                                                                             'PoolQC',
                                                                             'Fence',
                                                                             'MiscFeature'],
                                                              'medianas': ['LotFrontage',
                                                                           'LotArea',
                                                                           'OverallQual',
                                                                           'OverallCond',
                                                                           'YearBuilt',
                                                                           'YearRemodAdd',
                                                                           'M...
                                        SelectKBest(k=200,
                                                    score_func=<function chi2 at 0x000002C7871D3130>)),
                                       ('model',
                                        GradientBoostingRegressor(learning_rate=0.01,
                                                                  max_depth=10,
                                                                  min_samples_split=5,
                                                                  n_estimators=500,
                                                                  random_state=4209))]),
             param_grid={'model__max_depth': [6, 8, 10, 12],
                         'model__min_samples_split': [2, 4, 6],
                         'model__n_estimators': [200, 400, 600],
                         'selection__k': [120, 160, 200, 240]},
             verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=Pipeline(steps=[('imp',
                                        CustomImputer(fields={'constantes': ['Alley',
                                                                             'BsmtQual',
                                                                             'BsmtCond',
                                                                             'BsmtExposure',
                                                                             'BsmtFinType1',
                                                                             'BsmtFinType2',
                                                                             'FireplaceQu',
                                                                             'GarageType',
                                                                             'GarageFinish',
                                                                             'GarageQual',
                                                                             'GarageCond',
                                                                             'PoolQC',
                                                                             'Fence',
                                                                             'MiscFeature'],
                                                              'medianas': ['LotFrontage',
                                                                           'LotArea',
                                                                           'OverallQual',
                                                                           'OverallCond',
                                                                           'YearBuilt',
                                                                           'YearRemodAdd',
                                                                           'M...
                                        SelectKBest(k=200,
                                                    score_func=<function chi2 at 0x000002C7871D3130>)),
                                       ('model',
                                        GradientBoostingRegressor(learning_rate=0.01,
                                                                  max_depth=10,
                                                                  min_samples_split=5,
                                                                  n_estimators=500,
                                                                  random_state=4209))]),
             param_grid={'model__max_depth': [6, 8, 10, 12],
                         'model__min_samples_split': [2, 4, 6],
                         'model__n_estimators': [200, 400, 600],
                         'selection__k': [120, 160, 200, 240]},
             verbose=3)
Pipeline(steps=[('imp',
                 CustomImputer(fields={'constantes': ['Alley', 'BsmtQual',
                                                      'BsmtCond',
                                                      'BsmtExposure',
                                                      'BsmtFinType1',
                                                      'BsmtFinType2',
                                                      'FireplaceQu',
                                                      'GarageType',
                                                      'GarageFinish',
                                                      'GarageQual',
                                                      'GarageCond', 'PoolQC',
                                                      'Fence', 'MiscFeature'],
                                       'medianas': ['LotFrontage', 'LotArea',
                                                    'OverallQual',
                                                    'OverallCond', 'YearBuilt',
                                                    'YearRemodAdd',
                                                    'MasVnrArea', 'BsmtFinSF1'...
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch', '3SsnPorch', ...])])),
                ('selection',
                 SelectKBest(k=200,
                             score_func=<function chi2 at 0x000002C7871D3130>)),
                ('model',
                 GradientBoostingRegressor(learning_rate=0.01, max_depth=10,
                                           min_samples_split=5,
                                           n_estimators=500,
                                           random_state=4209))])
CustomImputer(fields={'constantes': ['Alley', 'BsmtQual', 'BsmtCond',
                                     'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'FireplaceQu',
                                     'GarageType', 'GarageFinish', 'GarageQual',
                                     'GarageCond', 'PoolQC', 'Fence',
                                     'MiscFeature'],
                      'medianas': ['LotFrontage', 'LotArea', 'OverallQual',
                                   'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                   'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                   'BsmtUnfS...
                      'modas': ['MSSubClass', 'MSZoning', 'Street', 'LotShape',
                                'LandContour', 'Utilities', 'LotConfig',
                                'LandSlope', 'Neighborhood', 'Condition1',
                                'Condition2', 'BldgType', 'HouseStyle',
                                'RoofStyle', 'RoofMatl', 'Exterior1st',
                                'Exterior2nd', 'MasVnrType', 'ExterQual',
                                'ExterCond', 'Foundation', 'Heating',
                                'HeatingQC', 'CentralAir', 'Electrical',
                                'KitchenQual', 'Functional', 'PavedDrive',
                                'MoSold', 'SaleType', ...]})
ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                 ['MSZoning', 'Street', 'Alley', 'LotShape',
                                  'LandContour', 'Utilities', 'LotConfig',
                                  'LandSlope', 'Neighborhood', 'Condition1',
                                  'Condition2', 'BldgType', 'HouseStyle',
                                  'RoofStyle', 'RoofMatl', 'Exterior1st',
                                  'Exterior2nd', 'MasVnrType', 'ExterQual',
                                  'ExterCond', 'Foundation', 'BsmtQual',
                                  'Bsmt...
                                  'OverallCond', 'YearBuilt', 'YearRemodAdd',
                                  'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                  'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                                  '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
                                  'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                                  'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
                                  'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
                                  'GarageCars', 'GarageArea', 'WoodDeckSF',
                                  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', ...])])
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass']
OneHotEncoder(handle_unknown='ignore')
['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
MinMaxScaler()
SelectKBest(k=200, score_func=<function chi2 at 0x000002C7871D3130>)
GradientBoostingRegressor(learning_rate=0.01, max_depth=10, min_samples_split=5,
                          n_estimators=500, random_state=4209)
In [18]:
train_score = search.score(X_train, y_train)
train_score
Out[18]:
0.9591508955664606
In [19]:
test_score = search.score(X_test, y_test)
test_score
Out[19]:
0.7827827279310412
In [20]:
mlflow.log_metric("train_score", train_score)
mlflow.log_metric("test_score", test_score)
In [22]:
best_params = search.best_params_
best_params
Out[22]:
{'model__max_depth': 6,
 'model__min_samples_split': 4,
 'model__n_estimators': 600,
 'selection__k': 240}
In [23]:
mlflow.log_params(best_params)
mlflow.log_param("sel_func", "chi2")
Out[23]:
'chi2'
In [24]:
file_name = "House-sales.pkl"
with open(file_name, "wb") as f:
    pickle.dump(search.best_estimator_, f)
    
mlflow.log_artifact(file_name, "model")
In [26]:
import sklearn

with open("versions.txt", "w") as f:
    f.writelines([
        f"La versión de pandas es: {pd.__version__}",
        f"La versión de sklearn es: {sklearn.__version__}",
        f"La versión de mlflow es: {mlflow.__version__}"
    ])
    
mlflow.log_artifact("versions.txt")
In [27]:
mlflow.end_run()
In [ ]: