Catboost + merge models 1.5562589231189894

ottogin · Sep 25, 2021 · 39213fc · 39213fc
1 parent 0dfd16b
commit 39213fc
Show file tree

Hide file tree

Showing 4 changed files with 193 additions and 38 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 *.pyc
 train.log
+data/train.csv
+data/train_trunc.csv
+catboost_info/
+raif_hack.zip
diff --git a/raif_hack/model.py b/raif_hack/model.py
@@ -5,6 +5,7 @@
 import logging
 
 from lightgbm import LGBMRegressor
+from catboost import CatBoostRegressor
 
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
@@ -92,7 +93,6 @@ def fit(
         y_offer: pd.Series,
         X_manual: pd.DataFrame,
         y_manual: pd.Series,
-        weight,
     ):
         """Обучение модели.
         ML модель обучается на данных по предложениям на рынке (цены из объявления)
@@ -108,8 +108,7 @@ def fit(
             X_offer,
             y_offer,
             model__feature_name=[f"{i}" for i in range(70)],
-            model__categorical_feature=["67", "68", "69"], 
-            model__sample_weight=weight
+            model__categorical_feature=["67", "68", "69"],
         )
         logger.info("Find corr coefficient")
         self._find_corr_coefficient(X_manual, y_manual)
@@ -154,7 +153,7 @@ def load(self, path: str):
         return model
 
 
-class TwoStepBenchmarkModel:
+class TwoStepBenchmarkModel(BenchmarkModel):
     """
     Модель представляет из себя sklearn pipeline. Пошаговый алгоритм:
       1) в качестве обучения выбираются все данные с price_type=0
@@ -210,8 +209,12 @@ def __init__(
             ]
         )
 
-        self.model =  LGBMRegressor(**model_params)
-        self.model2 = LGBMRegressor(**model_params)
+        # params =
+
+        # logger.info("Init with ")
+
+        self.model = CatBoostRegressor()
+        self.model2 = CatBoostRegressor()
 
         self.pipeline = Pipeline(
             steps=[("preprocessor", self.preprocessor), ("model", self.model)]
@@ -236,8 +239,11 @@ def fit(
         self.pipeline.fit(
             X_offer,
             y_offer,
-            model__feature_name=NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES,
-            model__categorical_feature=CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES,
+            # model__feature_name=NUM_FEATURES
+            # + CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES,
+            # model__categorical_feature=CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES,
         )
 
         killer_f = self.pipeline.predict(X_manual)
@@ -247,8 +253,12 @@ def fit(
         self.pipeline2.fit(
             X_manual,
             y_manual,
-            model__feature_name=NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES + ["killer_f"],
-            model__categorical_feature=CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES,
+            # model__feature_name=NUM_FEATURES
+            # + CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES
+            # + ["killer_f"],
+            # model__categorical_feature=CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES,
         )
 
         self.__is_fitted = True
@@ -273,21 +283,160 @@ def predict(self, X: pd.DataFrame) -> np.array:
                 )
             )
 
-    def save(self, path: str):
-        """Сериализует модель в pickle.
 
-        :param path: str, путь до файла
-        """
-        with open(path, "wb") as f:
-            pickle.dump(self, f)
+class WeightedTwoStepModel(BenchmarkModel):
+    """
+    Модель представляет из себя sklearn pipeline. Пошаговый алгоритм:
+      1) в качестве обучения выбираются все данные с price_type=0
+      1) все фичи делятся на три типа (numerical_features, ohe_categorical_features, ste_categorical_features):
+          1.1) numerical_features - применяется StandardScaler
+          1.2) ohe_categorical_featires - кодируются через one hot encoding
+          1.3) ste_categorical_features - кодируются через SmoothedTargetEncoder
+      2) после этого все полученные фичи конкатенируются в одно пространство фичей и подаются на вход модели Lightgbm
+      3) делаем предикт на данных с price_type=1, считаем среднее отклонение реальных значений от предикта. Вычитаем это отклонение на финальном шаге (чтобы сместить отклонение к 0)
 
-    @classmethod
-    def load(self, path: str):
-        """Сериализует модель в pickle.
+    :param numerical_features: list, список численных признаков из датафрейма
+    :param ohe_categorical_features: list, список категориальных признаков для one hot encoding
+    :param ste_categorical_features, list, список категориальных признаков для smoothed target encoding.
+                                     Можно кодировать сразу несколько полей (например объединять категориальные признаки)
+    :
+    """
 
-        :param path: str, путь до файла
-        :return: Модель
+    def __init__(
+        self,
+        numerical_features: typing.List[str],
+        ohe_categorical_features: typing.List[str],
+        ste_categorical_features: typing.List[typing.Union[str, typing.List[str]]],
+        model_params: typing.Dict[str, typing.Union[str, int, float]],
+    ):
+        self.num_features = numerical_features
+        self.ohe_cat_features = ohe_categorical_features
+        self.ste_cat_features = ste_categorical_features
+
+        self.preprocessor = ColumnTransformer(
+            transformers=[
+                ("num", StandardScaler(), self.num_features),
+                ("ohe", OneHotEncoder(), self.ohe_cat_features),
+                (
+                    "ste",
+                    OrdinalEncoder(
+                        handle_unknown="use_encoded_value", unknown_value=-1
+                    ),
+                    self.ste_cat_features,
+                ),
+            ]
+        )
+        self.preprocessor2 = ColumnTransformer(
+            transformers=[
+                ("num", StandardScaler(), self.num_features + ["killer_f"]),
+                ("ohe", OneHotEncoder(), self.ohe_cat_features),
+                (
+                    "ste",
+                    OrdinalEncoder(
+                        handle_unknown="use_encoded_value", unknown_value=-1
+                    ),
+                    self.ste_cat_features,
+                ),
+            ]
+        )
+
+        # self.model = LGBMRegressor(
+        #     n_estimators=1000,
+        #     learning_rate=0.01,
+        #     reg_alpha=1,
+        #     num_leaves=40,
+        #     min_child_samples=5,
+        #     importance_type="gain",
+        #     n_jobs=4,
+        #     random_state=563,
+        # )
+        # self.model2 = LGBMRegressor(
+        #     n_estimators=1000,
+        #     learning_rate=0.01,
+        #     reg_alpha=1,
+        #     num_leaves=40,
+        #     min_child_samples=5,
+        #     importance_type="gain",
+        #     n_jobs=4,
+        #     random_state=213,
+        # )
+
+        self.model = CatBoostRegressor()
+        self.model2 = CatBoostRegressor()
+
+        self.pipeline = Pipeline(
+            steps=[("preprocessor", self.preprocessor), ("model", self.model)]
+        )
+
+        self.pipeline2 = Pipeline(
+            steps=[("preprocessor", self.preprocessor2), ("model", self.model2)]
+        )
+
+        self._is_fitted = False
+        self.corr_coef = 0
+
+    def fit(
+        self,
+        X_offer: pd.DataFrame,
+        y_offer: pd.Series,
+        X_manual: pd.DataFrame,
+        y_manual: pd.Series,
+    ):
+
+        logger.info("Fit lightgbm")
+
+        X = pd.concat([X_offer, X_manual])
+        y = pd.concat([y_offer, y_manual])
+        WEIGHT = 0.05
+        weight = np.ones_like(y.values) * WEIGHT
+        weight[-len(y_manual) :] = 1 - WEIGHT
+
+        self.pipeline.fit(
+            X,
+            y,
+            # model__feature_name=NUM_FEATURES
+            # + CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES,
+            # model__categorical_feature=CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES,
+            model__sample_weight=weight,
+        )
+
+        killer_f = self.pipeline.predict(X_manual)
+        X_manual = X_manual.copy()
+        X_manual["killer_f"] = killer_f
+
+        logger.info("Fit lightgbm 2")
+
+        self.pipeline2.fit(
+            X_manual,
+            y_manual,
+            # model__feature_name=NUM_FEATURES
+            # + CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES
+            # + ["killer_f"],
+            # model__categorical_feature=CATEGORICAL_OHE_FEATURES
+            # + CATEGORICAL_STE_FEATURES,
+        )
+
+        self.__is_fitted = True
+
+    def predict(self, X: pd.DataFrame) -> np.array:
+        """Предсказание модели Предсказываем преобразованный таргет, затем конвертируем в обычную цену через обратное
+        преобразование.
+
+        :param X: pd.DataFrame
+        :return: np.array, предсказания (цены на коммерческую недвижимость)
         """
-        with open(path, "rb") as f:
-            model = pickle.load(f)
-        return model
+        if self.__is_fitted:
+            killer_f = self.pipeline.predict(X)
+            X = X.copy()
+            X["killer_f"] = killer_f
+            price = self.pipeline2.predict(X)
+            return price
+        else:
+            raise NotFittedError(
+                "This {} instance is not fitted yet! Call 'fit' with appropriate arguments before predict".format(
+                    type(self).__name__
+                )
+            )
diff --git a/raif_hack/settings.py b/raif_hack/settings.py
@@ -83,7 +83,7 @@
     num_leaves=40,
     min_child_samples=5,
     importance_type="gain",
-    n_jobs=1,
+    n_jobs=4,
     random_state=563,
 )
 

diff --git a/train.py b/train.py
@@ -4,7 +4,7 @@
 import numpy as np
 from traceback import format_exc
 
-from raif_hack.model import BenchmarkModel, TwoStepBenchmarkModel
+from raif_hack.model import *
 from raif_hack.settings import (
     MODEL_PARAMS,
     LOGGING_CONFIG,
@@ -35,7 +35,6 @@ def parse_args():
         formatter_class=argparse.RawTextHelpFormatter,
     )
 
-
     parser.add_argument("--val", action="store_true")
 
     parser.add_argument(
@@ -53,40 +52,44 @@ def parse_args():
 if __name__ == "__main__":
 
     try:
-        logger.info("START train.py")
+        # for w in [0.0001, 0.0002, 0.0003, 0.0004, 0.0005,
+        #       0.001, 0.002, 0.003, 0.004, 0.005,
+        #       0.01, 0.02, 0.03, 0.04, 0.05,
+        #       0.1, 0.2, 0.3, 0.4]:
+        logger.info("START train.py with")
         args = vars(parse_args())
 
-        train_path = 'data/train.csv'
+        train_path = "data/train.csv"
         if args["val"]:
-            train_path = 'data/train_trunc.csv'
+            train_path = "data/train_trunc.csv"
 
         logger.info("Load train df from %s" % train_path)
         train_df = pd.read_csv(train_path)
         logger.info(f"Input shape: {train_df.shape}")
         train_df = prepare_categorical(train_df)
 
-        X_offer = train_df[NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
-        y_offer = train_df[TARGET]
-        weight = np.ones_like(train_df['price_type'].values)*0.9
-        weight[train_df['price_type'].values == 0] = 0.1
+        X_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][
+            NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES
+        ]
+        y_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]
+
         X_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][
             NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES
         ]
         y_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]
         logger.info(
             f"X_offer {X_offer.shape}  y_offer {y_offer.shape}\tX_manual {X_manual.shape} y_manual {y_manual.shape}"
         )
-        model = TwoStepBenchmarkModel(
+        model = WeightedTwoStepModel(
             numerical_features=NUM_FEATURES,
             ohe_categorical_features=CATEGORICAL_OHE_FEATURES,
             ste_categorical_features=CATEGORICAL_STE_FEATURES,
             model_params=MODEL_PARAMS,
         )
         logger.info("Fit model")
-        model.fit(X_offer, y_offer, X_manual, y_manual, weight)
+        model.fit(X_offer, y_offer, X_manual, y_manual)
         logger.info("Save model")
         model.save(args["mp"])
-
         # predictions_offer = model.predict(X_offer)
         # metrics = metrics_stat(
         #     y_offer.values, predictions_offer / (1 + model.corr_coef)
@@ -108,7 +111,6 @@ def parse_args():
         metrics = metrics_stat(y_manual_val.values, predictions_manual_val)
         logger.info(f"Metrics stat for validation data with manual prices: {metrics}")
 
-
     except Exception as e:
         err = format_exc()
         logger.error(err)