Skip to content

Commit

Permalink
Catboost + merge models 1.5562589231189894
Browse files Browse the repository at this point in the history
  • Loading branch information
ottogin committed Sep 25, 2021
1 parent 0dfd16b commit 39213fc
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 38 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
*.pyc
train.log
data/train.csv
data/train_trunc.csv
catboost_info/
raif_hack.zip
197 changes: 173 additions & 24 deletions raif_hack/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -92,7 +93,6 @@ def fit(
y_offer: pd.Series,
X_manual: pd.DataFrame,
y_manual: pd.Series,
weight,
):
"""Обучение модели.
ML модель обучается на данных по предложениям на рынке (цены из объявления)
Expand All @@ -108,8 +108,7 @@ def fit(
X_offer,
y_offer,
model__feature_name=[f"{i}" for i in range(70)],
model__categorical_feature=["67", "68", "69"],
model__sample_weight=weight
model__categorical_feature=["67", "68", "69"],
)
logger.info("Find corr coefficient")
self._find_corr_coefficient(X_manual, y_manual)
Expand Down Expand Up @@ -154,7 +153,7 @@ def load(self, path: str):
return model


class TwoStepBenchmarkModel:
class TwoStepBenchmarkModel(BenchmarkModel):
"""
Модель представляет из себя sklearn pipeline. Пошаговый алгоритм:
1) в качестве обучения выбираются все данные с price_type=0
Expand Down Expand Up @@ -210,8 +209,12 @@ def __init__(
]
)

self.model = LGBMRegressor(**model_params)
self.model2 = LGBMRegressor(**model_params)
# params =

# logger.info("Init with ")

self.model = CatBoostRegressor()
self.model2 = CatBoostRegressor()

self.pipeline = Pipeline(
steps=[("preprocessor", self.preprocessor), ("model", self.model)]
Expand All @@ -236,8 +239,11 @@ def fit(
self.pipeline.fit(
X_offer,
y_offer,
model__feature_name=NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES,
model__categorical_feature=CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES,
# model__feature_name=NUM_FEATURES
# + CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES,
# model__categorical_feature=CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES,
)

killer_f = self.pipeline.predict(X_manual)
Expand All @@ -247,8 +253,12 @@ def fit(
self.pipeline2.fit(
X_manual,
y_manual,
model__feature_name=NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES + ["killer_f"],
model__categorical_feature=CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES,
# model__feature_name=NUM_FEATURES
# + CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES
# + ["killer_f"],
# model__categorical_feature=CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES,
)

self.__is_fitted = True
Expand All @@ -273,21 +283,160 @@ def predict(self, X: pd.DataFrame) -> np.array:
)
)

def save(self, path: str):
"""Сериализует модель в pickle.

:param path: str, путь до файла
"""
with open(path, "wb") as f:
pickle.dump(self, f)
class WeightedTwoStepModel(BenchmarkModel):
"""
Модель представляет из себя sklearn pipeline. Пошаговый алгоритм:
1) в качестве обучения выбираются все данные с price_type=0
1) все фичи делятся на три типа (numerical_features, ohe_categorical_features, ste_categorical_features):
1.1) numerical_features - применяется StandardScaler
1.2) ohe_categorical_featires - кодируются через one hot encoding
1.3) ste_categorical_features - кодируются через SmoothedTargetEncoder
2) после этого все полученные фичи конкатенируются в одно пространство фичей и подаются на вход модели Lightgbm
3) делаем предикт на данных с price_type=1, считаем среднее отклонение реальных значений от предикта. Вычитаем это отклонение на финальном шаге (чтобы сместить отклонение к 0)
@classmethod
def load(self, path: str):
"""Сериализует модель в pickle.
:param numerical_features: list, список численных признаков из датафрейма
:param ohe_categorical_features: list, список категориальных признаков для one hot encoding
:param ste_categorical_features, list, список категориальных признаков для smoothed target encoding.
Можно кодировать сразу несколько полей (например объединять категориальные признаки)
:
"""

:param path: str, путь до файла
:return: Модель
def __init__(
self,
numerical_features: typing.List[str],
ohe_categorical_features: typing.List[str],
ste_categorical_features: typing.List[typing.Union[str, typing.List[str]]],
model_params: typing.Dict[str, typing.Union[str, int, float]],
):
self.num_features = numerical_features
self.ohe_cat_features = ohe_categorical_features
self.ste_cat_features = ste_categorical_features

self.preprocessor = ColumnTransformer(
transformers=[
("num", StandardScaler(), self.num_features),
("ohe", OneHotEncoder(), self.ohe_cat_features),
(
"ste",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1
),
self.ste_cat_features,
),
]
)
self.preprocessor2 = ColumnTransformer(
transformers=[
("num", StandardScaler(), self.num_features + ["killer_f"]),
("ohe", OneHotEncoder(), self.ohe_cat_features),
(
"ste",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1
),
self.ste_cat_features,
),
]
)

# self.model = LGBMRegressor(
# n_estimators=1000,
# learning_rate=0.01,
# reg_alpha=1,
# num_leaves=40,
# min_child_samples=5,
# importance_type="gain",
# n_jobs=4,
# random_state=563,
# )
# self.model2 = LGBMRegressor(
# n_estimators=1000,
# learning_rate=0.01,
# reg_alpha=1,
# num_leaves=40,
# min_child_samples=5,
# importance_type="gain",
# n_jobs=4,
# random_state=213,
# )

self.model = CatBoostRegressor()
self.model2 = CatBoostRegressor()

self.pipeline = Pipeline(
steps=[("preprocessor", self.preprocessor), ("model", self.model)]
)

self.pipeline2 = Pipeline(
steps=[("preprocessor", self.preprocessor2), ("model", self.model2)]
)

self._is_fitted = False
self.corr_coef = 0

def fit(
self,
X_offer: pd.DataFrame,
y_offer: pd.Series,
X_manual: pd.DataFrame,
y_manual: pd.Series,
):

logger.info("Fit lightgbm")

X = pd.concat([X_offer, X_manual])
y = pd.concat([y_offer, y_manual])
WEIGHT = 0.05
weight = np.ones_like(y.values) * WEIGHT
weight[-len(y_manual) :] = 1 - WEIGHT

self.pipeline.fit(
X,
y,
# model__feature_name=NUM_FEATURES
# + CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES,
# model__categorical_feature=CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES,
model__sample_weight=weight,
)

killer_f = self.pipeline.predict(X_manual)
X_manual = X_manual.copy()
X_manual["killer_f"] = killer_f

logger.info("Fit lightgbm 2")

self.pipeline2.fit(
X_manual,
y_manual,
# model__feature_name=NUM_FEATURES
# + CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES
# + ["killer_f"],
# model__categorical_feature=CATEGORICAL_OHE_FEATURES
# + CATEGORICAL_STE_FEATURES,
)

self.__is_fitted = True

def predict(self, X: pd.DataFrame) -> np.array:
"""Предсказание модели Предсказываем преобразованный таргет, затем конвертируем в обычную цену через обратное
преобразование.
:param X: pd.DataFrame
:return: np.array, предсказания (цены на коммерческую недвижимость)
"""
with open(path, "rb") as f:
model = pickle.load(f)
return model
if self.__is_fitted:
killer_f = self.pipeline.predict(X)
X = X.copy()
X["killer_f"] = killer_f
price = self.pipeline2.predict(X)
return price
else:
raise NotFittedError(
"This {} instance is not fitted yet! Call 'fit' with appropriate arguments before predict".format(
type(self).__name__
)
)
2 changes: 1 addition & 1 deletion raif_hack/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
num_leaves=40,
min_child_samples=5,
importance_type="gain",
n_jobs=1,
n_jobs=4,
random_state=563,
)

Expand Down
28 changes: 15 additions & 13 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from traceback import format_exc

from raif_hack.model import BenchmarkModel, TwoStepBenchmarkModel
from raif_hack.model import *
from raif_hack.settings import (
MODEL_PARAMS,
LOGGING_CONFIG,
Expand Down Expand Up @@ -35,7 +35,6 @@ def parse_args():
formatter_class=argparse.RawTextHelpFormatter,
)


parser.add_argument("--val", action="store_true")

parser.add_argument(
Expand All @@ -53,40 +52,44 @@ def parse_args():
if __name__ == "__main__":

try:
logger.info("START train.py")
# for w in [0.0001, 0.0002, 0.0003, 0.0004, 0.0005,
# 0.001, 0.002, 0.003, 0.004, 0.005,
# 0.01, 0.02, 0.03, 0.04, 0.05,
# 0.1, 0.2, 0.3, 0.4]:
logger.info("START train.py with")
args = vars(parse_args())

train_path = 'data/train.csv'
train_path = "data/train.csv"
if args["val"]:
train_path = 'data/train_trunc.csv'
train_path = "data/train_trunc.csv"

logger.info("Load train df from %s" % train_path)
train_df = pd.read_csv(train_path)
logger.info(f"Input shape: {train_df.shape}")
train_df = prepare_categorical(train_df)

X_offer = train_df[NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_offer = train_df[TARGET]
weight = np.ones_like(train_df['price_type'].values)*0.9
weight[train_df['price_type'].values == 0] = 0.1
X_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][
NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES
]
y_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]

X_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][
NUM_FEATURES + CATEGORICAL_OHE_FEATURES + CATEGORICAL_STE_FEATURES
]
y_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]
logger.info(
f"X_offer {X_offer.shape} y_offer {y_offer.shape}\tX_manual {X_manual.shape} y_manual {y_manual.shape}"
)
model = TwoStepBenchmarkModel(
model = WeightedTwoStepModel(
numerical_features=NUM_FEATURES,
ohe_categorical_features=CATEGORICAL_OHE_FEATURES,
ste_categorical_features=CATEGORICAL_STE_FEATURES,
model_params=MODEL_PARAMS,
)
logger.info("Fit model")
model.fit(X_offer, y_offer, X_manual, y_manual, weight)
model.fit(X_offer, y_offer, X_manual, y_manual)
logger.info("Save model")
model.save(args["mp"])

# predictions_offer = model.predict(X_offer)
# metrics = metrics_stat(
# y_offer.values, predictions_offer / (1 + model.corr_coef)
Expand All @@ -108,7 +111,6 @@ def parse_args():
metrics = metrics_stat(y_manual_val.values, predictions_manual_val)
logger.info(f"Metrics stat for validation data with manual prices: {metrics}")


except Exception as e:
err = format_exc()
logger.error(err)
Expand Down

0 comments on commit 39213fc

Please # to comment.