forked from WecoAI/aideml
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathplayground-series-s3e18.py
91 lines (76 loc) · 3.04 KB
/
playground-series-s3e18.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import (
RandomForestClassifier,
VotingClassifier,
GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")
# Identify common features
common_features = list(set(train_data.columns) & set(test_data.columns))
common_features.remove("id")
# Prepare the data
X_train = train_data[common_features]
y_train_EC1 = train_data["EC1"]
y_train_EC2 = train_data["EC2"]
X_test = test_data[common_features]
# Initialize StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# Initialize individual models
model1_EC1 = LGBMClassifier(random_state=42)
model1_EC2 = LGBMClassifier(random_state=42)
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
model3 = LogisticRegression(max_iter=1000)
model4 = GradientBoostingClassifier(random_state=42)
# Define the parameter grid for GradientBoostingClassifier
param_grid = {
"n_estimators": [100, 200],
"learning_rate": [0.05, 0.1],
"max_depth": [3, 5],
}
# Perform GridSearchCV to find the best parameters for GradientBoostingClassifier
grid_search = GridSearchCV(model4, param_grid, cv=skf, scoring="roc_auc")
grid_search.fit(
X_train, y_train_EC1
) # We can use y_train_EC1 to find general good params
best_params = grid_search.best_params_
# Update the GradientBoostingClassifier with the best parameters
model4 = GradientBoostingClassifier(random_state=42, **best_params)
# Combine models into a VotingClassifier with soft voting
voting_clf_EC1 = VotingClassifier(
estimators=[("lgbm", model1_EC1), ("rf", model2), ("lr", model3), ("gbc", model4)],
voting="soft",
)
voting_clf_EC2 = VotingClassifier(
estimators=[("lgbm", model1_EC2), ("rf", model2), ("lr", model3), ("gbc", model4)],
voting="soft",
)
# Train and evaluate the ensemble model for EC1
cv_scores_EC1 = cross_val_score(
voting_clf_EC1, X_train, y_train_EC1, cv=skf, scoring="roc_auc"
)
auc_EC1 = np.mean(cv_scores_EC1)
# Train and evaluate the ensemble model for EC2
cv_scores_EC2 = cross_val_score(
voting_clf_EC2, X_train, y_train_EC2, cv=skf, scoring="roc_auc"
)
auc_EC2 = np.mean(cv_scores_EC2)
# Print the evaluation metric for each target
print(f"Validation AUC for EC1: {auc_EC1}")
print(f"Validation AUC for EC2: {auc_EC2}")
print(f"Average Validation AUC: {(auc_EC1 + auc_EC2) / 2}")
# Fit the ensemble models on the entire training set
voting_clf_EC1.fit(X_train, y_train_EC1)
voting_clf_EC2.fit(X_train, y_train_EC2)
# Predict probabilities for the test set
test_data["EC1"] = voting_clf_EC1.predict_proba(X_test)[:, 1]
test_data["EC2"] = voting_clf_EC2.predict_proba(X_test)[:, 1]
# Prepare the submission file
submission = test_data[["id", "EC1", "EC2"]]
submission.to_csv("./working/submission.csv", index=False)