sample_results/playground-series-s3e18.py

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
    GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score

# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")

# Identify common features
common_features = list(set(train_data.columns) & set(test_data.columns))
common_features.remove("id")

# Prepare the data
X_train = train_data[common_features]
y_train_EC1 = train_data["EC1"]
y_train_EC2 = train_data["EC2"]
X_test = test_data[common_features]

# Initialize StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize individual models
model1_EC1 = LGBMClassifier(random_state=42)
model1_EC2 = LGBMClassifier(random_state=42)
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
model3 = LogisticRegression(max_iter=1000)
model4 = GradientBoostingClassifier(random_state=42)

# Define the parameter grid for GradientBoostingClassifier
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
}

# Perform GridSearchCV to find the best parameters for GradientBoostingClassifier
grid_search = GridSearchCV(model4, param_grid, cv=skf, scoring="roc_auc")
grid_search.fit(
    X_train, y_train_EC1
)  # We can use y_train_EC1 to find general good params
best_params = grid_search.best_params_

# Update the GradientBoostingClassifier with the best parameters
model4 = GradientBoostingClassifier(random_state=42, **best_params)

# Combine models into a VotingClassifier with soft voting
voting_clf_EC1 = VotingClassifier(
    estimators=[("lgbm", model1_EC1), ("rf", model2), ("lr", model3), ("gbc", model4)],
    voting="soft",
)
voting_clf_EC2 = VotingClassifier(
    estimators=[("lgbm", model1_EC2), ("rf", model2), ("lr", model3), ("gbc", model4)],
    voting="soft",
)

# Train and evaluate the ensemble model for EC1
cv_scores_EC1 = cross_val_score(
    voting_clf_EC1, X_train, y_train_EC1, cv=skf, scoring="roc_auc"
)
auc_EC1 = np.mean(cv_scores_EC1)

# Train and evaluate the ensemble model for EC2
cv_scores_EC2 = cross_val_score(
    voting_clf_EC2, X_train, y_train_EC2, cv=skf, scoring="roc_auc"
)
auc_EC2 = np.mean(cv_scores_EC2)

# Print the evaluation metric for each target
print(f"Validation AUC for EC1: {auc_EC1}")
print(f"Validation AUC for EC2: {auc_EC2}")
print(f"Average Validation AUC: {(auc_EC1 + auc_EC2) / 2}")

# Fit the ensemble models on the entire training set
voting_clf_EC1.fit(X_train, y_train_EC1)
voting_clf_EC2.fit(X_train, y_train_EC2)

# Predict probabilities for the test set
test_data["EC1"] = voting_clf_EC1.predict_proba(X_test)[:, 1]
test_data["EC2"] = voting_clf_EC2.predict_proba(X_test)[:, 1]

# Prepare the submission file
submission = test_data[["id", "EC1", "EC2"]]
submission.to_csv("./working/submission.csv", index=False)