ML - AllPurposeKaggleFile

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"ML - AllPurposeKaggleFile","provenance":[],"collapsed_sections":["605Z6AM7AzVu","TTbZQHZYBJ7h","s--KVoPPtNjw","t6g1xMqtBywn","4wUwHKT-0vJo","iFrNrFxY8aDB","dmvkwdb5u-dl","UGowwqQA5UbR","u99e1rNz-uDU","kc1d8UtFtP9E","amPtDnnGAGsq","HrEYIZ4aBrPz","eQwvPCh-E5ud","Y8qbE9FpZ8Ta","T83mLyjyDHDm","tQ8ko2lY6nB4","6UdSLAUYDwwm","b5Nk7Md5EGWc","KMeR6CfaEkC6","T0XgtPMrFYCb","d_xAvCcQFp9e","lHCkkJlI_HiH","qOM_XYlzGLAq","R6KSlgc4GnXJ","2tfXwbGQHAn1","6LlUdUzjHOy-","4MQPfrEvHli4","1RDQQ9HDH59u","pZ9GqMBpIMmL","_9rtvPh4IZXJ","XKY-OsaYI4y_","dT12glwMYB75","fJ6F2KpWJMVa","UMWIKpdPJRo5"],"toc_visible":true,"authorship_tag":"ABX9TyOwuBPLBatBXFp2WocRsPKk"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"EdQIiERNAHHa"},"source":["import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","%matplotlib inline"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"RmZuBm_FAs7V"},"source":["train=pd.read_csv('/kaggle/input/)\n","test=pd.read_csv('/kaggle/input/')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"605Z6AM7AzVu"},"source":["##Information about Data"]},{"cell_type":"code","metadata":{"id":"LjncRvNFA5Kn"},"source":["train.info()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"JJE3XF6FBAn1"},"source":["test.info()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"TTbZQHZYBJ7h"},"source":["##Unique Values"]},{"cell_type":"code","metadata":{"id":"Ry_KHoquBLQp"},"source":["train.nunique()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"UfKw4C_IBSsV"},"source":["test.nunique()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["##Changing Infinite to Nan"],"metadata":{"id":"OBUjxp2GPV9o"}},{"cell_type":"code","source":["pd.set_option('mode.use_inf_as_na', True)"],"metadata":{"id":"0YlgQauOP_2u"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Col with Null Values"],"metadata":{"id":"s--KVoPPtNjw"}},{"cell_type":"code","source":["null_train=train.columns[train.isnull().any()]\n","null_train"],"metadata":{"id":"edj5E5shtYpt"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["null_test=test.columns[test.isnull().any()]\n","null_test"],"metadata":{"id":"MrJMRTvbta6L"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"t6g1xMqtBywn"},"source":["## Dropping Cols"]},{"cell_type":"code","metadata":{"id":"mbJuX8LQB32h"},"source":["train=train.drop(columns=[], axis=1)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"hsObDn7pCLus"},"source":["test=test.drop(columns=[], axis=1)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Histogram \n","Plotting Histogram to see distribution of SOME Features"],"metadata":{"id":"4wUwHKT-0vJo"}},{"cell_type":"code","source":["import plotly.express as px\n","for col in train.columns: \n","    fig = px.histogram(train, x=train[col],marginal=\"rug\")\n","    fig.show()"],"metadata":{"id":"5fsF45bG03gd"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","for col in test.columns: \n","    fig = px.histogram(test, x=test[col],marginal=\"rug\")\n","    fig.show()"],"metadata":{"id":"-2f4UOM_1Rpk"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Distplot\n","DistPlot to check Distribution of EACH Features"],"metadata":{"id":"iFrNrFxY8aDB"}},{"cell_type":"code","source":["import plotly.figure_factory as ff\n","fig = ff.create_distplot([train[c] for c in train.columns], train.columns, bin_size=0.5,curve_type='normal')\n","fig.show()"],"metadata":{"id":"zDgSZmfS8kyq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.figure_factory as ff\n","fig = ff.create_distplot([train[c] for c in test.columns], test.columns, bin_size=0.5,curve_type='normal')\n","fig.show()"],"metadata":{"id":"B3CXr7yU-Ksu"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Boxplot\n","\n","Vizualize the Distribution of Features"],"metadata":{"id":"dmvkwdb5u-dl"}},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.box(train, y=\"<feature>\", color=\"<target>\",points=\"all\")\n","fig.show()"],"metadata":{"id":"WI5lqpsWvJ9V"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.box(test, y=\"<feature>\", color=\"<target>\",points=\"all\")\n","fig.show()"],"metadata":{"id":"IJozbcF-vkVt"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## JointPlot"],"metadata":{"id":"UGowwqQA5UbR"}},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.scatter(train, x='<feature1>', y='<feature2>', marginal_y=\"rug\", marginal_x=\"histogram\")\n","fig.show()"],"metadata":{"id":"p5HgI6pE5bl6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.scatter(test, x='<feature1>', y='<feature2>', marginal_y=\"rug\", marginal_x=\"histogram\")\n","fig.show()"],"metadata":{"id":"Uo87I8086Umv"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## 2D Histogram\n","\n","To see relation between EACH Feature and Target Variable"],"metadata":{"id":"u99e1rNz-uDU"}},{"cell_type":"code","source":["import plotly.express as px\n","\n","for col in train.columns:\n","    fig = px.density_heatmap(train, x=train[col],y=y['1'], marginal_x=\"histogram\", marginal_y=\"histogram\")\n","    fig.show()"],"metadata":{"id":"A_ceOht--23z"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","\n","for col in test.columns:\n","    fig = px.density_heatmap(train, x=test[col],y=y['1'], marginal_x=\"histogram\", marginal_y=\"histogram\")\n","    fig.show()"],"metadata":{"id":"Sr-Vv_lD_xHp"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## ScatterPlot\n","\n","Plots the relation between EACH Pair"],"metadata":{"id":"kc1d8UtFtP9E"}},{"cell_type":"code","source":["import plotly.express as px\n","\n","for col in train.columns:\n","    fig = px.scatter(train, x=train[col], y=train[col])\n","    fig.show()"],"metadata":{"id":"3rXfyeETtrkE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","\n","for col in test.columns:\n","    fig = px.scatter(test, x=test[col], y=test[col])\n","    fig.show()"],"metadata":{"id":"erqDrl1Zt0_c"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Density Heatmap"],"metadata":{"id":"amPtDnnGAGsq"}},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.density_heatmap(train, x=\"<feature1>\", y=\"<feature2>\", text_auto=True,histfunc=\"avg\")\n","fig.show()"],"metadata":{"id":"UTkNQg4RANxQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.density_heatmap(test, x=\"<feature1>\", y=\"<feature2>\", text_auto=True,histfunc=\"avg\")\n","fig.show()"],"metadata":{"id":"Gklt-YzIA14Q"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Heatmap\n","Checks the Correlation among ALL Features"],"metadata":{"id":"HrEYIZ4aBrPz"}},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.imshow(train.corr(), text_auto=True,aspect=\"auto\")\n","fig.show()"],"metadata":{"id":"gCBu7e-sChlj"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import plotly.express as px\n","fig = px.imshow(test.corr(), text_auto=True,aspect=\"auto\")\n","fig.show()"],"metadata":{"id":"VhmNAuCjC5rq"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"eQwvPCh-E5ud"},"source":["## Handling Multi-Collinearity\n","\n","Multicollinearity occurs when there is a high correlation between the independent variables in the regression analysis which impacts the overall interpretation of the results. It reduces the power of coefficients and weakens the statistical measure to trust the p-values to identify the significant independent variables. \n","\n","A correlation plot can be used to identify the correlation or bivariate relationship between two independent variables whereas VIF is used to identify the correlation of one independent variable with a group of other variables. Hence, it is preferred to use VIF for better understanding."]},{"cell_type":"code","metadata":{"id":"PsWMEltKk2sJ"},"source":["import plotly.figure_factory as ff\n","fig = ff.create_dendrogram(train,labels=train.columns, orientation='left', leaf_font_size=16)\n","fig.update_layout(width=800, height=500)\n","fig.show()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"JuhK2YPnlI3a"},"source":["import plotly.figure_factory as ff\n","fig = ff.create_dendrogram(test,labels=test.columns, orientation='left', leaf_font_size=16)\n","fig.update_layout(width=800, height=500)\n","fig.show()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"w8kFVEsRFGZ8"},"source":["from sklearn.decomposition import PCA\n","pca = PCA(n_components = 0.5*num_train.shape[1])\n","train[num_train] = pca.fit_transform(num_train)\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"t1-MjPg5FM4b"},"source":["from sklearn.decomposition import PCA\n","pca = PCA(n_components = 0.5*num_test.shape[1])\n","test[num_test] = pca.fit_transform(num_test)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Y8qbE9FpZ8Ta"},"source":["## Checking whether the 'dtype' of column matches with 'dtype' of data"]},{"cell_type":"code","metadata":{"id":"5wFMCFYoZ30X"},"source":["#Iterate through each column and each row, Assign Most Frequent on mismatch\n","#Note: astype is used to assign types\n","\n","for col, row in train.iterrows():\n","    if train[col][row].dtype != train[col].dtype:\n","      train[col][row]=train[col].value_counts().idxmax()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"9SSFQIpKd7aM"},"source":["#Iterate through each column and each row, Assign Most Frequent on mismatch\n","#Note: astype is used to assign types\n","\n","for col, row in test.iterrows():\n","    if test[col][row].dtype != test[col].dtype:\n","      test[col][row]=test[col].value_counts().idxmax()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T83mLyjyDHDm"},"source":["## Seperate Cols w.r.t Datatypes\n"]},{"cell_type":"code","metadata":{"id":"_1aILCy1DQC6"},"source":["num_train = train.select_dtypes(include=['int64','float64','UInt32'])\n","cat_train = train.select_dtypes(include=['object','string'])\n","date_train = train.select_dtypes(include='datetime64')\n","num_train"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"_DuYQjsDDj1r"},"source":["num_test = test.select_dtypes(include=['int64','float64','UInt32'])\n","cat_test = test.select_dtypes(include=['object','string'])\n","date_test = test.select_dtypes(include='datetime64')\n","num_test"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Encoding\n","\n","Typically any standard workflow in feature engineering involves some form of transformation of these categorical values into numeric labels and then applying some encoding scheme on these values.\n","\n","* Nominal attributes consist of discrete categorical values with no notion or sense of order amongst them. \n","* Ordinal attributes are categorical attributes with a sense of order amongst the values."],"metadata":{"id":"tQ8ko2lY6nB4"}},{"cell_type":"code","source":["# Label Encoding - This transformer should be used to encode 1 COLUMN at a Time\n","from sklearn.preprocessing import LabelEncoder\n","\n","le = LabelEncoder()\n","\n","for cat in cat_train:\n","    train[cat]=le.fit_transform(train[cat])\n","    \n","train"],"metadata":{"id":"szWnBpUtHr9w"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["for cat in cat_test:\n","    test[cat]=le.fit_transform(test[cat])\n","test"],"metadata":{"id":"8_FJPRt9HwMn"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6UdSLAUYDwwm"},"source":["## Seperate Cols having\n","* Continuous values\n","* Discreet values\n","*  Date/Time Values\n"]},{"cell_type":"code","metadata":{"id":"XjPjPXMwD34v"},"source":["con_train =[col for col in num_train if train[col].nunique()>25]\n","dis_train =[col for col in num_train if train[col].nunique()<25]\n","yea_train =[col for col in train.columns if 'Yr' in col or 'Year' in  col or 'yr' in  col or 'YR' in  col]\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"dimBSvwcD_ob"},"source":["con_test =[col for col in num_test if test[col].nunique()>25]\n","dis_test =[col for col in num_test if test[col].nunique()<25]\n","yea_test =[col for col in test.columns if 'Yr' in col or 'Year' in  col or 'yr' in  col or 'YR' in  col]\n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"b5Nk7Md5EGWc"},"source":["## Impute\n","\n","Missing values are one of the most common problems you can encounter when you try to prepare your data for machine learning. The reason for the missing values might be human errors,interruptions in the data flow, privacy concerns, and so on. Whatever is the reason, missing values affect the performance of the machine learning models."]},{"cell_type":"code","metadata":{"id":"1XfBPfZWENu7"},"source":["from sklearn.impute import SimpleImputer\n","\n","imp = SimpleImputer(strategy='most_frequent')\n","\n","train[null_train] = imp.fit_transform(train[null_train])\n","\n","train.isnull().sum().sum()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Trmh9262Eh8_"},"source":["test[null_test] = imp.fit_transform(test[null_test])\n","test.isnull().sum().sum()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"KMeR6CfaEkC6"},"source":["##Log Transformation - Continuous Data\n","\n","Log transform helps in handling the skewed data, and it makes the distribution more approximate to normal after transformation. It also reduces the effects of outliers on the data, as because of the normalization of magnitude differences, a model becomes much robust.\n","\n","Log transforms are useful when applied to skewed distributions as they tend to expand the values which fall in the range of lower magnitudes and tend to compress or reduce the values which fall in the range of higher magnitudes."]},{"cell_type":"code","metadata":{"id":"FDqHdlhVEpcM"},"source":["train[con_train]=np.log1p(train[con_train])\n","train"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"pyKakxReEyVR"},"source":["test[con_test]=np.log1p(test[con_test])\n","test"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T0XgtPMrFYCb"},"source":["## Transforming Dates\n","\n","Though date columns usually provide valuable information about the model target, they are neglected as an input or used nonsensically for the machine learning algorithms.Building an ordinal relationship between the values is very challenging for a machine learning algorithm if you leave the date columns without manipulation."]},{"cell_type":"code","metadata":{"id":"VnaO9dcgFdOD"},"source":["train['date'] = pd.Timestamp.now().normalize() - train['date']\n","train"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Qm1nRWJlFnJr"},"source":["test[yea_test]=date.today().year - test[yea_test]\n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"d_xAvCcQFp9e"},"source":["## Standardizing - Discrete Values\n","\n","Standardization (or z-score normalization) scales the values between 0 to 1, while taking into account standard deviation. If the standard deviation of features is different, their range also would differ from each other. This reduces the effect of the outliers in the features."]},{"cell_type":"code","metadata":{"id":"2_vZ0hVeFvKR"},"source":["# RobustScaler\n","from sklearn.preprocessing import RobustScaler\n","\n","rs = RobustScaler()\n","\n","train[dis_train]= rs.fit_transform(train[dis_train])\n","train"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"khcoigs2F6ja"},"source":["test[dis_test]= rs.fit_transform(test[dis_test])\n","test"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Box Cox - Both Discreet & Continuous Values\n","\n","A Box Cox transformation is a transformation of non-normal dependent variables into a normal shape. Normality is an important assumption for many statistical techniques; if your data isn’t normal, applying a Box-Cox means that you are able to run a broader number of tests."],"metadata":{"id":"lHCkkJlI_HiH"}},{"cell_type":"code","source":["from scipy.special import boxcox1p\n","from scipy.stats import boxcox_normmax\n","\n","# Fixing Skewness\n","for feat in num_train:\n","        train[feat] = boxcox1p(train[feat], boxcox_normmax(train[feat] + 1))\n","train"],"metadata":{"id":"iKaddryFHzyR"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from scipy.special import boxcox1p\n","from scipy.stats import boxcox_normmax\n","\n","# Fixing Skewness\n","for feat in num_test:\n","        test[feat] = boxcox1p(test[feat], boxcox_normmax(test[feat] + 1))\n","test"],"metadata":{"id":"dbgEKN6qIfuI"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"qOM_XYlzGLAq"},"source":["## Removing Outliers\n","\n","In statistics, an outlier is a data point that differs significantly from other observations. An outlier may be due to variability in the measurement or it may indicate experimental error; the latter are sometimes excluded from the data set. An outlier can cause serious problems in statistical analyses."]},{"cell_type":"code","metadata":{"id":"05IZQgun_xnZ"},"source":["plt.boxplot(train,vert=True,patch_artist=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"KK4BzJlTAFBc"},"source":["plt.boxplot(test,vert=True,patch_artist=True)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"K-dFro9kBklt"},"source":["If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, the following expression would do that in one shot.\n"]},{"cell_type":"code","metadata":{"id":"qktrcwobGMPK"},"source":["# Using Isolation Forest\n","from sklearn.ensemble import IsolationForest\n","iso = IsolationForest(contamination=0.3)\n","\n","out = iso.fit_predict(train)\n","\n","# select all rows that are not outliers\n","train[out != -1]\n","train"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"daHWdnO4GTZm"},"source":["out = iso.fit_predict(test)\n","\n","# select all rows that are not outliers\n","test[out != -1]\n","test"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"R6KSlgc4GnXJ"},"source":["## Get_Dummies()\n","\n","Machine learning models require all input and output variables to be numeric.This means that if your data contains categorical data, you must encode it to numbers before you can fit and evaluate a model.\n"]},{"cell_type":"code","metadata":{"id":"7gdTGQwkGtS5"},"source":["train1= pd.get_dummies(train, columns=cat_train, drop_first= True)\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"YWDFomnzG09E"},"source":["test1= pd.get_dummies(test, columns=cat_test, drop_first= True)\n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2tfXwbGQHAn1"},"source":["## Concatenating\n","\n","Get_Dummies() method creates a new DF containing JUST the dummies, MOST People get wrong here)\n"]},{"cell_type":"code","metadata":{"id":"Ttr54LNiHB3A"},"source":["train2=pd.concat([train,train1],axis=1)\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"_0WYz4G3HM1x"},"source":["test2=pd.concat([test,test1],axis=1)\n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6LlUdUzjHOy-"},"source":["## Dropping the columns already concatenated after Get_Dummies()\n"]},{"cell_type":"code","metadata":{"id":"UcEFGwUcHTsl"},"source":["train=train2.drop(cat_train,axis=1)\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"-OjG9TCgHdFO"},"source":["test=test2.drop(cat_test,axis=1)\n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"4MQPfrEvHli4"},"source":["## Splitting X & y\n"]},{"cell_type":"code","metadata":{"id":"8EHScNvhHwcK"},"source":["y=train['<target>']\n","# print(y)\n","X=train.drop(['<target>'],axis=1)\n","X"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1RDQQ9HDH59u"},"source":["## Train Test Split\n","\n","It splits the train data into 4 parts, X_train, X_test, y_train, y_test.\n","\n","* X_train, y_train first used to train the algorithm.\n","* X_test is used in that trained algorithms to predict outcomes.\n","* Once we get the outcomes, we compare it with y_test\n"]},{"cell_type":"code","metadata":{"id":"gdDdAWJvIAOS"},"source":["from sklearn.model_selection import train_test_split\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,shuffle=True,stratify=train['<class label>'])"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"pZ9GqMBpIMmL"},"source":["## Train\n"]},{"cell_type":"code","metadata":{"id":"kq760huRIRQA"},"source":["from xgboost import XGBClassifier\n","model = XGBClassifier(\n","    booster='gbtree', \n","    objective='binary:logistic', \n","    eval_metric='logloss',\n","    n_estimators=1000,\n","    max_depth=15,\n","    min_split_loss=0.1,\n","    base_score=0.5,\n","    learning_rate=0.08,\n","    reg_alpha=0.5,\n","    reg_lambda=0.5,\n","    gamma=0.2)\n","\n","model.fit(X_train, y_train)\n","\n","model.get_params()"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# plot\n","import matplotlib.pyplot as plt\n","from xgboost import plot_importance\n","plot_importance(model)\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":217},"id":"hWyB2xxbXpDm","executionInfo":{"status":"error","timestamp":1651404379747,"user_tz":-330,"elapsed":1307,"user":{"displayName":"P Kumar","userId":"08759630486056956439"}},"outputId":"2e1b3d49-ab38-4b08-ec33-1ddbcb31988f"},"execution_count":null,"outputs":[{"output_type":"error","ename":"NameError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)","\u001b[0;32m<ipython-input-1-46afb8bc3d24>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mxgboost\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mplot_importance\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mplot_importance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"]}]},{"cell_type":"markdown","metadata":{"id":"_9rtvPh4IZXJ"},"source":["## Predict"]},{"cell_type":"code","metadata":{"id":"CdPGbO3nIiJt"},"source":["predict= model.predict(X_test)\n","predict"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XKY-OsaYI4y_"},"source":["## Score - Regression"]},{"cell_type":"code","source":["# plot\n","import matplotlib.pyplot as plt\n","from xgboost import plot_importance\n","plot_importance(model)\n","plt.show()"],"metadata":{"id":"05Mcp-WaX1St"},"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"3WBpINK5JAmf"},"source":["from sklearn.metrics import mean_squared_log_error\n","rmsle=mean_squared_log_error(y_test, predict,squared=False)\n","msle"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Score - Classification"],"metadata":{"id":"dT12glwMYB75"}},{"cell_type":"code","source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,predict)\n","\n","import plotly.express as px\n","fig = px.imshow(cm,text_auto=True,color_continuous_scale='RdBu_r')\n","fig.show()                "],"metadata":{"id":"kkd0I3v5YQq3"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import f1_score\n","f1_score(y_test, predict)"],"metadata":{"id":"5VtB-9GXYZ-J"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"fJ6F2KpWJMVa"},"source":["## Suggestions:-\n","* Kaggle - https://www.kaggle.com/pythonkumar\n","* GitHub - https://github.com/KumarPython​\n","* Twitter - https://twitter.com/KumarPython\n","* LinkedIn - https://www.linkedin.com/in/kumarpython/"]},{"cell_type":"markdown","metadata":{"id":"UMWIKpdPJRo5"},"source":["## Submission\n"]},{"cell_type":"code","metadata":{"id":"Le2_7CI6JXPt"},"source":[" submission=pd.DataFrame({'PassengerId': test1.PassengerId,\n","                         'Transported' : sub\n","                        })\n","#  submission\n","submission.to_csv('submission.csv', index=False)"],"execution_count":null,"outputs":[]}]}