diff --git a/AliMehroze_lhr_assignment_1_Python.ipynb b/AliMehroze_lhr_assignment_1_Python.ipynb new file mode 100644 index 0000000..cad99c5 --- /dev/null +++ b/AliMehroze_lhr_assignment_1_Python.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['age' 'bp' 'sg' 'al' 'su' 'rbc' 'pc' 'pcc' 'ba' 'bgr' 'bu' 'sc' 'sod'\n", + " 'pot' 'hemo' 'pcv' 'wbcc' 'rbcc' 'htn' 'dm' 'cad' 'appet' 'pe' 'ane'\n", + " 'class']\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib\n", + "import csv\n", + "\n", + "df=pd.DataFrame()\n", + "df=pd.read_csv(\"chronic_kidney_disease_updated.csv\")\n", + "\n", + "\n", + "print df.columns.values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebpsgalsurbcpcpccbabgr...pcvwbccrbcchtndmcadappetpeaneclass
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
148801.02010?normalnotpresentnotpresent121...4478005.2yesyesnogoodnonockd
27501.02040?normalnotpresentnotpresent?...386000?nononogoodnonockd
362801.01023normalnormalnotpresentnotpresent423...317500?noyesnopoornoyesckd
448701.00540normalabnormalpresentnotpresent117...3267003.9yesnonopooryesyesckd
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " age bp sg al su rbc pc pcc ba bgr \\\n", + "0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 48 80 1.020 1 0 ? normal notpresent notpresent 121 \n", + "2 7 50 1.020 4 0 ? normal notpresent notpresent ? \n", + "3 62 80 1.010 2 3 normal normal notpresent notpresent 423 \n", + "4 48 70 1.005 4 0 normal abnormal present notpresent 117 \n", + "\n", + " ... pcv wbcc rbcc htn dm cad appet pe ane class \n", + "0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 ... 44 7800 5.2 yes yes no good no no ckd \n", + "2 ... 38 6000 ? no no no good no no ckd \n", + "3 ... 31 7500 ? no yes no poor no yes ckd \n", + "4 ... 32 6700 3.9 yes no no poor yes yes ckd \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#first five rows\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "##A function to replace \"?\" with NaN, and trim other data\n", + "def cleanse(df):\n", + " df.replace(\"?\", np.nan, inplace=True)\n", + " df.replace(regex=True,inplace=True, to_replace=r'\\t',value=r'')\n", + " df.replace(regex=True,inplace=True, to_replace=r' ',value=r'')\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, 'yes', 'no'], dtype=object)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "##calling function\n", + "# df.age.replace(\"?\", np.nan, inplace=True)\n", + "cleanse(df)\n", + "##df.head(5)\n", + "df.dm.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 'yes' 'no']\n" + ] + } + ], + "source": [ + "print df.dm.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "no 261\n", + "yes 137\n", + "Name: dm, dtype: int64" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dm.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "('Unable to parse string', u'occurred at index age')", + "output_type": "error", + "traceback": [ + "\u001b[0;31m\u001b[0m", + "\u001b[0;31mValueError\u001b[0mTraceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'age'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'bp'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'bgr'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'bu'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sod'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'pot'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'hemo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'pcv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'wbcc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rbcc'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'age'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'bp'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'bgr'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'bu'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sod'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'pot'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'hemo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'pcv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'wbcc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rbcc'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numeric\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4059\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4060\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4061\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4062\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4063\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4155\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4156\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4157\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4158\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4159\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/tools/util.pyc\u001b[0m in \u001b[0;36mto_numeric\u001b[0;34m(arg, errors)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 114\u001b[0m values = lib.maybe_convert_numeric(values, set(),\n\u001b[0;32m--> 115\u001b[0;31m coerce_numeric=coerce_numeric)\n\u001b[0m\u001b[1;32m 116\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'raise'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mpandas/src/inference.pyx\u001b[0m in \u001b[0;36mpandas.lib.maybe_convert_numeric (pandas/lib.c:53558)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/src/inference.pyx\u001b[0m in \u001b[0;36mpandas.lib.maybe_convert_numeric (pandas/lib.c:53344)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: ('Unable to parse string', u'occurred at index age')" + ] + } + ], + "source": [ + "df[['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']] = df[['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']].apply(pd.to_numeric)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#df\n", + "##matplotlib.bar(df['rbcc'],df['class'], width = 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().", + "output_type": "error", + "traceback": [ + "\u001b[0;31m\u001b[0m", + "\u001b[0;31mValueError\u001b[0mTraceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bp'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bp'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'class'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m'ckd'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc\u001b[0m in \u001b[0;36m__nonzero__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 890\u001b[0m raise ValueError(\"The truth value of a {0} is ambiguous. \"\n\u001b[1;32m 891\u001b[0m \u001b[0;34m\"Use a.empty, a.bool(), a.item(), a.any() or a.all().\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 892\u001b[0;31m .format(self.__class__.__name__))\n\u001b[0m\u001b[1;32m 893\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 894\u001b[0m \u001b[0m__bool__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__nonzero__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ] + } + ], + "source": [ + "(df['bp']==max(df['bp']) and df['class']=='ckd')==True" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.to_csv(\"clean_chronic_kidney_disease.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + }, + "widgets": { + "state": {}, + "version": "1.1.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/DIH b/DIH new file mode 160000 index 0000000..c089def --- /dev/null +++ b/DIH @@ -0,0 +1 @@ +Subproject commit c089defeacf966815508d06dec5fe5cab65c2b0d