-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9e93b9f
commit b84e7e9
Showing
1 changed file
with
309 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,309 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"name": "email_analysis part II.ipynb", | ||
"provenance": [], | ||
"collapsed_sections": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "EapyrOLUDsRb" | ||
}, | ||
"source": [ | ||
"## Getting the Data\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "KUVN3H6IANY_" | ||
}, | ||
"source": [ | ||
"\"\"\"Import Libraries\"\"\"\n", | ||
"import imaplib\n", | ||
"import email\n", | ||
"import getpass\n", | ||
"import pandas as pd" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "kk3Zkb43AQLt" | ||
}, | ||
"source": [ | ||
"\"\"\"\n", | ||
"Desired Email address is entered as a string\n", | ||
"Then login to the email server\n", | ||
"\"\"\"\n", | ||
"username = input(\"Enter your email address: \")\n", | ||
"password = getpass.getpass(\"Enter password: \")\n", | ||
"mail = imaplib.IMAP4_SSL('imap.gmail.com')\n" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "boGaCBiICFWC" | ||
}, | ||
"source": [ | ||
"mail.login(username, password)" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "ztTc5b1lAQNx" | ||
}, | ||
"source": [ | ||
"\"\"\"Checking the mailboxes and selecting one\"\"\"\n", | ||
"print(mail.list())\n", | ||
"mail.select(\"INBOX\")\n" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "ADl-R0_yAQP0" | ||
}, | ||
"source": [ | ||
"\"\"\"Searching for emails in the mailbox\"\"\"\n", | ||
"result, numbers = mail.search(None, '(FROM \"quincy@freecodecamp.org\")') \n", | ||
"uids1 = numbers[0].split()\n", | ||
"uids = [id.decode(\"utf-8\") for id in uids1 ]\n" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "6QcB095VAQRy" | ||
}, | ||
"source": [ | ||
"\"\"\"Fetching the desired content\"\"\"\n", | ||
"result, messages = mail.fetch(','.join(uids) ,'(RFC822)') " | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "OGBHYkBeKVdr" | ||
}, | ||
"source": [ | ||
"body_list =[]\n", | ||
"date_list = []\n", | ||
"from_list = [] \n", | ||
"subject_list = []\n", | ||
"for _, message in messages[::2]:\n", | ||
" email_message = email.message_from_bytes(message)\n", | ||
" email_subject = email.header.decode_header(email_message['Subject'])[0]\n", | ||
" for part in email_message.walk():\n", | ||
" print(part.get_content_type())\n", | ||
" if part.get_content_type() == \"text/plain\":\n", | ||
" body = part.get_payload(decode=True)\n", | ||
" body = body.decode(\"utf-8\")\n", | ||
" body_list.append(body)\n", | ||
" else:\n", | ||
" continue\n", | ||
" if isinstance(email_subject[0],bytes):\n", | ||
" decoded = email_subject[0].decode(errors=\"ignore\")\n", | ||
" subject_list.append(decoded)\n", | ||
" else:\n", | ||
" subject_list.append(email_subject[0])\n", | ||
" date_list.append(email_message.get('date'))\n", | ||
" fromlist = email_message.get('From')\n", | ||
" fromlist = fromlist.split(\"<\")[0].replace('\"', '')\n", | ||
" from_list.append(fromlist)\n" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "8LlsIlXSih0I" | ||
}, | ||
"source": [ | ||
"date_list = pd.to_datetime(date_list)\n", | ||
"date_list = [item.isoformat(' ')[:-6]for item in date_list]\n", | ||
"print(len(subject_text))\n", | ||
"print(len(from_list))\n", | ||
"print(len(date_list))\n", | ||
"print(len(body_list))\n", | ||
"data = pd.DataFrame(data={'Date':date_list,'Sender':from_list,'Subject':subject_list, 'Body':body_list})\n", | ||
"data.to_csv('emails.csv',index=False)" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "R2vg7Qf3cQF-" | ||
}, | ||
"source": [ | ||
"## DATA CLEANING" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/", | ||
"height": 204 | ||
}, | ||
"id": "l0PQkE7V6isA", | ||
"outputId": "a8f066dc-73cd-49f1-95be-a2c26ef8a540" | ||
}, | ||
"source": [ | ||
"data.head()" | ||
], | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>Date</th>\n", | ||
" <th>Sender</th>\n", | ||
" <th>Subject</th>\n", | ||
" <th>Body</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>2020-02-07 11:23:04</td>\n", | ||
" <td>Quincy Larson</td>\n", | ||
" <td>I analyzed the results of a new survey of 116,...</td>\n", | ||
" <td>Here are this week's five links that are worth...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>2020-02-14 11:11:52</td>\n", | ||
" <td>Quincy Larson</td>\n", | ||
" <td>How to get your first job as a self-taught dev...</td>\n", | ||
" <td>Here are this week's five links that are worth...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>2020-02-21 11:41:13</td>\n", | ||
" <td>Quincy Larson</td>\n", | ||
" <td>What I learned from 213,000 coding interview t...</td>\n", | ||
" <td>Here are this week's five links that are worth...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>2020-02-28 10:53:05</td>\n", | ||
" <td>Quincy Larson</td>\n", | ||
" <td>Resume tips from a developer who got job offer...</td>\n", | ||
" <td>Here are this week's five links that are worth...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>2020-03-06 10:28:17</td>\n", | ||
" <td>Quincy Larson</td>\n", | ||
" <td>freeCodeCamp just released a 7-hour Python Mac...</td>\n", | ||
" <td>Here are this week's five links that are worth...</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" Date ... Body\n", | ||
"0 2020-02-07 11:23:04 ... Here are this week's five links that are worth...\n", | ||
"1 2020-02-14 11:11:52 ... Here are this week's five links that are worth...\n", | ||
"2 2020-02-21 11:41:13 ... Here are this week's five links that are worth...\n", | ||
"3 2020-02-28 10:53:05 ... Here are this week's five links that are worth...\n", | ||
"4 2020-03-06 10:28:17 ... Here are this week's five links that are worth...\n", | ||
"\n", | ||
"[5 rows x 4 columns]" | ||
] | ||
}, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"execution_count": 28 | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "QQSMkpJrDnGV" | ||
}, | ||
"source": [ | ||
"def clean_data(data,column,i):\n", | ||
" data.loc[i, column] = data.loc[i, column].split(\"\\r\\n\")\n", | ||
" new_string = \" \".join(data.loc[i, column])\n", | ||
" new_string = new_string.split(\"'',\")\n", | ||
" data[column][i:i+1] = pd.DataFrame(data = new_string)\n", | ||
" return data" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "swyBdI7jDnI8" | ||
}, | ||
"source": [ | ||
"for n in range(len(data)):\n", | ||
" new_data = clean_data(data,\"Body\",n)" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "RZjC9bb2DnK6" | ||
}, | ||
"source": [ | ||
"new_data.head()" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |