Skip to content

Commit

Permalink
Email analysis part II
Browse files Browse the repository at this point in the history
  • Loading branch information
yomaokobiah authored Jun 29, 2021
1 parent 9e93b9f commit b84e7e9
Showing 1 changed file with 309 additions and 0 deletions.
309 changes: 309 additions & 0 deletions email_analysis_part_II.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "email_analysis part II.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "EapyrOLUDsRb"
},
"source": [
"## Getting the Data\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "KUVN3H6IANY_"
},
"source": [
"\"\"\"Import Libraries\"\"\"\n",
"import imaplib\n",
"import email\n",
"import getpass\n",
"import pandas as pd"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "kk3Zkb43AQLt"
},
"source": [
"\"\"\"\n",
"Desired Email address is entered as a string\n",
"Then login to the email server\n",
"\"\"\"\n",
"username = input(\"Enter your email address: \")\n",
"password = getpass.getpass(\"Enter password: \")\n",
"mail = imaplib.IMAP4_SSL('imap.gmail.com')\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "boGaCBiICFWC"
},
"source": [
"mail.login(username, password)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ztTc5b1lAQNx"
},
"source": [
"\"\"\"Checking the mailboxes and selecting one\"\"\"\n",
"print(mail.list())\n",
"mail.select(\"INBOX\")\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ADl-R0_yAQP0"
},
"source": [
"\"\"\"Searching for emails in the mailbox\"\"\"\n",
"result, numbers = mail.search(None, '(FROM \"quincy@freecodecamp.org\")') \n",
"uids1 = numbers[0].split()\n",
"uids = [id.decode(\"utf-8\") for id in uids1 ]\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "6QcB095VAQRy"
},
"source": [
"\"\"\"Fetching the desired content\"\"\"\n",
"result, messages = mail.fetch(','.join(uids) ,'(RFC822)') "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OGBHYkBeKVdr"
},
"source": [
"body_list =[]\n",
"date_list = []\n",
"from_list = [] \n",
"subject_list = []\n",
"for _, message in messages[::2]:\n",
" email_message = email.message_from_bytes(message)\n",
" email_subject = email.header.decode_header(email_message['Subject'])[0]\n",
" for part in email_message.walk():\n",
" print(part.get_content_type())\n",
" if part.get_content_type() == \"text/plain\":\n",
" body = part.get_payload(decode=True)\n",
" body = body.decode(\"utf-8\")\n",
" body_list.append(body)\n",
" else:\n",
" continue\n",
" if isinstance(email_subject[0],bytes):\n",
" decoded = email_subject[0].decode(errors=\"ignore\")\n",
" subject_list.append(decoded)\n",
" else:\n",
" subject_list.append(email_subject[0])\n",
" date_list.append(email_message.get('date'))\n",
" fromlist = email_message.get('From')\n",
" fromlist = fromlist.split(\"<\")[0].replace('\"', '')\n",
" from_list.append(fromlist)\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8LlsIlXSih0I"
},
"source": [
"date_list = pd.to_datetime(date_list)\n",
"date_list = [item.isoformat(' ')[:-6]for item in date_list]\n",
"print(len(subject_text))\n",
"print(len(from_list))\n",
"print(len(date_list))\n",
"print(len(body_list))\n",
"data = pd.DataFrame(data={'Date':date_list,'Sender':from_list,'Subject':subject_list, 'Body':body_list})\n",
"data.to_csv('emails.csv',index=False)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "R2vg7Qf3cQF-"
},
"source": [
"## DATA CLEANING"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "l0PQkE7V6isA",
"outputId": "a8f066dc-73cd-49f1-95be-a2c26ef8a540"
},
"source": [
"data.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Sender</th>\n",
" <th>Subject</th>\n",
" <th>Body</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-02-07 11:23:04</td>\n",
" <td>Quincy Larson</td>\n",
" <td>I analyzed the results of a new survey of 116,...</td>\n",
" <td>Here are this week's five links that are worth...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2020-02-14 11:11:52</td>\n",
" <td>Quincy Larson</td>\n",
" <td>How to get your first job as a self-taught dev...</td>\n",
" <td>Here are this week's five links that are worth...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2020-02-21 11:41:13</td>\n",
" <td>Quincy Larson</td>\n",
" <td>What I learned from 213,000 coding interview t...</td>\n",
" <td>Here are this week's five links that are worth...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-02-28 10:53:05</td>\n",
" <td>Quincy Larson</td>\n",
" <td>Resume tips from a developer who got job offer...</td>\n",
" <td>Here are this week's five links that are worth...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2020-03-06 10:28:17</td>\n",
" <td>Quincy Larson</td>\n",
" <td>freeCodeCamp just released a 7-hour Python Mac...</td>\n",
" <td>Here are this week's five links that are worth...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Date ... Body\n",
"0 2020-02-07 11:23:04 ... Here are this week's five links that are worth...\n",
"1 2020-02-14 11:11:52 ... Here are this week's five links that are worth...\n",
"2 2020-02-21 11:41:13 ... Here are this week's five links that are worth...\n",
"3 2020-02-28 10:53:05 ... Here are this week's five links that are worth...\n",
"4 2020-03-06 10:28:17 ... Here are this week's five links that are worth...\n",
"\n",
"[5 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "QQSMkpJrDnGV"
},
"source": [
"def clean_data(data,column,i):\n",
" data.loc[i, column] = data.loc[i, column].split(\"\\r\\n\")\n",
" new_string = \" \".join(data.loc[i, column])\n",
" new_string = new_string.split(\"'',\")\n",
" data[column][i:i+1] = pd.DataFrame(data = new_string)\n",
" return data"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "swyBdI7jDnI8"
},
"source": [
"for n in range(len(data)):\n",
" new_data = clean_data(data,\"Body\",n)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "RZjC9bb2DnK6"
},
"source": [
"new_data.head()"
],
"execution_count": null,
"outputs": []
}
]
}

0 comments on commit b84e7e9

Please # to comment.