-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataCleaner.py
88 lines (70 loc) · 3.63 KB
/
DataCleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
from Locator import Locator
class DataCleaner:
def __init__(self, education, experiences):
self.education = education
self.experiences = experiences
def clean(self):
company_most_freq_loc = \
self.experiences.groupby(['company', 'location']).size().reset_index().sort_values(0, ascending=False).groupby(
'company')['location'].apply(lambda loc: loc.iloc[0])
self.experiences.loc[self.experiences['location'].isnull(), 'location'] = (
self.experiences.loc[self.experiences['location'].isnull()]['company'].map(company_most_freq_loc)
)
id_most_freq_loc = \
self.experiences.groupby(['id', 'location']).size().reset_index().sort_values(0, ascending=False).groupby(
'id')['location'].apply(lambda loc: loc.iloc[0])
self.experiences.loc[self.experiences['location'].isnull(), 'location'] = (
self.experiences.loc[self.experiences['location'].isnull()]['id'].map(id_most_freq_loc)
)
# from_, to_ = self.clean_dates(dates=self.experiences['date_range'])
# self.experiences['from'] = from_
# self.experiences['to'] = to_
# del self.experiences['date_range']
locator = Locator()
addresses = pd.Series(self.experiences['location'].unique())
address_country_map = locator.add_country(addresses=addresses)
self.experiences['Country'] = self.experiences['location'].map(address_country_map)
addresses = pd.Series(self.education['location'].unique())
address_country_map = locator.add_country(addresses=addresses)
self.education['Country'] = self.education['location'].map(address_country_map)
self.clean_missing_countries()
self.experiences.to_csv('experiences.csv', index=False)
self.education.to_csv('education.csv', index=False)
def clean_dates(self, dates):
date_range_split = dates.str.split('–')
date_range_split[date_range_split.notnull()] = \
date_range_split[date_range_split.notnull()].apply(lambda x: [s.strip() for s in x])
from_ = date_range_split[date_range_split.notnull()].apply(lambda x: x[0])
to_ = date_range_split[date_range_split.notnull()].apply(
lambda x: x[1] if len(x) > 1 else None)
to_.loc[to_ == 'Present'] = 'Jul 2020'
from_ = pd.to_datetime(from_)
to_ = pd.to_datetime(to_)
return from_, to_
def clean_missing_countries(self):
school_country_map = {
'Colegio Santa Maria': 'Brazil',
'Sabanci University': 'Turkey',
'Southeast University': 'China',
'Thapar Institute of Engineering and Technology': 'India',
'Tianjin University': 'China',
'University of Tehran': 'Iran',
'Sharif University of Technology': 'Iran',
'PES institute of technology': 'India',
'Seoul National University': 'South Korea',
'National Institute of Technology Karnataka': 'India',
'Narayana PU College, Bangalore': 'India',
'浙江大学': 'China',
'University of Seoul': 'South Korea',
'Beihang University': 'China'
}
self.education.loc[self.education['school'].isin(school_country_map), 'Country'] = \
self.education.loc[self.education['school'].isin(school_country_map), 'school'].map(school_country_map)
def main():
education = pd.read_csv('education.csv')
experiences = pd.read_csv('experiences.csv')
cleaner = DataCleaner(education=education, experiences=experiences)
cleaner.clean()
if __name__ == '__main__':
main()