forked from kamilkuczynski/Why-nations-rise-and-fall
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_data_wrangling.py
128 lines (91 loc) · 3.95 KB
/
3_data_wrangling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
def create_bar_chart(df, n, title, value_column):
# get the first n rows of the dataframe
df_first_n = df.head(n)
df_first_n = df_first_n.sort_values(value_column, ascending=True)
# plot the bar chart
fig, ax = plt.subplots(figsize=(18, 7.5))
# add bars
bars = plt.barh(df_first_n['Country'], df_first_n[value_column], color="grey")
bars[-1].set_color('black') # Change color of the first bar
# set y-axis
plt.gca().set_yticklabels(df_first_n["Country"])
# add title to a bar
plt.title(title, fontsize=24, y=1.05)
# rectangle first
plt.gca().add_patch(
plt.Rectangle(
(-0.05, .95), # location
0.0125, # width
-0.13, # height
facecolor='tab:red',
transform=fig.transFigure,
clip_on=False,
linewidth=0
)
)
plt.show()
file_name = "C:/Users/kuczy/Documents/GitHub/Why-nations-rise-and-fall/gdp_per_capita_ppp_constant_2017.csv"
df = pd.read_csv(file_name)
df.rename(columns={df.columns[0]: "Country"}, inplace= True)
duplicate_rows = df.duplicated()
print(f"This data contains {duplicate_rows.sum()} duplicated rows.\n")
print("Checking how many rows are there in the dataset?")
#print(len(df), '\n')
print("\n Find the missing values for all columns. \n")
missing_values_per_column = df.isnull().sum()
#print(missing_values_per_column)
print("Checking which columns has more than 100 mising values \n")
columns_with_more_than_100_missing_values = missing_values_per_column[missing_values_per_column > 100].index
#print(columns_with_more_than_100_missing_values)
print("Dropping columns that have more than 100 missing values \n")
df = df.drop(columns=columns_with_more_than_100_missing_values)
#print(df)
print("Checking which columns are not needed\n")
#print(df.columns)
df = df.drop(columns=['Indicator Name', 'Indicator Code'])
#print(df.columns)
mask = df['2021'].isnull()
#print(mask)
#print(df.loc[mask, :])
print("Removing countries where data for 2020 and 2021 are missing \n")
mask = df['2020'].isnull() & df['2021'].isnull()
#print(df.loc[mask, :]) #checking the data
# Drop countries without data
df.drop(df[mask].index, inplace = True)
df_2020_2021 = df.loc[:, ["Country", "2020", "2021"]] #checking data
#print(df_2020_2021)
# changing datatype from string to float
cols = [str(year) for year in range(1990, 2022)]
df[cols] = df[cols].apply(lambda x: x.str.replace(',', '.')).astype(float)
#save df to csv
df.to_csv("gdp_per_capita_ppp_constant_2017_modified.csv", index=False)
print("Count growth of all countries from 1990 to 2021 \n")
#fullfill empty series in column 1990 with .bfill() method, which fills missing values with the next non-null value in the column
df["1990"].bfill(inplace=True)
#add column with growth from 1990 to 20221
df['Growth'] = round(df["2021"]/df["1990"] - 1, 2)
# Sort values of the dataframe based on column 'Growth' in descending order
top_10 = df.sort_values(by='Growth', ascending=False, inplace=True)
print(top_10)
# Print 5 random rows from the sorted dataframe
#print(df.sample(5))
#plot a bar chrt for top 10 exonomies:
create_bar_chart(df=df, n=10, title="Top 10 best growing countries", value_column="Growth")
# Load the 'area.csv' file and skip first 4 rows
file_area = "area.csv"
df_area = pd.read_csv(file_area, skiprows=4, encoding='UTF-8')
# Print the loaded dataframe
#print(df_area)
# Rename the column '2020' to 'Area' in the dataframe 'df_area'
df_area.rename(columns={"2020" : "Area"}, inplace=True)
# Print the modified dataframe
#print(df_area)
# Merge the dataframes 'df' and 'df_area' on the common column 'Country Code'
# with the type of merge being 'left'
df = df.merge(df_area[["Country Code", 'Area']], on='Country Code', how='left')
# Save the merged dataframe to a csv file
df.to_csv("gdp_per_capita_ppp_constant_2017_modified_by-area.csv", index=False)