Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

REF: Israel pipeline #518

Merged
merged 7 commits into from
Feb 22, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 100 additions & 20 deletions scripts/scripts/vaccinations/automations/batch/israel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,119 @@
import json
import requests
import pandas as pd
from utils.pipeline import enrich_total_vaccinations


def main():
def read(source: str) -> pd.DataFrame:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:85.0) Gecko/20100101 Firefox/85.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
data = json.loads(requests.get(source, headers=headers).content)
return pd.DataFrame.from_records(data)


def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
return df.rename(
columns={
"Day_Date": "date",
"vaccinated_cum": "people_vaccinated",
"vaccinated_seconde_dose_cum": "people_fully_vaccinated"
}
)


def format_date(df: pd.DataFrame) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps those should be decoupled into:

def format_date(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(date=df.date.str.slice(0, 10))

def filter_date(df: pd.DataFrame) -> pd.DataFrame:
    return df[df.date < str(datetime.date.today())]

This decouples the logic (cognitive load, reusability), and we don’t mutate inputs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! Thanks

return df.assign(date=df.date.str.slice(0, 10))


def filter_date(df: pd.DataFrame) -> pd.DataFrame:
return df[df.date < str(datetime.date.today())]


def select_distinct(df: pd.DataFrame) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let’s return directly without a temporary variable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

return df.groupby(["people_vaccinated", "people_fully_vaccinated"], as_index=False).min()


url = "https://datadashboardapi.health.gov.il/api/queries/vaccinated"
def enrich_source(df: pd.DataFrame) -> pd.DataFrame:
return df.assign(
source_url="https://datadashboard.health.gov.il/COVID-19/general"
)

data = json.loads(requests.get(url).content)

df = pd.DataFrame.from_records(data)
def enrich_location(df: pd.DataFrame) -> pd.DataFrame:
return df.assign(
location="Israel",
)

df = df.rename(columns={
"Day_Date": "date",
"vaccinated_cum": "people_vaccinated",
"vaccinated_seconde_dose_cum": "people_fully_vaccinated"
})

df = df.groupby(["people_vaccinated", "people_fully_vaccinated"], as_index=False).min()
def enrich_vaccine(df: pd.DataFrame) -> pd.DataFrame:
def _enrich_vaccine(date: str) -> str:
if date >= "2021-01-07":
return "Moderna, Pfizer/BioNTech"
return "Pfizer/BioNTech"
return df.assign(
vaccine=df.date.apply(_enrich_vaccine)
)

df["total_vaccinations"] = df["people_vaccinated"].add(df["people_fully_vaccinated"])
df["people_fully_vaccinated"] = df["people_fully_vaccinated"].replace(0, pd.NA)

df["date"] = df["date"].str.slice(0, 10)
df = df[df["date"] < str(datetime.date.today())]
def format_nulls_as_nans(df: pd.DataFrame) -> pd.DataFrame:
return df.assign(
people_fully_vaccinated=df.people_fully_vaccinated.replace(0, pd.NA)
)

df = df[["date", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"]]

df.loc[:, "location"] = "Israel"
df.loc[:, "source_url"] = "https://datadashboard.health.gov.il/COVID-19/general"
def select_output_columns(df: pd.DataFrame) -> pd.DataFrame:
df = (
df[["date", "total_vaccinations", "people_vaccinated",
"people_fully_vaccinated", "location", "source_url", "vaccine"
]]
)
return df

df.loc[:, "vaccine"] = "Pfizer/BioNTech"
df.loc[df["date"] >= "2021-01-07", "vaccine"] = "Moderna, Pfizer/BioNTech"

df.to_csv("automations/output/Israel.csv", index=False)
def pre_process(df: pd.DataFrame) -> pd.DataFrame:
return (
df.pipe(rename_columns)
.pipe(format_date)
.pipe(filter_date)
.pipe(select_distinct)
)


def enrich(df: pd.DataFrame) -> pd.DataFrame:
return (
df.pipe(enrich_total_vaccinations)
.pipe(enrich_location)
.pipe(enrich_source)
.pipe(enrich_vaccine)
)


def post_process(df: pd.DataFrame) -> pd.DataFrame:
return (
df.pipe(format_nulls_as_nans)
.pipe(select_output_columns)
)


def pipeline(df: pd.DataFrame) -> pd.DataFrame:
return (
df.pipe(pre_process)
.pipe(enrich)
.pipe(post_process)
)


def main():
source = "https://datadashboardapi.health.gov.il/api/queries/vaccinated"
destination = "automations/output/Israel.csv"

read(source).pipe(pipeline).to_csv(destination, index=False)


if __name__ == "__main__":
Expand Down