Skip to content

Commit

Permalink
Add initial starter
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilwoodruff committed Sep 9, 2024
0 parents commit c0acc9e
Showing 20 changed files with 591 additions and 0 deletions.
115 changes: 115 additions & 0 deletions .github/workflows/ci_cd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: CI/CD

on:
pull_request:
branches: [main]
push:
branches: [main]

jobs:
publish-to-pypi:
name: Publish to PyPI
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: pip install -e ".[dev]"
- name: Build package
run: python -m build
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI }}
skip-existing: true
lint:
runs-on: ubuntu-latest
name: Lint
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install black
- name: Check formatting
run: black . -l 79 --check

test:
name: Build and Test
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: pip install -e ".[dev]"
- name: Download data inputs
run: make download
env:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
- name: Run tests
run: pytest

docker:
name: Docker
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- name: Checkout repo
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/#-action@v2
with:
registry: ghcr.io
username: ${{github.actor}}
password: ${{secrets.GITHUB_TOKEN}}
- name: Build container
run: docker build . -f docker/policyengine_us_data.Dockerfile -t ghcr.io/policyengine/policyengine-us-data:latest
- name: Push container
run: docker push ghcr.io/policyengine/policyengine-us-data:latest

upload:
name: Upload Data
runs-on: ubuntu-latest
needs: [lint, test]
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: pip install -e ".[dev]"
- name: Download data inputs
run: make download
env:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
- name: Upload data
run: make upload
env:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
*.egg-info
**/__pycache__
**/.DS_STORE
**/*.h5
*.ipynb
**/*.csv
**/*.zip
!uprating_factors.csv
!uprating_growth_factors.csv
33 changes: 33 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
.PHONY: all format test install download upload docker documentation data clean build

all: data test

format:
black . -l 79

test:
pytest

install:
pip install -e ".[dev]"

download:
python policyengine_us_data/data_storage/download_private_prerequisites.py

upload:
python policyengine_us_data/data_storage/upload_completed_datasets.py

docker:
docker buildx build --platform linux/amd64 . -t policyengine-uk-data:latest

documentation:
streamlit run docs/Home.py

data:
echo "Nothing right now."

build:
python -m build

publish:
twine upload dist/*
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# PolicyEngine UK Data
5 changes: 5 additions & 0 deletions docker/docs.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM python:latest
COPY . .
RUN make install
EXPOSE 8080
ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"]
3 changes: 3 additions & 0 deletions docker/policyengine_uk_data.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM python:latest
COPY . .
RUN make install
13 changes: 13 additions & 0 deletions docs/Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import streamlit as st

st.title("PolicyEngine-UK-Data")

st.write(
"""PolicyEngine-UK-Data is a package to create representative microdata for the UK, designed for input in the PolicyEngine tax-benefit microsimulation model."""
)

st.subheader("What does this repo do?")

st.write(
"""This package creates a (partly synthetic) dataset of households (with incomes, demographics and more) that describes the U.K. household sector. This dataset synthesises multiple sources of data (the Current Population Survey, the IRS Public Use File, and administrative statistics) to improve upon the accuracy of **any** of them."""
)
1 change: 1 addition & 0 deletions policyengine_uk_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .datasets import *
1 change: 1 addition & 0 deletions policyengine_uk_data/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .frs import *
Empty file.
113 changes: 113 additions & 0 deletions policyengine_uk_data/datasets/frs/dwp_frs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from policyengine_core.data import Dataset
from pathlib import Path
import pandas as pd
import warnings
from typing import Type
from policyengine_uk_data.storage import STORAGE_FOLDER


class DWP_FRS(Dataset):
data_format = Dataset.TABLES
folder = None

def generate(self):
"""Generate the survey data from the original TAB files.
Args:
tab_folder (Path): The folder containing the original TAB files.
"""

tab_folder = self.folder

if isinstance(tab_folder, str):
tab_folder = Path(tab_folder)

# Folder might be either a folder, or a zipped folder.

if tab_folder.suffix == ".zip":
import zipfile

with zipfile.ZipFile(tab_folder, "r") as zip_ref:
zip_ref.extractall(tab_folder.parent)

tab_folder = Path(tab_folder.parent / tab_folder.stem)
# Load the data
tables = {}
for tab_file in tab_folder.glob("*.tab"):
table_name = tab_file.stem
if "frs" in table_name:
continue
with warnings.catch_warnings():
warnings.simplefilter("ignore")
tables[table_name] = pd.read_csv(
tab_file, delimiter="\t"
).apply(pd.to_numeric, errors="coerce")
tables[table_name].columns = tables[
table_name
].columns.str.upper()

sernum = (
"sernum"
if "sernum" in tables[table_name].columns
else "SERNUM"
) # FRS inconsistently users sernum/SERNUM in different years

if "PERSON" in tables[table_name].columns:
tables[table_name]["person_id"] = (
tables[table_name][sernum] * 1e2
+ tables[table_name].BENUNIT * 1e1
+ tables[table_name].PERSON
).astype(int)

if "BENUNIT" in tables[table_name].columns:
tables[table_name]["benunit_id"] = (
tables[table_name][sernum] * 1e2
+ tables[table_name].BENUNIT * 1e1
).astype(int)

if sernum in tables[table_name].columns:
tables[table_name]["household_id"] = (
tables[table_name][sernum] * 1e2
).astype(int)
if table_name in ("adult", "child"):
tables[table_name].set_index(
"person_id", inplace=True, drop=False
)
elif table_name == "benunit":
tables[table_name].set_index(
"benunit_id", inplace=True, drop=False
)
elif table_name == "househol":
tables[table_name].set_index(
"household_id", inplace=True, drop=False
)
tables["benunit"] = tables["benunit"][
tables["benunit"].benunit_id.isin(tables["adult"].benunit_id)
]
tables["househol"] = tables["househol"][
tables["househol"].household_id.isin(tables["adult"].household_id)
]

# Save the data
self.save_dataset(tables)


class DWP_FRS_2020_21(DWP_FRS):
folder = STORAGE_FOLDER / "frs_2020_21.zip"
name = "dwp_frs_2020_21"
label = "DWP FRS (2020-21)"
file_path = STORAGE_FOLDER / "dwp_frs_2020_21.h5"


class DWP_FRS_2021_22(DWP_FRS):
folder = STORAGE_FOLDER / "frs_2021_22.zip"
name = "dwp_frs_2021_22"
label = "DWP FRS (2021-22)"
file_path = STORAGE_FOLDER / "dwp_frs_2021_22.h5"


class DWP_FRS_2022_23(DWP_FRS):
folder = STORAGE_FOLDER / "frs_2022_23.zip"
name = "dwp_frs_2022_23"
label = "DWP FRS (2022-23)"
file_path = STORAGE_FOLDER / "dwp_frs_2022_23.h5"
3 changes: 3 additions & 0 deletions policyengine_uk_data/storage/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pathlib import Path

STORAGE_FOLDER = Path(__file__).parent
26 changes: 26 additions & 0 deletions policyengine_uk_data/storage/download_private_prerequisites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from policyengine_uk_data.utils.github import download
from pathlib import Path

FOLDER = Path(__file__).parent

download(
"PolicyEngine",
"ukda",
"release",
"frs_2020_21.zip",
FOLDER / "frs_2020_21.zip",
)
download(
"PolicyEngine",
"ukda",
"release",
"frs_2021_22.zip",
FOLDER / "frs_2021_22.zip",
)
download(
"PolicyEngine",
"ukda",
"release",
"frs_2022_23.zip",
FOLDER / "frs_2022_23.zip",
)
4 changes: 4 additions & 0 deletions policyengine_uk_data/storage/upload_completed_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from policyengine_uk_data.utils.github import upload
from pathlib import Path

FOLDER = Path(__file__).parent
Empty file.
2 changes: 2 additions & 0 deletions policyengine_uk_data/tests/test_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def test_import():
import policyengine_uk_data
4 changes: 4 additions & 0 deletions policyengine_uk_data/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .github import *
from .soi import *
from .uprating import *
from .loss import *
Loading

0 comments on commit c0acc9e

Please # to comment.