Skip to content

Commit

Permalink
Merge pull request #103 from FelipeSBarros/make_flatten_gdf
Browse files Browse the repository at this point in the history
Update flatten to accept DataFrame and GeoDataFrame
close #104 and close #96
  • Loading branch information
FelipeSBarros authored May 21, 2024
2 parents cb99ec1 + efd25bd commit 89a346a
Show file tree
Hide file tree
Showing 12 changed files with 662 additions and 120 deletions.
52 changes: 47 additions & 5 deletions .github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ name: 'Check test, code style and linter'
- push
- pull_request
jobs:
build:
build_with_basic_dependencies:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
name: Python ${{ matrix.python-version }}
name: (basic dependencies) Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v3
- name: Set up Python
Expand All @@ -17,9 +17,51 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install and configure Poetry
uses: snok/install-poetry@v1
- name: Install dependencies
- name: Install basic dependencies
run: |
poetry install --all-extras
- name: Run tests
poetry install
- name: Run tests with basic dependencies
run: |
poetry run pytest
build_with_pandas_only:
needs: build_with_basic_dependencies
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
name: (Pandas only) Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install and configure Poetry
uses: snok/install-poetry@v1
- name: Install pandas dependencies
run: |
poetry install --extras "df"
- name: Run tests with pandas only
run: |
poetry run pytest
build_with_pandas_and_geopandas:
needs: [build_with_basic_dependencies, build_with_pandas_only]
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
name: (Pandas & GeoPandas) Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install and configure Poetry
uses: snok/install-poetry@v1
- name: Install pandas & geopandas dependencies
run: |
poetry install --extras "geodf"
- name: Run tests with pandas & geopandas
run: |
poetry run pytest
56 changes: 41 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ cities(format='df')

#### `Cities` parameters

| Name | Required | Description | Type | Default value | Example |
|---|---|---|---|---|---|
| `state_id` || ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` |
| `city_id` || ID of the city | string | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` |
| `city_name` || Name of the city | string | `None` | `'Rio de Janeiro'` |
| `format` || Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` |
| Name | Required | Description | Type | Default value | Example |
|-------------|----------|----------------------|--------|---------------|------------------------------------------|
| `state_id` | | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` |
| `city_id` | | ID of the city | string | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` |
| `city_name` | | Name of the city | string | `None` | `'Rio de Janeiro'` |
| `format` | | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` |


### Listing occurrences
Expand Down Expand Up @@ -114,17 +114,43 @@ occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef', format='geodf')

#### `Occurrences` parameters

| Name | Required | Description | Type | Default value | Example |
|---|---|---|---|---|--------------------------------------------------------------------------------------------------------------------------------|
| `id_state` || ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` |
| `id_cities` || ID of the city | string or list of strings | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` or `['88959ad9-b2f5-4a33-a8ec-ceff5a572ca5', '9d7b569c-ec84-4908-96ab-3706ec3bfc57']` |
| `type_occurrence` || Type of occurrence | string | `'all'` | `'all'`, `'withVictim'` or `'withoutVictim'` |
| `initial_date` || Initial date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` |
| `final_date` || Final date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` |
| `max_parallel_requests` || Maximum number of parallel requests to the API | int | `16` | `32` |
| `format` || Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` |
| Name | Required | Description | Type | Default value | Example |
|-------------------------|----------|------------------------------------------------|------------------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------|
| `id_state` || ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` |
| `id_cities` || ID of the city | string or list of strings | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` or `['88959ad9-b2f5-4a33-a8ec-ceff5a572ca5', '9d7b569c-ec84-4908-96ab-3706ec3bfc57']` |
| `type_occurrence` || Type of occurrence | string | `'all'` | `'all'`, `'withVictim'` or `'withoutVictim'` |
| `initial_date` || Initial date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` |
| `final_date` || Final date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` |
| `max_parallel_requests` || Maximum number of parallel requests to the API | int | `16` | `32` |
| `format` || Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` |
| `flat` || Return nested columns as separate columns | bool | `False` | `True` or `False` |


##### About `flat` parameter

Occurrence data often contains nested information in several columns. By setting the parameter `flat=True`, you can simplify the analysis by separating nested data into individual columns. This feature is particularly useful for columns such as `contextInfo`, `state`, `region`, `city`, `neighborhood`, and `locality`.

For example, to access detailed information about the context of occurrences, such as identifying the main reason, you would typically need to access the `contextInfo` column and then look for the mainReason key. With the `flat=True` parameter, this nested information is automatically split into separate columns, making the data easier to work with.

When `flat=True` is set, the function returns occurrences with the flattened columns. Each new column retains the original column name as a prefix and the nested key as a suffix. For instance, the `contextInfo` column will be split into the following columns: `contextInfo_mainReason`, `contextInfo_complementaryReasons`, `contextInfo_clippings`, `contextInfo_massacre`, and `contextInfo_policeUnit`.


###### Example

```python
from crossfire import occurrences
from crossfire.clients.occurrences import flatten

occs = occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef')
occs[0].keys()
# dict_keys(['id', 'documentNumber', 'address', 'state', 'region', 'city', 'neighborhood', 'subNeighborhood', 'locality', 'latitude', 'longitude', 'date', 'policeAction', 'agentPresence', 'relatedRecord', 'contextInfo', 'transports', 'victims', 'animalVictims'])
flattened_occs = occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef', flat=True)
occs[0].keys()
# dict_keys(['id', 'documentNumber', 'address', 'state', 'region', 'city', 'neighborhood', 'subNeighborhood', 'locality', 'latitude', 'longitude', 'date', 'policeAction', 'agentPresence', 'relatedRecord', 'transports', 'victims', 'animalVictims', 'contextInfo', 'contextInfo_mainReason', 'contextInfo_complementaryReasons', 'contextInfo_clippings', 'contextInfo_massacre', 'contextInfo_policeUnit'])
```

By using the `flat=True parameter`, you ensure that all nested data is expanded into individual columns, simplifying data analysis and making it more straightforward to access specific details within your occurrence data.

### Custom client

If not using the environment variables for authentication, it is recommended to use a custom client:
Expand Down
27 changes: 2 additions & 25 deletions crossfire/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,6 @@
from functools import lru_cache

from crossfire.clients import AsyncClient, Client # noqa
from crossfire.errors import NestedColumnError

NESTED_COLUMNS = {
"contextInfo",
"state",
"region",
"city",
"neighborhood",
"locality",
}


@lru_cache(maxsize=1)
Expand All @@ -39,6 +29,7 @@ def occurrences(
final_date=None,
max_parallel_requests=None,
format=None,
flat=False,
):
return client().occurrences(
id_state,
Expand All @@ -48,19 +39,5 @@ def occurrences(
final_date=final_date,
max_parallel_requests=max_parallel_requests,
format=format,
flat=flat,
)


def flatten(data, nested_columns=None):
nested_columns = set(nested_columns or NESTED_COLUMNS)
if not nested_columns.issubset(NESTED_COLUMNS):
raise NestedColumnError(nested_columns)
if not data:
return data
if isinstance(data, list):
keys = set(data[0].keys()) & nested_columns
for item in data:
for key in keys:
item.update({f"{key}_{k}": v for k, v in item.get(key).items()})
item.pop(key)
return data
4 changes: 4 additions & 0 deletions crossfire/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ async def occurrences(
final_date=None,
max_parallel_requests=None,
format=None,
flat=False,
):
occurrences = Occurrences(
self,
Expand All @@ -125,6 +126,7 @@ async def occurrences(
max_parallel_requests=max_parallel_requests
or self.max_parallel_requests,
format=format,
flat=flat,
)
return await occurrences()

Expand Down Expand Up @@ -164,6 +166,7 @@ def occurrences(
final_date=None,
max_parallel_requests=None,
format=None,
flat=False,
):
loop = get_event_loop()
occurrences = loop.run_until_complete(
Expand All @@ -175,6 +178,7 @@ def occurrences(
final_date=final_date,
max_parallel_requests=max_parallel_requests,
format=format,
flat=flat,
)
)
return occurrences
112 changes: 108 additions & 4 deletions crossfire/clients/occurrences.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,22 @@
from httpx import ReadTimeout
from tqdm import tqdm

from crossfire.errors import NestedColumnError

try:
from pandas import concat
from pandas import DataFrame, Series, concat

HAS_PANDAS = True
except ImportError:
pass
HAS_PANDAS = False


try:
from geopandas import GeoDataFrame

HAS_GEOPANDAS = True
except ImportError:
pass
HAS_GEOPANDAS = False

from crossfire.errors import (
CrossfireError,
Expand All @@ -28,6 +35,14 @@

TYPE_OCCURRENCES = {"all", "withVictim", "withoutVictim"}
NOT_NUMBER = re.compile("\D")
NESTED_COLUMNS = {
"contextInfo",
"state",
"region",
"city",
"neighborhood",
"locality",
}


def date_formatter(date_parameter):
Expand Down Expand Up @@ -65,12 +80,14 @@ def __init__(
final_date=None,
max_parallel_requests=None,
format=None,
flat=False,
):
if type_occurrence not in TYPE_OCCURRENCES:
raise UnknownTypeOccurrenceError(type_occurrence)

self.client = client
self.format = format
self.flat = flat
self.params = {"idState": id_state, "typeOccurrence": type_occurrence}
if id_cities:
self.params["idCities"] = id_cities
Expand Down Expand Up @@ -132,6 +149,8 @@ async def __call__(self):
pages = await gather(*requests)
data.merge(*pages)

if self.flat:
return flatten(data())
return data()


Expand All @@ -142,7 +161,7 @@ def __init__(self):

def save_first(self, *pages):
self.data, *remaining = pages
if isinstance(self.data, GeoDataFrame):
if HAS_GEOPANDAS and isinstance(self.data, GeoDataFrame):
self.is_gdf = True
return self if not remaining else self.merge(remaining)

Expand All @@ -164,3 +183,88 @@ def __call__(self):
return GeoDataFrame(self.data)

return self.data


def _flatten_df(data, nested_columns):
def _flatten_col(row, column_name):
column_data = row[column_name]
if not column_data:
return Series()

flatenned_series = Series(
{
f"{column_name}_{key}": value
for key, value in column_data.items()
}
)
for key, value in column_data.items():
if isinstance(value, dict):
flatenned_series = concat(
[
flatenned_series,
Series(
{
f"{column_name}_{key}_{subkey}": v
for subkey, v in value.items()
},
),
],
axis=0,
)
return flatenned_series

keys = set(data.columns) & nested_columns
if not keys:
return data
for key in keys:
data = concat(
[
data,
data.apply(_flatten_col, args=(key,), axis=1),
],
axis=1,
)
return data


def _flatten_list(data, nested_columns):
keys = set(data[0].keys()) & nested_columns
for item in data:
for key in keys:
if key not in item:
return data
value = item.get(key)
if not value:
return data

item.update({f"{key}_{k}": v for k, v in value.items() if v})
for k, v in value.items():
if isinstance(v, dict):
item.update(
{
f"{key}_{k}_{subkey}": v
for subkey, v in v.items()
if v
}
)
return data


def is_empty(data):
if HAS_PANDAS and isinstance(data, DataFrame):
return data.empty
return not data


def flatten(data, nested_columns=None):
nested_columns = set(nested_columns or NESTED_COLUMNS)
if not nested_columns.issubset(NESTED_COLUMNS):
raise NestedColumnError(nested_columns)
if is_empty(data):
return data
if HAS_PANDAS and isinstance(data, DataFrame):
data = _flatten_df(data, nested_columns)
return data

data = _flatten_list(data, nested_columns)
return data
Loading

0 comments on commit 89a346a

Please # to comment.