Merge pull request #103 from FelipeSBarros/make_flatten_gdf

Update flatten to accept DataFrame and GeoDataFrame close #104 and close #96
FelipeSBarros · May 21, 2024 · 89a346a · 89a346a
2 parents cb99ec1 + efd25bd
commit 89a346a
Show file tree

Hide file tree

Showing 12 changed files with 662 additions and 120 deletions.
diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -3,12 +3,12 @@ name: 'Check test, code style and linter'
   - push
   - pull_request
 jobs:
-  build:
+  build_with_basic_dependencies:
     runs-on: ubuntu-latest
     strategy:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
-    name: Python ${{ matrix.python-version }}
+    name: (basic dependencies) Python ${{ matrix.python-version }}
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python
@@ -17,9 +17,51 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install and configure Poetry
         uses: snok/install-poetry@v1
-      - name: Install dependencies
+      - name: Install basic dependencies
         run: |
-          poetry install --all-extras
-      - name: Run tests
+          poetry install
+      - name: Run tests with basic dependencies
+        run: |
+          poetry run pytest
+  build_with_pandas_only:
+    needs: build_with_basic_dependencies
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    name: (Pandas only) Python ${{ matrix.python-version }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install and configure Poetry
+        uses: snok/install-poetry@v1
+      - name: Install pandas dependencies
+        run: |
+          poetry install --extras "df"
+      - name: Run tests with pandas only
+        run: |
+          poetry run pytest
+  build_with_pandas_and_geopandas:
+    needs: [build_with_basic_dependencies, build_with_pandas_only]
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    name: (Pandas & GeoPandas) Python ${{ matrix.python-version }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install and configure Poetry
+        uses: snok/install-poetry@v1
+      - name: Install pandas & geopandas dependencies
+        run: |
+          poetry install --extras "geodf"
+      - name: Run tests with pandas & geopandas
         run: |
           poetry run pytest
diff --git a/README.md b/README.md
@@ -81,12 +81,12 @@ cities(format='df')
 
 #### `Cities` parameters
 
-| Name | Required | Description | Type | Default value | Example |
-|---|---|---|---|---|---|
-| `state_id` | ❌ | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` |
-| `city_id` | ❌ | ID of the city | string | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` |
-| `city_name` | ❌ | Name of the city | string | `None` | `'Rio de Janeiro'` |
-| `format` | ❌ | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` |
+| Name        | Required | Description          | Type   | Default value | Example                                  |
+|-------------|----------|----------------------|--------|---------------|------------------------------------------|
+| `state_id`  | ❌        | ID of the state      | string | `None`        | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` |
+| `city_id`   | ❌        | ID of the city       | string | `None`        | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` |
+| `city_name` | ❌        | Name of the city     | string | `None`        | `'Rio de Janeiro'`                       |
+| `format`    | ❌        | Format of the result | string | `'dict'`      | `'dict'`, `'df'` or `'geodf'`            |
 
 
 ### Listing occurrences
@@ -114,17 +114,43 @@ occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef', format='geodf')
 
 #### `Occurrences` parameters
 
-| Name | Required | Description | Type | Default value | Example                                                                                                                        |
-|---|---|---|---|---|--------------------------------------------------------------------------------------------------------------------------------|
-| `id_state` | ✅ | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'`                                                                                       |
-| `id_cities` | ❌ | ID of the city | string or list of strings | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` or `['88959ad9-b2f5-4a33-a8ec-ceff5a572ca5', '9d7b569c-ec84-4908-96ab-3706ec3bfc57']` |
-| `type_occurrence` | ❌ | Type of occurrence | string | `'all'` | `'all'`, `'withVictim'` or `'withoutVictim'`                                                                                   |
-| `initial_date` | ❌ | Initial date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)`                   | 
-| `final_date` | ❌ | Final date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)`                   |
-| `max_parallel_requests` | ❌ | Maximum number of parallel requests to the API | int | `16` | `32`                                                                                                                           |
-| `format` | ❌ | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'`                                                                                                  |
+| Name                    | Required | Description                                    | Type                         | Default value | Example                                                                                                                        |
+|-------------------------|----------|------------------------------------------------|------------------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------|
+| `id_state`              | ✅        | ID of the state                                | string                       | `None`        | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'`                                                                                       |
+| `id_cities`             | ❌        | ID of the city                                 | string or list of strings    | `None`        | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` or `['88959ad9-b2f5-4a33-a8ec-ceff5a572ca5', '9d7b569c-ec84-4908-96ab-3706ec3bfc57']` |
+| `type_occurrence`       | ❌        | Type of occurrence                             | string                       | `'all'`       | `'all'`, `'withVictim'` or `'withoutVictim'`                                                                                   |
+| `initial_date`          | ❌        | Initial date of the occurrences                | string, `date` or `datetime` | `None`        | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)`                   | 
+| `final_date`            | ❌        | Final date of the occurrences                  | string, `date` or `datetime` | `None`        | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)`                   |
+| `max_parallel_requests` | ❌        | Maximum number of parallel requests to the API | int                          | `16`          | `32`                                                                                                                           |
+| `format`                | ❌        | Format of the result                           | string                       | `'dict'`      | `'dict'`, `'df'` or `'geodf'`                                                                                                  |
+| `flat`                  | ❌        | Return nested columns as separate columns      | bool                         | `False`       | `True` or `False`                                                                                                              |
 
 
+##### About `flat` parameter
+
+Occurrence data often contains nested information in several columns. By setting the parameter `flat=True`, you can simplify the analysis by separating nested data into individual columns. This feature is particularly useful for columns such as `contextInfo`, `state`, `region`, `city`, `neighborhood`, and `locality`.
+
+For example, to access detailed information about the context of occurrences, such as identifying the main reason, you would typically need to access the `contextInfo` column and then look for the mainReason key. With the `flat=True` parameter, this nested information is automatically split into separate columns, making the data easier to work with.
+
+When `flat=True` is set, the function returns occurrences with the flattened columns. Each new column retains the original column name as a prefix and the nested key as a suffix. For instance, the `contextInfo` column will be split into the following columns: `contextInfo_mainReason`, `contextInfo_complementaryReasons`, `contextInfo_clippings`, `contextInfo_massacre`, and `contextInfo_policeUnit`.
+
+
+###### Example
+
+```python
+from crossfire import occurrences
+from crossfire.clients.occurrences import flatten
+
+occs = occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef')
+occs[0].keys()
+# dict_keys(['id', 'documentNumber', 'address', 'state', 'region', 'city', 'neighborhood', 'subNeighborhood', 'locality', 'latitude', 'longitude', 'date', 'policeAction', 'agentPresence', 'relatedRecord', 'contextInfo', 'transports', 'victims', 'animalVictims'])
+flattened_occs = occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef', flat=True)
+occs[0].keys()
+# dict_keys(['id', 'documentNumber', 'address', 'state', 'region', 'city', 'neighborhood', 'subNeighborhood', 'locality', 'latitude', 'longitude', 'date', 'policeAction', 'agentPresence', 'relatedRecord', 'transports', 'victims', 'animalVictims', 'contextInfo', 'contextInfo_mainReason', 'contextInfo_complementaryReasons', 'contextInfo_clippings', 'contextInfo_massacre', 'contextInfo_policeUnit'])
+```
+
+By using the `flat=True parameter`, you ensure that all nested data is expanded into individual columns, simplifying data analysis and making it more straightforward to access specific details within your occurrence data.
+
 ### Custom client
 
 If not using the environment variables for authentication, it is recommended to use a custom client:

diff --git a/crossfire/__init__.py b/crossfire/__init__.py
@@ -4,16 +4,6 @@
 from functools import lru_cache
 
 from crossfire.clients import AsyncClient, Client  # noqa
-from crossfire.errors import NestedColumnError
-
-NESTED_COLUMNS = {
-    "contextInfo",
-    "state",
-    "region",
-    "city",
-    "neighborhood",
-    "locality",
-}
 
 
 @lru_cache(maxsize=1)
@@ -39,6 +29,7 @@ def occurrences(
     final_date=None,
     max_parallel_requests=None,
     format=None,
+    flat=False,
 ):
     return client().occurrences(
         id_state,
@@ -48,19 +39,5 @@ def occurrences(
         final_date=final_date,
         max_parallel_requests=max_parallel_requests,
         format=format,
+        flat=flat,
     )
-
-
-def flatten(data, nested_columns=None):
-    nested_columns = set(nested_columns or NESTED_COLUMNS)
-    if not nested_columns.issubset(NESTED_COLUMNS):
-        raise NestedColumnError(nested_columns)
-    if not data:
-        return data
-    if isinstance(data, list):
-        keys = set(data[0].keys()) & nested_columns
-        for item in data:
-            for key in keys:
-                item.update({f"{key}_{k}": v for k, v in item.get(key).items()})
-                item.pop(key)
-        return data
diff --git a/crossfire/clients/__init__.py b/crossfire/clients/__init__.py
@@ -114,6 +114,7 @@ async def occurrences(
         final_date=None,
         max_parallel_requests=None,
         format=None,
+        flat=False,
     ):
         occurrences = Occurrences(
             self,
@@ -125,6 +126,7 @@ async def occurrences(
             max_parallel_requests=max_parallel_requests
             or self.max_parallel_requests,
             format=format,
+            flat=flat,
         )
         return await occurrences()
 
@@ -164,6 +166,7 @@ def occurrences(
         final_date=None,
         max_parallel_requests=None,
         format=None,
+        flat=False,
     ):
         loop = get_event_loop()
         occurrences = loop.run_until_complete(
@@ -175,6 +178,7 @@ def occurrences(
                 final_date=final_date,
                 max_parallel_requests=max_parallel_requests,
                 format=format,
+                flat=flat,
             )
         )
         return occurrences
diff --git a/crossfire/clients/occurrences.py b/crossfire/clients/occurrences.py
@@ -6,15 +6,22 @@
 from httpx import ReadTimeout
 from tqdm import tqdm
 
+from crossfire.errors import NestedColumnError
+
 try:
-    from pandas import concat
+    from pandas import DataFrame, Series, concat
+
+    HAS_PANDAS = True
 except ImportError:
-    pass
+    HAS_PANDAS = False
+
 
 try:
     from geopandas import GeoDataFrame
+
+    HAS_GEOPANDAS = True
 except ImportError:
-    pass
+    HAS_GEOPANDAS = False
 
 from crossfire.errors import (
     CrossfireError,
@@ -28,6 +35,14 @@
 
 TYPE_OCCURRENCES = {"all", "withVictim", "withoutVictim"}
 NOT_NUMBER = re.compile("\D")
+NESTED_COLUMNS = {
+    "contextInfo",
+    "state",
+    "region",
+    "city",
+    "neighborhood",
+    "locality",
+}
 
 
 def date_formatter(date_parameter):
@@ -65,12 +80,14 @@ def __init__(
         final_date=None,
         max_parallel_requests=None,
         format=None,
+        flat=False,
     ):
         if type_occurrence not in TYPE_OCCURRENCES:
             raise UnknownTypeOccurrenceError(type_occurrence)
 
         self.client = client
         self.format = format
+        self.flat = flat
         self.params = {"idState": id_state, "typeOccurrence": type_occurrence}
         if id_cities:
             self.params["idCities"] = id_cities
@@ -132,6 +149,8 @@ async def __call__(self):
             pages = await gather(*requests)
             data.merge(*pages)
 
+        if self.flat:
+            return flatten(data())
         return data()
 
 
@@ -142,7 +161,7 @@ def __init__(self):
 
     def save_first(self, *pages):
         self.data, *remaining = pages
-        if isinstance(self.data, GeoDataFrame):
+        if HAS_GEOPANDAS and isinstance(self.data, GeoDataFrame):
             self.is_gdf = True
         return self if not remaining else self.merge(remaining)
 
@@ -164,3 +183,88 @@ def __call__(self):
             return GeoDataFrame(self.data)
 
         return self.data
+
+
+def _flatten_df(data, nested_columns):
+    def _flatten_col(row, column_name):
+        column_data = row[column_name]
+        if not column_data:
+            return Series()
+
+        flatenned_series = Series(
+            {
+                f"{column_name}_{key}": value
+                for key, value in column_data.items()
+            }
+        )
+        for key, value in column_data.items():
+            if isinstance(value, dict):
+                flatenned_series = concat(
+                    [
+                        flatenned_series,
+                        Series(
+                            {
+                                f"{column_name}_{key}_{subkey}": v
+                                for subkey, v in value.items()
+                            },
+                        ),
+                    ],
+                    axis=0,
+                )
+        return flatenned_series
+
+    keys = set(data.columns) & nested_columns
+    if not keys:
+        return data
+    for key in keys:
+        data = concat(
+            [
+                data,
+                data.apply(_flatten_col, args=(key,), axis=1),
+            ],
+            axis=1,
+        )
+    return data
+
+
+def _flatten_list(data, nested_columns):
+    keys = set(data[0].keys()) & nested_columns
+    for item in data:
+        for key in keys:
+            if key not in item:
+                return data
+            value = item.get(key)
+            if not value:
+                return data
+
+            item.update({f"{key}_{k}": v for k, v in value.items() if v})
+            for k, v in value.items():
+                if isinstance(v, dict):
+                    item.update(
+                        {
+                            f"{key}_{k}_{subkey}": v
+                            for subkey, v in v.items()
+                            if v
+                        }
+                    )
+    return data
+
+
+def is_empty(data):
+    if HAS_PANDAS and isinstance(data, DataFrame):
+        return data.empty
+    return not data
+
+
+def flatten(data, nested_columns=None):
+    nested_columns = set(nested_columns or NESTED_COLUMNS)
+    if not nested_columns.issubset(NESTED_COLUMNS):
+        raise NestedColumnError(nested_columns)
+    if is_empty(data):
+        return data
+    if HAS_PANDAS and isinstance(data, DataFrame):
+        data = _flatten_df(data, nested_columns)
+        return data
+
+    data = _flatten_list(data, nested_columns)
+    return data