diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 658f675..28f9039 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -3,12 +3,12 @@ name: 'Check test, code style and linter' - push - pull_request jobs: - build: + build_with_basic_dependencies: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12"] - name: Python ${{ matrix.python-version }} + name: (basic dependencies) Python ${{ matrix.python-version }} steps: - uses: actions/checkout@v3 - name: Set up Python @@ -17,9 +17,51 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install and configure Poetry uses: snok/install-poetry@v1 - - name: Install dependencies + - name: Install basic dependencies run: | - poetry install --all-extras - - name: Run tests + poetry install + - name: Run tests with basic dependencies + run: | + poetry run pytest + build_with_pandas_only: + needs: build_with_basic_dependencies + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + name: (Pandas only) Python ${{ matrix.python-version }} + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + - name: Install pandas dependencies + run: | + poetry install --extras "df" + - name: Run tests with pandas only + run: | + poetry run pytest + build_with_pandas_and_geopandas: + needs: [build_with_basic_dependencies, build_with_pandas_only] + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + name: (Pandas & GeoPandas) Python ${{ matrix.python-version }} + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + - name: Install pandas & geopandas dependencies + run: | + poetry install --extras "geodf" + - name: Run tests with pandas & geopandas run: | poetry run pytest diff --git a/README.md b/README.md index fab2cfc..90654e8 100644 --- a/README.md +++ b/README.md @@ -81,12 +81,12 @@ cities(format='df') #### `Cities` parameters -| Name | Required | Description | Type | Default value | Example | -|---|---|---|---|---|---| -| `state_id` | ❌ | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` | -| `city_id` | ❌ | ID of the city | string | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` | -| `city_name` | ❌ | Name of the city | string | `None` | `'Rio de Janeiro'` | -| `format` | ❌ | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` | +| Name | Required | Description | Type | Default value | Example | +|-------------|----------|----------------------|--------|---------------|------------------------------------------| +| `state_id` | ❌ | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` | +| `city_id` | ❌ | ID of the city | string | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` | +| `city_name` | ❌ | Name of the city | string | `None` | `'Rio de Janeiro'` | +| `format` | ❌ | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` | ### Listing occurrences @@ -114,17 +114,43 @@ occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef', format='geodf') #### `Occurrences` parameters -| Name | Required | Description | Type | Default value | Example | -|---|---|---|---|---|--------------------------------------------------------------------------------------------------------------------------------| -| `id_state` | ✅ | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` | -| `id_cities` | ❌ | ID of the city | string or list of strings | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` or `['88959ad9-b2f5-4a33-a8ec-ceff5a572ca5', '9d7b569c-ec84-4908-96ab-3706ec3bfc57']` | -| `type_occurrence` | ❌ | Type of occurrence | string | `'all'` | `'all'`, `'withVictim'` or `'withoutVictim'` | -| `initial_date` | ❌ | Initial date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` | -| `final_date` | ❌ | Final date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` | -| `max_parallel_requests` | ❌ | Maximum number of parallel requests to the API | int | `16` | `32` | -| `format` | ❌ | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` | +| Name | Required | Description | Type | Default value | Example | +|-------------------------|----------|------------------------------------------------|------------------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------| +| `id_state` | ✅ | ID of the state | string | `None` | `'b112ffbe-17b3-4ad0-8f2a-2038745d1d14'` | +| `id_cities` | ❌ | ID of the city | string or list of strings | `None` | `'88959ad9-b2f5-4a33-a8ec-ceff5a572ca5'` or `['88959ad9-b2f5-4a33-a8ec-ceff5a572ca5', '9d7b569c-ec84-4908-96ab-3706ec3bfc57']` | +| `type_occurrence` | ❌ | Type of occurrence | string | `'all'` | `'all'`, `'withVictim'` or `'withoutVictim'` | +| `initial_date` | ❌ | Initial date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` | +| `final_date` | ❌ | Final date of the occurrences | string, `date` or `datetime` | `None` | `'2020-01-01'`, `'2020/01/01'`, `'20200101'`, `datetime.datetime(2023, 1, 1)` or `datetime.date(2023, 1, 1)` | +| `max_parallel_requests` | ❌ | Maximum number of parallel requests to the API | int | `16` | `32` | +| `format` | ❌ | Format of the result | string | `'dict'` | `'dict'`, `'df'` or `'geodf'` | +| `flat` | ❌ | Return nested columns as separate columns | bool | `False` | `True` or `False` | +##### About `flat` parameter + +Occurrence data often contains nested information in several columns. By setting the parameter `flat=True`, you can simplify the analysis by separating nested data into individual columns. This feature is particularly useful for columns such as `contextInfo`, `state`, `region`, `city`, `neighborhood`, and `locality`. + +For example, to access detailed information about the context of occurrences, such as identifying the main reason, you would typically need to access the `contextInfo` column and then look for the mainReason key. With the `flat=True` parameter, this nested information is automatically split into separate columns, making the data easier to work with. + +When `flat=True` is set, the function returns occurrences with the flattened columns. Each new column retains the original column name as a prefix and the nested key as a suffix. For instance, the `contextInfo` column will be split into the following columns: `contextInfo_mainReason`, `contextInfo_complementaryReasons`, `contextInfo_clippings`, `contextInfo_massacre`, and `contextInfo_policeUnit`. + + +###### Example + +```python +from crossfire import occurrences +from crossfire.clients.occurrences import flatten + +occs = occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef') +occs[0].keys() +# dict_keys(['id', 'documentNumber', 'address', 'state', 'region', 'city', 'neighborhood', 'subNeighborhood', 'locality', 'latitude', 'longitude', 'date', 'policeAction', 'agentPresence', 'relatedRecord', 'contextInfo', 'transports', 'victims', 'animalVictims']) +flattened_occs = occurrences('813ca36b-91e3-4a18-b408-60b27a1942ef', flat=True) +occs[0].keys() +# dict_keys(['id', 'documentNumber', 'address', 'state', 'region', 'city', 'neighborhood', 'subNeighborhood', 'locality', 'latitude', 'longitude', 'date', 'policeAction', 'agentPresence', 'relatedRecord', 'transports', 'victims', 'animalVictims', 'contextInfo', 'contextInfo_mainReason', 'contextInfo_complementaryReasons', 'contextInfo_clippings', 'contextInfo_massacre', 'contextInfo_policeUnit']) +``` + +By using the `flat=True parameter`, you ensure that all nested data is expanded into individual columns, simplifying data analysis and making it more straightforward to access specific details within your occurrence data. + ### Custom client If not using the environment variables for authentication, it is recommended to use a custom client: diff --git a/crossfire/__init__.py b/crossfire/__init__.py index f20004e..57c7478 100644 --- a/crossfire/__init__.py +++ b/crossfire/__init__.py @@ -4,16 +4,6 @@ from functools import lru_cache from crossfire.clients import AsyncClient, Client # noqa -from crossfire.errors import NestedColumnError - -NESTED_COLUMNS = { - "contextInfo", - "state", - "region", - "city", - "neighborhood", - "locality", -} @lru_cache(maxsize=1) @@ -39,6 +29,7 @@ def occurrences( final_date=None, max_parallel_requests=None, format=None, + flat=False, ): return client().occurrences( id_state, @@ -48,19 +39,5 @@ def occurrences( final_date=final_date, max_parallel_requests=max_parallel_requests, format=format, + flat=flat, ) - - -def flatten(data, nested_columns=None): - nested_columns = set(nested_columns or NESTED_COLUMNS) - if not nested_columns.issubset(NESTED_COLUMNS): - raise NestedColumnError(nested_columns) - if not data: - return data - if isinstance(data, list): - keys = set(data[0].keys()) & nested_columns - for item in data: - for key in keys: - item.update({f"{key}_{k}": v for k, v in item.get(key).items()}) - item.pop(key) - return data diff --git a/crossfire/clients/__init__.py b/crossfire/clients/__init__.py index 9224e16..3dd5d51 100644 --- a/crossfire/clients/__init__.py +++ b/crossfire/clients/__init__.py @@ -114,6 +114,7 @@ async def occurrences( final_date=None, max_parallel_requests=None, format=None, + flat=False, ): occurrences = Occurrences( self, @@ -125,6 +126,7 @@ async def occurrences( max_parallel_requests=max_parallel_requests or self.max_parallel_requests, format=format, + flat=flat, ) return await occurrences() @@ -164,6 +166,7 @@ def occurrences( final_date=None, max_parallel_requests=None, format=None, + flat=False, ): loop = get_event_loop() occurrences = loop.run_until_complete( @@ -175,6 +178,7 @@ def occurrences( final_date=final_date, max_parallel_requests=max_parallel_requests, format=format, + flat=flat, ) ) return occurrences diff --git a/crossfire/clients/occurrences.py b/crossfire/clients/occurrences.py index 4b6089a..8be9745 100644 --- a/crossfire/clients/occurrences.py +++ b/crossfire/clients/occurrences.py @@ -6,15 +6,22 @@ from httpx import ReadTimeout from tqdm import tqdm +from crossfire.errors import NestedColumnError + try: - from pandas import concat + from pandas import DataFrame, Series, concat + + HAS_PANDAS = True except ImportError: - pass + HAS_PANDAS = False + try: from geopandas import GeoDataFrame + + HAS_GEOPANDAS = True except ImportError: - pass + HAS_GEOPANDAS = False from crossfire.errors import ( CrossfireError, @@ -28,6 +35,14 @@ TYPE_OCCURRENCES = {"all", "withVictim", "withoutVictim"} NOT_NUMBER = re.compile("\D") +NESTED_COLUMNS = { + "contextInfo", + "state", + "region", + "city", + "neighborhood", + "locality", +} def date_formatter(date_parameter): @@ -65,12 +80,14 @@ def __init__( final_date=None, max_parallel_requests=None, format=None, + flat=False, ): if type_occurrence not in TYPE_OCCURRENCES: raise UnknownTypeOccurrenceError(type_occurrence) self.client = client self.format = format + self.flat = flat self.params = {"idState": id_state, "typeOccurrence": type_occurrence} if id_cities: self.params["idCities"] = id_cities @@ -132,6 +149,8 @@ async def __call__(self): pages = await gather(*requests) data.merge(*pages) + if self.flat: + return flatten(data()) return data() @@ -142,7 +161,7 @@ def __init__(self): def save_first(self, *pages): self.data, *remaining = pages - if isinstance(self.data, GeoDataFrame): + if HAS_GEOPANDAS and isinstance(self.data, GeoDataFrame): self.is_gdf = True return self if not remaining else self.merge(remaining) @@ -164,3 +183,88 @@ def __call__(self): return GeoDataFrame(self.data) return self.data + + +def _flatten_df(data, nested_columns): + def _flatten_col(row, column_name): + column_data = row[column_name] + if not column_data: + return Series() + + flatenned_series = Series( + { + f"{column_name}_{key}": value + for key, value in column_data.items() + } + ) + for key, value in column_data.items(): + if isinstance(value, dict): + flatenned_series = concat( + [ + flatenned_series, + Series( + { + f"{column_name}_{key}_{subkey}": v + for subkey, v in value.items() + }, + ), + ], + axis=0, + ) + return flatenned_series + + keys = set(data.columns) & nested_columns + if not keys: + return data + for key in keys: + data = concat( + [ + data, + data.apply(_flatten_col, args=(key,), axis=1), + ], + axis=1, + ) + return data + + +def _flatten_list(data, nested_columns): + keys = set(data[0].keys()) & nested_columns + for item in data: + for key in keys: + if key not in item: + return data + value = item.get(key) + if not value: + return data + + item.update({f"{key}_{k}": v for k, v in value.items() if v}) + for k, v in value.items(): + if isinstance(v, dict): + item.update( + { + f"{key}_{k}_{subkey}": v + for subkey, v in v.items() + if v + } + ) + return data + + +def is_empty(data): + if HAS_PANDAS and isinstance(data, DataFrame): + return data.empty + return not data + + +def flatten(data, nested_columns=None): + nested_columns = set(nested_columns or NESTED_COLUMNS) + if not nested_columns.issubset(NESTED_COLUMNS): + raise NestedColumnError(nested_columns) + if is_empty(data): + return data + if HAS_PANDAS and isinstance(data, DataFrame): + data = _flatten_df(data, nested_columns) + return data + + data = _flatten_list(data, nested_columns) + return data diff --git a/poetry.lock b/poetry.lock index a3fe348..7fde554 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,9 +1,10 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "anyio" version = "4.1.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -25,6 +26,7 @@ trio = ["trio (>=0.23)"] name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" +category = "main" optional = true python-versions = ">=3.7" files = [ @@ -43,6 +45,7 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte name = "certifi" version = "2023.11.17" description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -54,6 +57,7 @@ files = [ name = "click" version = "8.1.7" description = "Composable command line interface toolkit" +category = "main" optional = true python-versions = ">=3.7" files = [ @@ -68,6 +72,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "click-plugins" version = "1.1.1" description = "An extension module for click to enable registering CLI commands via setuptools entry-points." +category = "main" optional = true python-versions = "*" files = [ @@ -85,6 +90,7 @@ dev = ["coveralls", "pytest (>=3.6)", "pytest-cov", "wheel"] name = "cligj" version = "0.7.2" description = "Click params for commmand line interfaces to GeoJSON" +category = "main" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4" files = [ @@ -102,6 +108,7 @@ test = ["pytest-cov"] name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -113,6 +120,7 @@ files = [ name = "exceptiongroup" version = "1.2.0" description = "Backport of PEP 654 (exception groups)" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -127,6 +135,7 @@ test = ["pytest (>=6)"] name = "fiona" version = "1.9.5" description = "Fiona reads and writes spatial data files" +category = "main" optional = true python-versions = ">=3.7" files = [ @@ -176,6 +185,7 @@ test = ["Fiona[s3]", "pytest (>=7)", "pytest-cov", "pytz"] name = "geopandas" version = "0.13.2" description = "Geographic pandas extensions" +category = "main" optional = true python-versions = ">=3.8" files = [ @@ -194,6 +204,7 @@ shapely = ">=1.7.1" name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -205,6 +216,7 @@ files = [ name = "httpcore" version = "1.0.2" description = "A minimal low-level HTTP client." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -219,13 +231,14 @@ h11 = ">=0.13,<0.15" [package.extras] asyncio = ["anyio (>=4.0,<5.0)"] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] +socks = ["socksio (>=1.0.0,<2.0.0)"] trio = ["trio (>=0.22.0,<0.23.0)"] [[package]] name = "httpx" version = "0.25.2" description = "The next generation HTTP client." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -236,20 +249,21 @@ files = [ [package.dependencies] anyio = "*" certifi = "*" -httpcore = "==1.*" +httpcore = ">=1.0.0,<2.0.0" idna = "*" sniffio = "*" [package.extras] brotli = ["brotli", "brotlicffi"] -cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] +socks = ["socksio (>=1.0.0,<2.0.0)"] [[package]] name = "idna" version = "3.6" description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -261,6 +275,7 @@ files = [ name = "importlib-metadata" version = "7.0.0" description = "Read metadata from Python packages" +category = "main" optional = true python-versions = ">=3.8" files = [ @@ -280,6 +295,7 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -302,6 +318,7 @@ files = [ name = "numpy" version = "1.26.2" description = "Fundamental package for array computing in Python" +category = "main" optional = true python-versions = ">=3.9" files = [ @@ -347,6 +364,7 @@ files = [ name = "packaging" version = "23.2" description = "Core utilities for Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -356,36 +374,41 @@ files = [ [[package]] name = "pandas" -version = "2.1.3" +version = "2.2.1" description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" optional = true python-versions = ">=3.9" files = [ - {file = "pandas-2.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:acf08a73b5022b479c1be155d4988b72f3020f308f7a87c527702c5f8966d34f"}, - {file = "pandas-2.1.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3cc4469ff0cf9aa3a005870cb49ab8969942b7156e0a46cc3f5abd6b11051dfb"}, - {file = "pandas-2.1.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35172bff95f598cc5866c047f43c7f4df2c893acd8e10e6653a4b792ed7f19bb"}, - {file = "pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59dfe0e65a2f3988e940224e2a70932edc964df79f3356e5f2997c7d63e758b4"}, - {file = "pandas-2.1.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0296a66200dee556850d99b24c54c7dfa53a3264b1ca6f440e42bad424caea03"}, - {file = "pandas-2.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:465571472267a2d6e00657900afadbe6097c8e1dc43746917db4dfc862e8863e"}, - {file = "pandas-2.1.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:04d4c58e1f112a74689da707be31cf689db086949c71828ef5da86727cfe3f82"}, - {file = "pandas-2.1.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fa2ad4ff196768ae63a33f8062e6838efed3a319cf938fdf8b95e956c813042"}, - {file = "pandas-2.1.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4441ac94a2a2613e3982e502ccec3bdedefe871e8cea54b8775992485c5660ef"}, - {file = "pandas-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5ded6ff28abbf0ea7689f251754d3789e1edb0c4d0d91028f0b980598418a58"}, - {file = "pandas-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fca5680368a5139d4920ae3dc993eb5106d49f814ff24018b64d8850a52c6ed2"}, - {file = "pandas-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:de21e12bf1511190fc1e9ebc067f14ca09fccfb189a813b38d63211d54832f5f"}, - {file = "pandas-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a5d53c725832e5f1645e7674989f4c106e4b7249c1d57549023ed5462d73b140"}, - {file = "pandas-2.1.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7cf4cf26042476e39394f1f86868d25b265ff787c9b2f0d367280f11afbdee6d"}, - {file = "pandas-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72c84ec1b1d8e5efcbff5312abe92bfb9d5b558f11e0cf077f5496c4f4a3c99e"}, - {file = "pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f539e113739a3e0cc15176bf1231a553db0239bfa47a2c870283fd93ba4f683"}, - {file = "pandas-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fc77309da3b55732059e484a1efc0897f6149183c522390772d3561f9bf96c00"}, - {file = "pandas-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:08637041279b8981a062899da0ef47828df52a1838204d2b3761fbd3e9fcb549"}, - {file = "pandas-2.1.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b99c4e51ef2ed98f69099c72c75ec904dd610eb41a32847c4fcbc1a975f2d2b8"}, - {file = "pandas-2.1.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f7ea8ae8004de0381a2376662c0505bb0a4f679f4c61fbfd122aa3d1b0e5f09d"}, - {file = "pandas-2.1.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fcd76d67ca2d48f56e2db45833cf9d58f548f97f61eecd3fdc74268417632b8a"}, - {file = "pandas-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1329dbe93a880a3d7893149979caa82d6ba64a25e471682637f846d9dbc10dd2"}, - {file = "pandas-2.1.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:321ecdb117bf0f16c339cc6d5c9a06063854f12d4d9bc422a84bb2ed3207380a"}, - {file = "pandas-2.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:11a771450f36cebf2a4c9dbd3a19dfa8c46c4b905a3ea09dc8e556626060fe71"}, - {file = "pandas-2.1.3.tar.gz", hash = "sha256:22929f84bca106921917eb73c1521317ddd0a4c71b395bcf767a106e3494209f"}, + {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"}, + {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"}, + {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"}, + {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"}, + {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"}, + {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"}, + {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"}, ] [package.dependencies] @@ -396,36 +419,38 @@ numpy = [ ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" -tzdata = ">=2022.1" +tzdata = ">=2022.7" [package.extras] -all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] -aws = ["s3fs (>=2022.05.0)"] -clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] -compression = ["zstandard (>=0.17.0)"] -computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] consortium-standard = ["dataframe-api-compat (>=0.1.7)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] -feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2022.05.0)"] -gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] -hdf5 = ["tables (>=3.7.0)"] -html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] -mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] -parquet = ["pyarrow (>=7.0.0)"] -performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] -plot = ["matplotlib (>=3.6.1)"] -postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] -spss = ["pyreadstat (>=1.1.5)"] -sql-other = ["SQLAlchemy (>=1.4.36)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.8.0)"] +xml = ["lxml (>=4.9.2)"] [[package]] name = "pluggy" version = "1.3.0" description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -441,6 +466,7 @@ testing = ["pytest", "pytest-benchmark"] name = "pyproj" version = "3.6.1" description = "Python interface to PROJ (cartographic projections and coordinate transformations library)" +category = "main" optional = true python-versions = ">=3.9" files = [ @@ -480,6 +506,7 @@ certifi = "*" name = "pytest" version = "7.4.3" description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -502,6 +529,7 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "pytest-asyncio" version = "0.21.1" description = "Pytest support for asyncio" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -520,6 +548,7 @@ testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy name = "pytest-ruff" version = "0.2.1" description = "pytest plugin to check ruff requirements." +category = "dev" optional = false python-versions = ">=3.7,<4.0" files = [ @@ -534,6 +563,7 @@ ruff = ">=0.0.242" name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" +category = "main" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -548,6 +578,7 @@ six = ">=1.5" name = "python-decouple" version = "3.8" description = "Strict separation of settings from code." +category = "main" optional = false python-versions = "*" files = [ @@ -559,6 +590,7 @@ files = [ name = "pytz" version = "2023.3.post1" description = "World timezone definitions, modern and historical" +category = "main" optional = true python-versions = "*" files = [ @@ -570,6 +602,7 @@ files = [ name = "ruff" version = "0.1.7" description = "An extremely fast Python linter and code formatter, written in Rust." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -596,6 +629,7 @@ files = [ name = "setuptools" version = "69.0.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" optional = true python-versions = ">=3.8" files = [ @@ -612,6 +646,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar name = "shapely" version = "2.0.2" description = "Manipulation and analysis of geometric objects" +category = "main" optional = true python-versions = ">=3.7" files = [ @@ -662,13 +697,14 @@ files = [ numpy = ">=1.14" [package.extras] -docs = ["matplotlib", "numpydoc (==1.1.*)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"] +docs = ["matplotlib", "numpydoc (>=1.1.0,<1.2.0)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"] test = ["pytest", "pytest-cov"] [[package]] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" +category = "main" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -680,6 +716,7 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -691,6 +728,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -702,6 +740,7 @@ files = [ name = "tqdm" version = "4.66.1" description = "Fast, Extensible Progress Meter" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -722,6 +761,7 @@ telegram = ["requests"] name = "tzdata" version = "2023.3" description = "Provider of IANA time zone data" +category = "main" optional = true python-versions = ">=2" files = [ @@ -733,6 +773,7 @@ files = [ name = "zipp" version = "3.17.0" description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" optional = true python-versions = ">=3.8" files = [ diff --git a/tests/conftest.py b/tests/conftest.py index 420b418..df9bd81 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -97,6 +97,7 @@ def occurrences_client_and_get_mock(client_with_token): "id": "a7bfebed-ce9c-469d-a656-924ed8248e95", "latitude": "-8.1576367000", "longitude": "-34.9696372000", + "contextInfo": {"context1": "info1", "context2": "info2"}, }, ], } diff --git a/tests/test_client.py b/tests/test_client.py index 1c82a71..74fb02b 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,3 +1,4 @@ +import importlib from datetime import datetime, timedelta from time import sleep from unittest.mock import patch @@ -15,6 +16,14 @@ ) from crossfire.parser import UnknownFormatError +if importlib.util.find_spec("pandas"): + HAS_PANDAS = True +else: + HAS_PANDAS = False +skip_if_pandas_not_installed = mark.skipif( + not HAS_PANDAS, reason="pandas is not installed" +) + def test_client_initiates_with_proper_credentials(client): assert client.credentials["email"] == "email" @@ -155,6 +164,7 @@ async def test_async_client_load_states(state_client_and_get_mock): assert not metadata.has_next_page +@skip_if_pandas_not_installed @mark.asyncio async def test_async_client_load_states_as_df(state_client_and_get_mock): client, mock = state_client_and_get_mock @@ -337,4 +347,5 @@ def test_client_load_occurrences(): final_date=None, max_parallel_requests=None, format=None, + flat=False, ) diff --git a/tests/test_flatten.py b/tests/test_flatten.py index 403db53..49baa66 100644 --- a/tests/test_flatten.py +++ b/tests/test_flatten.py @@ -1,6 +1,31 @@ -from pytest import raises +from copy import deepcopy +from unittest.mock import Mock, patch -from crossfire import NestedColumnError, flatten +try: + from geopandas import GeoDataFrame + from shapely import Point + + HAS_GEOPANDAS = True +except ModuleNotFoundError: + HAS_GEOPANDAS = False +try: + from pandas import DataFrame, Series + from pandas.testing import assert_frame_equal + + HAS_PANDAS = True +except ModuleNotFoundError: + HAS_PANDAS = False +from pytest import mark, raises + +from crossfire.clients.occurrences import flatten +from crossfire.errors import NestedColumnError + +skip_if_pandas_not_installed = mark.skipif( + not HAS_PANDAS, reason="pandas is not installed" +) +skip_if_geopandas_not_installed = mark.skipif( + not HAS_GEOPANDAS, reason="geopandas is not installed" +) DICT_DATA = [ { @@ -8,26 +33,242 @@ "contextInfo": {"context1": "info1", "context2": "info2"}, } ] +DICT_DATA_MISSING_NESTED_VALUE = [ + { + "answer": 42, + "contextInfo": {"context1": "info1"}, + }, + { + "answer": 42, + "contextInfo": None, + }, +] +DICT_DATA_ALL_ROWS_MISSING_NESTED_VALUE = [ + { + "answer": 42, + "contextInfo": None, + }, + { + "answer": 42, + "contextInfo": None, + }, +] +DICT_DATA_WITH_NESTED_VALUES_IN_NESTED_COLUMNS = [ + { + "answer": 42, + "contextInfo": { + "mainReason": {"mainReason1": "info1", "mainReason2": "info2"} + }, + }, + {"answer": 42, "contextInfo": {"mainReason": "mainReason1"}}, + {"answer": 42, "contextInfo": None}, +] +EXPECTED_DICT_RETURN = [ + { + "answer": 42, + "contextInfo": {"context1": "info1", "context2": "info2"}, + "contextInfo_context1": "info1", + "contextInfo_context2": "info2", + } +] +EXPECTED_DICT_MISSING_NESTED_VALUE_RETURN = [ + { + "answer": 42, + "contextInfo": {"context1": "info1"}, + "contextInfo_context1": "info1", + }, + { + "answer": 42, + "contextInfo": None, + }, +] +EXPECTED_DICT_RETURN_WITH_NESTED_VALUES_IN_NESTED_COLUMNS = [ + { + "answer": 42, + "contextInfo": { + "mainReason": {"mainReason1": "info1", "mainReason2": "info2"} + }, + "contextInfo_mainReason": { + "mainReason1": "info1", + "mainReason2": "info2", + }, + "contextInfo_mainReason_mainReason1": "info1", + "contextInfo_mainReason_mainReason2": "info2", + }, + { + "answer": 42, + "contextInfo": {"mainReason": "mainReason1"}, + "contextInfo_mainReason": "mainReason1", + }, + { + "answer": 42, + "contextInfo": None, + }, +] + +if HAS_PANDAS: + PD_DATA = DataFrame(DICT_DATA) + PD_DATA_MISSING_NESTED_VALUE = DataFrame(DICT_DATA_MISSING_NESTED_VALUE) + PD_DATA_ALL_ROWS_MISSING_NESTED_VALUE = DataFrame( + DICT_DATA_ALL_ROWS_MISSING_NESTED_VALUE + ) + EXPECTED_PD_RETURN = DataFrame(EXPECTED_DICT_RETURN) + EXPECTED_PD_MISSING_NESTED_VALUE_RETURN = DataFrame( + EXPECTED_DICT_MISSING_NESTED_VALUE_RETURN + ) +if HAS_GEOPANDAS: + GEOMETRY = [Point(4, 2)] + GEOPD_DATA = GeoDataFrame(DICT_DATA, crs="EPSG:4326", geometry=GEOMETRY) + EXPECTED_GEOPD_RETURN = GeoDataFrame( + [ + { + "answer": 42, + "contextInfo": {"context1": "info1", "context2": "info2"}, + "contextInfo_context1": "info1", + "contextInfo_context2": "info2", + } + ], + crs="EPSG:4326", + geometry=GEOMETRY, + ).reindex( + columns=( + "answer", + "contextInfo", + "geometry", + "contextInfo_context1", + "contextInfo_context2", + ) + ) -def teste_flatten_wrong_nested_columns_value_error(): +def test_flatten_wrong_nested_columns_value_error(): with raises(NestedColumnError): flatten(DICT_DATA, nested_columns=["wrong"]) -def teste_flatten_with_emptylist(): +def test_flatten_with_empty_list(): assert flatten([]) == [] -# test the flatten function with a dictionary mocking it to assert _flatten_dict function is being called +@skip_if_pandas_not_installed +def test_flatten_with_empty_data_frame(): + with patch("crossfire.clients.occurrences._flatten_df") as mock_flatten_df: + flatten(DataFrame(), nested_columns=["contextInfo"]) + + mock_flatten_df.assert_not_called() + + def test_flatten_dict(): flattened_dict = flatten( DICT_DATA, nested_columns=["contextInfo", "neighborhood"] ) - assert flattened_dict == [ - { - "answer": 42, - "contextInfo_context1": "info1", - "contextInfo_context2": "info2", - } - ] + assert flattened_dict == EXPECTED_DICT_RETURN + + +@skip_if_pandas_not_installed +def test_flatten_pd(): + flattened_pd = flatten( + PD_DATA, nested_columns=["contextInfo", "neighborhood"] + ) + assert_frame_equal(flattened_pd, EXPECTED_PD_RETURN) + + +@skip_if_pandas_not_installed +def test_flatten_df_is_called(): + # There is a bug on Pandas that makes apply fails when called from Series with the default MagicMock + # more info: https://github.com/pandas-dev/pandas/issues/45298 + with patch( + "crossfire.clients.occurrences._flatten_df", new_callable=Mock + ) as mock_flatten_df: + mock_flatten_df.return_value = Series( + { + "contextInfo_context1": "info1", + "contextInfo_context2": "info2", + } + ) + + flatten(PD_DATA, nested_columns=["contextInfo"]) + + mock_flatten_df.assert_called_once() + + +@skip_if_geopandas_not_installed +def test_flatten_gpd(): + flattened_pd = flatten( + GEOPD_DATA, nested_columns=["contextInfo", "neighborhood"] + ) + assert_frame_equal(flattened_pd, EXPECTED_GEOPD_RETURN) + + +def test_flatten_list_is_called(): + with patch( + "crossfire.clients.occurrences._flatten_list" + ) as mock_flatten_list: + flatten(DICT_DATA, nested_columns=["contextInfo", "neighborhood"]) + + mock_flatten_list.assert_called_once() + + +def test_flatten_list(): + result = EXPECTED_DICT_RETURN + assert ( + flatten(DICT_DATA, nested_columns=["contextInfo", "neighborhood"]) + == result + ) + + +def test_flatten_dict_with_rows_missing_nested_values(): + assert ( + flatten(DICT_DATA_MISSING_NESTED_VALUE, nested_columns=["contextInfo"]) + == EXPECTED_DICT_MISSING_NESTED_VALUE_RETURN + ) + + +def test_flatten_dict_with_all_rows_missing_nested_values(): + assert ( + flatten( + DICT_DATA_ALL_ROWS_MISSING_NESTED_VALUE, + nested_columns=["contextInfo"], + ) + == DICT_DATA_ALL_ROWS_MISSING_NESTED_VALUE + ) + + +@skip_if_pandas_not_installed +def test_flatten_pd_with_missing_nested_values(): + flattened_pd = flatten( + PD_DATA_MISSING_NESTED_VALUE, nested_columns=["contextInfo"] + ) + assert_frame_equal(flattened_pd, EXPECTED_PD_MISSING_NESTED_VALUE_RETURN) + + +@skip_if_pandas_not_installed +def test_flatten_df_with_all_rows_missing_nested_values(): + flattened_pd = flatten( + PD_DATA_ALL_ROWS_MISSING_NESTED_VALUE, nested_columns=["contextInfo"] + ) + assert_frame_equal(flattened_pd, PD_DATA_ALL_ROWS_MISSING_NESTED_VALUE) + + +def test_flatten_list_dicts_with_nested_columns_with_nested_values(): + data = [deepcopy(d) for d in DICT_DATA_WITH_NESTED_VALUES_IN_NESTED_COLUMNS] + assert ( + flatten( + data, + nested_columns=["contextInfo"], + ) + == EXPECTED_DICT_RETURN_WITH_NESTED_VALUES_IN_NESTED_COLUMNS + ) + + +@skip_if_pandas_not_installed +def test_flatten_pd_with_nested_columns_with_nested_values(): + data = [deepcopy(d) for d in DICT_DATA_WITH_NESTED_VALUES_IN_NESTED_COLUMNS] + + assert_frame_equal( + flatten( + DataFrame(data), + nested_columns=["contextInfo"], + ), + DataFrame(EXPECTED_DICT_RETURN_WITH_NESTED_VALUES_IN_NESTED_COLUMNS), + ) diff --git a/tests/test_init.py b/tests/test_init.py index 336eeb5..c3ed18c 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -51,6 +51,7 @@ def test_occurrences_with_default_args(): initial_date=None, final_date=None, format=None, + flat=False, ) @@ -64,6 +65,7 @@ def test_occurrences_with_custom_args(): final_date="2023-01-31", max_parallel_requests=10, format="df", + flat=True, ) mock.return_value.occurrences.assert_called_once_with( "42", @@ -73,4 +75,5 @@ def test_occurrences_with_custom_args(): final_date="2023-01-31", max_parallel_requests=10, format="df", + flat=True, ) diff --git a/tests/test_occurrences_client.py b/tests/test_occurrences_client.py index b8863c9..8fe6d1a 100644 --- a/tests/test_occurrences_client.py +++ b/tests/test_occurrences_client.py @@ -1,8 +1,24 @@ import datetime -from geopandas import GeoDataFrame -from pandas import DataFrame -from pandas.testing import assert_frame_equal +try: + from geopandas import GeoDataFrame + + HAS_GEOPANDAS = True +except ImportError: + HAS_GEOPANDAS = False + +try: + from pandas import DataFrame + + HAS_PANDAS = True +except ImportError: + HAS_PANDAS = False + +try: + from pandas.testing import assert_frame_equal +except ImportError: + pass + from pytest import mark, raises from crossfire.clients.occurrences import ( @@ -13,6 +29,13 @@ ) from crossfire.errors import DateFormatError, DateIntervalError +skip_if_pandas_not_installed = mark.skipif( + not HAS_PANDAS, reason="pandas is not installed" +) +skip_if_geopandas_not_installed = mark.skipif( + not HAS_GEOPANDAS, reason="geopandas is not installed" +) + def dummy_response(total_pages, last_page): if total_pages == 1: @@ -42,6 +65,7 @@ def test_occurrences_accumulator_for_lists(): assert accumulator() == [1, 2, 3] +@skip_if_pandas_not_installed def test_occurrences_accumulator_for_df(): accumulator = Accumulator() accumulator.merge(DataFrame([{"a": 1}])) @@ -49,6 +73,7 @@ def test_occurrences_accumulator_for_df(): assert_frame_equal(accumulator(), DataFrame([{"a": 1}, {"a": 2}, {"a": 3}])) +@skip_if_geopandas_not_installed def test_occurrences_accumulator_for_geodf(): accumulator = Accumulator() accumulator.merge(GeoDataFrame([{"a": 1}])) @@ -134,6 +159,7 @@ async def test_occurrences_without_victims(occurrences_client_and_get_mock): ) +@skip_if_pandas_not_installed @mark.asyncio async def test_occurrences_with_format_parameter( occurrences_client_and_get_mock, @@ -236,3 +262,47 @@ def test_date_formatter_with_python_datetime_format(): formatted_date = date_formatter(date) assert isinstance(formatted_date, datetime.date) assert str(formatted_date) == "2023-01-23" + + +@mark.asyncio +async def test_occurrences_as_list_dicts_with_flat_parameter( + occurrences_client_and_get_mock, +): + client_mock, _ = occurrences_client_and_get_mock + occurrences = Occurrences(client_mock, id_state=42, flat=True) + occs = await occurrences() + assert occs == [ + { + "id": "a7bfebed-ce9c-469d-a656-924ed8248e95", + "latitude": "-8.1576367000", + "longitude": "-34.9696372000", + "contextInfo": {"context1": "info1", "context2": "info2"}, + "contextInfo_context1": "info1", + "contextInfo_context2": "info2", + }, + ] + + +@skip_if_pandas_not_installed +@mark.asyncio +async def test_occurrences_as_df_with_flat_parameter( + occurrences_client_and_get_mock, +): + client_mock, _ = occurrences_client_and_get_mock + occurrences = Occurrences(client_mock, id_state=42, format="df", flat=True) + occs = await occurrences() + assert_frame_equal( + occs, + DataFrame( + [ + { + "id": "a7bfebed-ce9c-469d-a656-924ed8248e95", + "latitude": "-8.1576367000", + "longitude": "-34.9696372000", + "contextInfo": {"context1": "info1", "context2": "info2"}, + "contextInfo_context1": "info1", + "contextInfo_context2": "info2", + }, + ] + ), + ) diff --git a/tests/test_parser.py b/tests/test_parser.py index fe38056..d09d50c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,9 +1,21 @@ from json import dumps from unittest.mock import Mock, patch -from geopandas import GeoDataFrame -from pandas import DataFrame -from pytest import raises +try: + from geopandas import GeoDataFrame + + HAS_GEOPANDAS = True +except ImportError: + HAS_GEOPANDAS = False + +try: + from pandas import DataFrame + + HAS_PANDAS = True +except ImportError: + HAS_PANDAS = False + +from pytest import mark, raises from crossfire.parser import ( IncompatibleDataError, @@ -14,6 +26,13 @@ DATA = [{"answer": 42}] GEODATA = [{"answer": 42, "latitude": 4, "longitude": 2}] +skip_if_pandas_not_installed = mark.skipif( + not HAS_PANDAS, reason="pandas is not installed" +) +skip_if_geopandas_not_installed = mark.skipif( + not HAS_GEOPANDAS, reason="geopandas is not installed" +) + class DummyError(Exception): pass @@ -70,16 +89,19 @@ def test_parse_response_handles_metadata(): assert metadata.page_count == 1 +@skip_if_pandas_not_installed def test_parse_response_uses_dataframe_when_specified(): data, _ = parse_response(create_response(), format="df") assert isinstance(data, DataFrame) +@skip_if_geopandas_not_installed def test_parse_response_uses_geodataframe_when_specified(): data, _ = parse_response(create_response(True), format="geodf") assert isinstance(data, GeoDataFrame) +@skip_if_geopandas_not_installed def test_parse_response_raises_error_when_missing_coordinates(): with raises(IncompatibleDataError): parse_response(create_response(), format="geodf")