From c24cc8bdc6cf6db1bc5f02a7a3d1f4a468912a49 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Thu, 12 Sep 2024 10:39:45 +0100 Subject: [PATCH 01/13] Updated Readme --- README.md | 103 +++--------------------------------------------------- 1 file changed, 5 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 300452f..65e2f20 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,9 @@ -# Python Project Template +# DRI IO -[![tests badge](https://github.com/NERC-CEH/python-template/actions/workflows/pipeline.yml/badge.svg)](https://github.com/NERC-CEH/python-template/actions) -[![docs badge](https://github.com/NERC-CEH/python-template/actions/workflows/deploy-docs.yml/badge.svg)](https://nerc-ceh.github.io/python-template/) +[![tests badge](https://github.com/NERC-CEH/dri-io/actions/workflows/pipeline.yml/badge.svg)](https://github.com/NERC-CEH/dri-io/actions) +[![docs badge](https://github.com/NERC-CEH/dri-io/actions/workflows/deploy-docs.yml/badge.svg)](https://nerc-ceh.github.io/dri-io/) -[Read the docs!](https://nerc-ceh.github.io/python-template) - -This repository is a template for a basic Python project. Included here is: - -* Example Python package -* Tests -* Documentation -* Automatic incremental versioning -* CI/CD - * Installs and tests the package - * Builds documentation on branches - * Deploys documentation on main branch - * Deploys docker image to AWS ECR -* Githook to ensure linting and code checking +This is a Python package that serves to hold commonly implemented Input/Output actions, typically reading and writing file ## Getting Started @@ -62,84 +49,4 @@ The docs, tests, and linter packages can be installed together with: ``` pip install -e .[dev] -``` - -### Making it Your Own - -This repo has a single package in the `./src/...` path called `driio` (creative I know). Change this to the name of your package and update it in: - -* `docs/conf.py` -* `src/**/*.py` -* `tests/**/*.py` -* `pyproject.toml` - -To make thing move a bit faster, use the script `./rename-package.sh` to rename all references of `driio` to whatever you like. For example: - -``` -./rename-package.sh "acoolnewname" -``` - -Will rename the package and all references to "acoolnewname" - -After doing this it is recommended to also run: - -``` -cd docs -make apidoc -``` - -To keep your documentation in sync with the package name. You may need to delete a file called `driio.rst` from `./docs/sources/...` - -### Deploying Docs to GitHub Pages - -If you want docs to be published to github pages automatically, go to your repo settings and enable docs from GitHub Actions and the workflows will do the rest. - -### Building Docs Locally - -The documentation is driven by [Sphinx](https://www.sphinx-doc.org/) an industry standard for documentation with a healthy userbase and lots of add-ons. It uses `sphinx-apidoc` to generate API documentation for the codebase from Python docstrings. - -To run `sphinx-apidoc` run: - -``` -# Install your package with optional dependencies for docs -pip install -e .[docs] - -cd docs -make apidoc -``` - -This will populate `./docs/sources/...` with `*.rst` files for each Python module, which may be included into the documentation. - -Documentation can then be built locally by running `make html`, or found on the [GitHub Deployment](https://nerc-ceh.github.io/python-template). - -### Run the Tests - -To run the tests run: - -``` -#Install package with optional dependencies for testing -pip install -e .[test] - -pytest -``` - -### Automatic Versioning - -This codebase is set up using [autosemver](https://autosemver.readthedocs.io/en/latest/usage.html#) a tool that uses git commit history to calculate the package version. Each time you make a commit, it increments the patch version by 1. You can increment by: - -* Normal commit. Use for bugfixes and small updates - * Increments patch version: `x.x.5 -> x.x.6` -* Commit starts with `* NEW:`. Use for new features - * Increments minor version `x.1.x -> x.2.x` -* Commit starts with `* INCOMPATIBLE:`. Use for API breaking changes - * Increments major version `2.x.x -> 3.x.x` - -### Docker and the ECR - -The python code is packaged into a docker image and pushed to the AWS ECR. For the deployment to succeed you must: - -* Add 2 secrets to the GitHub Actions: - * AWS_REGION: \ - * AWS_ROLE_ARN: \ -* Add a repository to the ECR with the same name as the GitHub repo - \ No newline at end of file +``` \ No newline at end of file From e5bae77f5c0d6ef92a46c8fbcee98704c363172e Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Thu, 12 Sep 2024 15:31:34 +0100 Subject: [PATCH 02/13] Implemented S3 reader for DuckDB --- pyproject.toml | 2 +- src/driio/__main__.py | 3 - src/driio/module.py | 15 ----- src/driio/read.py | 139 ++++++++++++++++++++++++++++++++++++++++++ src/driio/utils.py | 22 +++++++ 5 files changed, 162 insertions(+), 19 deletions(-) delete mode 100644 src/driio/__main__.py delete mode 100644 src/driio/module.py create mode 100644 src/driio/read.py create mode 100644 src/driio/utils.py diff --git a/pyproject.toml b/pyproject.toml index 31fb2ac..c309e8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = ["setuptools >= 61.0", "autosemver"] [project] requires-python = ">=3.12" -dependencies = ["autosemver"] +dependencies = ["autosemver", "duckdb"] name = "dri-io" dynamic = ["version"] authors = [{ name = "John Doe", email = "johdoe@ceh.ac.uk" }] diff --git a/src/driio/__main__.py b/src/driio/__main__.py deleted file mode 100644 index 38ec4a4..0000000 --- a/src/driio/__main__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""This file is run when `python -m driio` is called. Put your client code here if you have any.""" - -print("Hello World!") diff --git a/src/driio/module.py b/src/driio/module.py deleted file mode 100644 index 1e53591..0000000 --- a/src/driio/module.py +++ /dev/null @@ -1,15 +0,0 @@ -"""This module contains a single method, demonstrating the structure""" - - -def add_int(x: int, y: int) -> int: - """Adds two integers together - - Args: - x: The first number - y: The second number - - Returns: - int: The result - """ - - return x + y diff --git a/src/driio/read.py b/src/driio/read.py new file mode 100644 index 0000000..358ff79 --- /dev/null +++ b/src/driio/read.py @@ -0,0 +1,139 @@ +from abc import ABC, abstractmethod +from typing import Any, List, Optional + +import duckdb +from duckdb import DuckDBPyConnection + +from driio.utils import remove_protocol_from_url + + +class ReaderInterface(ABC): + """Abstract implementation for a IO reader""" + + @abstractmethod + def read(self, *args, **kwargs) -> Any: + """Reads data from a source""" + + +class DuckDBReader(ReaderInterface): + """Abstract implementation of a DuckDB Reader""" + + _connection: DuckDBPyConnection + """A connection to DuckDB""" + + def __enter__(self): + """Creates a connection when used in a context block""" + self._connection = duckdb.connect() + + def __exit__(self, *args) -> None: + """Closes the connection when exiting the context""" + self._connection.close() + + def __del__(self): + """Closes the connection when deleted""" + self._connection.close() + + def __init__(self, *args, **kwargs) -> None: + self._connection = duckdb.connect() + + +class DuckDBS3Reader(DuckDBReader): + """Concrete Implementation of a DuckDB reader for reading + data from an S3 endpoint""" + + def __init__( + self, + auth_type: str, + endpoint_url: Optional[str] = None, + ) -> None: + """Initializes + + Args: + endpoint_url: Custom s3 endpoint + """ + + self._connection = duckdb.connect() + + auth_type = auth_type.lower() + + VALID_AUTH_METHODS = ["auto", "sts", "custom_endpoint"] + if auth_type not in VALID_AUTH_METHODS: + raise ValueError(f"Invalid `auth_type`, must be one of {VALID_AUTH_METHODS}") + + self._connection.execute(""" + INSTALL httpfs; + LOAD httpfs; + SET force_download = true; + SET http_keep_alive = false; + """) + + if auth_type == "auto": + self._auto_auth() + elif auth_type == "sts": + self._sts_auth() + elif auth_type == "custom_endpoint": + self._custom_endpoint_auth(endpoint_url) + + def _auto_auth(self) -> None: + """Automatically authenticates using environment variables""" + + self._connection.execute(""" + INSTALL aws; + LOAD aws; + CREATE SECRET ( + TYPE S3, + PROVIDER CREDENTIAL_CHAIN + ); + """) + + def _sts_auth(self) -> None: + """Authenicates using assumed roles on AWS""" + + self._connection.execute(""" + CREATE SECRET ( + TYPE S3, + PROVIDER CREDENTIAL_CHAIN, + CHAIN 'sts' + ); + """) + + def _custom_endpoint_auth(self, endpoint_url: str, use_ssl: Optional[bool] = False) -> None: + """Authenticates to a custom endpoint + + Args: + endpoint_url: Endpoint to the s3 provider. + use_ssl: Flag for using ssl (https connections). + """ + + self._connection.execute(f""" + CREATE SECRET ( + TYPE S3, + ENDPOINT '{remove_protocol_from_url(endpoint_url)}', + URL_STYLE 'path', + USE_SSL 'false' + ); + """) + + def read(self, query: str, params: Optional[List] = None) -> Any: + """Requests to read a file + + Args: + query: The query to send. + params: The parameters to supplement the query. + """ + + return self._connection.execute(query, params) + + +if __name__ == "__main__": + # reader = DuckDBS3Reader("sts") + # query = "SELECT * FROM read_parquet('s3://ukceh-fdri-staging-timeseries-level-0/cosmos/PRECIP_1MIN_2024_LOOPED/2024-02/2024-02-14.parquet');" + # print(reader.read(query).pl()) + + endpoint = "http://localhost:4566" + file = "s3://ukceh-fdri-timeseries-level-0/cosmos/PRECIP_1MIN_2024_LOOPED/2024-01/2024-01-01.parquet" + query = f"SELECT * FROM read_parquet('{file}');" + print(query) + reader = DuckDBS3Reader("custom_endpoint", endpoint_url=endpoint) + + print(reader.read(query).pl()) diff --git a/src/driio/utils.py b/src/driio/utils.py new file mode 100644 index 0000000..82d7c90 --- /dev/null +++ b/src/driio/utils.py @@ -0,0 +1,22 @@ +"""Utility methods that don't belong elsewhere""" + +from urllib.parse import urlparse + + +def remove_protocol_from_url(url: str) -> str: + """Remove the protocol from a URL. + + Args: + url: URL to remove protocol from + + Returns: + URL with protocol removed + + Examples: + >>> remove_protocol_from_url("https://www.example.com") + "www.example.com" + """ + endpoint_url = urlparse(url) + # Remove the protocol scheme by setting it to an empty string + endpoint_url = "".join(endpoint_url[1:]) + return endpoint_url From 7b14c6f4282300cced5730fd9e5c02287d9d72f0 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Thu, 12 Sep 2024 16:24:45 +0100 Subject: [PATCH 03/13] Updated README --- README.md | 70 +++++++++++++++++++++++++++++++++++++- src/driio/read.py | 86 ++++++++++++++++++++++++----------------------- 2 files changed, 113 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 65e2f20..fadeb66 100644 --- a/README.md +++ b/README.md @@ -49,4 +49,72 @@ The docs, tests, and linter packages can be installed together with: ``` pip install -e .[dev] -``` \ No newline at end of file +``` + +## Readers + +### DuckDB Reader +The DuckDB classes use the duckdb python interface to read files from local documents or S3 object storage - this comes with the capacity to use custom s3 endpoints. + +To read a local file: +```python + +from driio.read import DuckDBFileReader + +reader = DuckDBFileReader() +query = "SELECT * FROM READ_PARQUET('myfile.parquet');" +result = reader.read(query) + +# Result will be a +# Get your desired format such as polars like: +df = result.pl() + +# Or pandas +df = result.df() + +# Close the connection +reader.close() +``` + +Alternatively, use a context manager to automatically close the connection: +```python +... + +with DuckDBFileReader() as reader: + df = reader.read(query, params).df() +``` + +To read from an S3 storage location there is a more configuration available and there is 3 use cases supported: + +* Automatic credential loading from current environment variables +* Automatic credential loading from an assumed role +* Authentication to a custom s3 endpoint, i.e. localstack. This currently assumes that credentials aren't needed (they aren't for now) + +The reader is instantiated like this: +```python +from driio.read import import DuckDBS3Reader + +# Automatic authentication from your environment +auto_auth_reader = DuckDBS3Reader("auto") + +# Automatic authentication from your assumed role +sts_auth_reader = DuckDBS3Reader("sts") + +# Custom url for localstack +endpoint = "http://localhost:" +custom_url_reader = DuckDBS3Reader( + "custom_endpoint", + endpoint_url=endpoint, + use_ssl=False + ) + +# Custom url using https protocol +endpoint = "https://a-real.s3.endpoint" +custom_url_reader = DuckDBS3Reader( + "custom_endpoint", + endpoint_url=endpoint, + use_ssl=True + ) +``` + +The `reader.read()` in the background forwards a DuckDB SQL query and parameters to fill arguments in the query with. \ No newline at end of file diff --git a/src/driio/read.py b/src/driio/read.py index 358ff79..45d002e 100644 --- a/src/driio/read.py +++ b/src/driio/read.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, List, Optional +from typing import Any, List, Optional, Self import duckdb from duckdb import DuckDBPyConnection @@ -10,6 +10,21 @@ class ReaderInterface(ABC): """Abstract implementation for a IO reader""" + _connection: Any + """Reference to the connection object""" + + def __enter__(self) -> Self: + """Creates a connection when used in a context block""" + return self + + def __exit__(self, *args) -> None: + """Closes the connection when exiting the context""" + self._connection.close() + + def __del__(self): + """Closes the connection when deleted""" + self._connection.close() + @abstractmethod def read(self, *args, **kwargs) -> Any: """Reads data from a source""" @@ -21,42 +36,44 @@ class DuckDBReader(ReaderInterface): _connection: DuckDBPyConnection """A connection to DuckDB""" - def __enter__(self): - """Creates a connection when used in a context block""" + def __init__(self, *args, **kwargs) -> None: self._connection = duckdb.connect() - def __exit__(self, *args) -> None: - """Closes the connection when exiting the context""" - self._connection.close() + def read(self, query: str, params: Optional[List] = None) -> DuckDBPyConnection: + """Requests to read a file - def __del__(self): - """Closes the connection when deleted""" - self._connection.close() + Args: + query: The query to send. + params: The parameters to supplement the query. + """ - def __init__(self, *args, **kwargs) -> None: - self._connection = duckdb.connect() + return self._connection.execute(query, params) + + def close(self) -> None: + """Close the connection""" + self._connection.close() class DuckDBS3Reader(DuckDBReader): """Concrete Implementation of a DuckDB reader for reading data from an S3 endpoint""" - def __init__( - self, - auth_type: str, - endpoint_url: Optional[str] = None, - ) -> None: + def __init__(self, auth_type: str, endpoint_url: Optional[str] = None, use_ssl: bool = True) -> None: """Initializes Args: + auth_type: The type of authentication to request. May + be one of ["auto", "sts", "custom_endpoint"] endpoint_url: Custom s3 endpoint + use_ssl: Flag for using ssl (https connections). """ - self._connection = duckdb.connect() + super().__init__() auth_type = auth_type.lower() VALID_AUTH_METHODS = ["auto", "sts", "custom_endpoint"] + if auth_type not in VALID_AUTH_METHODS: raise ValueError(f"Invalid `auth_type`, must be one of {VALID_AUTH_METHODS}") @@ -72,7 +89,10 @@ def __init__( elif auth_type == "sts": self._sts_auth() elif auth_type == "custom_endpoint": - self._custom_endpoint_auth(endpoint_url) + if not isinstance(endpoint_url, str): + endpoint_url = str(endpoint_url) + + self._custom_endpoint_auth(endpoint_url, use_ssl) def _auto_auth(self) -> None: """Automatically authenticates using environment variables""" @@ -90,6 +110,8 @@ def _sts_auth(self) -> None: """Authenicates using assumed roles on AWS""" self._connection.execute(""" + INSTALL aws; + LOAD aws; CREATE SECRET ( TYPE S3, PROVIDER CREDENTIAL_CHAIN, @@ -97,7 +119,7 @@ def _sts_auth(self) -> None: ); """) - def _custom_endpoint_auth(self, endpoint_url: str, use_ssl: Optional[bool] = False) -> None: + def _custom_endpoint_auth(self, endpoint_url: str, use_ssl: bool = True) -> None: """Authenticates to a custom endpoint Args: @@ -110,30 +132,10 @@ def _custom_endpoint_auth(self, endpoint_url: str, use_ssl: Optional[bool] = Fal TYPE S3, ENDPOINT '{remove_protocol_from_url(endpoint_url)}', URL_STYLE 'path', - USE_SSL 'false' + USE_SSL '{str(use_ssl).lower()}' ); """) - def read(self, query: str, params: Optional[List] = None) -> Any: - """Requests to read a file - - Args: - query: The query to send. - params: The parameters to supplement the query. - """ - - return self._connection.execute(query, params) - - -if __name__ == "__main__": - # reader = DuckDBS3Reader("sts") - # query = "SELECT * FROM read_parquet('s3://ukceh-fdri-staging-timeseries-level-0/cosmos/PRECIP_1MIN_2024_LOOPED/2024-02/2024-02-14.parquet');" - # print(reader.read(query).pl()) - - endpoint = "http://localhost:4566" - file = "s3://ukceh-fdri-timeseries-level-0/cosmos/PRECIP_1MIN_2024_LOOPED/2024-01/2024-01-01.parquet" - query = f"SELECT * FROM read_parquet('{file}');" - print(query) - reader = DuckDBS3Reader("custom_endpoint", endpoint_url=endpoint) - print(reader.read(query).pl()) +class DuckDBFileReader(DuckDBReader): + """DuckDB implementation for reading files""" From c270acc14c0315fb4d8480b4a85b58b4a6765466 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Fri, 13 Sep 2024 11:38:03 +0100 Subject: [PATCH 04/13] Renamed repo to dri-utils --- .github/workflows/pipeline.yml | 2 +- Dockerfile | 2 +- README.md | 29 +++++++++-- docs/Makefile | 2 +- docs/conf.py | 4 +- docs/sources/modules.rst | 4 +- docs/sources/mypackage.rst | 8 +-- pyproject.toml | 12 ++--- src/{driio => driutils}/__init__.py | 2 +- src/driutils/io/__init__.py | 0 src/{driio => driutils/io}/read.py | 2 +- src/driutils/io/write.py | 81 +++++++++++++++++++++++++++++ src/{driio => driutils}/utils.py | 0 tests/io/test_readers.py | 0 tests/io/test_writers.py | 0 tests/test_module.py | 27 ---------- tests/test_utils.py | 39 ++++++++++++++ 17 files changed, 163 insertions(+), 51 deletions(-) rename src/{driio => driutils}/__init__.py (86%) create mode 100644 src/driutils/io/__init__.py rename src/{driio => driutils/io}/read.py (98%) create mode 100644 src/driutils/io/write.py rename src/{driio => driutils}/utils.py (100%) create mode 100644 tests/io/test_readers.py create mode 100644 tests/io/test_writers.py delete mode 100644 tests/test_module.py create mode 100644 tests/test_utils.py diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index 554d64e..51fe8fe 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -15,7 +15,7 @@ jobs: needs: [test-python] uses: NERC-CEH/dri-cicd/.github/workflows/build-test-deploy-docker.yml@main with: - package_name: driio + package_name: driutils secrets: AWS_REGION: ${{ secrets.AWS_REGION }} AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/Dockerfile b/Dockerfile index a577706..bc8676e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,4 +22,4 @@ COPY --chown=python:python tests/ /app/tests USER python ENV PATH="/app/.venv/bin:$PATH" ENV VIRTUAL_ENV="/app/.venv" -CMD ["python", "-m", "driio"] \ No newline at end of file +CMD ["python", "-m", "driutils"] \ No newline at end of file diff --git a/README.md b/README.md index fadeb66..dbf588a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # DRI IO -[![tests badge](https://github.com/NERC-CEH/dri-io/actions/workflows/pipeline.yml/badge.svg)](https://github.com/NERC-CEH/dri-io/actions) -[![docs badge](https://github.com/NERC-CEH/dri-io/actions/workflows/deploy-docs.yml/badge.svg)](https://nerc-ceh.github.io/dri-io/) +[![tests badge](https://github.com/NERC-CEH/dri-utils/actions/workflows/pipeline.yml/badge.svg)](https://github.com/NERC-CEH/dri-utils/actions) This is a Python package that serves to hold commonly implemented Input/Output actions, typically reading and writing file @@ -59,7 +58,7 @@ The DuckDB classes use the duckdb python interface to read files from local docu To read a local file: ```python -from driio.read import DuckDBFileReader +from driutils.read import DuckDBFileReader reader = DuckDBFileReader() query = "SELECT * FROM READ_PARQUET('myfile.parquet');" @@ -92,7 +91,7 @@ To read from an S3 storage location there is a more configuration available and The reader is instantiated like this: ```python -from driio.read import import DuckDBS3Reader +from driutils.read import import DuckDBS3Reader # Automatic authentication from your environment auto_auth_reader = DuckDBS3Reader("auto") @@ -117,4 +116,24 @@ custom_url_reader = DuckDBS3Reader( ) ``` -The `reader.read()` in the background forwards a DuckDB SQL query and parameters to fill arguments in the query with. \ No newline at end of file +The `reader.read()` in the background forwards a DuckDB SQL query and parameters to fill arguments in the query with. + +## Writers + +### S3 Object Writer + +The `S3Writer` uploads files to S3 using a pre-existing `S3Client` which is left to the user to resource, but is commonly implemented as: +```python + +import boto3 +from driutils.write import S3Writer + +s3_client = boto3.client('s3', endpoint_url="an_optional_url") +content = "Just a lil string" + +writer = S3Writer(s3_client) +writer.write( + bucket_name="target-bucket", + key="path/to/upload/destination", + body=content +) diff --git a/docs/Makefile b/docs/Makefile index dd5a6b7..8e4c30a 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -20,4 +20,4 @@ help: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) apidoc: - @sphinx-apidoc -f ../src/driio -o sources + @sphinx-apidoc -f ../src/driutils -o sources diff --git a/docs/conf.py b/docs/conf.py index 229ba78..35344c3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,13 +6,13 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -import driio +import driutils project = 'My Project' copyright = '2024, UKCEH' author = 'UKCEH' -release = driio.__version__ +release = driutils.__version__ version = release # -- General configuration --------------------------------------------------- diff --git a/docs/sources/modules.rst b/docs/sources/modules.rst index 3c7da81..a63568e 100644 --- a/docs/sources/modules.rst +++ b/docs/sources/modules.rst @@ -1,7 +1,7 @@ -driio +driutils ========= .. toctree:: :maxdepth: 4 - driio + driutils diff --git a/docs/sources/mypackage.rst b/docs/sources/mypackage.rst index 9785bc2..d5b26e5 100644 --- a/docs/sources/mypackage.rst +++ b/docs/sources/mypackage.rst @@ -1,13 +1,13 @@ -driio package +driutils package ================= Submodules ---------- -driio.module module +driutils.module module ----------------------- -.. automodule:: driio.module +.. automodule:: driutils.module :members: :undoc-members: :show-inheritance: @@ -15,7 +15,7 @@ driio.module module Module contents --------------- -.. automodule:: driio +.. automodule:: driutils :members: :undoc-members: :show-inheritance: diff --git a/pyproject.toml b/pyproject.toml index c309e8f..4035f95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ requires = ["setuptools >= 61.0", "autosemver"] [project] requires-python = ">=3.12" -dependencies = ["autosemver", "duckdb"] -name = "dri-io" +dependencies = ["autosemver", "duckdb", "boto3", "mypy_boto3_s3"] +name = "dri-utils" dynamic = ["version"] authors = [{ name = "John Doe", email = "johdoe@ceh.ac.uk" }] description = "A minimal setup for a template package." @@ -14,19 +14,19 @@ description = "A minimal setup for a template package." test = ["pytest", "pytest-cov", "parameterized"] docs = ["sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] lint = ["ruff"] -dev = ["driio[test,docs,lint]"] +dev = ["dri-utils[test,docs,lint]"] [tool.setuptools.dynamic] -version = { attr = "driio.__version__" } +version = { attr = "driutils.__version__" } [tool.setuptools.packages.find] where = ["src"] -include = ["driio*"] +include = ["driutils*"] [tool.pytest.ini_options] -addopts = "--cov=driio" +addopts = "--cov=driutils" markers = ["slow: Marks slow tests"] filterwarnings = [ diff --git a/src/driio/__init__.py b/src/driutils/__init__.py similarity index 86% rename from src/driio/__init__.py rename to src/driutils/__init__.py index c46d6a1..46a678f 100644 --- a/src/driio/__init__.py +++ b/src/driutils/__init__.py @@ -1,6 +1,6 @@ import autosemver try: - __version__ = autosemver.packaging.get_current_version(project_name="driio") + __version__ = autosemver.packaging.get_current_version(project_name="driutils") except Exception: __version__ = "0.0.0" diff --git a/src/driutils/io/__init__.py b/src/driutils/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/driio/read.py b/src/driutils/io/read.py similarity index 98% rename from src/driio/read.py rename to src/driutils/io/read.py index 45d002e..2fc8b45 100644 --- a/src/driio/read.py +++ b/src/driutils/io/read.py @@ -4,7 +4,7 @@ import duckdb from duckdb import DuckDBPyConnection -from driio.utils import remove_protocol_from_url +from driutils.utils import remove_protocol_from_url class ReaderInterface(ABC): diff --git a/src/driutils/io/write.py b/src/driutils/io/write.py new file mode 100644 index 0000000..2a0500d --- /dev/null +++ b/src/driutils/io/write.py @@ -0,0 +1,81 @@ +"""Module for handling data writing logic""" + +from abc import ABC, abstractmethod +from io import BytesIO +from typing import Any + +import polars as pl +from botocore.client import BaseClient +from mypy_boto3_s3.client import S3Client + + +class WriterInterface(ABC): + """Interface for defining parquet writing objects""" + + @abstractmethod + def write(self, *args, **kwargs) -> None: + """Abstract method for read operations""" + + +class S3Writer(WriterInterface): + """Writes to an S3 bucket""" + + s3_client: S3Client + """Handle to the the s3 client used to read data""" + + def __init__(self, s3_client: S3Client): + """Initializes the class + + Args: + s3_client: The s3 client used to retrieve data from + """ + + if not isinstance(s3_client, BaseClient): + raise TypeError(f"`s3_client` must be a `S3Client` not `{type(s3_client)}`") + + self.s3_client = s3_client + + @staticmethod + def _get_bytes(obj: Any) -> BytesIO: + """Converts an object to bytes + + Args: + obj: The object to convert. + Returns: + bytes representation of the object. + """ + + buffer = BytesIO() + + if hasattr(obj, "to_bytes") and callable(obj.to_bytes): + buffer.write(obj.to_bytes()) + elif isinstance(obj, str): + buffer.write(str.encode(obj)) + elif isinstance(obj, pl.dataframe.DataFrame): + obj.write_parquet(buffer) + else: + raise TypeError(f"Bytes conversion not supported for type: '{type(obj)}'") + + buffer.seek(0) + + return buffer + + def write(self, bucket_name: str, key: str, body: bytes) -> None: + """Uploads an object to an S3 bucket. + + This function attempts to upload a byte object to a specified S3 bucket + using the provided S3 client. If the upload fails, it logs an error + message and re-raises the exception. + + Args: + bucket_name: The name of the S3 bucket. + key: The key (path) of the object within the bucket. + body: data to write to s3 object + + Raises: + RuntimeError, ClientError + """ + if not isinstance(body, bytes): + body = self._get_bytes(body) + + self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body) diff --git a/src/driio/utils.py b/src/driutils/utils.py similarity index 100% rename from src/driio/utils.py rename to src/driutils/utils.py diff --git a/tests/io/test_readers.py b/tests/io/test_readers.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/io/test_writers.py b/tests/io/test_writers.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_module.py b/tests/test_module.py deleted file mode 100644 index 2ef7c3a..0000000 --- a/tests/test_module.py +++ /dev/null @@ -1,27 +0,0 @@ -import unittest -from driio.module import add_int -from parameterized import parameterized - -class TestModuleMethods(unittest.TestCase): - - def test_errors(self): - - with self.assertRaises(TypeError, msg="Expected integer argument for y."): - add_int(1, "3") - - with self.assertRaises(TypeError, msg="Expected integer argument for x."): - add_int({1,2}, 5) - - - @parameterized.expand([ - [1, 2, 3], - [-4, 10, 6], - [1000, 100, 1100] - ]) - def test_result(self, x, y, expected): - - self.assertEqual(add_int(x, y), expected) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..0457eda --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,39 @@ +import unittest + +from driutils.utils import remove_protocol_from_url + +class TestRemoveProtocolFromUrl(unittest.TestCase): + def test_https_url(self): + """Test removing protocol from an HTTPS URL.""" + url = "https://www.example.com" + expected = "www.example.com" + result = remove_protocol_from_url(url) + self.assertEqual(result, expected) + + def test_http_url(self): + """Test removing protocol from an HTTP URL.""" + url = "http://www.example.com" + expected = "www.example.com" + result = remove_protocol_from_url(url) + self.assertEqual(result, expected) + + def test_url_with_path(self): + """Test removing protocol from a URL with a path.""" + url = "https://www.example.com/path/to/resource" + expected = "www.example.com/path/to/resource" + result = remove_protocol_from_url(url) + self.assertEqual(result, expected) + + def test_url_with_port(self): + """Test removing protocol from a URL with a port.""" + url = "https://www.example.com:8080" + expected = "www.example.com:8080" + result = remove_protocol_from_url(url) + self.assertEqual(result, expected) + + def test_url_without_protocol(self): + """Test a URL that already has no protocol.""" + url = "www.example.com" + expected = "www.example.com" + result = remove_protocol_from_url(url) + self.assertEqual(result, expected) \ No newline at end of file From 573bc31459430533d79bee0403445d05d3243940 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Fri, 13 Sep 2024 15:51:11 +0100 Subject: [PATCH 05/13] Added datetime and other utilities --- pyproject.toml | 6 ++- src/driutils/datetime.py | 64 +++++++++++++++++++++++ src/driutils/utils.py | 22 ++++++++ tests/test_datetime.py | 109 +++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 41 ++++++++++++++- 5 files changed, 238 insertions(+), 4 deletions(-) create mode 100644 src/driutils/datetime.py create mode 100644 tests/test_datetime.py diff --git a/pyproject.toml b/pyproject.toml index 4035f95..2c74639 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,9 @@ description = "A minimal setup for a template package." test = ["pytest", "pytest-cov", "parameterized"] docs = ["sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] lint = ["ruff"] -dev = ["dri-utils[test,docs,lint]"] +datetime = ["isodate"] +all = ["dri-utils[datetime]"] +dev = ["dri-utils[all,test,docs,lint]"] [tool.setuptools.dynamic] version = { attr = "driutils.__version__" } @@ -26,7 +28,7 @@ include = ["driutils*"] [tool.pytest.ini_options] -addopts = "--cov=driutils" +addopts = "--cov=driutils --cov-report term-missing" markers = ["slow: Marks slow tests"] filterwarnings = [ diff --git a/src/driutils/datetime.py b/src/driutils/datetime.py new file mode 100644 index 0000000..a17c4da --- /dev/null +++ b/src/driutils/datetime.py @@ -0,0 +1,64 @@ +from datetime import date, datetime +from typing import Optional, Tuple, Union + + +def validate_iso8601_duration(duration: str) -> bool: + """Validate if the given string is a valid ISO 8601 duration. + + Args: + duration: The duration string to validate. + + Returns: + True if the duration is valid, False otherwise. + """ + + try: + import isodate + except ModuleNotFoundError: + raise ModuleNotFoundError( + ( + "Datetime utilities were not installed. Reinstall with", + " 'pip install driutils[datetime]' to use isodate functionality", + ) + ) + + try: + isodate.parse_duration(duration) + return True + except isodate.ISO8601Error: + return False + + +def steralize_dates( + start_date: Union[date, datetime], end_date: Optional[Union[date, datetime]] = None +) -> Tuple[Union[date, datetime], datetime]: + """ + Configures and validates start and end dates. + + Args: + start_date: The start date. + end_date: The end date. If None, defaults to start_date. + + Returns: + A tuple containing the start date and the end date. + + Raises: + UserWarning: If the start date is after the end date. + """ + # If end_date is not provided, set it to start_date + if end_date is None: + end_date = start_date + + # Ensure the start_date is not after the end_date + if start_date > end_date: + raise UserWarning(f"Start date must come before end date: {start_date} > {end_date}") + + # If start_date is of type date, convert it to datetime with time at start of the day + if isinstance(start_date, date) and not isinstance(start_date, datetime): + start_date = datetime.combine(start_date, datetime.min.time()) + + # If end_date is of type date, convert it to datetime to include the entire day + if isinstance(end_date, date) and not isinstance(end_date, datetime): + end_date = datetime.combine(end_date, datetime.max.time()) + + return start_date, end_date diff --git a/src/driutils/utils.py b/src/driutils/utils.py index 82d7c90..c544642 100644 --- a/src/driutils/utils.py +++ b/src/driutils/utils.py @@ -1,5 +1,6 @@ """Utility methods that don't belong elsewhere""" +from typing import List, Optional, Union from urllib.parse import urlparse @@ -20,3 +21,24 @@ def remove_protocol_from_url(url: str) -> str: # Remove the protocol scheme by setting it to an empty string endpoint_url = "".join(endpoint_url[1:]) return endpoint_url + + +def ensure_list(items: Optional[Union[str, List[str]]] = None) -> List[str]: + """ + Ensures that a list is always received + + Args: + items: A single string or a list of strings + If None, defaults to an empty list. + + Returns: + A list of strings. + """ + if items is None or items == "": + # If no items are provided, return an empty list + items = [] + elif isinstance(items, str): + # If a single string is provided, convert it to a list + items = [items] + + return items diff --git a/tests/test_datetime.py b/tests/test_datetime.py new file mode 100644 index 0000000..fb023e2 --- /dev/null +++ b/tests/test_datetime.py @@ -0,0 +1,109 @@ +import unittest +from unittest.mock import patch +from datetime import date, datetime + +from driutils.datetime import steralize_dates, validate_iso8601_duration + +class TestValidateISO8601Duration(unittest.TestCase): + @patch("builtins.__import__") + def test_error_if_isodate_not_installed(self, mock): + """Tests that isodate is installed""" + + mock.side_effect = ModuleNotFoundError + with self.assertRaises(ModuleNotFoundError): + duration = "P1Y2M3DT4H5M6S" + validate_iso8601_duration(duration) + + def test_valid_duration_full_format(self): + """Test a valid ISO 8601 duration in full format.""" + duration = "P1Y2M3DT4H5M6S" + self.assertTrue(validate_iso8601_duration(duration)) + + def test_valid_duration_days_only(self): + """Test a valid ISO 8601 duration with days only.""" + duration = "P3D" + self.assertTrue(validate_iso8601_duration(duration)) + + def test_valid_duration_hours_only(self): + """Test a valid ISO 8601 duration with hours only.""" + duration = "PT4H" + self.assertTrue(validate_iso8601_duration(duration)) + + def test_valid_duration_combination(self): + """Test a valid ISO 8601 duration with a combination of elements.""" + duration = "P2W" + self.assertTrue(validate_iso8601_duration(duration)) + + def test_invalid_duration_missing_p(self): + """Test an invalid ISO 8601 duration missing the 'P' character.""" + duration = "1Y2M3DT4H5M6S" + self.assertFalse(validate_iso8601_duration(duration)) + + def test_invalid_duration_wrong_format(self): + """Test an invalid ISO 8601 duration with a wrong format.""" + duration = "P1Y2M3D4H5M6S" + self.assertFalse(validate_iso8601_duration(duration)) + + def test_invalid_duration_non_iso_string(self): + """Test an invalid ISO 8601 duration with a non-ISO string.""" + duration = "This is not a duration" + self.assertFalse(validate_iso8601_duration(duration)) + + def test_empty_string(self): + """Test an invalid ISO 8601 duration with an empty string.""" + duration = "" + self.assertFalse(validate_iso8601_duration(duration)) + + +class TestSteralizeDates(unittest.TestCase): + def test_start_date_only(self): + """Test with only start_date provided as date that datetimes of start and end of that date are returned + """ + start = date(2023, 8, 1) + expected_start = datetime.combine(start, datetime.min.time()) + expected_end = datetime.combine(start, datetime.max.time()) + result = steralize_dates(start) + self.assertEqual(result, (expected_start, expected_end)) + + def test_start_date_and_end_date_as_dates(self): + """Test with both start_date and end_date provided as dates that datetimes are returned + """ + start = date(2023, 8, 1) + end = date(2023, 8, 10) + expected_start = datetime.combine(start, datetime.min.time()) + expected_end = datetime.combine(end, datetime.max.time()) + result = steralize_dates(start, end) + self.assertEqual(result, (expected_start, expected_end)) + + def test_start_date_after_end_date_error(self): + """Test with start_date after end_date, should raise UserWarning. + """ + start = date(2023, 8, 10) + end = date(2023, 8, 1) + with self.assertRaises(UserWarning): + steralize_dates(start, end) + + def test_start_date_equals_end_date(self): + """Test with start_date equal to end_date that datetimes of start and end of that date are returned. + """ + start = date(2023, 8, 1) + end = date(2023, 8, 1) + expected_start = datetime.combine(start, datetime.min.time()) + expected_end = datetime.combine(end, datetime.max.time()) + result = steralize_dates(start, end) + self.assertEqual(result, (expected_start, expected_end)) + + def test_datetime_input(self): + """Test with datetime inputs for both start_date and end_date.""" + start = datetime(2023, 8, 1, 12, 0) + end = datetime(2023, 8, 10, 18, 0) + result = steralize_dates(start, end) + self.assertEqual(result, (start, end)) + + def test_mixed_date_and_datetime(self): + """Test with start_date as date and end_date as datetime.""" + start = date(2023, 8, 1) + end = datetime(2023, 8, 10, 18, 0) + expected_start = datetime.combine(start, datetime.min.time()) + result = steralize_dates(expected_start, end) + self.assertEqual(result, (expected_start, end)) \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 0457eda..efa029d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ import unittest -from driutils.utils import remove_protocol_from_url +from driutils.utils import remove_protocol_from_url, ensure_list class TestRemoveProtocolFromUrl(unittest.TestCase): def test_https_url(self): @@ -36,4 +36,41 @@ def test_url_without_protocol(self): url = "www.example.com" expected = "www.example.com" result = remove_protocol_from_url(url) - self.assertEqual(result, expected) \ No newline at end of file + self.assertEqual(result, expected) + +class TestSteralizeSiteIds(unittest.TestCase): + def test_none_input(self): + """Test with None as input, should return an empty list. + """ + result = ensure_list(None) + self.assertEqual(result, []) + + def test_no_input(self): + """Test with no input, should return an empty list. + """ + result = ensure_list() + self.assertEqual(result, []) + + def test_empty_string_input(self): + """Test with an empty string as input, should return an empty list. + """ + result = ensure_list('') + self.assertEqual(result, []) + + def test_single_string_input(self): + """Test with a single site ID as a string. + """ + result = ensure_list('site1') + self.assertEqual(result, ['site1']) + + def test_list_of_strings_input(self): + """Test with a list of site IDs. + """ + result = ensure_list(['site1', 'site2', 'site3']) + self.assertEqual(result, ['site1', 'site2', 'site3']) + + def test_empty_list_input(self): + """Test with an empty list, should return an empty list. + """ + result = ensure_list([]) + self.assertEqual(result, []) From f512810f3047a8e0ade4bd28038a2ea213de41e6 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Fri, 13 Sep 2024 15:59:33 +0100 Subject: [PATCH 06/13] Added logging formatter --- pyproject.toml | 2 +- src/driutils/logger.py | 88 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 src/driutils/logger.py diff --git a/pyproject.toml b/pyproject.toml index 2c74639..638952b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ filterwarnings = [ ] [tool.coverage.run] -omit = ["*__init__.py"] +omit = ["*__init__.py", "**/logger.py"] [tool.ruff] src = ["src", "tests"] diff --git a/src/driutils/logger.py b/src/driutils/logger.py new file mode 100644 index 0000000..b602654 --- /dev/null +++ b/src/driutils/logger.py @@ -0,0 +1,88 @@ +"""Module for formatting log messages""" + +import logging +import traceback +from datetime import datetime +from typing import Tuple, Type + + +class LogFormatter(logging.Formatter): + """ + A custom log formatter that provides a structured log format. + + This formatter creates log entries with a timestamp, log level, logger name, + and either the log message or exception traceback. + """ + + def format(self, record: logging.LogRecord) -> str: + """ + Format the specified log record as text. + + Args: + record: A LogRecord instance representing the event being logged. + + Returns: + str: A formatted string containing the log entry details. + """ + timestamp = self.formatTime(record) + level = record.levelname + logger_name = record.name + + log_entry = f"{timestamp} - {level} - {logger_name} - " + + if record.exc_info: + tb = self.formatException(record.exc_info) + # Replace newlines in traceback with pipe symbols + tb = " | ".join(line.strip() for line in tb.split("\n") if line.strip()) + log_entry += f" | Exception: {tb}" + else: + log_entry += record.getMessage() + + return log_entry + + def formatTime(self, record: logging.LogRecord) -> datetime: + """ + Format the creation time of the specified LogRecord. + + Args: + record: A LogRecord instance representing the event being logged. + + Returns: + datetime: A datetime object representing the record's creation time. + """ + return datetime.fromtimestamp(record.created) + + def formatException(self, exc_info: Tuple[Type[BaseException], BaseException]) -> str: + """ + Format the specified exception information as a string. + + Args: + exc_info: A tuple containing exception information. + + Returns: + str: A string representation of the exception traceback. + """ + return "".join(traceback.format_exception(*exc_info)).strip() + + +def setup_logging(level: int = logging.INFO) -> None: + """ + Set up basic logging configuration with a custom formatter. + + This function configures the root logger with a StreamHandler and + the custom LogFormatter. It removes any existing handlers before + adding the new one. + + Args: + level: The logging level to set for the root logger. Defaults to logging.INFO. + + Returns: + None + """ + root_logger = logging.getLogger() + root_logger.setLevel(level) + handler = logging.StreamHandler() + formatter = LogFormatter() + handler.setFormatter(formatter) + + root_logger.handlers = [handler] From e01102a8e3ac73011f121e5474075ac4682ecc39 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Fri, 13 Sep 2024 17:08:58 +0100 Subject: [PATCH 07/13] Tested S3Writer --- pyproject.toml | 9 +++- src/driutils/io/write.py | 32 ++------------ tests/__init__.py | 0 tests/io/__init__.py | 0 tests/io/base_test_case.py | 87 ++++++++++++++++++++++++++++++++++++++ tests/io/test_writers.py | 52 +++++++++++++++++++++++ 6 files changed, 150 insertions(+), 30 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/io/__init__.py create mode 100644 tests/io/base_test_case.py diff --git a/pyproject.toml b/pyproject.toml index 638952b..54c16a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,14 @@ requires = ["setuptools >= 61.0", "autosemver"] [project] requires-python = ">=3.12" -dependencies = ["autosemver", "duckdb", "boto3", "mypy_boto3_s3"] +dependencies = [ + "autosemver", + "duckdb", + "boto3", + "mypy_boto3_s3", + "moto", + "polars", +] name = "dri-utils" dynamic = ["version"] authors = [{ name = "John Doe", email = "johdoe@ceh.ac.uk" }] diff --git a/src/driutils/io/write.py b/src/driutils/io/write.py index 2a0500d..ae1c2f4 100644 --- a/src/driutils/io/write.py +++ b/src/driutils/io/write.py @@ -1,10 +1,7 @@ """Module for handling data writing logic""" from abc import ABC, abstractmethod -from io import BytesIO -from typing import Any -import polars as pl from botocore.client import BaseClient from mypy_boto3_s3.client import S3Client @@ -28,6 +25,8 @@ def __init__(self, s3_client: S3Client): Args: s3_client: The s3 client used to retrieve data from + Raises: + TypeError """ if not isinstance(s3_client, BaseClient): @@ -35,31 +34,6 @@ def __init__(self, s3_client: S3Client): self.s3_client = s3_client - @staticmethod - def _get_bytes(obj: Any) -> BytesIO: - """Converts an object to bytes - - Args: - obj: The object to convert. - Returns: - bytes representation of the object. - """ - - buffer = BytesIO() - - if hasattr(obj, "to_bytes") and callable(obj.to_bytes): - buffer.write(obj.to_bytes()) - elif isinstance(obj, str): - buffer.write(str.encode(obj)) - elif isinstance(obj, pl.dataframe.DataFrame): - obj.write_parquet(buffer) - else: - raise TypeError(f"Bytes conversion not supported for type: '{type(obj)}'") - - buffer.seek(0) - - return buffer - def write(self, bucket_name: str, key: str, body: bytes) -> None: """Uploads an object to an S3 bucket. @@ -76,6 +50,6 @@ def write(self, bucket_name: str, key: str, body: bytes) -> None: RuntimeError, ClientError """ if not isinstance(body, bytes): - body = self._get_bytes(body) + raise TypeError(f"'body' must be 'bytes', not '{type(body)}") self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/io/__init__.py b/tests/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/io/base_test_case.py b/tests/io/base_test_case.py new file mode 100644 index 0000000..2245391 --- /dev/null +++ b/tests/io/base_test_case.py @@ -0,0 +1,87 @@ +import io +import unittest +from datetime import datetime, timedelta +from unittest.mock import patch + +import boto3 +import polars as pl + + +class BaseTestCase(unittest.TestCase): + """ Unit testing of DuckDb requires connecting to the localstack server, rather than using Moto. + Whatever DuckDb does to connect to S3 is not handled within the Moto mocking setup, so you get the error: + "duckdb.duckdb.IOException: IO Error: Connection error for HTTP HEAD to http://localhost:5000/test-bucket..." + """ + @classmethod + def setUpClass(cls): + cls.s3_client = boto3.client("s3", endpoint_url="http://localhost:4566", region_name="eu-west-2") + + # Define some bucket names + cls.bucket_name = "test-bucket" + cls.empty_bucket_name = "empty-bucket" + + # First, clear any hangovers from previous tests that may not have cleared up properly + cls.clear_buckets() + + # Create a test bucket to store parquet files + cls.s3_client.create_bucket(Bucket=cls.bucket_name, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}) + + # Create a test empty bucket + cls.s3_client.create_bucket(Bucket=cls.empty_bucket_name, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}) + + # Add test Parquet files to the bucket + cls.create_and_upload_test_data() + + # Add some corrupt data for testing + cls.s3_client.put_object(Bucket=cls.bucket_name, Key="corrupted.parquet", Body=b"corrupted data") + + @classmethod + def tearDownClass(cls): + # Clear the buckets + cls.clear_buckets() + + @classmethod + def clear_buckets(cls): + # Clear the buckets from the localstack instance + buckets = (cls.bucket_name, cls.empty_bucket_name) + for bucket_name in buckets: + try: + # Delete all objects in the bucket + bucket = cls.s3_client.list_objects(Bucket=bucket_name) + if 'Contents' in bucket: + for obj in bucket['Contents']: + cls.s3_client.delete_object(Bucket=bucket_name, Key=obj['Key']) + # Delete the bucket + cls.s3_client.delete_bucket(Bucket=bucket_name) + except cls.s3_client.exceptions.NoSuchBucket: + pass + + @classmethod + def create_and_upload_test_data(cls): + # Create sample data that we have more control over for doing specific tests + start_date = datetime(2024, 1, 1) + end_date = datetime(2024, 1, 10) + current_date = start_date + + while current_date <= end_date: + # Create hourly data for the current date + data = { + 'time': [current_date + timedelta(hours=i) for i in range(24)] * 2, + 'SITE_ID': ['site1'] * 24 + ['site2'] * 24, + 'col1': list(range(48)), + 'col2': list(range(48, 96)) + } + df = pl.DataFrame(data) + + # Convert DataFrame to Parquet + parquet_buffer = io.BytesIO() + df.write_parquet(parquet_buffer) + parquet_buffer.seek(0) + + # Upload Parquet file to S3 bucket with date-stamped filename + file_key = f"TEST_CATEGORY/{current_date.strftime('%Y-%m')}/{current_date.strftime('%Y-%m-%d')}.parquet" + cls.s3_client.put_object(Bucket=cls.bucket_name, Key=file_key, Body=parquet_buffer.getvalue()) + + current_date += timedelta(days=1) diff --git a/tests/io/test_writers.py b/tests/io/test_writers.py index e69de29..fa1ae6b 100644 --- a/tests/io/test_writers.py +++ b/tests/io/test_writers.py @@ -0,0 +1,52 @@ +import unittest +from unittest.mock import MagicMock +from driutils.io.write import S3Writer +import boto3 +from botocore.client import BaseClient +from mypy_boto3_s3.client import S3Client +from parameterized import parameterized + +class TestS3Writer(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + + cls.s3_client: S3Client = boto3.client("s3", endpoint_url="http://localhost:4566") #type: ignore + + def test_s3_client_type(self): + """Returns an object if s3_client is of type `boto3.client.s3`, otherwise + raises an error""" + + # Happy path + writer = S3Writer(self.s3_client) + + self.assertIsInstance(writer.s3_client, BaseClient) + + # Bad path + + with self.assertRaises(TypeError): + S3Writer("not an s3 client") #type: ignore + + + @parameterized.expand([1, "body", 1.123, {"key": b"bytes"}]) + def test_error_raises_if_write_without_bytes(self, body): + """Tests that a type error is raised if the wrong type body used""" + + writer = S3Writer(self.s3_client) + writer.s3_client = MagicMock() + with self.assertRaises(TypeError): + writer.write("bucket", "key", body) + + writer.s3_client.put_object.assert_not_called() + + def test_write_called(self): + """Tests that the writer can be executed""" + + body = b"Test data" + + writer = S3Writer(self.s3_client) + writer.s3_client = MagicMock() + writer.write("bucket", "key", body) + + writer.s3_client.put_object.assert_called_once_with(Bucket="bucket", Key="key", Body=body) + \ No newline at end of file From f03b9a1f3af09a538937ca3c31d91fb44b35a6ac Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Fri, 13 Sep 2024 17:59:35 +0100 Subject: [PATCH 08/13] Layed out tests for readers --- src/driutils/io/read.py | 2 +- tests/io/test_readers.py | 60 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/driutils/io/read.py b/src/driutils/io/read.py index 2fc8b45..5223fd2 100644 --- a/src/driutils/io/read.py +++ b/src/driutils/io/read.py @@ -36,7 +36,7 @@ class DuckDBReader(ReaderInterface): _connection: DuckDBPyConnection """A connection to DuckDB""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self) -> None: self._connection = duckdb.connect() def read(self, query: str, params: Optional[List] = None) -> DuckDBPyConnection: diff --git a/tests/io/test_readers.py b/tests/io/test_readers.py index e69de29..f14e6f0 100644 --- a/tests/io/test_readers.py +++ b/tests/io/test_readers.py @@ -0,0 +1,60 @@ +import unittest +from unittest.mock import patch, MagicMock +from driutils.io.read import DuckDBFileReader +from duckdb import DuckDBPyConnection + +class TestDuckDBFileReader(unittest.TestCase): + + def test_initialization(self): + """Test that the class can be initialized""" + reader = DuckDBFileReader() + + self.assertIsInstance(reader._connection, DuckDBPyConnection) + + def test_context_manager_is_function(self): + """Should be able to use context manager to auto-close file connection""" + + mock = MagicMock() + + with DuckDBFileReader() as con: + self.assertIsInstance(con._connection, DuckDBPyConnection) + + con._connection = mock + + self.assertTrue(mock.close.called) + + def test_connection_closed_on_delete(self): + """Tests that duckdb connection is closed when object is deleted""" + assert False + + def test_close_method_closes_connection(self): + """Tests that the .close() method closes the connection""" + assert False + def test_read_executes_query(self): + """Tests that the .read() method executes a query""" + assert False + +class TestDuckDBS3Reader(unittest.TestCase): + + def test_value_error_if_invalid_auth_option(self): + """Test that a ValueError is raised if a bad auth option is selected""" + assert False + + def test_init_auto_authentication(self): + """Tests that the reader can use the 'auto' auth option""" + assert False + + def test_init_sts_authentication(self): + """Tests that the reader can use the 'sts' auth option""" + assert False + + def test_init_custom_endpoint_authentication_https(self): + """Tests that the reader can authenticate to a custom endpoint + with https protocol""" + assert False + + def test_init_custom_endpoint_authentication_http(self): + """Tests that the reader can authenticate to a custom endpoint + with http protocol""" + assert False + \ No newline at end of file From fc6deef0b891464f60021f3c6999ce52bc58b6b8 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Mon, 16 Sep 2024 12:00:37 +0100 Subject: [PATCH 09/13] Finished testing readers --- src/driutils/io/read.py | 44 +++++++++++-------- tests/io/test_readers.py | 94 +++++++++++++++++++++++++++++----------- 2 files changed, 95 insertions(+), 43 deletions(-) diff --git a/src/driutils/io/read.py b/src/driutils/io/read.py index 5223fd2..84f515b 100644 --- a/src/driutils/io/read.py +++ b/src/driutils/io/read.py @@ -19,10 +19,14 @@ def __enter__(self) -> Self: def __exit__(self, *args) -> None: """Closes the connection when exiting the context""" - self._connection.close() + self.close() def __del__(self): """Closes the connection when deleted""" + self.close() + + def close(self) -> None: + """Closes the connection""" self._connection.close() @abstractmethod @@ -49,10 +53,6 @@ def read(self, query: str, params: Optional[List] = None) -> DuckDBPyConnection: return self._connection.execute(query, params) - def close(self) -> None: - """Close the connection""" - self._connection.close() - class DuckDBS3Reader(DuckDBReader): """Concrete Implementation of a DuckDB reader for reading @@ -70,36 +70,46 @@ def __init__(self, auth_type: str, endpoint_url: Optional[str] = None, use_ssl: super().__init__() - auth_type = auth_type.lower() + auth_type = str(auth_type).lower() VALID_AUTH_METHODS = ["auto", "sts", "custom_endpoint"] if auth_type not in VALID_AUTH_METHODS: raise ValueError(f"Invalid `auth_type`, must be one of {VALID_AUTH_METHODS}") + self._connection.install_extension("httpfs") + self._connection.load_extension("httpfs") self._connection.execute(""" - INSTALL httpfs; - LOAD httpfs; SET force_download = true; SET http_keep_alive = false; """) - if auth_type == "auto": + self._authenticate(auth_type, endpoint_url, use_ssl) + + def _authenticate(self, method: str, endpoint_url: Optional[str] = None, use_ssl: Optional[bool] = None) -> None: + """Handles authentication selection + + Args: + method: method of authentication used + endpoint_url: Custom s3 endpoint + use_ssl: Flag for using ssl (https connections) + """ + if method == "auto": self._auto_auth() - elif auth_type == "sts": + elif method == "sts": self._sts_auth() - elif auth_type == "custom_endpoint": - if not isinstance(endpoint_url, str): - endpoint_url = str(endpoint_url) + elif method == "custom_endpoint": + if not endpoint_url: + raise ValueError("`endpoint_url` must be provided for `custom_endpoint` authentication") self._custom_endpoint_auth(endpoint_url, use_ssl) def _auto_auth(self) -> None: """Automatically authenticates using environment variables""" + self._connection.install_extension("aws") + self._connection.load_extension("aws") self._connection.execute(""" - INSTALL aws; - LOAD aws; CREATE SECRET ( TYPE S3, PROVIDER CREDENTIAL_CHAIN @@ -108,10 +118,10 @@ def _auto_auth(self) -> None: def _sts_auth(self) -> None: """Authenicates using assumed roles on AWS""" + self._connection.install_extension("aws") + self._connection.load_extension("aws") self._connection.execute(""" - INSTALL aws; - LOAD aws; CREATE SECRET ( TYPE S3, PROVIDER CREDENTIAL_CHAIN, diff --git a/tests/io/test_readers.py b/tests/io/test_readers.py index f14e6f0..ce7801b 100644 --- a/tests/io/test_readers.py +++ b/tests/io/test_readers.py @@ -1,7 +1,8 @@ import unittest from unittest.mock import patch, MagicMock -from driutils.io.read import DuckDBFileReader +from driutils.io.read import DuckDBFileReader, DuckDBS3Reader from duckdb import DuckDBPyConnection +from parameterized import parameterized class TestDuckDBFileReader(unittest.TestCase): @@ -11,50 +12,91 @@ def test_initialization(self): self.assertIsInstance(reader._connection, DuckDBPyConnection) - def test_context_manager_is_function(self): + @patch("driutils.io.read.DuckDBFileReader.close") + def test_context_manager_is_functional(self, mock): """Should be able to use context manager to auto-close file connection""" - mock = MagicMock() - with DuckDBFileReader() as con: self.assertIsInstance(con._connection, DuckDBPyConnection) - con._connection = mock - - self.assertTrue(mock.close.called) + mock.assert_called_once() - def test_connection_closed_on_delete(self): + @patch("driutils.io.read.DuckDBFileReader.close") + def test_connection_closed_on_delete(self, mock): """Tests that duckdb connection is closed when object is deleted""" - assert False + + reader = DuckDBFileReader() + del reader + mock.assert_called_once() def test_close_method_closes_connection(self): """Tests that the .close() method closes the connection""" - assert False + + reader = DuckDBFileReader() + reader._connection = MagicMock() + + reader.close() + + reader._connection.close.assert_called() + def test_read_executes_query(self): """Tests that the .read() method executes a query""" - assert False + + reader = DuckDBFileReader() + + reader._connection = MagicMock() + + query = "read this plz" + params = ["param1", "param2"] + + reader.read(query, params) + + reader._connection.execute.assert_called_once_with(query, params) class TestDuckDBS3Reader(unittest.TestCase): - def test_value_error_if_invalid_auth_option(self): + @parameterized.expand(["a", 1, "cutom_endpoint"]) + def test_value_error_if_invalid_auth_option(self, value): """Test that a ValueError is raised if a bad auth option is selected""" - assert False + + with self.assertRaises(ValueError): + DuckDBS3Reader(value) - def test_init_auto_authentication(self): + @parameterized.expand(["auto", "AUTO", "aUtO"]) + @patch("driutils.io.read.DuckDBS3Reader._authenticate") + def test_upper_or_lowercase_option_accepted(self, value, mock): + """Tests that the auth options can be provided in any case""" + DuckDBS3Reader(value) + + mock.assert_called_once() + + @patch.object(DuckDBS3Reader, "_auto_auth", side_effect=DuckDBS3Reader._auto_auth, autospec=True) + def test_init_auto_authentication(self, mock): """Tests that the reader can use the 'auto' auth option""" - assert False - def test_init_sts_authentication(self): - """Tests that the reader can use the 'sts' auth option""" - assert False + DuckDBS3Reader("auto") + mock.assert_called_once() - def test_init_custom_endpoint_authentication_https(self): + @patch.object(DuckDBS3Reader, "_sts_auth", side_effect=DuckDBS3Reader._sts_auth, autospec=True) + def test_init_sts_authentication(self, mock): + """Tests that the reader can use the 'sts' auth option""" + DuckDBS3Reader("sts") + mock.assert_called_once() + + @parameterized.expand([ + ["https://s3-a-real-endpoint", True], + ["http://localhost:8080", False] + ]) + @patch.object(DuckDBS3Reader, "_custom_endpoint_auth", wraps=DuckDBS3Reader._custom_endpoint_auth, autospec=True) + def test_init_custom_endpoint_authentication_https(self, url, ssl, mock): """Tests that the reader can authenticate to a custom endpoint with https protocol""" - assert False - - def test_init_custom_endpoint_authentication_http(self): - """Tests that the reader can authenticate to a custom endpoint - with http protocol""" - assert False - \ No newline at end of file + reader = DuckDBS3Reader("custom_endpoint", url, ssl) + mock.assert_called_once_with(reader, url, ssl) + + def test_error_if_custom_endpoint_not_provided(self): + """Test that an error is raised if custom_endpoint authentication used but + endpoint_url_not_given""" + + with self.assertRaises(ValueError): + DuckDBS3Reader("custom_endpoint") \ No newline at end of file From ecf62b8c1857459453889a42004220826cdf9783 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Mon, 16 Sep 2024 12:21:44 +0100 Subject: [PATCH 10/13] Updated README --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++ src/driutils/datetime.py | 2 +- tests/test_datetime.py | 14 +++++------ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index dbf588a..e808c44 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,37 @@ The docs, tests, and linter packages can be installed together with: pip install -e .[dev] ``` +#### Other Optional Packages + +Some utilities need additional packages that aren't relevant to all projects. To install everything, run: + +``` +pip install -e .[all] +``` + +or to include datetime utilities: + +``` +pip install -e .[datetime] +``` + +#### A Note on Remote Installs + +You are likely including this on another project, in this case you should include the git url when installing. For manual installs: +``` +pip install "dri-utils[all] @ git+https://github.com/NERC-CEH/dri-utils.git" + +``` + +or if including it in your dependencies +``` +dependencies = [ + "another-package", + ... + "dri-utils[all] @ git+https://github.com/NERC-CEH/dri-utils.git" + ] +``` + ## Readers ### DuckDB Reader @@ -137,3 +168,23 @@ writer.write( key="path/to/upload/destination", body=content ) +``` + +## Logging + +There is a logging module here that defines the base logging format used for all projects, to use it add: + +```python + +from driutils import logger + +logger.setup_logging() +``` + +## Datetime Utilities + +The module `driutils.datetime` contains common utilities for working with dates and times in Python. The methods within are currently simple validation methods. Some of the methods require additional packages that are not needed for all projects, so ensure that the package is installed as `pip install .[datetime]` or `pip install .[all]` + +## General Utilities + +The module `driutils.utils` contains utility methods that didn't fit anywhere else and includes things such as ensuring that a list is always returned and removing protocols from URLs. \ No newline at end of file diff --git a/src/driutils/datetime.py b/src/driutils/datetime.py index a17c4da..5485931 100644 --- a/src/driutils/datetime.py +++ b/src/driutils/datetime.py @@ -29,7 +29,7 @@ def validate_iso8601_duration(duration: str) -> bool: return False -def steralize_dates( +def steralize_date_range( start_date: Union[date, datetime], end_date: Optional[Union[date, datetime]] = None ) -> Tuple[Union[date, datetime], datetime]: """ diff --git a/tests/test_datetime.py b/tests/test_datetime.py index fb023e2..cdb9637 100644 --- a/tests/test_datetime.py +++ b/tests/test_datetime.py @@ -2,7 +2,7 @@ from unittest.mock import patch from datetime import date, datetime -from driutils.datetime import steralize_dates, validate_iso8601_duration +from driutils.datetime import steralize_date_range, validate_iso8601_duration class TestValidateISO8601Duration(unittest.TestCase): @patch("builtins.__import__") @@ -62,7 +62,7 @@ def test_start_date_only(self): start = date(2023, 8, 1) expected_start = datetime.combine(start, datetime.min.time()) expected_end = datetime.combine(start, datetime.max.time()) - result = steralize_dates(start) + result = steralize_date_range(start) self.assertEqual(result, (expected_start, expected_end)) def test_start_date_and_end_date_as_dates(self): @@ -72,7 +72,7 @@ def test_start_date_and_end_date_as_dates(self): end = date(2023, 8, 10) expected_start = datetime.combine(start, datetime.min.time()) expected_end = datetime.combine(end, datetime.max.time()) - result = steralize_dates(start, end) + result = steralize_date_range(start, end) self.assertEqual(result, (expected_start, expected_end)) def test_start_date_after_end_date_error(self): @@ -81,7 +81,7 @@ def test_start_date_after_end_date_error(self): start = date(2023, 8, 10) end = date(2023, 8, 1) with self.assertRaises(UserWarning): - steralize_dates(start, end) + steralize_date_range(start, end) def test_start_date_equals_end_date(self): """Test with start_date equal to end_date that datetimes of start and end of that date are returned. @@ -90,14 +90,14 @@ def test_start_date_equals_end_date(self): end = date(2023, 8, 1) expected_start = datetime.combine(start, datetime.min.time()) expected_end = datetime.combine(end, datetime.max.time()) - result = steralize_dates(start, end) + result = steralize_date_range(start, end) self.assertEqual(result, (expected_start, expected_end)) def test_datetime_input(self): """Test with datetime inputs for both start_date and end_date.""" start = datetime(2023, 8, 1, 12, 0) end = datetime(2023, 8, 10, 18, 0) - result = steralize_dates(start, end) + result = steralize_date_range(start, end) self.assertEqual(result, (start, end)) def test_mixed_date_and_datetime(self): @@ -105,5 +105,5 @@ def test_mixed_date_and_datetime(self): start = date(2023, 8, 1) end = datetime(2023, 8, 10, 18, 0) expected_start = datetime.combine(start, datetime.min.time()) - result = steralize_dates(expected_start, end) + result = steralize_date_range(expected_start, end) self.assertEqual(result, (expected_start, end)) \ No newline at end of file From 88792ed2d0f5019a48c7233b72ea1ca8a67c6ad7 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Mon, 16 Sep 2024 14:26:05 +0100 Subject: [PATCH 11/13] Added optional dependencies --- .github/workflows/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index 51fe8fe..74f293d 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -16,6 +16,7 @@ jobs: uses: NERC-CEH/dri-cicd/.github/workflows/build-test-deploy-docker.yml@main with: package_name: driutils + optional_dependencies: "[lint, test, all]" secrets: AWS_REGION: ${{ secrets.AWS_REGION }} AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} From 5ffa5bdd8b8ec90e29e8073b3256f91ac1302080 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Mon, 16 Sep 2024 14:34:24 +0100 Subject: [PATCH 12/13] Put deps on right job --- .github/workflows/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index 74f293d..e95d13d 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -10,13 +10,14 @@ permissions: jobs: test-python: uses: NERC-CEH/dri-cicd/.github/workflows/test-python.yml@main + with: + optional_dependencies: "[lint, test, all]" build-test-deploy-docker: needs: [test-python] uses: NERC-CEH/dri-cicd/.github/workflows/build-test-deploy-docker.yml@main with: package_name: driutils - optional_dependencies: "[lint, test, all]" secrets: AWS_REGION: ${{ secrets.AWS_REGION }} AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} From c61b6d4a54763d1dfed23e98d8a687b846cf6c49 Mon Sep 17 00:00:00 2001 From: Lewis Chambers Date: Mon, 16 Sep 2024 14:35:47 +0100 Subject: [PATCH 13/13] No spaces --- .github/workflows/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index e95d13d..4eab4c6 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -11,7 +11,7 @@ jobs: test-python: uses: NERC-CEH/dri-cicd/.github/workflows/test-python.yml@main with: - optional_dependencies: "[lint, test, all]" + optional_dependencies: "[lint,test,all]" build-test-deploy-docker: needs: [test-python]