From b445c32933236588359a45e6982c75c94f540d93 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 14:19:32 +0100 Subject: [PATCH 01/11] feat: Handle NotFound error deleting old resource --- ckanext/switzerland/harvester/base_sbb_harvester.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index e36100c8..f1a52167 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -947,7 +947,17 @@ def _import_stage(self, harvest_object): # noqa except NotFound: pass # Sometimes importing the data into the datastore fails - get_action("resource_delete")(context, {"id": old_resource_id}) + try: + get_action("resource_delete")(context, {"id": old_resource_id}) + except NotFound: + self._save_object_error( + f"Error deleting old resource {old_resource_id} for " + f"filename {file_name}: the resource was not found. " + f"This could be due to a failed connection to the database. " + f"{traceback.format_exc()}", + harvest_object, + stage, + ) Session.commit() From be986f510b39144e989e7de1e20dfffbb35a7cf9 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 14:47:08 +0100 Subject: [PATCH 02/11] feat: Log error when reordering resources fails --- .../switzerland/harvester/base_sbb_harvester.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index f1a52167..da97c88f 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -1041,7 +1041,21 @@ def finalize(self, harvest_object, harvest_object_data): return True - ordered_resources, unmatched_resources = self._get_ordered_resources(package) + try: + ordered_resources, unmatched_resources = self._get_ordered_resources( + package + ) + except NotFound: + self._save_object_error( + f"Error reordering resources for dataset " + f"{harvest_object_data['dataset']}. " + f"This could be due to a failed connection to the database. " + f"{traceback.format_exc()}", + harvest_object, + stage, + ) + + return False # ---------------------------------------------------------------------------- # delete old resources From 4f454faa225e8b11a5ebdc86d9adf588aba52573 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 14:53:16 +0100 Subject: [PATCH 03/11] fix: Record an error if finalizing job errored --- ckanext/switzerland/harvester/base_sbb_harvester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index da97c88f..7eaa7fd8 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -1039,7 +1039,7 @@ def finalize(self, harvest_object, harvest_object_data): log.exception(message) self._save_object_error(message, harvest_object, "Import") - return True + return False try: ordered_resources, unmatched_resources = self._get_ordered_resources( From ba868ff0db50f712e20d381eb672cf5395439594 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 17:02:34 +0100 Subject: [PATCH 04/11] feat: Use fresh context to delete each resource --- .../switzerland/harvester/base_sbb_harvester.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index 7eaa7fd8..356bc264 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -1006,7 +1006,8 @@ def _get_ordered_resources(self, package): return ordered_resources, unmatched_resources def finalize(self, harvest_object, harvest_object_data): - context = {"model": model, "session": Session, "user": self._get_user_name()} + user_name = self._get_user_name() + context = {"model": model, "session": Session, "user": user_name} stage = "Import" log.info("Running finalizing tasks:") @@ -1072,8 +1073,17 @@ def finalize(self, harvest_object, harvest_object_data): for resource in ordered_resources[max_resources:]: try: + # We need a new context each time, otherwise, if there is an + # exception deleting the resource, there will be auth data left in + # the context that won't get deleted. Then all subsequent calls to + # resource_delete will seem unauthorized and fail. + delete_context = { + "model": model, + "session": Session, + "user": user_name, + } self._delete_version( - context, package, resource_filename(resource["url"]) + delete_context, package, resource_filename(resource["url"]) ) except Exception as e: self._save_object_error( From e0603669f4729b9b083277005531ecad025a74bc Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 19:07:58 +0100 Subject: [PATCH 05/11] feat: Delete extra resources by id, not filename If we get into a state where we have duplicate resources for the same filename (e.g. because we created a new resource for the filename but deleting the old resource failed), we must not delete resources by filename in the finalize step! If we do, we will delete both the new and the old resource for the filename. This method is already a lot simpler than it used to be, since CKAN no longer stores lots of old versions of resources in the database, so it's safe to change it to delete resources by id. --- .../harvester/base_sbb_harvester.py | 48 +++++++++---------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index 356bc264..cc93d38a 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -1073,7 +1073,7 @@ def finalize(self, harvest_object, harvest_object_data): for resource in ordered_resources[max_resources:]: try: - # We need a new context each time, otherwise, if there is an + # We need a new context each time: otherwise, if there is an # exception deleting the resource, there will be auth data left in # the context that won't get deleted. Then all subsequent calls to # resource_delete will seem unauthorized and fail. @@ -1082,9 +1082,7 @@ def finalize(self, harvest_object, harvest_object_data): "session": Session, "user": user_name, } - self._delete_version( - delete_context, package, resource_filename(resource["url"]) - ) + self._fully_delete_resource(delete_context, resource) except Exception as e: self._save_object_error( "Error deleting resource {} in finalizing tasks: {}".format( @@ -1148,27 +1146,25 @@ def finalize(self, harvest_object, harvest_object_data): search.rebuild(package["id"]) - def _delete_version(self, context, package, filename): - """Fully delete the resource with the given filename""" - for resource in package["resources"]: - if resource_filename(resource["url"]) == filename: - log.debug( - "Deleting resource {} with filename {}".format( - resource["id"], filename - ) - ) - # delete the file from the filestore - path = uploader.ResourceUpload(resource).get_path(resource["id"]) - if os.path.exists(path): - os.remove(path) + def _fully_delete_resource(self, context, resource): + """Fully delete a resource and its file.""" + log.debug( + "Deleting resource {} with filename {}".format( + resource["id"], resource["url"] + ) + ) + # delete the file from the filestore + path = uploader.ResourceUpload(resource).get_path(resource["id"]) + if os.path.exists(path): + os.remove(path) - # delete the datastore table - try: - get_action("datastore_delete")( - context, {"resource_id": resource["id"], "force": True} - ) - except NotFound: - pass # Sometimes importing the data into the datastore fails + # delete the datastore table + try: + get_action("datastore_delete")( + context, {"resource_id": resource["id"], "force": True} + ) + except NotFound: + pass # Sometimes importing the data into the datastore fails - # delete the resource itself - get_action("resource_delete")(context, {"id": resource["id"]}) + # delete the resource itself + get_action("resource_delete")(context, {"id": resource["id"]}) From 495939a76befb9f543cc63218b808e901c93a835 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 19:12:47 +0100 Subject: [PATCH 06/11] refactor: Use method instead of duplicating logic This will also clean up the filestore of old versions of files that are otherwise not cleaned up. --- .../switzerland/harvester/base_sbb_harvester.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index cc93d38a..cd599d24 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -939,21 +939,13 @@ def _import_stage(self, harvest_object): # noqa if old_resource_id: log.info("Deleting old resource: %s", old_resource_id) - # delete the datastore table try: - get_action("datastore_delete")( - context, {"resource_id": old_resource_id, "force": True} - ) - except NotFound: - pass # Sometimes importing the data into the datastore fails - - try: - get_action("resource_delete")(context, {"id": old_resource_id}) + self._fully_delete_resource(context, old_resource_meta) except NotFound: self._save_object_error( f"Error deleting old resource {old_resource_id} for " - f"filename {file_name}: the resource was not found. " - f"This could be due to a failed connection to the database. " + f"filename {file_name}. This could be due to a failed " + f"connection to the database. " f"{traceback.format_exc()}", harvest_object, stage, From ac41a385ddee4c9d63d806d7e74c6796d7f70242 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 19:15:27 +0100 Subject: [PATCH 07/11] feat: Catch ValidationError when reordering resources --- .../harvester/base_sbb_harvester.py | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index cd599d24..58756ab6 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -1104,15 +1104,27 @@ def finalize(self, harvest_object, harvest_object_data): }, ) - # reorder resources - # not matched resources come first in the list, then the ordered - get_action("package_resource_reorder")( - context, - { - "id": package["id"], - "order": [r["id"] for r in unmatched_resources + ordered_resources], - }, - ) + try: + # reorder resources + # not matched resources come first in the list, then the ordered + get_action("package_resource_reorder")( + context, + { + "id": package["id"], + "order": [r["id"] for r in unmatched_resources + ordered_resources], + }, + ) + except ValidationError: + self._save_object_error( + f"Error reordering resources for dataset " + f"{harvest_object_data['dataset']}. " + f"This could be due to a failed connection to the database. " + f"{traceback.format_exc()}", + harvest_object, + stage, + ) + + return False from ckanext.harvest.model import harvest_object_table From 964bc35a6dcbe400a1615b3b6c6e2b70b58337de Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 19:18:01 +0100 Subject: [PATCH 08/11] style: Remove unused import --- ckanext/switzerland/harvester/base_sbb_harvester.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index 58756ab6..4b96a6ee 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -38,7 +38,6 @@ from ckanext.harvest.harvesters.base import HarvesterBase from ckanext.switzerland.harvester.storage_adapter_factory import StorageAdapterFactory -from ckanext.switzerland.helpers import resource_filename log = logging.getLogger(__name__) From 86f281ff93317b529ec77f5f7e50d7c223b76903 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 2 Dec 2024 19:22:56 +0100 Subject: [PATCH 09/11] style: Ignore complexity of some methods --- ckanext/switzerland/harvester/base_sbb_harvester.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index 4b96a6ee..f50a9c2a 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -444,7 +444,7 @@ def fetch_stage(self, harvest_object): ) return False - def _fetch_stage(self, harvest_object): # noqa + def _fetch_stage(self, harvest_object): # noqa: C901 """ Fetching of resources. Runs once for each gathered resource. @@ -602,7 +602,7 @@ def import_stage(self, harvest_object): ) return False - def _import_stage(self, harvest_object): # noqa + def _import_stage(self, harvest_object): # noqa: C901 """ Importing the fetched files into CKAN storage. Runs once for each fetched resource. @@ -996,7 +996,8 @@ def _get_ordered_resources(self, package): return ordered_resources, unmatched_resources - def finalize(self, harvest_object, harvest_object_data): + def finalize(self, harvest_object, harvest_object_data): # noqa: C901 + # TODO: Simplify this method. user_name = self._get_user_name() context = {"model": model, "session": Session, "user": user_name} stage = "Import" From 0b6203ac5650e3541b318969e5c46103ce6aa5cb Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Tue, 3 Dec 2024 17:27:57 +0100 Subject: [PATCH 10/11] tests: Pin ckanext-harvest to non-breaking version v1.6.0 included changes that required updates on our side, which were made in the migration branch of ckanext-switzerland. No point adding the same changes here as the migration will soon be finished and the migration branch will be merged into main. --- bin/install_test_requirements.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/install_test_requirements.sh b/bin/install_test_requirements.sh index 9b9d5144..b159b6f0 100755 --- a/bin/install_test_requirements.sh +++ b/bin/install_test_requirements.sh @@ -8,8 +8,8 @@ pip install -e /__w/ckanext-switzerland/ckanext-switzerland/ # Install ckanext dependencies pip install -e git+https://github.com/ckan/ckanext-dcat.git@v1.5.1#egg=ckanext-dcat pip install -r https://raw.githubusercontent.com/ckan/ckanext-dcat/v1.5.1/requirements.txt -pip install -e git+https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest.git#egg=ckanext-harvest -pip install -r https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest/-/raw/main/requirements.txt +pip install -e git+https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest.git@v1.5.6#egg=ckanext-harvest +pip install -r https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest/v1.5.6/requirements.txt pip install -e git+https://github.com/ckan/ckanext-scheming.git@release-3.0.0#egg=ckanext-scheming pip install -e git+https://github.com/ckan/ckanext-fluent.git#egg=ckanext-fluent pip install -r https://raw.githubusercontent.com/ckan/ckanext-fluent/master/requirements.txt From 74c191251515c678f83c96ddee2cc3b305fe1cf5 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Tue, 3 Dec 2024 17:44:09 +0100 Subject: [PATCH 11/11] tests: Fix urls for ckanext-harvest I forgot that we are using our own fork here instead of the official ckan one. --- bin/install_test_requirements.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/install_test_requirements.sh b/bin/install_test_requirements.sh index b159b6f0..57a92528 100755 --- a/bin/install_test_requirements.sh +++ b/bin/install_test_requirements.sh @@ -8,8 +8,8 @@ pip install -e /__w/ckanext-switzerland/ckanext-switzerland/ # Install ckanext dependencies pip install -e git+https://github.com/ckan/ckanext-dcat.git@v1.5.1#egg=ckanext-dcat pip install -r https://raw.githubusercontent.com/ckan/ckanext-dcat/v1.5.1/requirements.txt -pip install -e git+https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest.git@v1.5.6#egg=ckanext-harvest -pip install -r https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest/v1.5.6/requirements.txt +pip install -e git+https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest.git@v1.0.2#egg=ckanext-harvest +pip install -r https://gitlab.liip.ch/odp_oev_schweiz/ckanext-harvest/-/raw/v1.0.2/requirements.txt pip install -e git+https://github.com/ckan/ckanext-scheming.git@release-3.0.0#egg=ckanext-scheming pip install -e git+https://github.com/ckan/ckanext-fluent.git#egg=ckanext-fluent pip install -r https://raw.githubusercontent.com/ckan/ckanext-fluent/master/requirements.txt