diff --git a/poetry.lock b/poetry.lock index d3a5121e..4f416228 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1011,7 +1011,7 @@ elasticsearch7 = ["invenio-search[elasticsearch7] (>=1.4.1,<1.5.0)"] files = ["invenio-files-rest (>=1.2.0,<1.3.0)", "invenio-iiif (>=1.1.0,<1.2.0)", "invenio-previewer (>=1.3.2,<1.4.0)", "invenio-records-files (>=1.2.1,<1.3.0)"] metadata = ["invenio-indexer (>=1.2.0,<1.3.0)", "invenio-jsonschemas (>=1.1.1,<1.2.0)", "invenio-oaiserver (>=1.2.0,<1.3.0)", "invenio-pidstore (>=1.2.1,<1.3.0)", "invenio-records-rest (>=1.8.0,<1.9.0)", "invenio-records-ui (>=1.2.0,<1.3.0)", "invenio-records (>=1.4.0,<1.6.0)", "invenio-search-ui (>=2.0.0,<2.1.0)"] mysql = ["invenio-db[mysql,versioning] (>=1.0.8,<1.1.0)"] -postgresql = ["invenio-db[versioning,postgresql] (>=1.0.8,<1.1.0)"] +postgresql = ["invenio-db[postgresql,versioning] (>=1.0.8,<1.1.0)"] sqlite = ["invenio-db[versioning] (>=1.0.8,<1.1.0)"] tests = ["pytest-invenio (>=1.4.0,<1.5.0)"] @@ -1033,7 +1033,7 @@ invenio-i18n = ">=1.2.0" all = ["Sphinx (>=3)", "cachelib (>=0.1)", "pytest-invenio (>=1.4.1)", "redis (>=2.10.5)"] docs = ["Sphinx (>=3)"] mysql = ["invenio-db[mysql,versioning] (>=1.0.8)"] -postgresql = ["invenio-db[versioning,postgresql] (>=1.0.8)"] +postgresql = ["invenio-db[postgresql,versioning] (>=1.0.8)"] sqlite = ["invenio-db[versioning,sqlite] (>=1.0.8)"] tests = ["cachelib (>=0.1)", "pytest-invenio (>=1.4.1)", "redis (>=2.10.5)"] @@ -1072,7 +1072,7 @@ admin = ["invenio-admin (>=1.2.1)"] all = ["invenio-admin (>=1.2.1)", "Sphinx (==4.2.0)", "pytest-invenio (>=1.4.2)"] docs = ["Sphinx (==4.2.0)"] mysql = ["invenio-db[mysql,versioning] (>=1.0.9)"] -postgresql = ["invenio-db[versioning,postgresql] (>=1.0.9)"] +postgresql = ["invenio-db[postgresql,versioning] (>=1.0.9)"] sqlite = ["invenio-db[versioning] (>=1.0.9)"] tests = ["pytest-invenio (>=1.4.2)"] @@ -1441,7 +1441,7 @@ admin = ["invenio-admin (>=1.2.1)"] all = ["invenio-admin (>=1.2.1)", "Sphinx (>=4.2.0)", "redis (>=2.10.5)", "pytest-invenio (>=1.4.0)"] docs = ["Sphinx (>=4.2.0)"] mysql = ["invenio-db[mysql,versioning] (>=1.0.9,<2.0.0)"] -postgresql = ["invenio-db[versioning,postgresql] (>=1.0.9,<2.0.0)"] +postgresql = ["invenio-db[postgresql,versioning] (>=1.0.9,<2.0.0)"] redis = ["redis (>=2.10.5)"] sqlite = ["invenio-db[versioning] (>=1.0.9,<2.0.0)"] tests = ["pytest-invenio (>=1.4.0)"] @@ -1525,7 +1525,7 @@ admin = ["invenio-admin (>=1.2.1)"] all = ["Sphinx (==4.2.0)", "invenio-admin (>=1.2.1)", "pytest-invenio (>=1.4.1)"] docs = ["Sphinx (==4.2.0)"] mysql = ["invenio-db[mysql,versioning] (>=1.0.9,<1.1.0)"] -postgresql = ["invenio-db[versioning,postgresql] (>=1.0.9,<1.1.0)"] +postgresql = ["invenio-db[postgresql,versioning] (>=1.0.9,<1.1.0)"] sqlite = ["invenio-db[versioning] (>=1.0.9,<1.1.0)"] tests = ["pytest-invenio (>=1.4.1)"] @@ -1574,9 +1574,9 @@ invenio-pidstore = ">=1.2.0" invenio-records = ">=1.0.0" [package.extras] -all = ["Sphinx (>=1.5.1)", "invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[mysql,versioning,postgresql] (>=1.0.0)", "pytest-invenio (>=1.4.0)"] +all = ["Sphinx (>=1.5.1)", "invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[postgresql,mysql,versioning] (>=1.0.0)", "pytest-invenio (>=1.4.0)"] docs = ["Sphinx (>=1.5.1)"] -tests = ["invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[mysql,versioning,postgresql] (>=1.0.0)", "pytest-invenio (>=1.4.0)"] +tests = ["invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[postgresql,mysql,versioning] (>=1.0.0)", "pytest-invenio (>=1.4.0)"] [[package]] name = "invenio-rest" diff --git a/pytest.ini b/pytest.ini index dde067a9..f4a9d5d0 100644 --- a/pytest.ini +++ b/pytest.ini @@ -17,7 +17,7 @@ [pytest] live_server_scope = module -addopts = --pycodestyle --pydocstyle --doctest-glob="*.rst" --doctest-modules --cov=rero_mef --cov-report=term-missing --ignore=setup.py --ignore=docs/conf.py +addopts = --pycodestyle --pydocstyle --doctest-glob="*.rst" --doctest-modules --cov=rero_mef --cov-report=term-missing --ignore=setup.py --ignore=docs/conf.py --color=yes testpaths = docs tests rero_mef # not displaying all the PendingDeprecationWarnings from invenio diff --git a/rero_mef/agents/__init__.py b/rero_mef/agents/__init__.py index 108727cd..d00b961e 100644 --- a/rero_mef/agents/__init__.py +++ b/rero_mef/agents/__init__.py @@ -15,4 +15,16 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -"""DOJSON transformations.""" +"""Agents.""" + +from .gnd.api import AgentGndIndexer, AgentGndRecord, AgentGndSearch +from .idref.api import AgentIdrefIndexer, AgentIdrefRecord, AgentIdrefSearch +from .mef.api import AgentMefIndexer, AgentMefRecord, AgentMefSearch +from .rero.api import AgentReroIndexer, AgentReroRecord, AgentReroSearch +from .viaf.api import AgentViafIndexer, AgentViafRecord, AgentViafSearch + +__all__ = (AgentGndIndexer, AgentGndRecord, AgentGndSearch, + AgentIdrefIndexer, AgentIdrefRecord, AgentIdrefSearch, + AgentMefIndexer, AgentMefRecord, AgentMefSearch, + AgentReroIndexer, AgentReroRecord, AgentReroSearch, + AgentViafIndexer, AgentViafRecord, AgentViafSearch) diff --git a/rero_mef/agents/cli.py b/rero_mef/agents/cli.py index 6e710c5f..0a9cb089 100644 --- a/rero_mef/agents/cli.py +++ b/rero_mef/agents/cli.py @@ -56,14 +56,16 @@ def create_from_viaf(test_md5, enqueue, online, verbose, progress, wait, fg='green' ) counts = {} + unexisting_pids = {} agent_classes = get_entity_classes(without_mef_viaf=False) for name, agent_class in agent_classes.items(): counts[name] = {} counts[name]['old'] = agent_class.count() if missing: - missing_pids = AgentMefRecord.get_all_missing_viaf_pids( - verbose=progress or verbose - ) + missing_pids, unexisting_pids = AgentMefRecord. \ + get_all_missing_viaf_pids( + verbose=progress or verbose + ) progress_bar = progressbar( items=missing_pids, length=len(missing_pids), @@ -94,6 +96,14 @@ def create_from_viaf(test_md5, enqueue, online, verbose, progress, wait, online=online, verbose=verbose ) + + if unexisting_pids: + click.echo( + f'Clean VIAF pids from MEF records: {len(unexisting_pids)}') + for pid, viaf_pid in unexisting_pids.items(): + # TODO: clean MEF records with unexisting VIAF pids: + pass + if wait: from ..cli import wait_empty_tasks wait_empty_tasks(delay=3, verbose=True) diff --git a/rero_mef/agents/mef/api.py b/rero_mef/agents/mef/api.py index 6325c098..d3f71c19 100644 --- a/rero_mef/agents/mef/api.py +++ b/rero_mef/agents/mef/api.py @@ -29,7 +29,7 @@ from .providers import MefProvider from ...api import ReroIndexer from ...api_mef import EntityMefRecord -from ...utils import progressbar +from ...utils import mef_get_all_missing_entity_pids, progressbar class AgentMefSearch(RecordsSearch): @@ -86,6 +86,7 @@ def get_all_missing_viaf_pids(cls, verbose=False): """ from ..viaf.api import AgentViafRecord missing_pids = {} + unexisting_pids = {} if verbose: click.echo('Get pids from VIAF ...') progress = progressbar( @@ -97,29 +98,53 @@ def get_all_missing_viaf_pids(cls, verbose=False): missing_pids[pid] = 1 if verbose: click.echo('Get pids from MEF and calculate missing ...') + query = cls.search().filter('exists', field='viaf_pid') progress = progressbar( - items=cls.search().filter('match_all').source().scan(), - length=cls.search().filter('match_all').source().count(), + items=query.source(['pid', 'viaf_pid']).scan(), + length=query.count(), verbose=True ) for hit in progress: - data = hit.to_dict() - viaf_pid = data.get('viaf_pid') - if viaf_pid: - missing_pids.pop(viaf_pid, None) - return missing_pids + if not missing_pids.pop(hit.viaf_pid, None): + unexisting_pids[hit.pid] = hit.viaf_pid + return [v for v in missing_pids], unexisting_pids + + @classmethod + def get_all_missing_agents_pids(cls, agent, verbose=False): + """Get all missing agent pids. + + :param agent: agent name to get the missing pids. + :param verbose: Verbose. + :returns: Missing VIAF pids. + """ + return mef_get_all_missing_entity_pids(mef_class=cls, entity=agent, + verbose=verbose) def replace_refs(self): """Replace $ref with real data.""" data = super().replace_refs() sources = [] for agent in ['rero', 'gnd', 'idref']: - if agent in data and data[agent]: - sources.append(agent) - metadata = data[agent].get('metadata') - if metadata: - data[agent] = metadata - data['type'] = metadata['bf:Agent'] + agent_data = data.get(agent) + if agent_data: + if agent_data.get('deleted'): + data.pop(agent) + current_app.logger.info( + f'MEF replace refs {data.get("pid")} {agent} deleted') + elif agent_data.get('status'): + data.pop(agent) + current_app.logger.error( + f'MEF replace refs {data.get("pid")} {agent}' + f' status: {agent_data.get("status")}' + f' {agent_data.get("message")}') + else: + sources.append(agent) + metadata = data[agent].get('metadata') + if metadata: + data[agent] = metadata + data['type'] = metadata['bf:Agent'] + else: + data['type'] = data[agent]['bf:Agent'] data['sources'] = sources return data diff --git a/rero_mef/agents/tasks.py b/rero_mef/agents/tasks.py index 8df99ffc..47c186ee 100644 --- a/rero_mef/agents/tasks.py +++ b/rero_mef/agents/tasks.py @@ -17,6 +17,7 @@ """Tasks used by RERO-MEF.""" +import click from celery import shared_task from .viaf.api import AgentViafRecord @@ -38,13 +39,17 @@ def task_create_mef_from_viaf_agent(pid, dbcommit=True, reindex=True, :returns: string with pid and actions """ viaf_record = AgentViafRecord.get_record_by_pid(pid) - actions = viaf_record.create_mef_and_agents( - dbcommit=dbcommit, - reindex=reindex, - test_md5=test_md5, - online=online, - verbose=verbose - ) + action = 'NO VIAF' + if viaf_record: + actions = viaf_record.create_mef_and_agents( + dbcommit=dbcommit, + reindex=reindex, + test_md5=test_md5, + online=online, + verbose=verbose + ) + else: + click.secho(f'{action}: {pid}', fg='red') return actions diff --git a/rero_mef/agents/viaf/api.py b/rero_mef/agents/viaf/api.py index 96d41a1a..40e14936 100644 --- a/rero_mef/agents/viaf/api.py +++ b/rero_mef/agents/viaf/api.py @@ -309,9 +309,9 @@ def get_missing_agent_pids(cls, agent, verbose=False): ) for pid in progress: pids_db[pid] = 1 + agent_pid_name = f'{record_class.name}_pid' if verbose: - click.echo(f'Get pids from VIAF with {agent} ...') - agent_pid_name = f'{agent}_pid' + click.echo(f'Get pids from VIAF with {agent_pid_name} ...') query = AgentViafSearch() \ .filter('bool', should=[Q('exists', field=agent_pid_name)]) \ .source(['pid', agent_pid_name]) diff --git a/rero_mef/concepts/__init__.py b/rero_mef/concepts/__init__.py index f2ef5b22..a50ddfc5 100644 --- a/rero_mef/concepts/__init__.py +++ b/rero_mef/concepts/__init__.py @@ -15,4 +15,10 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -"""RERO.""" +"""Concepts.""" + +from .mef.api import ConceptMefIndexer, ConceptMefRecord, ConceptMefSearch +from .rero.api import ConceptReroIndexer, ConceptReroRecord, ConceptReroSearch + +__all__ = (ConceptMefIndexer, ConceptMefRecord, ConceptMefSearch, + ConceptReroIndexer, ConceptReroRecord, ConceptReroSearch) diff --git a/rero_mef/concepts/mef/api.py b/rero_mef/concepts/mef/api.py index 7d45a67b..f9961c59 100644 --- a/rero_mef/concepts/mef/api.py +++ b/rero_mef/concepts/mef/api.py @@ -27,6 +27,7 @@ from .providers import ConceptMefProvider from ...api import ReroIndexer from ...api_mef import EntityMefRecord +from ...utils import mef_get_all_missing_entity_pids class ConceptMefSearch(RecordsSearch): @@ -76,6 +77,17 @@ def update_indexes(cls): 'ERROR flush and refresh: {err}'.format(err=err) ) + @classmethod + def get_all_missing_concepts_pids(cls, agent, verbose=False): + """Get all missing agent pids. + + :param agent: agent name to get the missing pids. + :param verbose: Verbose. + :returns: Missing VIAF pids. + """ + return mef_get_all_missing_entity_pids(mef_class=cls, entity=agent, + verbose=verbose) + def replace_refs(self): """Replace $ref with real data.""" data = super().replace_refs() diff --git a/rero_mef/concepts/utils.py b/rero_mef/concepts/utils.py new file mode 100644 index 00000000..d71358cd --- /dev/null +++ b/rero_mef/concepts/utils.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# +# RERO MEF +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Utilities.""" + +from flask import current_app + + +def get_concepts_endpoints(): + """Get all contributions from config.""" + concepts_endpoints = {} + concepts = current_app.config.get('CONCEPTS', []) + endpoints = current_app.config.get('RECORDS_REST_ENDPOINTS', {}) + for endpoint, data in endpoints.items(): + if endpoint in concepts: + concepts_endpoints[endpoint] = data + return concepts_endpoints diff --git a/rero_mef/marctojson/do_gnd_agent.py b/rero_mef/marctojson/do_gnd_agent.py index 617ee582..1cb5c227 100644 --- a/rero_mef/marctojson/do_gnd_agent.py +++ b/rero_mef/marctojson/do_gnd_agent.py @@ -49,16 +49,9 @@ def __init__(self, marc, logger=None, verbose=False, transform=True): def _transform(self): """Call the transformation functions.""" - # test if organisation or person but not family - is_organisation_person = False - is_family = False - for field_075 in self.marc.get_fields('075'): - for subfield_b in field_075.get_subfields('b'): - if subfield_b in ['b', 'f', 'p']: - is_organisation_person = True - if subfield_b == 'piz': - is_family = True - if is_organisation_person and not is_family: + if self.marc.get_fields('100') or \ + self.marc.get_fields('110') or \ + self.marc.get_fields('111'): for func in dir(self): if func.startswith('trans'): func = getattr(self, func) diff --git a/rero_mef/monitoring.py b/rero_mef/monitoring.py index 447e7bbc..9cba34f9 100644 --- a/rero_mef/monitoring.py +++ b/rero_mef/monitoring.py @@ -32,7 +32,7 @@ from redis import Redis from .permissions import monitoring_permission -from .utils import get_entity_class, progressbar +from .utils import get_entity_class, get_mefs_endpoints, progressbar api_blueprint = Blueprint( 'api_monitoring', @@ -147,6 +147,19 @@ def es_db_counts(): )}) +@api_blueprint.route('/mef_counts') +def mef_counts(): + """Display count for mef and documents. + + Displays for all document types defind in config.py following informations: + - count of records in database + - count of records in MEF + - difference between the count in database and MEF + :return: jsonified count for MEF and documents + """ + return jsonify({'data': Monitoring.check_mef()}) + + @api_blueprint.route('/check_es_db_counts') def check_es_db_counts(): """Displays health status for elasticsearch and database counts. @@ -429,7 +442,7 @@ def info(cls, with_deleted=False, difference_db_es=False): Get count details for all records rest endpoints in JSON format. :param with_deleted: count also deleted items in database. - :return: dictionair with database, elasticsearch and databse minus + :return: dictionair with database, elasticsearch and database minus elasticsearch count informations. """ info = {} @@ -465,7 +478,7 @@ def check(cls, with_deleted=False, difference_db_es=False): :param with_deleted: count also deleted items in database. :return: dictionair with all document types with a difference in - databse and elasticsearch counts. + database and elasticsearch counts. """ checks = {} for info, data in cls.info( @@ -484,6 +497,28 @@ def check(cls, with_deleted=False, difference_db_es=False): checks[info]['es-'] = len(data.get('es-')) return checks + @classmethod + def check_mef(cls): + """Compaire MEF and entities counts. + + returns: MEF, entities and MEF-entities counts. + """ + checks = {} + for mef in get_mefs_endpoints(): + mef_search = mef['mef_class'].search + for entity in mef['endpoints']: + entity_class = get_entity_class(entity) + mef_count = mef_search() \ + .filter('exists', field=entity_class.name) \ + .count() + db_count = entity_class.count() + checks[entity] = { + 'mef': mef_count, + 'db': db_count, + 'mef-db': mef_count - db_count + } + return checks + @classmethod def missing(cls, doc_type, with_deleted=False): """Get missing pids. @@ -576,6 +611,32 @@ def es_db_counts_cli(missing): mon.print_missing(missing_doc_type) +@monitoring.command('mef_counts') +@with_appcontext +def mef_counts_cli(): + """Print MEF counts. + + Prints a table representation of MEF counts. + Columes: + 1. MEF count minus database count + 2. document type + 3. database count + 5. MEF count + """ + mon = Monitoring() + msg_head = f'MEF - DB {"type":>6} {"DB":>10} {"MEF":>10}' + click.echo(msg_head) + for entity, data in mon.check_mef().items(): + mef_db = data.get('mef-db', '') + db = data.get('db', '') + mef = data.get('mef', '') + msg = f'{mef_db:>8} {entity:>6} {db:>10} {mef:>10}' + if mef_db not in [0, '']: + click.secho(msg, fg='red') + else: + click.echo(msg) + + @monitoring.command('es_db_missing') @click.argument('doc_type') @with_appcontext diff --git a/rero_mef/utils.py b/rero_mef/utils.py index abd91e40..d65e3030 100644 --- a/rero_mef/utils.py +++ b/rero_mef/utils.py @@ -1187,3 +1187,64 @@ def write(self, data): def close(self): """Close file.""" self.__del__() + + +def mef_get_all_missing_entity_pids(mef_class, entity, verbose=False): + """Get all missing agent pids. + + :param mef_class: MEF class to use. + :param entity: entity name to get the missing pids. + :param verbose: Verbose. + :returns: Missing VIAF pids. + """ + record_class = get_entity_class(entity) + missing_pids = {} + unexisting_pids = {} + no_pids = [] + if verbose: + click.echo(f'Get pids from {entity} ...') + progress = progressbar( + items=record_class.get_all_pids(), + length=record_class.count(), + verbose=verbose + ) + for pid in progress: + missing_pids[pid] = 1 + name = record_class.name + if verbose: + click.echo(f'Get pids for {name} from MEF and calculate missing ...') + query = mef_class.search().filter('exists', field=name) + progress = progressbar( + items=query.source(['pid', name]).scan(), + length=query.count(), + verbose=True + ) + for hit in progress: + data = hit.to_dict() + agent_pid = data.get(name, {}).get('pid') + if agent_pid: + res = missing_pids.pop(agent_pid, False) + if not res: + unexisting_pids[hit.pid] = agent_pid + else: + no_pids.append(hit.pid) + return [v for v in missing_pids], unexisting_pids, no_pids + + +def get_mefs_endpoints(): + """Get all enpoints for MEF's.""" + from .agents.mef.api import AgentMefRecord + from .agents.utils import get_agents_endpoints + from .concepts.mef.api import ConceptMefRecord + from .concepts.utils import get_concepts_endpoints + + mefs = [] + mefs.append({ + 'mef_class': AgentMefRecord, + 'endpoints': get_agents_endpoints() + }) + mefs.append({ + 'mef_class': ConceptMefRecord, + 'endpoints': get_concepts_endpoints() + }) + return mefs