diff --git a/poetry.lock b/poetry.lock
index d3a5121e..4f416228 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1011,7 +1011,7 @@ elasticsearch7 = ["invenio-search[elasticsearch7] (>=1.4.1,<1.5.0)"]
files = ["invenio-files-rest (>=1.2.0,<1.3.0)", "invenio-iiif (>=1.1.0,<1.2.0)", "invenio-previewer (>=1.3.2,<1.4.0)", "invenio-records-files (>=1.2.1,<1.3.0)"]
metadata = ["invenio-indexer (>=1.2.0,<1.3.0)", "invenio-jsonschemas (>=1.1.1,<1.2.0)", "invenio-oaiserver (>=1.2.0,<1.3.0)", "invenio-pidstore (>=1.2.1,<1.3.0)", "invenio-records-rest (>=1.8.0,<1.9.0)", "invenio-records-ui (>=1.2.0,<1.3.0)", "invenio-records (>=1.4.0,<1.6.0)", "invenio-search-ui (>=2.0.0,<2.1.0)"]
mysql = ["invenio-db[mysql,versioning] (>=1.0.8,<1.1.0)"]
-postgresql = ["invenio-db[versioning,postgresql] (>=1.0.8,<1.1.0)"]
+postgresql = ["invenio-db[postgresql,versioning] (>=1.0.8,<1.1.0)"]
sqlite = ["invenio-db[versioning] (>=1.0.8,<1.1.0)"]
tests = ["pytest-invenio (>=1.4.0,<1.5.0)"]
@@ -1033,7 +1033,7 @@ invenio-i18n = ">=1.2.0"
all = ["Sphinx (>=3)", "cachelib (>=0.1)", "pytest-invenio (>=1.4.1)", "redis (>=2.10.5)"]
docs = ["Sphinx (>=3)"]
mysql = ["invenio-db[mysql,versioning] (>=1.0.8)"]
-postgresql = ["invenio-db[versioning,postgresql] (>=1.0.8)"]
+postgresql = ["invenio-db[postgresql,versioning] (>=1.0.8)"]
sqlite = ["invenio-db[versioning,sqlite] (>=1.0.8)"]
tests = ["cachelib (>=0.1)", "pytest-invenio (>=1.4.1)", "redis (>=2.10.5)"]
@@ -1072,7 +1072,7 @@ admin = ["invenio-admin (>=1.2.1)"]
all = ["invenio-admin (>=1.2.1)", "Sphinx (==4.2.0)", "pytest-invenio (>=1.4.2)"]
docs = ["Sphinx (==4.2.0)"]
mysql = ["invenio-db[mysql,versioning] (>=1.0.9)"]
-postgresql = ["invenio-db[versioning,postgresql] (>=1.0.9)"]
+postgresql = ["invenio-db[postgresql,versioning] (>=1.0.9)"]
sqlite = ["invenio-db[versioning] (>=1.0.9)"]
tests = ["pytest-invenio (>=1.4.2)"]
@@ -1441,7 +1441,7 @@ admin = ["invenio-admin (>=1.2.1)"]
all = ["invenio-admin (>=1.2.1)", "Sphinx (>=4.2.0)", "redis (>=2.10.5)", "pytest-invenio (>=1.4.0)"]
docs = ["Sphinx (>=4.2.0)"]
mysql = ["invenio-db[mysql,versioning] (>=1.0.9,<2.0.0)"]
-postgresql = ["invenio-db[versioning,postgresql] (>=1.0.9,<2.0.0)"]
+postgresql = ["invenio-db[postgresql,versioning] (>=1.0.9,<2.0.0)"]
redis = ["redis (>=2.10.5)"]
sqlite = ["invenio-db[versioning] (>=1.0.9,<2.0.0)"]
tests = ["pytest-invenio (>=1.4.0)"]
@@ -1525,7 +1525,7 @@ admin = ["invenio-admin (>=1.2.1)"]
all = ["Sphinx (==4.2.0)", "invenio-admin (>=1.2.1)", "pytest-invenio (>=1.4.1)"]
docs = ["Sphinx (==4.2.0)"]
mysql = ["invenio-db[mysql,versioning] (>=1.0.9,<1.1.0)"]
-postgresql = ["invenio-db[versioning,postgresql] (>=1.0.9,<1.1.0)"]
+postgresql = ["invenio-db[postgresql,versioning] (>=1.0.9,<1.1.0)"]
sqlite = ["invenio-db[versioning] (>=1.0.9,<1.1.0)"]
tests = ["pytest-invenio (>=1.4.1)"]
@@ -1574,9 +1574,9 @@ invenio-pidstore = ">=1.2.0"
invenio-records = ">=1.0.0"
[package.extras]
-all = ["Sphinx (>=1.5.1)", "invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[mysql,versioning,postgresql] (>=1.0.0)", "pytest-invenio (>=1.4.0)"]
+all = ["Sphinx (>=1.5.1)", "invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[postgresql,mysql,versioning] (>=1.0.0)", "pytest-invenio (>=1.4.0)"]
docs = ["Sphinx (>=1.5.1)"]
-tests = ["invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[mysql,versioning,postgresql] (>=1.0.0)", "pytest-invenio (>=1.4.0)"]
+tests = ["invenio-access (>=1.0.0)", "invenio-accounts (>=1.3.0)", "invenio-db[postgresql,mysql,versioning] (>=1.0.0)", "pytest-invenio (>=1.4.0)"]
[[package]]
name = "invenio-rest"
diff --git a/pytest.ini b/pytest.ini
index dde067a9..f4a9d5d0 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -17,7 +17,7 @@
[pytest]
live_server_scope = module
-addopts = --pycodestyle --pydocstyle --doctest-glob="*.rst" --doctest-modules --cov=rero_mef --cov-report=term-missing --ignore=setup.py --ignore=docs/conf.py
+addopts = --pycodestyle --pydocstyle --doctest-glob="*.rst" --doctest-modules --cov=rero_mef --cov-report=term-missing --ignore=setup.py --ignore=docs/conf.py --color=yes
testpaths = docs tests rero_mef
# not displaying all the PendingDeprecationWarnings from invenio
diff --git a/rero_mef/agents/__init__.py b/rero_mef/agents/__init__.py
index 108727cd..d00b961e 100644
--- a/rero_mef/agents/__init__.py
+++ b/rero_mef/agents/__init__.py
@@ -15,4 +15,16 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-"""DOJSON transformations."""
+"""Agents."""
+
+from .gnd.api import AgentGndIndexer, AgentGndRecord, AgentGndSearch
+from .idref.api import AgentIdrefIndexer, AgentIdrefRecord, AgentIdrefSearch
+from .mef.api import AgentMefIndexer, AgentMefRecord, AgentMefSearch
+from .rero.api import AgentReroIndexer, AgentReroRecord, AgentReroSearch
+from .viaf.api import AgentViafIndexer, AgentViafRecord, AgentViafSearch
+
+__all__ = (AgentGndIndexer, AgentGndRecord, AgentGndSearch,
+ AgentIdrefIndexer, AgentIdrefRecord, AgentIdrefSearch,
+ AgentMefIndexer, AgentMefRecord, AgentMefSearch,
+ AgentReroIndexer, AgentReroRecord, AgentReroSearch,
+ AgentViafIndexer, AgentViafRecord, AgentViafSearch)
diff --git a/rero_mef/agents/cli.py b/rero_mef/agents/cli.py
index 6e710c5f..0a9cb089 100644
--- a/rero_mef/agents/cli.py
+++ b/rero_mef/agents/cli.py
@@ -56,14 +56,16 @@ def create_from_viaf(test_md5, enqueue, online, verbose, progress, wait,
fg='green'
)
counts = {}
+ unexisting_pids = {}
agent_classes = get_entity_classes(without_mef_viaf=False)
for name, agent_class in agent_classes.items():
counts[name] = {}
counts[name]['old'] = agent_class.count()
if missing:
- missing_pids = AgentMefRecord.get_all_missing_viaf_pids(
- verbose=progress or verbose
- )
+ missing_pids, unexisting_pids = AgentMefRecord. \
+ get_all_missing_viaf_pids(
+ verbose=progress or verbose
+ )
progress_bar = progressbar(
items=missing_pids,
length=len(missing_pids),
@@ -94,6 +96,14 @@ def create_from_viaf(test_md5, enqueue, online, verbose, progress, wait,
online=online,
verbose=verbose
)
+
+ if unexisting_pids:
+ click.echo(
+ f'Clean VIAF pids from MEF records: {len(unexisting_pids)}')
+ for pid, viaf_pid in unexisting_pids.items():
+ # TODO: clean MEF records with unexisting VIAF pids:
+ pass
+
if wait:
from ..cli import wait_empty_tasks
wait_empty_tasks(delay=3, verbose=True)
diff --git a/rero_mef/agents/mef/api.py b/rero_mef/agents/mef/api.py
index 6325c098..d3f71c19 100644
--- a/rero_mef/agents/mef/api.py
+++ b/rero_mef/agents/mef/api.py
@@ -29,7 +29,7 @@
from .providers import MefProvider
from ...api import ReroIndexer
from ...api_mef import EntityMefRecord
-from ...utils import progressbar
+from ...utils import mef_get_all_missing_entity_pids, progressbar
class AgentMefSearch(RecordsSearch):
@@ -86,6 +86,7 @@ def get_all_missing_viaf_pids(cls, verbose=False):
"""
from ..viaf.api import AgentViafRecord
missing_pids = {}
+ unexisting_pids = {}
if verbose:
click.echo('Get pids from VIAF ...')
progress = progressbar(
@@ -97,29 +98,53 @@ def get_all_missing_viaf_pids(cls, verbose=False):
missing_pids[pid] = 1
if verbose:
click.echo('Get pids from MEF and calculate missing ...')
+ query = cls.search().filter('exists', field='viaf_pid')
progress = progressbar(
- items=cls.search().filter('match_all').source().scan(),
- length=cls.search().filter('match_all').source().count(),
+ items=query.source(['pid', 'viaf_pid']).scan(),
+ length=query.count(),
verbose=True
)
for hit in progress:
- data = hit.to_dict()
- viaf_pid = data.get('viaf_pid')
- if viaf_pid:
- missing_pids.pop(viaf_pid, None)
- return missing_pids
+ if not missing_pids.pop(hit.viaf_pid, None):
+ unexisting_pids[hit.pid] = hit.viaf_pid
+ return [v for v in missing_pids], unexisting_pids
+
+ @classmethod
+ def get_all_missing_agents_pids(cls, agent, verbose=False):
+ """Get all missing agent pids.
+
+ :param agent: agent name to get the missing pids.
+ :param verbose: Verbose.
+ :returns: Missing VIAF pids.
+ """
+ return mef_get_all_missing_entity_pids(mef_class=cls, entity=agent,
+ verbose=verbose)
def replace_refs(self):
"""Replace $ref with real data."""
data = super().replace_refs()
sources = []
for agent in ['rero', 'gnd', 'idref']:
- if agent in data and data[agent]:
- sources.append(agent)
- metadata = data[agent].get('metadata')
- if metadata:
- data[agent] = metadata
- data['type'] = metadata['bf:Agent']
+ agent_data = data.get(agent)
+ if agent_data:
+ if agent_data.get('deleted'):
+ data.pop(agent)
+ current_app.logger.info(
+ f'MEF replace refs {data.get("pid")} {agent} deleted')
+ elif agent_data.get('status'):
+ data.pop(agent)
+ current_app.logger.error(
+ f'MEF replace refs {data.get("pid")} {agent}'
+ f' status: {agent_data.get("status")}'
+ f' {agent_data.get("message")}')
+ else:
+ sources.append(agent)
+ metadata = data[agent].get('metadata')
+ if metadata:
+ data[agent] = metadata
+ data['type'] = metadata['bf:Agent']
+ else:
+ data['type'] = data[agent]['bf:Agent']
data['sources'] = sources
return data
diff --git a/rero_mef/agents/tasks.py b/rero_mef/agents/tasks.py
index 8df99ffc..47c186ee 100644
--- a/rero_mef/agents/tasks.py
+++ b/rero_mef/agents/tasks.py
@@ -17,6 +17,7 @@
"""Tasks used by RERO-MEF."""
+import click
from celery import shared_task
from .viaf.api import AgentViafRecord
@@ -38,13 +39,17 @@ def task_create_mef_from_viaf_agent(pid, dbcommit=True, reindex=True,
:returns: string with pid and actions
"""
viaf_record = AgentViafRecord.get_record_by_pid(pid)
- actions = viaf_record.create_mef_and_agents(
- dbcommit=dbcommit,
- reindex=reindex,
- test_md5=test_md5,
- online=online,
- verbose=verbose
- )
+ action = 'NO VIAF'
+ if viaf_record:
+ actions = viaf_record.create_mef_and_agents(
+ dbcommit=dbcommit,
+ reindex=reindex,
+ test_md5=test_md5,
+ online=online,
+ verbose=verbose
+ )
+ else:
+ click.secho(f'{action}: {pid}', fg='red')
return actions
diff --git a/rero_mef/agents/viaf/api.py b/rero_mef/agents/viaf/api.py
index 96d41a1a..40e14936 100644
--- a/rero_mef/agents/viaf/api.py
+++ b/rero_mef/agents/viaf/api.py
@@ -309,9 +309,9 @@ def get_missing_agent_pids(cls, agent, verbose=False):
)
for pid in progress:
pids_db[pid] = 1
+ agent_pid_name = f'{record_class.name}_pid'
if verbose:
- click.echo(f'Get pids from VIAF with {agent} ...')
- agent_pid_name = f'{agent}_pid'
+ click.echo(f'Get pids from VIAF with {agent_pid_name} ...')
query = AgentViafSearch() \
.filter('bool', should=[Q('exists', field=agent_pid_name)]) \
.source(['pid', agent_pid_name])
diff --git a/rero_mef/concepts/__init__.py b/rero_mef/concepts/__init__.py
index f2ef5b22..a50ddfc5 100644
--- a/rero_mef/concepts/__init__.py
+++ b/rero_mef/concepts/__init__.py
@@ -15,4 +15,10 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-"""RERO."""
+"""Concepts."""
+
+from .mef.api import ConceptMefIndexer, ConceptMefRecord, ConceptMefSearch
+from .rero.api import ConceptReroIndexer, ConceptReroRecord, ConceptReroSearch
+
+__all__ = (ConceptMefIndexer, ConceptMefRecord, ConceptMefSearch,
+ ConceptReroIndexer, ConceptReroRecord, ConceptReroSearch)
diff --git a/rero_mef/concepts/mef/api.py b/rero_mef/concepts/mef/api.py
index 7d45a67b..f9961c59 100644
--- a/rero_mef/concepts/mef/api.py
+++ b/rero_mef/concepts/mef/api.py
@@ -27,6 +27,7 @@
from .providers import ConceptMefProvider
from ...api import ReroIndexer
from ...api_mef import EntityMefRecord
+from ...utils import mef_get_all_missing_entity_pids
class ConceptMefSearch(RecordsSearch):
@@ -76,6 +77,17 @@ def update_indexes(cls):
'ERROR flush and refresh: {err}'.format(err=err)
)
+ @classmethod
+ def get_all_missing_concepts_pids(cls, agent, verbose=False):
+ """Get all missing agent pids.
+
+ :param agent: agent name to get the missing pids.
+ :param verbose: Verbose.
+ :returns: Missing VIAF pids.
+ """
+ return mef_get_all_missing_entity_pids(mef_class=cls, entity=agent,
+ verbose=verbose)
+
def replace_refs(self):
"""Replace $ref with real data."""
data = super().replace_refs()
diff --git a/rero_mef/concepts/utils.py b/rero_mef/concepts/utils.py
new file mode 100644
index 00000000..d71358cd
--- /dev/null
+++ b/rero_mef/concepts/utils.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+#
+# RERO MEF
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Utilities."""
+
+from flask import current_app
+
+
+def get_concepts_endpoints():
+ """Get all contributions from config."""
+ concepts_endpoints = {}
+ concepts = current_app.config.get('CONCEPTS', [])
+ endpoints = current_app.config.get('RECORDS_REST_ENDPOINTS', {})
+ for endpoint, data in endpoints.items():
+ if endpoint in concepts:
+ concepts_endpoints[endpoint] = data
+ return concepts_endpoints
diff --git a/rero_mef/marctojson/do_gnd_agent.py b/rero_mef/marctojson/do_gnd_agent.py
index 617ee582..1cb5c227 100644
--- a/rero_mef/marctojson/do_gnd_agent.py
+++ b/rero_mef/marctojson/do_gnd_agent.py
@@ -49,16 +49,9 @@ def __init__(self, marc, logger=None, verbose=False, transform=True):
def _transform(self):
"""Call the transformation functions."""
- # test if organisation or person but not family
- is_organisation_person = False
- is_family = False
- for field_075 in self.marc.get_fields('075'):
- for subfield_b in field_075.get_subfields('b'):
- if subfield_b in ['b', 'f', 'p']:
- is_organisation_person = True
- if subfield_b == 'piz':
- is_family = True
- if is_organisation_person and not is_family:
+ if self.marc.get_fields('100') or \
+ self.marc.get_fields('110') or \
+ self.marc.get_fields('111'):
for func in dir(self):
if func.startswith('trans'):
func = getattr(self, func)
diff --git a/rero_mef/monitoring.py b/rero_mef/monitoring.py
index 447e7bbc..9cba34f9 100644
--- a/rero_mef/monitoring.py
+++ b/rero_mef/monitoring.py
@@ -32,7 +32,7 @@
from redis import Redis
from .permissions import monitoring_permission
-from .utils import get_entity_class, progressbar
+from .utils import get_entity_class, get_mefs_endpoints, progressbar
api_blueprint = Blueprint(
'api_monitoring',
@@ -147,6 +147,19 @@ def es_db_counts():
)})
+@api_blueprint.route('/mef_counts')
+def mef_counts():
+ """Display count for mef and documents.
+
+ Displays for all document types defind in config.py following informations:
+ - count of records in database
+ - count of records in MEF
+ - difference between the count in database and MEF
+ :return: jsonified count for MEF and documents
+ """
+ return jsonify({'data': Monitoring.check_mef()})
+
+
@api_blueprint.route('/check_es_db_counts')
def check_es_db_counts():
"""Displays health status for elasticsearch and database counts.
@@ -429,7 +442,7 @@ def info(cls, with_deleted=False, difference_db_es=False):
Get count details for all records rest endpoints in JSON format.
:param with_deleted: count also deleted items in database.
- :return: dictionair with database, elasticsearch and databse minus
+ :return: dictionair with database, elasticsearch and database minus
elasticsearch count informations.
"""
info = {}
@@ -465,7 +478,7 @@ def check(cls, with_deleted=False, difference_db_es=False):
:param with_deleted: count also deleted items in database.
:return: dictionair with all document types with a difference in
- databse and elasticsearch counts.
+ database and elasticsearch counts.
"""
checks = {}
for info, data in cls.info(
@@ -484,6 +497,28 @@ def check(cls, with_deleted=False, difference_db_es=False):
checks[info]['es-'] = len(data.get('es-'))
return checks
+ @classmethod
+ def check_mef(cls):
+ """Compaire MEF and entities counts.
+
+ returns: MEF, entities and MEF-entities counts.
+ """
+ checks = {}
+ for mef in get_mefs_endpoints():
+ mef_search = mef['mef_class'].search
+ for entity in mef['endpoints']:
+ entity_class = get_entity_class(entity)
+ mef_count = mef_search() \
+ .filter('exists', field=entity_class.name) \
+ .count()
+ db_count = entity_class.count()
+ checks[entity] = {
+ 'mef': mef_count,
+ 'db': db_count,
+ 'mef-db': mef_count - db_count
+ }
+ return checks
+
@classmethod
def missing(cls, doc_type, with_deleted=False):
"""Get missing pids.
@@ -576,6 +611,32 @@ def es_db_counts_cli(missing):
mon.print_missing(missing_doc_type)
+@monitoring.command('mef_counts')
+@with_appcontext
+def mef_counts_cli():
+ """Print MEF counts.
+
+ Prints a table representation of MEF counts.
+ Columes:
+ 1. MEF count minus database count
+ 2. document type
+ 3. database count
+ 5. MEF count
+ """
+ mon = Monitoring()
+ msg_head = f'MEF - DB {"type":>6} {"DB":>10} {"MEF":>10}'
+ click.echo(msg_head)
+ for entity, data in mon.check_mef().items():
+ mef_db = data.get('mef-db', '')
+ db = data.get('db', '')
+ mef = data.get('mef', '')
+ msg = f'{mef_db:>8} {entity:>6} {db:>10} {mef:>10}'
+ if mef_db not in [0, '']:
+ click.secho(msg, fg='red')
+ else:
+ click.echo(msg)
+
+
@monitoring.command('es_db_missing')
@click.argument('doc_type')
@with_appcontext
diff --git a/rero_mef/utils.py b/rero_mef/utils.py
index abd91e40..d65e3030 100644
--- a/rero_mef/utils.py
+++ b/rero_mef/utils.py
@@ -1187,3 +1187,64 @@ def write(self, data):
def close(self):
"""Close file."""
self.__del__()
+
+
+def mef_get_all_missing_entity_pids(mef_class, entity, verbose=False):
+ """Get all missing agent pids.
+
+ :param mef_class: MEF class to use.
+ :param entity: entity name to get the missing pids.
+ :param verbose: Verbose.
+ :returns: Missing VIAF pids.
+ """
+ record_class = get_entity_class(entity)
+ missing_pids = {}
+ unexisting_pids = {}
+ no_pids = []
+ if verbose:
+ click.echo(f'Get pids from {entity} ...')
+ progress = progressbar(
+ items=record_class.get_all_pids(),
+ length=record_class.count(),
+ verbose=verbose
+ )
+ for pid in progress:
+ missing_pids[pid] = 1
+ name = record_class.name
+ if verbose:
+ click.echo(f'Get pids for {name} from MEF and calculate missing ...')
+ query = mef_class.search().filter('exists', field=name)
+ progress = progressbar(
+ items=query.source(['pid', name]).scan(),
+ length=query.count(),
+ verbose=True
+ )
+ for hit in progress:
+ data = hit.to_dict()
+ agent_pid = data.get(name, {}).get('pid')
+ if agent_pid:
+ res = missing_pids.pop(agent_pid, False)
+ if not res:
+ unexisting_pids[hit.pid] = agent_pid
+ else:
+ no_pids.append(hit.pid)
+ return [v for v in missing_pids], unexisting_pids, no_pids
+
+
+def get_mefs_endpoints():
+ """Get all enpoints for MEF's."""
+ from .agents.mef.api import AgentMefRecord
+ from .agents.utils import get_agents_endpoints
+ from .concepts.mef.api import ConceptMefRecord
+ from .concepts.utils import get_concepts_endpoints
+
+ mefs = []
+ mefs.append({
+ 'mef_class': AgentMefRecord,
+ 'endpoints': get_agents_endpoints()
+ })
+ mefs.append({
+ 'mef_class': ConceptMefRecord,
+ 'endpoints': get_concepts_endpoints()
+ })
+ return mefs