Skip to content
This repository has been archived by the owner on Jan 4, 2022. It is now read-only.

Commit

Permalink
Merge pull request #2 from ogdch/develop
Browse files Browse the repository at this point in the history
Release for fixes of latest support issues
  • Loading branch information
metaodi committed Nov 4, 2013
2 parents 74da6a3 + 92c08c2 commit 20e96cc
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 13 deletions.
127 changes: 127 additions & 0 deletions ckanext/fso/harvesters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from ckanext.harvest.harvesters import HarvesterBase

from sqlalchemy.sql import update,and_, bindparam
from sqlalchemy.exc import InvalidRequestError

from ckan import plugins as p
from ckan import model
from ckan.model import Session
from ckan.logic import ValidationError, NotFound, get_action
from ckan.logic.schema import default_create_package_schema
from ckan.lib.navl.validators import ignore_missing,ignore
from ckan.lib.munge import munge_tag

import logging
log = logging.getLogger(__name__)

class OGDCHHarvesterBase(HarvesterBase):
def _create_or_update_package(self, package_dict, harvest_object):
'''
Creates a new package or updates an exisiting one according to the
package dictionary provided. The package dictionary should look like
the REST API response for a package:
http://ckan.net/api/rest/package/statistics-catalunya
Note that the package_dict must contain an id, which will be used to
check if the package needs to be created or updated (use the remote
dataset id).
If the remote server provides the modification date of the remote
package, add it to package_dict['metadata_modified'].
TODO: Not sure it is worth keeping this function. If useful it should
use the output of package_show logic function (maybe keeping support
for rest api based dicts
'''
try:
# Change default schema
schema = default_create_package_schema()
schema['id'] = [ignore_missing, unicode]
schema['__junk'] = [ignore]

# Check API version
if self.config:
try:
api_version = int(self.config.get('api_version', 2))
except ValueError:
raise ValueError('api_version must be an integer')

#TODO: use site user when available
user_name = self.config.get('user', u'harvest')
else:
api_version = 2
user_name = u'harvest'

context = {
'model': model,
'session': Session,
'user': user_name,
'api_version': api_version,
'schema': schema,
}

tags = package_dict.get('tags', [])
tags = [munge_tag(t) for t in tags]
tags = list(set(tags))
package_dict['tags'] = tags

# Check if package exists
data_dict = {}
data_dict['id'] = package_dict['id']
try:
existing_package_dict = get_action('package_show')(context, data_dict)
# Check modified date
if not 'metadata_modified' in package_dict or \
package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
# Update package
context.update({'id':package_dict['id']})
new_package = get_action('package_update_rest')(context, package_dict)

else:
log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
return

# Flag the other objects linking to this package as not current anymore
from ckanext.harvest.model import harvest_object_table
conn = Session.connection()
u = update(harvest_object_table) \
.where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
.values(current=False)
conn.execute(u, b_package_id=new_package['id'])

# Flag this as the current harvest object

harvest_object.package_id = new_package['id']
harvest_object.current = True
harvest_object.save()

except NotFound:
# Package needs to be created
log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
harvest_object.current = True
harvest_object.package_id = package_dict['id']
# Defer constraints and flush so the dataset can be indexed with
# the harvest object id (on the after_show hook from the harvester
# plugin)
harvest_object.add()

model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
model.Session.flush()

new_package = get_action('package_create_rest')(context, package_dict)

Session.commit()

return True

except ValidationError,e:
log.exception(e)
self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
except Exception, e:
log.exception(e)
self._save_object_error('%r'%e,harvest_object,'Import')

return None
82 changes: 69 additions & 13 deletions ckanext/fso/harvesters/fsoharvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@

import urllib3
from lxml import etree
from uuid import NAMESPACE_OID, uuid4, uuid5

from ckan.lib.base import c
from ckan import model
from ckan.model import Session, Package
from ckan.logic import ValidationError, NotFound, get_action, action
from ckan.lib.helpers import json
from ckan.lib.munge import munge_title_to_name

from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
HarvestObjectError
from ckanext.harvest.harvesters import HarvesterBase
from base import OGDCHHarvesterBase

import logging
log = logging.getLogger(__name__)

class FSOHarvester(HarvesterBase):
class FSOHarvester(OGDCHHarvesterBase):
'''
The harvester for the FSO
'''
Expand All @@ -25,10 +27,20 @@ class FSOHarvester(HarvesterBase):
FILES_BASE_URL = "http://www.bfs.admin.ch/xmlns/opendata/"
HARVEST_USER = u'harvest'
ORGANIZATION = {
'de': u'Bundesamt für Statistik',
'fr': u'Office fédéral de la statistique',
'it': u'Ufficio federale di statistica',
'en': u'Swiss Federal Statistical Office',
'de': {
'name': u'Bundesamt für Statistik',
'description': u'Orientiert über den Stand und die Entwicklung der Schweiz in zahlreichen Lebensbereichen. Es liefert die quantitativen Informationen, um die Gegenwart zu verstehen und die Zukunft zu planen.',
'website': u'http://www.bfs.admin.ch/'
},
'fr': {
'name': u'Office fédéral de la statistique',
'description': u'Fournit des informations sur l’état et l’évolution de la Suisse dans de nombreux domaines. Les informations qu’il produit servent à comprendre le présent et à planifier l’avenir.'},
'it': {
'name': u'Ufficio federale di statistica',
'description': u'Fornisce informazioni sullo stato e sull’evoluzione della Svizzera nei più svariati settori che permettono di capire il presente e pianificare il futuro.'},
'en': {
'name': u'Swiss Federal Statistical Office',
'description': u'Publishes information on the situation and trends in Switzerland in many different areas of life. It provides the quantitative information needed to understand the present and to plan for the future.'}
}
GROUPS = {
'de': [u'Bevölkerung', u'Politik'],
Expand Down Expand Up @@ -77,6 +89,34 @@ class FSOHarvester(HarvesterBase):
'user': u'admin'
}

def _create_uuid(self, name=None):
'''
Create a new SHA-1 uuid for a given name or a random id
'''
if name:
new_uuid = uuid5(NAMESPACE_OID, str(name))
else:
new_uuid = uuid4()

return unicode(new_uuid)

def _gen_new_name(self, title, current_id=None):
'''
Creates a URL friendly name from a title
If the name already exists, it will add some random characters at the end
'''

name = munge_title_to_name(title).replace('_', '-')
while '--' in name:
name = name.replace('--', '-')
pkg_obj = Session.query(Package).filter(Package.name == name).first()
if pkg_obj and pkg_obj.id != current_id:
return name + str(uuid4())[:5]
else:
return name


def _file_is_available(self, url):
'''
Returns true if 200, False otherwise. (logs falses)
Expand Down Expand Up @@ -149,6 +189,15 @@ def _generate_term_translations(self, base_dataset, package):
'term_translation': group
})

for lang, org in self.ORGANIZATION.items():
if lang != 'de':
for field in ['name', 'description']:
translations.append({
'lang_code': lang,
'term': self.ORGANIZATION['de'][field],
'term_translation': org[field]
})

for dataset in package:
if base_dataset.get('datasetID') != dataset.get('datasetID'):
lang = dataset.get('{http://www.w3.org/XML/1998/namespace}lang')
Expand Down Expand Up @@ -240,7 +289,7 @@ def gather_stage(self, harvest_job):
metadata = self._generate_metadata(base_dataset, package)
if metadata:
obj = HarvestObject(
guid = base_dataset.get('datasetID'),
guid = self._create_uuid(base_dataset.get('datasetID')),
job = harvest_job,
content = json.dumps(metadata)
)
Expand Down Expand Up @@ -279,7 +328,7 @@ def import_stage(self, harvest_object):
package_dict = json.loads(harvest_object.content)

package_dict['id'] = harvest_object.guid
package_dict['name'] = self._gen_new_name(package_dict['title'])
package_dict['name'] = self._gen_new_name(package_dict['datasetID'], package_dict['id'])

user = model.User.get(self.HARVEST_USER)
context = {
Expand All @@ -294,7 +343,7 @@ def import_stage(self, harvest_object):
raise GroupNotFoundError('Group is not defined for dataset %s' % package_dict['title'])
data_dict = {
'id': group_name,
'name': self._gen_new_name(group_name),
'name': munge_title_to_name(group_name),
'title': group_name
}
try:
Expand All @@ -307,9 +356,16 @@ def import_stage(self, harvest_object):
try:
data_dict = {
'permission': 'edit_group',
'id': self._gen_new_name(self.ORGANIZATION['de']),
'name': self._gen_new_name(self.ORGANIZATION['de']),
'title': self.ORGANIZATION['de']
'id': munge_title_to_name(self.ORGANIZATION['de']['name']),
'name': munge_title_to_name(self.ORGANIZATION['de']['name']),
'title': self.ORGANIZATION['de']['name'],
'description': self.ORGANIZATION['de']['description'],
'extras': [
{
'key': 'website',
'value': self.ORGANIZATION['de']['website']
}
]
}
package_dict['owner_org'] = get_action('organization_show')(context, data_dict)['id']
except:
Expand All @@ -322,7 +378,7 @@ def import_stage(self, harvest_object):
if 'license_url' in package_dict:
extras.append(('license_url', package_dict['license_url']))
package_dict['extras'] = extras
log.debug('Extras %s' % extras)
log.debug('Extras %s' % extras)

package = model.Package.get(package_dict['id'])
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)
Expand Down

0 comments on commit 20e96cc

Please # to comment.