OpendataCH · loleg · Dec 16, 2023 · Dec 16, 2023 · Dec 16, 2023
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
+.~lock*
 .ipynb_checkpoints/
+
+urls.txt
diff --git a/README.md b/README.md
@@ -12,6 +12,18 @@ Inside the root folder you can find scripts used to generate the data files, and
 
 Please check the Issues, leave us any questions there or in [our forum](https://forum.opendata.ch).
 
+## Updating
+
+Put a file called `urls.txt` in the same folder which contains the domain names of dribdat servers, one per line, e.g.:
+
+```
+now.makezurich.ch
+baselhack.dribdat.cc
+bd.hack4socialgood.ch
+```
+
+Then run `python harvester.py` to update the files in the **data** folder.
+
 ## License
 
 All contents licensed [CC Attribution-Share Alike 4.0 International](http://creativecommons.org/licenses/by-sa/4.0/), unless otherwise stated.

diff --git a/data/dribdat-projects-2020.csv b/data/dribdat-projects-2020.csv
diff --git a/data/events.csv b/data/events.csv
diff --git a/data/hackopendata-events.csv b/data/hackopendata-events.csv
diff --git a/data/hackopendata-projects.csv b/data/hackopendata-projects.csv
diff --git a/data/projects.csv b/data/projects.csv
diff --git a/datapackage.json b/datapackage.json
@@ -2,7 +2,8 @@
   "profile": "tabular-data-package",
   "resources": [
     {
-      "path": "data/hackopendata-projects.csv",
+      "profile": "tabular-data-resource",
+      "path": "data/projects.csv",
       "pathType": "local",
       "title": "Archive of dribdat projects",
       "name": "projects",
@@ -11,10 +12,17 @@
       "encoding": "UTF-8",
       "dialect": {
         "delimiter": ",",
-        "quoteChar": "\""
+        "quoteChar": "\"",
+        "doubleQuote": true
       },
       "schema": {
         "fields": [
+          {
+            "name": "origin",
+            "type": "string",
+            "format": "uri",
+            "description": "A link to the server where this project was uploaded"
+          },
           {
             "name": "id",
             "type": "integer",
@@ -238,12 +246,29 @@
       }
     },
     {
+      "profile": "tabular-data-resource",
       "name": "events",
       "title": "Archive of dribdat events",
       "path": "data/events.csv",
+      "pathType": "local",
       "profile": "tabular-data-resource",
+      "format": "csv",
+      "mediatype": "text/csv",
+      "encoding": "UTF-8",
+      "dialect": {
+        "delimiter": ",",
+        "quoteChar": "\"",
+        "doubleQuote": true
+      },
+
       "schema": {
         "fields": [
+          {
+            "name": "origin",
+            "type": "string",
+            "format": "uri",
+            "description": "A link to the server where this event was run"
+          },
           {
             "name": "community_url",
             "type": "any",
@@ -291,12 +316,12 @@
           },
           {
             "name": "location_lat",
-            "type": "integer",
+            "type": "number",
             "format": "default"
           },
           {
             "name": "location_lon",
-            "type": "integer",
+            "type": "number",
             "format": "default"
           },
           {

diff --git a/harvester.ipynb b/harvester.ipynb
@@ -7,23 +7,21 @@
    "source": [
     "# Harvest all your hacks\n",
     "\n",
-    "A Python 3 script which collects projects from a Dribdat instance. Just put in the URL of a server, and it will talk to the public API to collect the content.\n",
-    "\n",
-    "No special dependencies required, though you might need to `pip install requests`"
+    "A Python 3 script which collects projects from a Dribdat instance. Just put in the URL of your target servers, and it will talk to the public API to collect the content. No special dependencies required, though you might need to `pip install requests`"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "id": "3a0e9c4c-5afe-4500-82ae-c56106d9f00c",
    "metadata": {},
    "outputs": [],
    "source": [
     "import csv, json, requests\n",
     "\n",
     "URL_DRIBDAT = 'https://hack.opendata.ch'\n",
-    "OUTPUT_PROJECTS = 'data/projects.csv'\n",
-    "OUTPUT_EVENTS = 'data/events.csv'\n",
+    "OUTPUT_PROJECTS = 'data/hackopendata-projects.csv'\n",
+    "OUTPUT_EVENTS = 'data/hackopendata-events.csv'\n",
     "MAX_PROJECTS = 100000"
    ]
   },
@@ -55,6 +53,8 @@
    "source": [
     "events_data = requests.get(URL_DRIBDAT + '/api/events.json').json()\n",
     "events = events_data['events']\n",
+    "for event in events:\n",
+    "    event['origin'] = URL_DRIBDAT\n",
     "len(events)"
    ]
   },
@@ -75,7 +75,7 @@
     {
      "data": {
       "text/plain": [
-       "661"
+       "13"
       ]
      },
      "execution_count": 3,
@@ -108,7 +108,7 @@
     {
      "data": {
       "text/plain": [
-       "661"
+       "13"
       ]
      },
      "execution_count": 4,
@@ -120,6 +120,7 @@
     "all_projects = []\n",
     "for pd in project_data:\n",
     "    for proj in project_data[pd]:\n",
+    "        proj['origin'] = URL_DRIBDAT\n",
     "        all_projects.append(proj)\n",
     "len(all_projects)"
    ]
@@ -141,7 +142,7 @@
     {
      "data": {
       "text/plain": [
-       "'community_url,ends_at,gallery_url,has_finished,has_started,hashtags,hostname,id,location,location_lat,location_lon,logo_url,name,starts_at,summary,webpage_url'"
+       "'origin,community_url,ends_at,gallery_url,has_finished,has_started,hashtags,hostname,id,location,location_lat,location_lon,logo_url,name,starts_at,summary,webpage_url'"
       ]
      },
      "execution_count": 5,
@@ -169,7 +170,7 @@
     {
      "data": {
       "text/plain": [
-       "'Wrote data/events.csv'"
+       "'Wrote data/hackopendata-events.csv'"
       ]
      },
      "execution_count": 6,
@@ -194,7 +195,7 @@
     {
      "data": {
       "text/plain": [
-       "'Wrote data/projects.csv'"
+       "'Wrote data/hackopendata-projects.csv'"
       ]
      },
      "execution_count": 7,

diff --git a/harvester.py b/harvester.py
@@ -0,0 +1,104 @@
+import csv, json, requests
+
+from requests.exceptions import JSONDecodeError
+
+INPUT_URLS = 'urls.txt'
+OUTPUT_PROJECTS = 'data/projects.csv'
+OUTPUT_EVENTS = 'data/events.csv'
+MAX_PROJECTS = 100000
+
+def main():
+    with open(INPUT_URLS, 'r') as f:
+        urls = f.read().split('\n')
+        print("Getting ready to harvest %d sites" % len(urls))
+
+    eventcols, projectcols = get_datapackage_schema()
+    print("Data Package loaded, Table Schema ready!")
+
+    events = []
+    projects = []
+
+    for u in urls:
+        if not u.strip(): continue
+        print("--- Harvesting: %s" % u)
+        ee, pp = get_events_projects(u)
+        if ee is not None and pp is not None:
+            events.extend(ee)
+            projects.extend(pp)
+
+    save_data(eventcols, events, projectcols, projects)
+    print("Done.")
+
+
+def get_datapackage_schema():
+    """ Loads a Data Package containing the target Table Schema """
+    projcols = eventcols = None
+    with open('datapackage.json') as f:
+        for res in json.load(f)['resources']:
+            if 'events' in res['name']:
+                eventcols = [ f['name'] for f in res['schema']['fields'] ]
+            if 'projects' in res['name']:
+                projcols = [ f['name'] for f in res['schema']['fields'] ]
+    return eventcols, projcols
+
+
+def get_events_projects(urlhost):
+    """ Fetches all the events and projects from a Dribdat URL """
+
+    url = 'https://' + urlhost
+
+    try:
+        events_data = requests.get(url + '/api/events.json', timeout=5).json()
+    except JSONDecodeError:
+        print("!! invalid JSON data, skipping this server")
+        return None, None
+    all_events = events_data['events']
+    for event in all_events:
+        event['origin'] = urlhost
+    print('Collecting data from %d events' % len(all_events))
+
+    count_total = 0
+    project_data = {}
+    for event in all_events:
+        url_api = url + "/api/event/%d/projects.json" % event['id']
+        print('.. event %d (%s)' % (event['id'], event['name']))
+        try:
+            proj_data = requests.get(url_api).json()
+        except JSONDecodeError:
+            print("!! invalid JSON data, skipping this server")
+            return None, None
+        if 'projects' in proj_data:
+            project_data[event['id']] = proj_data['projects']
+            count_total = count_total + len(proj_data['projects'])
+        else:
+            project_data[event['id']] = []
+            print('!! no data for event %d' % event['id'])
+        if count_total > MAX_PROJECTS: break
+
+    all_projects = []
+    for pd in project_data:
+        for proj in project_data[pd]:
+            proj['origin'] = urlhost
+            all_projects.append(proj)
+    print('Downloaded a total of %d projects' % len(all_projects))
+
+    return all_events, all_projects
+
+
+def save_data(eventcols, events, projectcols, projects):
+    """ Saves the events and projects to a file """
+
+    with open(OUTPUT_EVENTS, "w") as f:
+        cw = csv.DictWriter(f, eventcols, delimiter=',')
+        cw.writeheader()
+        cw.writerows(events)
+    print("Wrote %s" % OUTPUT_EVENTS)
+    with open(OUTPUT_PROJECTS, "w") as f:
+        cw = csv.DictWriter(f, projectcols, delimiter=',')
+        cw.writeheader()
+        cw.writerows(projects)
+    print("Wrote %s" % OUTPUT_PROJECTS)
+
+if __name__ == "__main__":
+    main()
+