Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Multisite #7

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
.~lock*
.ipynb_checkpoints/

urls.txt
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@ Inside the root folder you can find scripts used to generate the data files, and

Please check the Issues, leave us any questions there or in [our forum](https://forum.opendata.ch).

## Updating

Put a file called `urls.txt` in the same folder which contains the domain names of dribdat servers, one per line, e.g.:

```
now.makezurich.ch
baselhack.dribdat.cc
bd.hack4socialgood.ch
```

Then run `python harvester.py` to update the files in the **data** folder.

## License

All contents licensed [CC Attribution-Share Alike 4.0 International](http://creativecommons.org/licenses/by-sa/4.0/), unless otherwise stated.
Expand Down
158 changes: 0 additions & 158 deletions data/dribdat-projects-2020.csv

This file was deleted.

65 changes: 65 additions & 0 deletions data/events.csv

Large diffs are not rendered by default.

44 changes: 0 additions & 44 deletions data/hackopendata-events.csv

This file was deleted.

3,625 changes: 0 additions & 3,625 deletions data/hackopendata-projects.csv

This file was deleted.

5,213 changes: 5,213 additions & 0 deletions data/projects.csv

Large diffs are not rendered by default.

33 changes: 29 additions & 4 deletions datapackage.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"profile": "tabular-data-package",
"resources": [
{
"path": "data/hackopendata-projects.csv",
"profile": "tabular-data-resource",
"path": "data/projects.csv",
"pathType": "local",
"title": "Archive of dribdat projects",
"name": "projects",
Expand All @@ -11,10 +12,17 @@
"encoding": "UTF-8",
"dialect": {
"delimiter": ",",
"quoteChar": "\""
"quoteChar": "\"",
"doubleQuote": true
},
"schema": {
"fields": [
{
"name": "origin",
"type": "string",
"format": "uri",
"description": "A link to the server where this project was uploaded"
},
{
"name": "id",
"type": "integer",
Expand Down Expand Up @@ -238,12 +246,29 @@
}
},
{
"profile": "tabular-data-resource",
"name": "events",
"title": "Archive of dribdat events",
"path": "data/events.csv",
"pathType": "local",
"profile": "tabular-data-resource",
"format": "csv",
"mediatype": "text/csv",
"encoding": "UTF-8",
"dialect": {
"delimiter": ",",
"quoteChar": "\"",
"doubleQuote": true
},

"schema": {
"fields": [
{
"name": "origin",
"type": "string",
"format": "uri",
"description": "A link to the server where this event was run"
},
{
"name": "community_url",
"type": "any",
Expand Down Expand Up @@ -291,12 +316,12 @@
},
{
"name": "location_lat",
"type": "integer",
"type": "number",
"format": "default"
},
{
"name": "location_lon",
"type": "integer",
"type": "number",
"format": "default"
},
{
Expand Down
23 changes: 12 additions & 11 deletions harvester.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,21 @@
"source": [
"# Harvest all your hacks\n",
"\n",
"A Python 3 script which collects projects from a Dribdat instance. Just put in the URL of a server, and it will talk to the public API to collect the content.\n",
"\n",
"No special dependencies required, though you might need to `pip install requests`"
"A Python 3 script which collects projects from a Dribdat instance. Just put in the URL of your target servers, and it will talk to the public API to collect the content. No special dependencies required, though you might need to `pip install requests`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"id": "3a0e9c4c-5afe-4500-82ae-c56106d9f00c",
"metadata": {},
"outputs": [],
"source": [
"import csv, json, requests\n",
"\n",
"URL_DRIBDAT = 'https://hack.opendata.ch'\n",
"OUTPUT_PROJECTS = 'data/projects.csv'\n",
"OUTPUT_EVENTS = 'data/events.csv'\n",
"OUTPUT_PROJECTS = 'data/hackopendata-projects.csv'\n",
"OUTPUT_EVENTS = 'data/hackopendata-events.csv'\n",
"MAX_PROJECTS = 100000"
]
},
Expand Down Expand Up @@ -55,6 +53,8 @@
"source": [
"events_data = requests.get(URL_DRIBDAT + '/api/events.json').json()\n",
"events = events_data['events']\n",
"for event in events:\n",
" event['origin'] = URL_DRIBDAT\n",
"len(events)"
]
},
Expand All @@ -75,7 +75,7 @@
{
"data": {
"text/plain": [
"661"
"13"
]
},
"execution_count": 3,
Expand Down Expand Up @@ -108,7 +108,7 @@
{
"data": {
"text/plain": [
"661"
"13"
]
},
"execution_count": 4,
Expand All @@ -120,6 +120,7 @@
"all_projects = []\n",
"for pd in project_data:\n",
" for proj in project_data[pd]:\n",
" proj['origin'] = URL_DRIBDAT\n",
" all_projects.append(proj)\n",
"len(all_projects)"
]
Expand All @@ -141,7 +142,7 @@
{
"data": {
"text/plain": [
"'community_url,ends_at,gallery_url,has_finished,has_started,hashtags,hostname,id,location,location_lat,location_lon,logo_url,name,starts_at,summary,webpage_url'"
"'origin,community_url,ends_at,gallery_url,has_finished,has_started,hashtags,hostname,id,location,location_lat,location_lon,logo_url,name,starts_at,summary,webpage_url'"
]
},
"execution_count": 5,
Expand Down Expand Up @@ -169,7 +170,7 @@
{
"data": {
"text/plain": [
"'Wrote data/events.csv'"
"'Wrote data/hackopendata-events.csv'"
]
},
"execution_count": 6,
Expand All @@ -194,7 +195,7 @@
{
"data": {
"text/plain": [
"'Wrote data/projects.csv'"
"'Wrote data/hackopendata-projects.csv'"
]
},
"execution_count": 7,
Expand Down
104 changes: 104 additions & 0 deletions harvester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import csv, json, requests

from requests.exceptions import JSONDecodeError

INPUT_URLS = 'urls.txt'
OUTPUT_PROJECTS = 'data/projects.csv'
OUTPUT_EVENTS = 'data/events.csv'
MAX_PROJECTS = 100000

def main():
with open(INPUT_URLS, 'r') as f:
urls = f.read().split('\n')
print("Getting ready to harvest %d sites" % len(urls))

eventcols, projectcols = get_datapackage_schema()
print("Data Package loaded, Table Schema ready!")

events = []
projects = []

for u in urls:
if not u.strip(): continue
print("--- Harvesting: %s" % u)
ee, pp = get_events_projects(u)
if ee is not None and pp is not None:
events.extend(ee)
projects.extend(pp)

save_data(eventcols, events, projectcols, projects)
print("Done.")


def get_datapackage_schema():
""" Loads a Data Package containing the target Table Schema """
projcols = eventcols = None
with open('datapackage.json') as f:
for res in json.load(f)['resources']:
if 'events' in res['name']:
eventcols = [ f['name'] for f in res['schema']['fields'] ]
if 'projects' in res['name']:
projcols = [ f['name'] for f in res['schema']['fields'] ]
return eventcols, projcols


def get_events_projects(urlhost):
""" Fetches all the events and projects from a Dribdat URL """

url = 'https://' + urlhost

try:
events_data = requests.get(url + '/api/events.json', timeout=5).json()
except JSONDecodeError:
print("!! invalid JSON data, skipping this server")
return None, None
all_events = events_data['events']
for event in all_events:
event['origin'] = urlhost
print('Collecting data from %d events' % len(all_events))

count_total = 0
project_data = {}
for event in all_events:
url_api = url + "/api/event/%d/projects.json" % event['id']
print('.. event %d (%s)' % (event['id'], event['name']))
try:
proj_data = requests.get(url_api).json()
except JSONDecodeError:
print("!! invalid JSON data, skipping this server")
return None, None
if 'projects' in proj_data:
project_data[event['id']] = proj_data['projects']
count_total = count_total + len(proj_data['projects'])
else:
project_data[event['id']] = []
print('!! no data for event %d' % event['id'])
if count_total > MAX_PROJECTS: break

all_projects = []
for pd in project_data:
for proj in project_data[pd]:
proj['origin'] = urlhost
all_projects.append(proj)
print('Downloaded a total of %d projects' % len(all_projects))

return all_events, all_projects


def save_data(eventcols, events, projectcols, projects):
""" Saves the events and projects to a file """

with open(OUTPUT_EVENTS, "w") as f:
cw = csv.DictWriter(f, eventcols, delimiter=',')
cw.writeheader()
cw.writerows(events)
print("Wrote %s" % OUTPUT_EVENTS)
with open(OUTPUT_PROJECTS, "w") as f:
cw = csv.DictWriter(f, projectcols, delimiter=',')
cw.writeheader()
cw.writerows(projects)
print("Wrote %s" % OUTPUT_PROJECTS)

if __name__ == "__main__":
main()