This repository was archived by the owner on Mar 11, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path1_send_resources_to_linkproxy.py
130 lines (107 loc) · 3.74 KB
/
1_send_resources_to_linkproxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import click
import dataset as dataset_lib
import requests
from datetime import datetime, timedelta
db = dataset_lib.connect("sqlite:///orchestrator.db")
EXCLUDED_PATTERNS = [
"resources/donnees-temps-reel-de-mesure-des-concentrations-de-polluants-atmospheriques-reglementes-1",
"files.data.gouv.fr/lcsqa/concentrations-de-polluants-atmospheriques-reglementes",
# already ignored by link-proxy but let's not clobber things up
"files.geo.data.gouv.fr/link-proxy/"
]
@click.group()
def cli():
pass
def get_last_run():
table = db["runs"]
last_run = table.find_one(order_by='-date')
if not last_run:
return datetime.now() - timedelta(days=1)
else:
return last_run["date"]
def record_run(count, count_ignored):
table = db["runs"]
table.insert({
"nb_resources": count,
"nb_resources_ignored": count_ignored,
"date": datetime.now()
})
def get_datasets(page):
r = requests.get(f"https://www.data.gouv.fr/api/1/datasets/?sort=-last_modified&page={page}")
return r.json()["data"]
def modified_datasets(last_run):
got_everything = False
results = []
page = 1
while not got_everything:
data = get_datasets(page)
for d in data:
modified = datetime.fromisoformat(d["last_modified"])
got_everything = (modified < last_run)
if not got_everything:
results.append(d)
else:
break
if got_everything:
break
else:
page += 1
return results
def send_to_linkproxy(url):
r = requests.post("http://localhost:5010", json={
"location": url
})
r.raise_for_status()
return r.json()
def handle_dataset(dataset, last_run):
count = 0
count_ignored = 0
table = db["checks"]
for resource in dataset["resources"]:
try:
modified_date = datetime.fromisoformat(resource["last_modified"])
if last_run > modified_date:
continue
if any([excl in resource["url"] for excl in EXCLUDED_PATTERNS]):
count_ignored += 1
continue
try:
res = send_to_linkproxy(resource["url"])
except requests.HTTPError as e:
click.secho(f"Error while creating check: {e}", err=True, fg="red")
else:
count += 1
existing = table.find_one(check_id=res["_id"], dataset_id=dataset["id"], resource_id=resource["id"])
data = {
"check_id": res["_id"],
"modified_at": datetime.now(),
"dataset_id": dataset["id"],
"resource_id": resource["id"],
"url": resource["url"],
"dataset_title":dataset["title"],
"resource_title":resource["title"]
}
if not existing:
data["created_at"] = datetime.now()
table.insert(data)
else:
table.update(data, ["check_id", "resource_id", "dataset_id"])
except:
print("Exception in resource")
return count, count_ignored
@cli.command()
def run():
last_run = get_last_run()
click.echo(f"Last run: {last_run}")
datasets = modified_datasets(last_run)
count = 0
count_ignored = 0
with click.progressbar(datasets, label=f"Analysing {len(datasets)} datasets") as all_datasets:
for dataset in all_datasets:
_count, _count_ignored = handle_dataset(dataset, last_run)
count += _count
count_ignored += _count_ignored
record_run(count, count_ignored)
click.secho("Done!", fg="green")
if __name__ == "__main__":
cli()