Skip to content

Commit

Permalink
Merge pull request #135 from Datenschule/use-brandenburg-wfs
Browse files Browse the repository at this point in the history
[BB] Use WFS to get data
  • Loading branch information
k-nut authored Apr 22, 2024
2 parents 4d3efc8 + ef24a9f commit c8ae013
Showing 1 changed file with 39 additions and 45 deletions.
84 changes: 39 additions & 45 deletions jedeschule/spiders/brandenburg.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,54 @@
from typing import List, Optional
import xml.etree.ElementTree as ET

import scrapy
from scrapy import Item

from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider


def first_or_none(item: List) -> Optional[str]:
return next(iter(item or []), None)


class BrandenburgSpider(SchoolSpider):
name = "brandenburg"
start_urls = ['https://bildung-brandenburg.de/schulportraets/index.php?id=uebersicht']

start_urls = [
"https://schullandschaft.brandenburg.de/edugis/wfs/schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=ms:Schul_Standorte&srsname=epsg:4326"
]

def parse(self, response):
for link in response.xpath('/html/body/div/div[5]/div[2]/div/div[2]/table/tbody/tr/td/a/@href').getall():
yield scrapy.Request(response.urljoin(link), callback=self.parse_details)

def parse_details(self, response):
table = response.xpath('//*[@id="c"]/div/table')
data = {
# extract the school ID from the URL
'id': response.url.rsplit('=', 1)[1],
'data_url': response.url
tree = ET.fromstring(response.body)

namespaces = {
"gml": "http://www.opengis.net/gml",
"ms": "http://mapserver.gis.umn.edu/mapserver",
}
for tr in table.css('tr:not(:first-child)'):
key = tr.css('th ::text').get().replace(':', '').strip()
value = tr.css('td ::text').getall()
data[key] = [self.fix_data(part) for part in value]
yield data

def fix_data(self, string):
"""
fix wrong tabs, spaces and backslashes
fix @ in email addresses
"""
if string is None:
return None
string = ' '.join(string.split())
return string.replace('\\', '').replace('|at|','@').strip()
for school in tree.findall("gml:featureMember", namespaces):
data_elem = {}
for entry in school[0]:
if entry.tag == "{http://mapserver.gis.umn.edu/mapserver}msGeometry":
# This nested entry contains the coordinates that we would like to expand
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text
yield data_elem

@staticmethod
def normalize(item: Item) -> School:
*name, street, place = item.get('Adresse')
zip_code, *city_parts = place.split(" ")
return School(name=' '.join(name),
id='BB-{}'.format(item.get('id')),
address=street,
zip=zip_code,
city=' '.join(city_parts),
website=first_or_none(item.get('Internet')),
email=first_or_none(item.get('E-Mail')),
school_type=first_or_none(item.get('Schulform')),
provider=first_or_none(item.get('Schulamt')),
fax=first_or_none(item.get('Fax')),
phone=first_or_none(item.get('Telefon')),
director=first_or_none(item.get('Schulleiter/in')))
return School(
name=item.get("schulname"),
id="BB-{}".format(item.get("schul_nr")),
address=item.get("strasse_hausnr"),
zip=item.get("plz"),
city=item.get("ort"),
website=item.get("homepage"),
email=item.get("dienst_email"),
school_type=item.get("schulform"),
fax=item.get("faxnummer"),
phone=item.get("telefonnummer"),
provider=item.get('schulamtname'),
longitude=item.get("lon"),
latitude=item.get("lat"),
)

0 comments on commit c8ae013

Please # to comment.