-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsample_spider_alternative.py
264 lines (241 loc) · 16.5 KB
/
sample_spider_alternative.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import scrapy
from scrapy.spiders import CrawlSpider
from converter.constants import Constants
from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \
PermissionItemLoader, LomClassificationItemLoader
from converter.spiders.base_classes import LomBase
from converter.web_tools import WebEngine, WebTools
# This is an alternative approach to our previous "sample_spider.py" that might be easier to read and understand
# for web crawling beginners. Use whichever approach is more convenient for you.
# LAST UPDATE: 2021-08-20
# please also consult converter/items.py for all currently available keys/values in our crawler data model
class SampleSpiderAlternative(CrawlSpider, LomBase):
name = "sample_spider_alternative"
friendlyName = "Sample Source (alternative Method)" # how your crawler should appear in the "Supplier"-list
start_urls = ["https://edu-sharing.com"] # starting point of your crawler, e.g. a sitemap, index, rss-feed etc.
version = "0.0.1" # this is used for timestamping your crawler results (if a source changes its layout/data,
# make sure to increment this value to force a clear distinction between old and new crawler results)
custom_settings = {
'WEB_TOOLS': WebEngine.Playwright # OPTIONAL: this attribute controls which tool is used for taking Screenshots
# you can skip this attribute altogether if you want to use the default Settings (Splash)
}
def getId(self, response=None) -> str:
# You have two choices here:
# - either implement this method and return the current url of a material as a string
# - or look into the parse()-method for base.add_value('sourceId', response.url) is set manually
pass
def getHash(self, response=None) -> str:
# The hash should always be unique, e.g. by string-concatenating using the publicationDate + self.version
# you can implement this method here or simply look at the parse()-method where
# base.add_value('hash', hash_temp)
# is set manually.
pass
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(url=start_url, callback=self.parse)
async def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
# OPTIONAL: If you need to use playwright to crawl a website, this is how you can access the data provided
# by Playwright's headless browser
playwright_dict: dict = await WebTools.getUrlData(response.url, WebEngine.Playwright)
html_body = playwright_dict.get("html")
screenshot_bytes = playwright_dict.get("screenshot_bytes") # to be used in base.screenshot_bytes
base = BaseItemLoader()
# ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py
# TODO: fill "base"-keys with values for
# - sourceId required (see: getId()-method above)
# - hash required (see: getHash()-method above)
# - lom required (see: LomBaseItemLoader below)
# - valuespaces required (see: ValueSpacesItemLoader below)
# - permissions required (see: PermissionItemLoader below)
# - license required (see: LicenseItemLoader below)
# - lastModified recommended
# - origin optional (only necessary if items need to be sorted into a specific sub-folder)
# - thumbnail recommended
# - publisher optional
# - binary optional (only needed if you're working with binary files (e.g. .pdf-files),
# if you want to see an example, check out "niedersachsen_abi_spider.py")
# - fulltext optional (if 'full text' content is provided by a source (e.g. raw HTML or a
# human readable string of text) store its within the 'fulltext' field.)
# If no 'fulltext' value was provided, the pipelines will try to fetch
# 'full text' content from "ResponseItem.text" and save it here.
# - ai_allow_usage optional (filled automatically by the ``RobotsTxtPipeline`` and expects a boolean)
# indicates if an item is allowed to be used in AI training.
base.add_value('sourceId', response.url)
# if the source doesn't have a "datePublished" or "lastModified"-value in its header or JSON_LD,
# you might have to help yourself with a unique string consisting of the datetime of the crawl + self.version
hash_temp: str = "This string should consist of a date (publication date, preferably)" + self.version
base.add_value('hash', hash_temp)
last_modified = None
base.add_value('lastModified', last_modified)
thumbnail_url: str = "This string should hold the thumbnail URL"
base.add_value('origin', 'premium_only') # the OPTIONAL value for "origin" controls the subfolder-name
# in the edu-sharing repository (e.g. if you need to make a distinction between learning objects that are free
# to access or premium_only). in this example, items that have the "premium_only"-value will be sent to the
# "SYNC_OBJ/<crawler_name>/premium_only/"-folder.
# (This field is used in two different use-cases, both in "youtube_spider" and "lehreronline_spider")
base.add_value('thumbnail', thumbnail_url) # the thumbnail field expects an URL (as a String)
base.add_value('screenshot_bytes', screenshot_bytes) # this is an OPTIONAL field that will be CONSUMED within
# the thumbnail pipeline to create a small/large thumbnail of the website itself
lom = LomBaseItemloader()
# TODO: afterwards fill up the LomBaseItem with
# - LomGeneralItem required
# - LomTechnicalItem required
# - LomLifeCycleItem required (multiple possible)
# - LomEducationalItem required
# - LomClassificationItem optional
general = LomGeneralItemloader()
# TODO: fill "general"-keys with values for
# - identifier required
# - title required
# - keyword required
# - description required
# - language recommended (edu-sharing expects underscores in language-codes, e.g. 'en-US'
# needs to be replaced by 'en_US')
# - coverage optional
# - structure optional
# - aggregationLevel optional
# e.g.: the unique identifier might be the URL to a material
general.add_value('identifier', response.url)
# TODO: don't forget to add key-value-pairs for 'title', 'keyword' and 'description'!
technical = LomTechnicalItemLoader()
# TODO: fill "technical"-keys with values for
# - location required (expected: URI / URL of a learning object / material)
# - format optional (expected: MIME-type, e.g. 'text/html' for web-sites.
# Set this value only if absolutely necessary,
# e.g. during imports via oeh_spider)
# - size optional
# - requirement optional
# - installationRemarks optional
# - otherPlatformRequirements optional
# - duration optional (only applies to audiovisual content like videos/podcasts)
# similar to how the "general"-LomGeneralItemLoader was filled with Items, individual values can be set with
# technical.add_value('key','value')
# or replaced with:
# technical.replace_value('key', 'value')
technical.add_value('location', response.url) # if the learning object has a unique URL that's being
# navigated by the crawler
lifecycle = LomLifecycleItemloader()
# TODO: fill "lifecycle"-keys with values for
# - role recommended
# - firstName recommended
# - lastName recommended
# - url recommended
# - date recommended
# - organization optional
# - email optional
# - uuid optional
# - title optional (academic title)
# - id_gnd optional (expected: URI)
# - id_orcid optional (expected: URI)
# - id_ror optional (expected: URI)
# - id_wikidata optional (expected: URI)
lifecycle.add_value('role', 'author')
# supported roles:
# "author" / "editor" / "publisher" / "metadata_contributor" / "metadata_provider" / "unknown"
# for further available role mappings, please take a look at converter/es_connector.py
educational = LomEducationalItemLoader()
# TODO: fill "educational"-keys with values for
# - description recommended (= "Comments on how this learning object is to be used")
# - language recommended
# - interactivityType optional
# - interactivityLevel optional
# - semanticDensity optional
# - typicalAgeRange optional
# - difficulty optional
# - typicalLearningTime optional
classification = LomClassificationItemLoader()
# TODO: fill "classification"-keys with values for
# - cost optional
# - purpose optional
# - taxonPath optional
# - description optional
# - keyword optional
# once you've filled "general", "technical", "lifecycle" and "educational" with values,
# the LomBaseItem is loaded into the "base"-BaseItemLoader
vs = ValuespaceItemLoader()
# for possible values, either consult https://vocabs.openeduhub.de
# or take a look at https://github.com/openeduhub/oeh-metadata-vocabs
# wherever possible, please use the skos:Concept <key> instead of literal strings
# (since they are more stable over a longer period of time)
# TODO: fill "valuespaces"-keys with values for
# - discipline recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl)
# (please set discipline-values by their unique vocab-identifier: e.g. '060' for "Art education")
# - intendedEndUserRole recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl)
# - learningResourceType recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl)
# - new_lrt recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/new_lrt.ttl)
# - conditionsOfAccess recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl)
# - containsAdvertisement recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl)
# - price recommended
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl)
# - educationalContext optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl)
# - toolCategory optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl)
# - accessibilitySummary optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl)
# - dataProtectionConformity optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl)
# - fskRating optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl)
# - languageLevel optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/languageLevel.ttl)
# - oer optional
# (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl)
vs.add_value('new_lrt', Constants.NEW_LRT_MATERIAL)
lic = LicenseItemLoader()
# TODO: fill "license"-keys with values for
# - url required
# - oer recommended ('oer' is automatically set if the 'url'-field above
# is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at
# LICENSE_MAPPINGS in converter/constants.py)
# - author recommended
# - internal optional
# - description optional
# - expirationDate optional (for content that expires, e.g. ÖR-Mediatheken)
# Either fill the PermissionItemLoader manually (not necessary most of the times)
permissions = PermissionItemLoader()
# or (preferably) call the inherited getPermissions(response)-method
# from converter/spiders/base_classes/lom_base.py by using super().:
# permissions = super().getPermissions(response)
# TODO: if necessary, add/replace values for the following "permissions"-keys
# - public optional
# - groups optional
# - mediacenters optional
# - autoCreateGroups optional
# - autoCreateMediacenters optional
# Either fill the ResponseItemLoader manually (not necessary most of the time)
response_loader = ResponseItemLoader()
# or (preferably) call the inherited mapResponse(response)-method
# from converter/spiders/base_classes/lom_base.py by using super().:
# response_loader = super().mapResponse(response)
# TODO: if necessary, add/replace values for the following "response"-keys
# - url required
# - status unused
# - html unused
# - text optional (use this field for 'full text' data)
# - headers unused
# - cookies unused
# - har unused
# once we've added all available values to the necessary keys in our LomGeneralItemLoader,
# we call the load_item()-method to return a (now filled) LomGeneralItem to the LomBaseItemLoader.
# We do the same for every other nested Item within LomBaseItem as well:
lom.add_value('general', general.load_item())
lom.add_value('technical', technical.load_item())
lom.add_value('lifecycle', lifecycle.load_item())
lom.add_value('educational', educational.load_item())
lom.add_value('classification', classification.load_item())
# after LomBaseItem is filled with metadata, we build and return it to our BaseItem
base.add_value('lom', lom.load_item())
base.add_value('license', lic.load_item())
base.add_value('valuespaces', vs.load_item())
base.add_value('permissions', permissions.load_item())
base.add_value('response', response_loader.load_item())
# once all scrapy.Items are loaded into our "base", we yield the BaseItem by calling the .load_item() method
yield base.load_item()