-
Notifications
You must be signed in to change notification settings - Fork 125
/
Copy pathsparql_dataframe.py
74 lines (62 loc) · 2.37 KB
/
sparql_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Query a SPARQL endpoint and return results as a Pandas dataframe.
"""
import io
from typing import TYPE_CHECKING, Any, Dict, List, Union
from SPARQLWrapper.SmartWrapper import Bindings, SPARQLWrapper2, Value
from SPARQLWrapper.Wrapper import CSV, SELECT, SPARQLWrapper
if TYPE_CHECKING:
import pandas as pd
class QueryException(Exception):
pass
def get_sparql_dataframe_orig(
endpoint: str, query: Union[str, bytes]
) -> "pd.DataFrame":
"""copy paste from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
sparql.setReturnFormat(CSV)
results = sparql.query().convert()
if isinstance(results, bytes):
_csv = io.StringIO(results.decode("utf-8"))
return pd.read_csv(_csv, sep=",")
else:
raise TypeError(type(results))
def get_sparql_typed_dict(
endpoint: str, query: Union[str, bytes]
) -> List[Dict[str, Value]]:
"""modified from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
# rdflib in here because there is some meta stuff in the setup.py and Travis fails because rdflib is installed later
import rdflib.term
sparql = SPARQLWrapper2(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
# sparql.setReturnFormat(JSON)
results = sparql.query()
if not isinstance(results, Bindings):
raise TypeError(type(results))
# consider perf hacking later, probably slow
# convert list of dicts to python types
d = []
for x in results.bindings:
row = {}
for k in x:
v = x[k]
vv = rdflib.term.Literal(v.value, datatype=v.datatype).toPython() # type: ignore[no-untyped-call]
row[k] = vv
d.append(row)
return d
def get_sparql_dataframe(endpoint: str, query: Union[str, bytes]) -> "pd.DataFrame":
# pandas inside to avoid requiring it
import pandas as pd
d = get_sparql_typed_dict(endpoint, query)
# TODO: will nan fill somehow, make more strict if there is way of getting the nan types from rdflib
df = pd.DataFrame(d)
return df