-
-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathdump_lemmas.py
110 lines (100 loc) · 2.71 KB
/
dump_lemmas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sqlite3
from pathlib import Path
try:
from .utils import (
Prefs,
custom_lemmas_folder,
get_spacy_model_version,
insert_installed_libs,
load_plugin_json,
use_kindle_ww_db,
)
except ImportError:
from utils import (
Prefs,
custom_lemmas_folder,
get_spacy_model_version,
insert_installed_libs,
load_plugin_json,
use_kindle_ww_db,
)
def spacy_doc_path(
spacy_model: str,
model_version: str,
lemma_lang: str,
is_kindle: bool,
plugin_path: Path,
prefs: Prefs,
):
import platform
gloss_lang = prefs["gloss_lang"]
if is_kindle and not use_kindle_ww_db(lemma_lang, prefs):
is_kindle = False
py_version = ".".join(platform.python_version_tuple()[:2])
path = custom_lemmas_folder(plugin_path).joinpath(
f"{spacy_model or lemma_lang}_{'kindle' if is_kindle else 'wiktionary'}"
f"_{gloss_lang}_{model_version}_{py_version}"
)
return path
def dump_spacy_docs(
spacy_model: str,
is_kindle: bool,
lemma_lang: str,
db_path: Path,
plugin_path: Path,
prefs: Prefs,
):
insert_installed_libs(plugin_path)
import spacy
nlp = spacy.load(spacy_model) if spacy_model != "" else spacy.blank(lemma_lang)
lemmas_conn = sqlite3.connect(db_path)
pkg_versions = load_plugin_json(plugin_path, "data/deps.json")
save_spacy_docs(
nlp,
spacy_model,
get_spacy_model_version(spacy_model, pkg_versions),
lemma_lang,
is_kindle,
lemmas_conn,
plugin_path,
prefs,
)
lemmas_conn.close()
def save_spacy_docs(
nlp,
spacy_model: str,
model_version: str,
lemma_lang: str,
is_kindle: bool,
lemmas_conn: sqlite3.Connection,
plugin_path: Path,
prefs: Prefs,
):
from spacy.tokens import DocBin
lemmas_doc_bin = DocBin(attrs=["LOWER"])
difficulty_limit = (
5 if is_kindle else prefs[f"{lemma_lang}_wiktionary_difficulty_limit"]
)
query_sql = """
SELECT DISTINCT lemma
FROM lemmas l
JOIN senses s ON l.id = s.lemma_id AND enabled = 1 AND difficulty <= :difficulty
UNION ALL
SELECT DISTINCT form
FROM lemmas l
JOIN forms f ON l.id = f.lemma_id
JOIN senses s ON l.id = s.lemma_id AND f.pos = s.pos
AND enabled = 1 AND difficulty <= :difficulty
"""
for doc in nlp.tokenizer.pipe(
map(
lambda x: x[0].lower(),
lemmas_conn.execute(query_sql, {"difficulty": difficulty_limit}),
)
):
lemmas_doc_bin.add(doc)
lemmas_doc_bin.to_disk(
spacy_doc_path(
spacy_model, model_version, lemma_lang, is_kindle, plugin_path, prefs
)
)