Skip to content

Commit c5a3ad7

Browse files
Merge pull request #5 from dataiku/fix/broken-model-link
Fix/broken model link
2 parents 72b5b45 + 2380bd1 commit c5a3ad7

File tree

20 files changed

+639
-555
lines changed

20 files changed

+639
-555
lines changed

.wlock

Whitespace-only changes.

Makefile

+45-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,48 @@
1-
PLUGIN_VERSION=1.2.0
2-
PLUGIN_ID=named-entity-recognition
1+
# Makefile variables set automatically
2+
plugin_id=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['id']).replace('/',''))"`
3+
plugin_version=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['version']).replace('/',''))"`
4+
archive_file_name="dss-plugin-${plugin_id}-${plugin_version}.zip"
5+
remote_url=`git config --get remote.origin.url`
6+
last_commit_id=`git rev-parse HEAD`
7+
38

49
plugin:
5-
cat plugin.json|json_pp > /dev/null
10+
@echo "[START] Archiving plugin to dist/ folder..."
11+
@cat plugin.json | json_pp > /dev/null
12+
@rm -rf dist
13+
@mkdir dist
14+
@echo "{\"remote_url\":\"${remote_url}\",\"last_commit_id\":\"${last_commit_id}\"}" > release_info.json
15+
@git archive -v -9 --format zip -o dist/${archive_file_name} HEAD
16+
@zip -u dist/${archive_file_name} release_info.json
17+
@rm release_info.json
18+
@echo "[SUCCESS] Archiving plugin to dist/ folder: Done!"
19+
20+
unit-tests:
21+
@echo "[START] Running unit tests..."
22+
@( \
23+
PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \
24+
PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \
25+
if ! $$PYTHON_VERSION_IS_CORRECT; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; fi; \
26+
)
27+
@( \
28+
python3 -m venv env/; \
29+
source env/bin/activate; \
30+
pip3 install --upgrade pip; \
31+
pip3 install --no-cache-dir -r tests/python/requirements.txt; \
32+
pip3 install --no-cache-dir -r code-env/python/spec/requirements.txt; \
33+
export PYTHONPATH="$(PYTHONPATH):$(PWD)/python-lib"; \
34+
export DICTIONARY_FOLDER_PATH="$(PWD)/resource/dictionaries"; \
35+
pytest -o junit_family=xunit2 --junitxml=unit.xml tests/python/unit || true; \
36+
deactivate; \
37+
)
38+
@echo "[SUCCESS] Running unit tests: Done!"
39+
40+
integration-tests:
41+
@echo "[START] Running integration tests..."
42+
# TODO add integration tests
43+
@echo "[SUCCESS] Running integration tests: Done!"
44+
45+
tests: unit-tests integration-tests
46+
47+
dist-clean:
648
rm -rf dist
7-
mkdir dist
8-
zip --exclude "*.pyc" -r dist/dss-plugin-${PLUGIN_ID}-${PLUGIN_VERSION}.zip plugin.json code-env custom-recipes python-lib python-runnables resource webapps

README.md

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
# Named Entity Recognition
1+
# Named Entity Recognition Plugin
22

3-
This Dataiku DSS plugin provides recipes to recognize Named Entities (people, dates, places, etc.) in text data.
3+
This Dataiku DSS plugin provides a recipe, macro and webapp to recognize Named Entities (people, dates, places, etc.) in text data.
44

55
Documentation: https://www.dataiku.com/product/plugins/named-entity-recognition/
66

7-
### Licence
7+
## License
8+
89
This plugin is distributed under Apache License version 2.0

code-env/python/desc.json

100755100644
+6-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
{
2-
"acceptedPythonInterpreters": ["PYTHON36"],
3-
"forceConda": false,
4-
"installCorePackages": true,
5-
"installJupyterSupport": false
2+
"acceptedPythonInterpreters": [
3+
"PYTHON36"
4+
],
5+
"forceConda": false,
6+
"installCorePackages": true,
7+
"installJupyterSupport": false
68
}

code-env/python/spec/requirements.txt

+12-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
torch==1.4.0
2-
flair==0.4.3
3-
spacy==2.1.8
4-
flask>=1.0,<1.1
1+
torch==1.6.0
2+
flair==0.6.1
3+
flask>=1.0,<1.1
4+
tqdm==4.50.0
5+
spacy==2.3.2
6+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
7+
https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz
8+
https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz
9+
https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-2.3.0/pl_core_news_sm-2.3.0.tar.gz
10+
https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-2.3.0/nb_core_news_sm-2.3.0.tar.gz
11+
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
12+
https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz
+54-31
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,101 @@
11
{
22
"meta": {
3-
"label": "Extract Named Entities",
4-
"description": "Identify “real-world objects” (i.e. People names, Dates, Places, etc) in a text column.",
3+
"label": "Named Entity Recognition",
4+
"description": "Identify “real-world objects” (i.e. People names, Dates, Places, etc) in a text column",
55
"icon": "icon-tag"
66
},
7-
87
"kind": "PYTHON",
9-
108
"selectableFromDataset": "input_dataset",
11-
129
"inputRoles": [
1310
{
1411
"name": "input_dataset",
1512
"label": "Input dataset",
16-
"description": "The dataset that contains your texts.",
13+
"description": "Dataset containing the text data to analyze",
1714
"arity": "UNARY",
1815
"required": true,
1916
"acceptsDataset": true
2017
},
2118
{
2219
"name": "model_folder",
23-
"label": "Model folder",
24-
"description": "A managed folder for saving the NER model (only required if using Flair).",
20+
"label": "Flair model (optional)",
21+
"description": "Folder containing Flair model weights",
2522
"arity": "UNARY",
2623
"required": false,
2724
"acceptsManagedFolder": true,
28-
"acceptsDataset": false
25+
"acceptsDataset": false,
26+
"mustBeStrictlyType": "Filesystem"
2927
}
3028
],
31-
3229
"outputRoles": [
3330
{
3431
"name": "output_dataset",
3532
"label": "Output dataset",
36-
"description": "A dataset with the input texts and their corresponding entities.",
33+
"description": "Dataset with the input text and the corresponding entities",
3734
"arity": "UNARY",
3835
"required": true,
3936
"acceptsDataset": true
4037
}
4138
],
42-
4339
"params": [
40+
{
41+
"name": "separator_input",
42+
"label": "Input parameters",
43+
"type": "SEPARATOR"
44+
},
4445
{
4546
"name": "text_column_name",
4647
"label": "Text column",
4748
"type": "COLUMN",
48-
"description": "Select a column to extract named entities.",
4949
"mandatory": true,
50-
"columnRole": "input_dataset"
50+
"columnRole": "input_dataset",
51+
"allowedColumnTypes": [
52+
"string"
53+
]
5154
},
5255
{
5356
"visibilityCondition": "model.ner_model=='spacy'",
5457
"name": "text_language_spacy",
55-
"label": "Text language",
58+
"label": "Language",
59+
"description": "List of supported languages",
5660
"type": "SELECT",
57-
"description": "Select the language of your texts.",
5861
"selectChoices": [
62+
{
63+
"value": "zh",
64+
"label": "Chinese"
65+
},
5966
{
6067
"value": "en",
6168
"label": "English"
6269
},
6370
{
6471
"value": "fr",
6572
"label": "French"
73+
},
74+
{
75+
"value": "de",
76+
"label": "German"
77+
},
78+
{
79+
"value": "nb",
80+
"label": "Norwegian Bokmål"
81+
},
82+
{
83+
"value": "pl",
84+
"label": "Polish"
85+
},
86+
{
87+
"value": "es",
88+
"label": "Spanish"
6689
}
6790
],
6891
"defaultValue": "en"
6992
},
7093
{
7194
"visibilityCondition": "model.ner_model=='flair'",
7295
"name": "text_language_flair",
73-
"label": "Text language",
96+
"label": "Language",
7497
"type": "SELECT",
75-
"description": "Select the language of your texts.",
98+
"description": "Only supported language",
7699
"selectChoices": [
77100
{
78101
"value": "en",
@@ -82,41 +105,41 @@
82105
"defaultValue": "en"
83106
},
84107
{
85-
"label": "Advanced",
108+
"name": "separator_configuration",
109+
"label": "Configuration",
86110
"type": "SEPARATOR"
87111
},
88112
{
89113
"name": "advanced_settings",
90-
"label": "Show advanced Settings",
114+
"label": "Expert mode",
91115
"type": "BOOLEAN",
92-
"description": "",
93-
"defaultValue": false
94-
},
95-
{
96-
"visibilityCondition": "model.advanced_settings",
97-
"name": "output_single_json",
98-
"label": "Output single column",
99-
"type": "BOOLEAN",
100-
"description": "Output a single JSON column rather than one column per entity type",
101116
"defaultValue": false
102117
},
103118
{
104119
"visibilityCondition": "model.advanced_settings",
105120
"name": "ner_model",
106121
"label": "Model",
107122
"type": "SELECT",
108-
"description": "SpaCy (multi-language, faster, less accurate) of Flair (Enlgish only, slower, more accurate).",
123+
"description": "spaCy (multi-lingual, faster) or Flair (English only, slower)",
109124
"selectChoices": [
110125
{
111126
"value": "spacy",
112-
"label": "SpaCy"
127+
"label": "spaCy"
113128
},
114129
{
115130
"value": "flair",
116131
"label": "Flair"
117132
}
118133
],
119134
"defaultValue": "spacy"
135+
},
136+
{
137+
"visibilityCondition": "model.advanced_settings",
138+
"name": "output_single_json",
139+
"label": "JSON output",
140+
"type": "BOOLEAN",
141+
"description": "Output a single JSON column rather than one column per entity type",
142+
"defaultValue": false
120143
}
121144
]
122145
}
+34-46
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,69 @@
11
# -*- coding: utf-8 -*-
22
import dataiku
3-
import pandas as pd
4-
from dataiku.customrecipe import *
3+
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
54

6-
import warnings
7-
8-
warnings.filterwarnings(action='ignore')
9-
10-
#############################
11-
# Logging Settings
12-
#############################
13-
14-
import logging
15-
16-
FORMAT = '[NER RECIPE] %(asctime)s - %(name)s - %(levelname)s - %(message)s'
17-
logging.basicConfig(format=FORMAT)
18-
logger = logging.getLogger()
19-
logger.setLevel(logging.INFO)
5+
from dku_io_utils import process_dataset_chunks
206

217
#############################
228
# Input & Output datasets
239
#############################
2410

25-
input_dataset_name = get_input_names_for_role('input_dataset')[0]
11+
input_dataset_name = get_input_names_for_role("input_dataset")[0]
2612
input_dataset = dataiku.Dataset(input_dataset_name)
2713

28-
output_dataset_name = get_output_names_for_role('output_dataset')[0]
14+
output_dataset_name = get_output_names_for_role("output_dataset")[0]
2915
output_dataset = dataiku.Dataset(output_dataset_name)
3016

31-
input_df = input_dataset.get_dataframe()
32-
3317
#############################
3418
# Recipe Parameters
3519
#############################
3620

3721
recipe_config = get_recipe_config()
3822

39-
text_column_name = recipe_config.get('text_column_name', None)
40-
if text_column_name == None:
41-
raise ValueError("You did not choose a text column.")
23+
text_column_name = recipe_config.get("text_column_name", None)
24+
if not text_column_name:
25+
raise ValueError("Please choose a text column")
4226

43-
advanced_settings = recipe_config.get('advanced_settings', False)
27+
advanced_settings = recipe_config.get("advanced_settings", False)
4428
if advanced_settings:
45-
output_single_json = recipe_config.get('output_single_json', False)
46-
ner_model = recipe_config.get('ner_model', 'spacy')
29+
output_single_json = recipe_config.get("output_single_json", False)
30+
ner_model = recipe_config.get("ner_model", "spacy")
4731
else:
4832
output_single_json = False
49-
ner_model = 'spacy'
33+
ner_model = "spacy"
5034

51-
if ner_model == 'spacy':
35+
if ner_model == "spacy":
5236
from ner_utils_spacy import extract_entities
37+
38+
language = recipe_config.get("text_language_spacy", "en")
5339
else:
54-
from ner_utils_flair import extract_entities
40+
from ner_utils_flair import extract_entities, CustomSequenceTagger
41+
42+
try:
43+
model_folder = get_input_names_for_role("model_folder")[0]
44+
except IndexError:
45+
raise Exception(
46+
"To use Flair, download the model using the macro and add the resulting folder as input to the recipe."
47+
)
48+
folder_path = dataiku.Folder(model_folder).get_path()
49+
tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path)
5550

5651
#############################
5752
# Main Loop
5853
#############################
5954

60-
CHUNK_SIZE = 100
61-
n_lines = 0
62-
logger.info("Started chunk-processing of input Dataset.")
63-
for chunk_idx, df in enumerate(input_dataset.iter_dataframes(chunksize=CHUNK_SIZE)):
64-
# Process chunk
65-
out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json)
55+
56+
def compute_entities_df(df):
57+
if ner_model == "spacy":
58+
out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json, language=language)
59+
else:
60+
out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json, tagger=tagger)
6661
df = df.reset_index(drop=True)
6762
out_df = out_df.reset_index(drop=True)
6863
out_df = df.merge(out_df, left_index=True, right_index=True)
64+
return out_df
6965

70-
# Append dataframe to output Dataset
71-
if chunk_idx == 0:
72-
output_dataset.write_schema_from_dataframe(out_df)
73-
writer = output_dataset.get_writer()
74-
writer.write_dataframe(out_df)
75-
else:
76-
writer.write_dataframe(out_df)
77-
78-
n_lines += len(df)
79-
logger.info("Finished processing {} lines".format(n_lines))
8066

81-
writer.close()
67+
process_dataset_chunks(
68+
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
69+
)

0 commit comments

Comments
 (0)