dataiku
diff --git a/‎.wlock b/‎.wlock
diff --git a/‎Makefile
+45-5 b/‎Makefile
+45-5
diff --git a/‎README.md
+4-3 b/‎README.md
+4-3
diff --git a/‎code-env/python/desc.json
100755100644
+6-4 b/‎code-env/python/desc.json
100755100644
+6-4
diff --git a/‎code-env/python/spec/requirements.txt
+12-4 b/‎code-env/python/spec/requirements.txt
+12-4
diff --git a/‎custom-recipes/named-entity-recognition-extract/recipe.json
100755100644
+54-31 b/‎custom-recipes/named-entity-recognition-extract/recipe.json
100755100644
+54-31
diff --git a/‎custom-recipes/named-entity-recognition-extract/recipe.py
100755100644
+34-46 b/‎custom-recipes/named-entity-recognition-extract/recipe.py
100755100644
+34-46
@@ -1,8 +1,48 @@
-PLUGIN_VERSION=1.2.0
-PLUGIN_ID=named-entity-recognition
+# Makefile variables set automatically
+plugin_id=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['id']).replace('/',''))"`
+plugin_version=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['version']).replace('/',''))"`
+archive_file_name="dss-plugin-${plugin_id}-${plugin_version}.zip"
+remote_url=`git config --get remote.origin.url`
+last_commit_id=`git rev-parse HEAD`
+
 
 plugin:
-	cat plugin.json|json_pp > /dev/null
+	@echo "[START] Archiving plugin to dist/ folder..."
+	@cat plugin.json | json_pp > /dev/null
+	@rm -rf dist
+	@mkdir dist
+	@echo "{\"remote_url\":\"${remote_url}\",\"last_commit_id\":\"${last_commit_id}\"}" > release_info.json
+	@git archive -v -9 --format zip -o dist/${archive_file_name} HEAD
+	@zip -u dist/${archive_file_name} release_info.json
+	@rm release_info.json
+	@echo "[SUCCESS] Archiving plugin to dist/ folder: Done!"
+
+unit-tests:
+	@echo "[START] Running unit tests..."
+	@( \
+		PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \
+		PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \
+		if ! $$PYTHON_VERSION_IS_CORRECT; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; fi; \
+	)
+	@( \
+		python3 -m venv env/; \
+		source env/bin/activate; \
+		pip3 install --upgrade pip; \
+		pip3 install --no-cache-dir -r tests/python/requirements.txt; \
+		pip3 install --no-cache-dir -r code-env/python/spec/requirements.txt; \
+		export PYTHONPATH="$(PYTHONPATH):$(PWD)/python-lib"; \
+		export DICTIONARY_FOLDER_PATH="$(PWD)/resource/dictionaries"; \
+		pytest -o junit_family=xunit2 --junitxml=unit.xml tests/python/unit || true; \
+		deactivate; \
+	)
+	@echo "[SUCCESS] Running unit tests: Done!"
+
+integration-tests:
+	@echo "[START] Running integration tests..."
+	# TODO add integration tests
+	@echo "[SUCCESS] Running integration tests: Done!"
+
+tests: unit-tests integration-tests
+
+dist-clean:
 	rm -rf dist
-	mkdir dist
-	zip --exclude "*.pyc" -r dist/dss-plugin-${PLUGIN_ID}-${PLUGIN_VERSION}.zip plugin.json code-env custom-recipes python-lib python-runnables resource webapps
 
@@ -1,8 +1,9 @@
-# Named Entity Recognition
+# Named Entity Recognition Plugin
 
-This Dataiku DSS plugin provides recipes to recognize Named Entities (people, dates, places, etc.) in text data.
+This Dataiku DSS plugin provides a recipe, macro and webapp to recognize Named Entities (people, dates, places, etc.) in text data.
 
 Documentation: https://www.dataiku.com/product/plugins/named-entity-recognition/
 
-### Licence
+## License
+
 This plugin is distributed under Apache License version 2.0
@@ -1,6 +1,8 @@
 {
-  "acceptedPythonInterpreters": ["PYTHON36"],
-  "forceConda": false,
-  "installCorePackages": true,
-  "installJupyterSupport": false
+    "acceptedPythonInterpreters": [
+        "PYTHON36"
+    ],
+    "forceConda": false,
+    "installCorePackages": true,
+    "installJupyterSupport": false
 }
@@ -1,4 +1,12 @@
-torch==1.4.0
-flair==0.4.3
-spacy==2.1.8
-flask>=1.0,<1.1
+torch==1.6.0
+flair==0.6.1
+flask>=1.0,<1.1
+tqdm==4.50.0
+spacy==2.3.2
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
+https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz
+https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz
+https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-2.3.0/pl_core_news_sm-2.3.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-2.3.0/nb_core_news_sm-2.3.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz
@@ -1,78 +1,101 @@
 {
     "meta": {
-        "label": "Extract Named Entities",
-        "description": "Identify “real-world objects” (i.e. People names, Dates, Places, etc) in a text column.",
+        "label": "Named Entity Recognition",
+        "description": "Identify “real-world objects” (i.e. People names, Dates, Places, etc) in a text column",
         "icon": "icon-tag"
     },
-
     "kind": "PYTHON",
-
     "selectableFromDataset": "input_dataset",
-
     "inputRoles": [
         {
             "name": "input_dataset",
             "label": "Input dataset",
-            "description": "The dataset that contains your texts.",
+            "description": "Dataset containing the text data to analyze",
             "arity": "UNARY",
             "required": true,
             "acceptsDataset": true
         },
         {
             "name": "model_folder",
-            "label": "Model folder",
-            "description": "A managed folder for saving the NER model (only required if using Flair).",
+            "label": "Flair model (optional)",
+            "description": "Folder containing Flair model weights",
             "arity": "UNARY",
             "required": false,
             "acceptsManagedFolder": true,
-            "acceptsDataset": false
+            "acceptsDataset": false,
+            "mustBeStrictlyType": "Filesystem"
         }
     ],
-
     "outputRoles": [
         {
             "name": "output_dataset",
             "label": "Output dataset",
-            "description": "A dataset with the input texts and their corresponding entities.",
+            "description": "Dataset with the input text and the corresponding entities",
             "arity": "UNARY",
             "required": true,
             "acceptsDataset": true
         }
     ],
-
     "params": [
+        {
+            "name": "separator_input",
+            "label": "Input parameters",
+            "type": "SEPARATOR"
+        },
         {
             "name": "text_column_name",
             "label": "Text column",
             "type": "COLUMN",
-            "description": "Select a column to extract named entities.",
             "mandatory": true,
-            "columnRole": "input_dataset"
+            "columnRole": "input_dataset",
+            "allowedColumnTypes": [
+                "string"
+            ]
         },
         {
             "visibilityCondition": "model.ner_model=='spacy'",
             "name": "text_language_spacy",
-            "label": "Text language",
+            "label": "Language",
+            "description": "List of supported languages",
             "type": "SELECT",
-            "description": "Select the language of your texts.",
             "selectChoices": [
+                {
+                    "value": "zh",
+                    "label": "Chinese"
+                },
                 {
                     "value": "en",
                     "label": "English"
                 },
                 {
                     "value": "fr",
                     "label": "French"
+                },
+                {
+                    "value": "de",
+                    "label": "German"
+                },
+                {
+                    "value": "nb",
+                    "label": "Norwegian Bokmål"
+                },
+                {
+                    "value": "pl",
+                    "label": "Polish"
+                },
+                {
+                    "value": "es",
+                    "label": "Spanish"
                 }
             ],
             "defaultValue": "en"
         },
         {
             "visibilityCondition": "model.ner_model=='flair'",
             "name": "text_language_flair",
-            "label": "Text language",
+            "label": "Language",
             "type": "SELECT",
-            "description": "Select the language of your texts.",
+            "description": "Only supported language",
             "selectChoices": [
                 {
                     "value": "en",
@@ -82,41 +105,41 @@
             "defaultValue": "en"
         },
         {
-            "label": "Advanced",
+            "name": "separator_configuration",
+            "label": "Configuration",
             "type": "SEPARATOR"
         },
         {
             "name": "advanced_settings",
-            "label": "Show advanced Settings",
+            "label": "Expert mode",
             "type": "BOOLEAN",
-            "description": "",
-            "defaultValue": false
-        },
-        {
-            "visibilityCondition": "model.advanced_settings",
-            "name": "output_single_json",
-            "label": "Output single column",
-            "type": "BOOLEAN",
-            "description": "Output a single JSON column rather than one column per entity type",
             "defaultValue": false
         },
         {
             "visibilityCondition": "model.advanced_settings",
             "name": "ner_model",
             "label": "Model",
             "type": "SELECT",
-            "description": "SpaCy (multi-language, faster, less accurate) of Flair (Enlgish only, slower, more accurate).",
+            "description": "spaCy (multi-lingual, faster) or Flair (English only, slower)",
             "selectChoices": [
                 {
                     "value": "spacy",
-                    "label": "SpaCy"
+                    "label": "spaCy"
                 },
                 {
                     "value": "flair",
                     "label": "Flair"
                 }
             ],
             "defaultValue": "spacy"
+        },
+        {
+            "visibilityCondition": "model.advanced_settings",
+            "name": "output_single_json",
+            "label": "JSON output",
+            "type": "BOOLEAN",
+            "description": "Output a single JSON column rather than one column per entity type",
+            "defaultValue": false
         }
     ]
 }
@@ -1,81 +1,69 @@
 # -*- coding: utf-8 -*-
 import dataiku
-import pandas as pd
-from dataiku.customrecipe import *
+from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 
-import warnings
-
-warnings.filterwarnings(action='ignore')
-
-#############################
-# Logging Settings
-#############################
-
-import logging
-
-FORMAT = '[NER RECIPE] %(asctime)s - %(name)s - %(levelname)s - %(message)s'
-logging.basicConfig(format=FORMAT)
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
+from dku_io_utils import process_dataset_chunks
 
 #############################
 # Input & Output datasets
 #############################
 
-input_dataset_name = get_input_names_for_role('input_dataset')[0]
+input_dataset_name = get_input_names_for_role("input_dataset")[0]
 input_dataset = dataiku.Dataset(input_dataset_name)
 
-output_dataset_name = get_output_names_for_role('output_dataset')[0]
+output_dataset_name = get_output_names_for_role("output_dataset")[0]
 output_dataset = dataiku.Dataset(output_dataset_name)
 
-input_df = input_dataset.get_dataframe()
-
 #############################
 # Recipe Parameters
 #############################
 
 recipe_config = get_recipe_config()
 
-text_column_name = recipe_config.get('text_column_name', None)
-if text_column_name == None:
-    raise ValueError("You did not choose a text column.")
+text_column_name = recipe_config.get("text_column_name", None)
+if not text_column_name:
+    raise ValueError("Please choose a text column")
 
-advanced_settings = recipe_config.get('advanced_settings', False)
+advanced_settings = recipe_config.get("advanced_settings", False)
 if advanced_settings:
-    output_single_json = recipe_config.get('output_single_json', False)
-    ner_model = recipe_config.get('ner_model', 'spacy')
+    output_single_json = recipe_config.get("output_single_json", False)
+    ner_model = recipe_config.get("ner_model", "spacy")
 else:
     output_single_json = False
-    ner_model = 'spacy'
+    ner_model = "spacy"
 
-if ner_model == 'spacy':
+if ner_model == "spacy":
     from ner_utils_spacy import extract_entities
+
+    language = recipe_config.get("text_language_spacy", "en")
 else:
-    from ner_utils_flair import extract_entities
+    from ner_utils_flair import extract_entities, CustomSequenceTagger
+
+    try:
+        model_folder = get_input_names_for_role("model_folder")[0]
+    except IndexError:
+        raise Exception(
+            "To use Flair, download the model using the macro and add the resulting folder as input to the recipe."
+        )
+    folder_path = dataiku.Folder(model_folder).get_path()
+    tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path)
 
 #############################
 # Main Loop
 #############################
 
-CHUNK_SIZE = 100
-n_lines = 0
-logger.info("Started chunk-processing of input Dataset.")
-for chunk_idx, df in enumerate(input_dataset.iter_dataframes(chunksize=CHUNK_SIZE)):
-    # Process chunk
-    out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json)
+
+def compute_entities_df(df):
+    if ner_model == "spacy":
+        out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json, language=language)
+    else:
+        out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json, tagger=tagger)
     df = df.reset_index(drop=True)
     out_df = out_df.reset_index(drop=True)
     out_df = df.merge(out_df, left_index=True, right_index=True)
+    return out_df
 
-    # Append dataframe to output Dataset
-    if chunk_idx == 0:
-        output_dataset.write_schema_from_dataframe(out_df)
-        writer = output_dataset.get_writer()
-        writer.write_dataframe(out_df)
-    else:
-        writer.write_dataframe(out_df)
-
-    n_lines += len(df)
-    logger.info("Finished processing {} lines".format(n_lines))
 
-writer.close()
+process_dataset_chunks(
+    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
+)
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,8 @@`
`1`	`1`	`{`
`2`		`- "acceptedPythonInterpreters": ["PYTHON36"],`
`3`		`- "forceConda": false,`
`4`		`- "installCorePackages": true,`
`5`		`- "installJupyterSupport": false`
	`2`	`+ "acceptedPythonInterpreters": [`
	`3`	`+ "PYTHON36"`
	`4`	`+ ],`
	`5`	`+ "forceConda": false,`
	`6`	`+ "installCorePackages": true,`
	`7`	`+ "installJupyterSupport": false`
`6`	`8`	`}`