dataiku
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎custom-recipes/named-entity-recognition-extract/recipe.json
+50-22 b/‎custom-recipes/named-entity-recognition-extract/recipe.json
+50-22
diff --git a/‎custom-recipes/named-entity-recognition-extract/recipe.py
+18-24 b/‎custom-recipes/named-entity-recognition-extract/recipe.py
+18-24
diff --git a/‎plugin.json
+8-5 b/‎plugin.json
+8-5
diff --git a/‎python-lib/dku_io_utils.py
+119 b/‎python-lib/dku_io_utils.py
+119
@@ -1,6 +1,6 @@
-# Named Entity Recognition
+# Named Entity Recognition Plugin
 
-This Dataiku DSS plugin provides a recipe, macro and webapp  to recognize Named Entities (people, dates, places, etc.) in text data.
+This Dataiku DSS plugin provides a recipe, macro and webapp to recognize Named Entities (people, dates, places, etc.) in text data.
 
 Documentation: https://www.dataiku.com/product/plugins/named-entity-recognition/
 
 
@@ -10,15 +10,15 @@
         {
             "name": "input_dataset",
             "label": "Input dataset",
-            "description": "The dataset that contains your texts.",
+            "description": "Dataset containing the text data to analyze",
             "arity": "UNARY",
             "required": true,
             "acceptsDataset": true
         },
         {
             "name": "model_folder",
-            "label": "Model folder",
-            "description": "A managed folder for saving the NER model (only required if using Flair)",
+            "label": "Flair model (optional)",
+            "description": "Folder containing Flair model weights",
             "arity": "UNARY",
             "required": false,
             "acceptsManagedFolder": true,
@@ -30,44 +30,72 @@
         {
             "name": "output_dataset",
             "label": "Output dataset",
-            "description": "A dataset with the input texts and their corresponding entities",
+            "description": "Dataset with the input text and the corresponding entities",
             "arity": "UNARY",
             "required": true,
             "acceptsDataset": true
         }
     ],
     "params": [
+        {
+            "name": "separator_input",
+            "label": "Input parameters",
+            "type": "SEPARATOR"
+        },
         {
             "name": "text_column_name",
             "label": "Text column",
             "type": "COLUMN",
             "mandatory": true,
-            "columnRole": "input_dataset"
+            "columnRole": "input_dataset",
+            "allowedColumnTypes": [
+                "string"
+            ]
         },
         {
             "visibilityCondition": "model.ner_model=='spacy'",
             "name": "text_language_spacy",
-            "label": "Text language",
+            "label": "Language",
+            "description": "List of supported languages",
             "type": "SELECT",
-            "description": "Select the language of your texts.",
             "selectChoices": [
+                {
+                    "value": "zh",
+                    "label": "Chinese"
+                },
                 {
                     "value": "en",
                     "label": "English"
                 },
                 {
                     "value": "fr",
                     "label": "French"
+                },
+                {
+                    "value": "de",
+                    "label": "German"
+                },
+                {
+                    "value": "nb",
+                    "label": "Norwegian Bokmål"
+                },
+                {
+                    "value": "pl",
+                    "label": "Polish"
+                },
+                {
+                    "value": "es",
+                    "label": "Spanish"
                 }
             ],
             "defaultValue": "en"
         },
         {
             "visibilityCondition": "model.ner_model=='flair'",
             "name": "text_language_flair",
-            "label": "Text language",
+            "label": "Language",
             "type": "SELECT",
-            "description": "Select the language of your texts.",
+            "description": "Only supported language",
             "selectChoices": [
                 {
                     "value": "en",
@@ -77,41 +105,41 @@
             "defaultValue": "en"
         },
         {
-            "label": "Advanced",
+            "name": "separator_configuration",
+            "label": "Configuration",
             "type": "SEPARATOR"
         },
         {
             "name": "advanced_settings",
-            "label": "Show advanced Settings",
+            "label": "Expert mode",
             "type": "BOOLEAN",
-            "description": "",
-            "defaultValue": false
-        },
-        {
-            "visibilityCondition": "model.advanced_settings",
-            "name": "output_single_json",
-            "label": "Output single column",
-            "type": "BOOLEAN",
-            "description": "Output a single JSON column rather than one column per entity type",
             "defaultValue": false
         },
         {
             "visibilityCondition": "model.advanced_settings",
             "name": "ner_model",
             "label": "Model",
             "type": "SELECT",
-            "description": "SpaCy (multi-language, faster, less accurate) of Flair (Enlgish only, slower, more accurate).",
+            "description": "spaCy (multi-lingual, faster) or Flair (English only, slower)",
             "selectChoices": [
                 {
                     "value": "spacy",
-                    "label": "SpaCy"
+                    "label": "spaCy"
                 },
                 {
                     "value": "flair",
                     "label": "Flair"
                 }
             ],
             "defaultValue": "spacy"
+        },
+        {
+            "visibilityCondition": "model.advanced_settings",
+            "name": "output_single_json",
+            "label": "JSON output",
+            "type": "BOOLEAN",
+            "description": "Output a single JSON column rather than one column per entity type",
+            "defaultValue": false
         }
     ]
 }
@@ -1,11 +1,9 @@
 # -*- coding: utf-8 -*-
-import logging
-
-from tqdm import tqdm
-
 import dataiku
 from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 
+from dku_io_utils import process_dataset_chunks
+
 #############################
 # Input & Output datasets
 #############################
@@ -16,8 +14,6 @@
 output_dataset_name = get_output_names_for_role("output_dataset")[0]
 output_dataset = dataiku.Dataset(output_dataset_name)
 
-input_df = input_dataset.get_dataframe()
-
 #############################
 # Recipe Parameters
 #############################
@@ -26,7 +22,7 @@
 
 text_column_name = recipe_config.get("text_column_name", None)
 if not text_column_name:
-    raise ValueError("You did not choose a text column.")
+    raise ValueError("Please choose a text column")
 
 advanced_settings = recipe_config.get("advanced_settings", False)
 if advanced_settings:
@@ -38,8 +34,10 @@
 
 if ner_model == "spacy":
     from ner_utils_spacy import extract_entities
+
+    language = recipe_config.get("text_language_spacy", "en")
 else:
-    from ner_utils_flair import extract_entities
+    from ner_utils_flair import extract_entities, CustomSequenceTagger
 
     try:
         model_folder = get_input_names_for_role("model_folder")[0]
@@ -48,28 +46,24 @@
             "To use Flair, download the model using the macro and add the resulting folder as input to the recipe."
         )
     folder_path = dataiku.Folder(model_folder).get_path()
+    tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path)
 
 #############################
 # Main Loop
 #############################
 
-CHUNK_SIZE = 100
-n_lines = 0
-logging.info("Started chunk-processing of input Dataset.")
-for chunk_idx, df in enumerate(tqdm(input_dataset.iter_dataframes(chunksize=CHUNK_SIZE))):
-    # Process chunk
-    out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json)
+
+def compute_entities_df(df):
+    if ner_model == "spacy":
+        out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json, language=language)
+    else:
+        out_df = extract_entities(df[text_column_name].fillna(" "), format=output_single_json, tagger=tagger)
     df = df.reset_index(drop=True)
     out_df = out_df.reset_index(drop=True)
     out_df = df.merge(out_df, left_index=True, right_index=True)
+    return out_df
 
-    # Append dataframe to output Dataset
-    if chunk_idx == 0:
-        output_dataset.write_schema_from_dataframe(out_df)
-        writer = output_dataset.get_writer()
-        writer.write_dataframe(out_df)
-    else:
-        writer.write_dataframe(out_df)
-    n_lines += len(df)
-    logging.info("Finished processing {} lines".format(n_lines))
-writer.close()
+
+process_dataset_chunks(
+    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
+)
@@ -1,14 +1,17 @@
 {
     "id": "named-entity-recognition",
-    "version": "1.2.1",
+    "version": "1.3.0",
     "meta": {
         "label": "Named Entity Recognition",
         "category": "Natural Language Processing",
-        "description": "Identify “real-world objects” (people names, dates, places, etc.) in a text",
-        "author": "Dataiku (Alex COMBESSIE and Hicham EL BOUKKOURI)",
+        "description": "Recognize Named Entities in text data using pre-trained models",
+        "author": "Dataiku (Alex COMBESSIE, Du PHAN, Hicham EL BOUKKOURI)",
         "icon": "icon-tags",
         "licenseInfo": "Apache Software License",
         "url": "https://www.dataiku.com/product/plugins/named-entity-recognition/",
-        "tags": ["NLP"]
+        "tags": [
+            "NLP"
+        ],
+        "supportLevel": "NOT_SUPPORTED"
     }
-}
+}
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+"""Module with read/write utility functions based on the Dataiku API"""
+
+import logging
+import math
+from time import time
+from typing import Callable, Dict
+
+from tqdm import tqdm
+import dataiku
+
+
+def count_records(dataset: dataiku.Dataset) -> int:
+    """Count the number of records of a dataset using the Dataiku dataset metrics API
+
+    Args:
+        dataset: dataiku.Dataset instance
+
+    Returns:
+        Number of records
+    """
+    metric_id = "records:COUNT_RECORDS"
+    partitions = dataset.read_partitions
+    client = dataiku.api_client()
+    project = client.get_project(dataset.project_key)
+    record_count = 0
+    logging.info("Counting records of dataset: {}...".format(dataset.name))
+    if partitions is None or len(partitions) == 0:
+        project.get_dataset(dataset.short_name).compute_metrics(metric_ids=[metric_id])
+        metric = dataset.get_last_metric_values()
+        record_count = dataiku.ComputedMetrics.get_value_from_data(metric.get_global_data(metric_id=metric_id))
+        logging.info("Dataset {} contains {:d} records and is not partitioned".format(dataset.name, record_count))
+    else:
+        for partition in partitions:
+            project.get_dataset(dataset.name).compute_metrics(partition=partition, metric_ids=[metric_id])
+            metric = dataset.get_last_metric_values()
+            record_count += dataiku.ComputedMetrics.get_value_from_data(
+                metric.get_partition_data(partition=partition, metric_id=metric_id)
+            )
+        logging.info(
+            "Dataset {} contains {:d} records in partition(s) {}".format(dataset.name, record_count, partitions)
+        )
+    return record_count
+
+
+def process_dataset_chunks(
+    input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 1000, **kwargs
+) -> None:
+    """Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.
+
+    Passes keyword arguments to the function, adds a tqdm progress bar and generic logging.
+    Directly writes chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time.
+
+    Args:
+        input_dataset: Input dataiku.Dataset instance
+        output_dataset: Output dataiku.Dataset instance
+        func: The function to apply to the `input_dataset` by chunks of pandas.DataFrame
+            This function must take a pandas.DataFrame as first input argument,
+            and output another pandas.DataFrame
+        chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func`
+        **kwargs: Optional keyword arguments fed to `func`
+    """
+    input_count_records = count_records(input_dataset)
+    if input_count_records == 0:
+        raise ValueError("Input dataset has no records")
+    logging.info(
+        "Processing dataset {} of {:d} rows by chunks of {:d}...".format(
+            input_dataset.name, input_count_records, chunksize
+        )
+    )
+    start = time()
+    with output_dataset.get_writer() as writer:
+        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False)
+        len_iterator = math.ceil(input_count_records / chunksize)
+        for i, df in tqdm(enumerate(df_iterator), total=len_iterator):
+            output_df = func(df=df, **kwargs)
+            if i == 0:
+                output_dataset.write_schema_from_dataframe(
+                    output_df, dropAndCreate=bool(not output_dataset.writePartition)
+                )
+            writer.write_dataframe(output_df)
+    logging.info(
+        "Processing dataset {} of {:d} rows: Done in {:.2f} seconds.".format(
+            input_dataset.name, input_count_records, time() - start
+        )
+    )
+
+
+def set_column_description(
+    output_dataset: dataiku.Dataset, column_description_dict: Dict, input_dataset: dataiku.Dataset = None
+) -> None:
+    """Set column descriptions of the output dataset based on a dictionary of column descriptions
+
+    Retains the column descriptions from the input dataset if the column name matches.
+
+    Args:
+        output_dataset: Output dataiku.Dataset instance
+        column_description_dict: Dictionary holding column descriptions (value) by column name (key)
+        input_dataset: Optional input dataiku.Dataset instance
+            in case you want to retain input column descriptions
+    """
+    output_dataset_schema = output_dataset.read_schema()
+    input_dataset_schema = []
+    input_columns_names = []
+    if input_dataset is not None:
+        input_dataset_schema = input_dataset.read_schema()
+        input_columns_names = [col["name"] for col in input_dataset_schema]
+    for output_col_info in output_dataset_schema:
+        output_col_name = output_col_info.get("name", "")
+        output_col_info["comment"] = column_description_dict.get(output_col_name)
+        if output_col_name in input_columns_names:
+            matched_comment = [
+                input_col_info.get("comment", "")
+                for input_col_info in input_dataset_schema
+                if input_col_info.get("name") == output_col_name
+            ]
+            if len(matched_comment) != 0:
+                output_col_info["comment"] = matched_comment[0]
+    output_dataset.write_schema(output_dataset_schema)