From 19dfb9ab8e4cfad383f5f436fbc60e4bc43b31a0 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 14:22:14 +0100 Subject: [PATCH 01/13] Sort resources before persisting them --- snips_nlu/resources.py | 9 ++++++--- snips_nlu/slot_filler/feature_factory.py | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/snips_nlu/resources.py b/snips_nlu/resources.py index 42dc2817d..65de1ae87 100644 --- a/snips_nlu/resources.py +++ b/snips_nlu/resources.py @@ -254,6 +254,7 @@ def _load_stop_words(stop_words_path): def _persist_stop_words(stop_words, path): + stop_words = sorted(stop_words) with path.open(encoding="utf8", mode="w") as f: for stop_word in stop_words: f.write("%s\n" % stop_word) @@ -305,7 +306,7 @@ def _load_word_clusters(path): def _persist_word_clusters(word_clusters, path): with path.open(encoding="utf8", mode="w") as f: - for word, cluster in iteritems(word_clusters): + for word, cluster in sorted(iteritems(word_clusters)): f.write("%s\t%s\n" % (word, cluster)) @@ -327,6 +328,8 @@ def _load_gazetteer(path): def _persist_gazetteer(gazetteer, path): + # Sort gazetteer to avoid serialization diffs + gazetteer = sorted(gazetteer) with path.open(encoding="utf8", mode="w") as f: for word in gazetteer: f.write("%s\n" % word) @@ -355,7 +358,7 @@ def _persist_stems(stems, path): for value, stem in iteritems(stems): reversed_stems[stem].append(value) with path.open(encoding="utf8", mode="w") as f: - for stem, values in iteritems(reversed_stems): - elements = [stem] + values + for stem, values in sorted(iteritems(reversed_stems)): + elements = [stem] + sorted(values) line = ",".join(elements) f.write("%s\n" % line) diff --git a/snips_nlu/slot_filler/feature_factory.py b/snips_nlu/slot_filler/feature_factory.py index 743410726..c539b5b09 100644 --- a/snips_nlu/slot_filler/feature_factory.py +++ b/snips_nlu/slot_filler/feature_factory.py @@ -508,7 +508,8 @@ def language(self, value): def fit(self, dataset, intent): self.language = dataset[LANGUAGE] - self.builtin_entities = self._get_builtin_entity_scope(dataset, intent) + self.builtin_entities = sorted( + self._get_builtin_entity_scope(dataset, intent)) self.args["entity_labels"] = self.builtin_entities def build_features(self): From 17daa1ee2e129fc914e68435a1882604b07dbcb9 Mon Sep 17 00:00:00 2001 From: Rosa Stern Date: Wed, 20 Feb 2019 17:35:11 +0100 Subject: [PATCH 02/13] Add default configurations for Portuguese (PT and BR) - V0. --- snips_nlu/default_configs/config_pt_br.py | 170 ++++++++++++++++++++++ snips_nlu/default_configs/config_pt_pt.py | 170 ++++++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100644 snips_nlu/default_configs/config_pt_br.py create mode 100644 snips_nlu/default_configs/config_pt_pt.py diff --git a/snips_nlu/default_configs/config_pt_br.py b/snips_nlu/default_configs/config_pt_br.py new file mode 100644 index 000000000..e835f94f4 --- /dev/null +++ b/snips_nlu/default_configs/config_pt_br.py @@ -0,0 +1,170 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "deterministic_intent_parser", + "max_queries": 500, + "max_pattern_length": 1000, + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [ + -2, + -1, + 0, + 1, + 2 + ] + }, + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [ + -2, + 1 + ] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [ + -1, + 0, + 1 + ] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [ + -2, + -1, + 0 + ] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [ + 0, + 1, + 2 + ] + }, + { + "args": { + "n": 1 + }, + "factory_name": "shape_ngram", + "offsets": [ + 0 + ] + }, + { + "args": { + "n": 2 + }, + "factory_name": "shape_ngram", + "offsets": [ + -1, + 0 + ] + }, + { + "args": { + "n": 3 + }, + "factory_name": "shape_ngram", + "offsets": [ + -1 + ] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2 + }, + "factory_name": "entity_match", + "offsets": [ + -2, + -1, + 0 + ], + "drop_out": 0.5 + }, + { + "args": { + "tagging_scheme_code": 1 + }, + "factory_name": "builtin_entity_match", + "offsets": [ + -2, + -1, + 0 + ] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + }, + "random_seed": None + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "random_seed": None + } + } + ] +} diff --git a/snips_nlu/default_configs/config_pt_pt.py b/snips_nlu/default_configs/config_pt_pt.py new file mode 100644 index 000000000..e835f94f4 --- /dev/null +++ b/snips_nlu/default_configs/config_pt_pt.py @@ -0,0 +1,170 @@ +from __future__ import unicode_literals + +CONFIG = { + "unit_name": "nlu_engine", + "intent_parsers_configs": [ + { + "unit_name": "deterministic_intent_parser", + "max_queries": 500, + "max_pattern_length": 1000, + "ignore_stop_words": True + }, + { + "unit_name": "probabilistic_intent_parser", + "slot_filler_config": { + "unit_name": "crf_slot_filler", + "feature_factory_configs": [ + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 1 + }, + "factory_name": "ngram", + "offsets": [ + -2, + -1, + 0, + 1, + 2 + ] + }, + { + "args": { + "common_words_gazetteer_name": + "top_10000_words_stemmed", + "use_stemming": True, + "n": 2 + }, + "factory_name": "ngram", + "offsets": [ + -2, + 1 + ] + }, + { + "args": {}, + "factory_name": "is_digit", + "offsets": [ + -1, + 0, + 1 + ] + }, + { + "args": {}, + "factory_name": "is_first", + "offsets": [ + -2, + -1, + 0 + ] + }, + { + "args": {}, + "factory_name": "is_last", + "offsets": [ + 0, + 1, + 2 + ] + }, + { + "args": { + "n": 1 + }, + "factory_name": "shape_ngram", + "offsets": [ + 0 + ] + }, + { + "args": { + "n": 2 + }, + "factory_name": "shape_ngram", + "offsets": [ + -1, + 0 + ] + }, + { + "args": { + "n": 3 + }, + "factory_name": "shape_ngram", + "offsets": [ + -1 + ] + }, + { + "args": { + "use_stemming": True, + "tagging_scheme_code": 2 + }, + "factory_name": "entity_match", + "offsets": [ + -2, + -1, + 0 + ], + "drop_out": 0.5 + }, + { + "args": { + "tagging_scheme_code": 1 + }, + "factory_name": "builtin_entity_match", + "offsets": [ + -2, + -1, + 0 + ] + } + ], + "crf_args": { + "c1": 0.1, + "c2": 0.1, + "algorithm": "lbfgs" + }, + "tagging_scheme": 1, + "data_augmentation_config": { + "min_utterances": 200, + "capitalization_ratio": 0.2, + "add_builtin_entities_examples": True + }, + "random_seed": None + }, + "intent_classifier_config": { + "unit_name": "log_reg_intent_classifier", + "data_augmentation_config": { + "min_utterances": 20, + "noise_factor": 5, + "add_builtin_entities_examples": False, + "max_unknown_words": None, + "unknown_word_prob": 0.0, + "unknown_words_replacement_string": None + }, + "featurizer_config": { + "unit_name": "featurizer", + "pvalue_threshold": 0.4, + "added_cooccurrence_feature_ratio": 0.0, + "tfidf_vectorizer_config": { + "unit_name": "tfidf_vectorizer", + "use_stemming": True, + "word_clusters_name": None + }, + "cooccurrence_vectorizer_config": { + "unit_name": "cooccurrence_vectorizer", + "window_size": None, + "filter_stop_words": True, + "unknown_words_replacement_string": None, + "keep_order": True + } + }, + "random_seed": None + } + } + ] +} From 24a7cda11775f5f2b3cebe43d8d7390208a3cd3c Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Thu, 28 Feb 2019 11:30:09 +0100 Subject: [PATCH 03/13] Bump snips-nlu-parsers to 0.2.0 and snips-nlu-utils to 0.8.0 --- sample_datasets/beverage_dataset.json | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sample_datasets/beverage_dataset.json b/sample_datasets/beverage_dataset.json index ae90db545..4473ddde8 100644 --- a/sample_datasets/beverage_dataset.json +++ b/sample_datasets/beverage_dataset.json @@ -136,4 +136,4 @@ } }, "language": "en" -} \ No newline at end of file +} diff --git a/setup.py b/setup.py index f574442fc..45f687d67 100644 --- a/setup.py +++ b/setup.py @@ -24,8 +24,8 @@ "scikit-learn>=0.19,<0.20", "sklearn-crfsuite>=0.3.6,<0.4", "semantic_version>=2.6,<3.0", - "snips_nlu_utils>=0.7,<0.8", - "snips_nlu_parsers>=0.1,<0.2", + "snips_nlu_utils>=0.8,<0.9", + "snips_nlu_parsers>=0.2,<0.3", "num2words>=0.5.6,<0.6", "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", From ffbf66a9efb615b0f26381b3a6f2361c3d0c29bc Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Thu, 28 Feb 2019 13:42:45 +0100 Subject: [PATCH 04/13] Fix resources names --- snips_nlu/default_configs/config_pt_br.py | 4 ++-- snips_nlu/default_configs/config_pt_pt.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snips_nlu/default_configs/config_pt_br.py b/snips_nlu/default_configs/config_pt_br.py index e835f94f4..e8a478b5b 100644 --- a/snips_nlu/default_configs/config_pt_br.py +++ b/snips_nlu/default_configs/config_pt_br.py @@ -17,7 +17,7 @@ { "args": { "common_words_gazetteer_name": - "top_10000_words_stemmed", + "top_5000_words_stemmed", "use_stemming": True, "n": 1 }, @@ -33,7 +33,7 @@ { "args": { "common_words_gazetteer_name": - "top_10000_words_stemmed", + "top_5000_words_stemmed", "use_stemming": True, "n": 2 }, diff --git a/snips_nlu/default_configs/config_pt_pt.py b/snips_nlu/default_configs/config_pt_pt.py index e835f94f4..e8a478b5b 100644 --- a/snips_nlu/default_configs/config_pt_pt.py +++ b/snips_nlu/default_configs/config_pt_pt.py @@ -17,7 +17,7 @@ { "args": { "common_words_gazetteer_name": - "top_10000_words_stemmed", + "top_5000_words_stemmed", "use_stemming": True, "n": 1 }, @@ -33,7 +33,7 @@ { "args": { "common_words_gazetteer_name": - "top_10000_words_stemmed", + "top_5000_words_stemmed", "use_stemming": True, "n": 2 }, From 19d2485034a700788a86c6e2e350cfcfc59e549b Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Fri, 1 Mar 2019 16:02:00 +0100 Subject: [PATCH 05/13] Fix setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 45f687d67..006cb4dde 100644 --- a/setup.py +++ b/setup.py @@ -24,8 +24,8 @@ "scikit-learn>=0.19,<0.20", "sklearn-crfsuite>=0.3.6,<0.4", "semantic_version>=2.6,<3.0", - "snips_nlu_utils>=0.8,<0.9", - "snips_nlu_parsers>=0.2,<0.3", + "snips-nlu-utils>=0.8,<0.9", + "snips-nlu-parsers>=0.2,<0.3", "num2words>=0.5.6,<0.6", "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", From b0f7deea16e6cfb623eb7b2e1d44312d07c3b37a Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Fri, 1 Mar 2019 17:00:05 +0100 Subject: [PATCH 06/13] Temporarly update the CI to directly download PT resources --- tox.ini | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index c1cd55591..e16019964 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,20 @@ envlist = py27, py35, py36, py37, integration-test, codecov, docs-html skip_install = true commands = pip install -e ".[test]" - snips-nlu download-all-languages + + snips-nlu download -d snips_nlu_pt_br-0.1.0 + snips-nlu download -d snips_nlu_pt_pt-0.1.0 + snips-nlu link snips_nlu_pt_br pt_br -f + snips-nlu link snips_nlu_pt_pt pt_pt -f + + snips-nlu download de + snips-nlu download en + snips-nlu download es + snips-nlu download fr + snips-nlu download it + snips-nlu download ja + snips-nlu download ko + snips-nlu download-language-entities fr snips-nlu download-language-entities en coverage run -m unittest discover @@ -19,7 +32,20 @@ basepython = python3.6 skip_install = true commands = pip install -e ".[test]" - snips-nlu download-all-languages + + snips-nlu download -d snips_nlu_pt_br-0.1.0 + snips-nlu download -d snips_nlu_pt_pt-0.1.0 + snips-nlu link snips_nlu_pt_br pt_br -f + snips-nlu link snips_nlu_pt_pt pt_pt -f + + snips-nlu download de + snips-nlu download en + snips-nlu download es + snips-nlu download fr + snips-nlu download it + snips-nlu download ja + snips-nlu download ko + python -m unittest discover -p 'linting_test*.py' python -m unittest discover -p 'integration_test*.py' From 0e92d557df12e55f42d90045e131aea27d373e21 Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Tue, 5 Mar 2019 17:43:35 +0100 Subject: [PATCH 07/13] Expose PT configs --- snips_nlu/constants.py | 2 ++ snips_nlu/default_configs/__init__.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/snips_nlu/constants.py b/snips_nlu/constants.py index 3acdeeef4..f37e48ac4 100644 --- a/snips_nlu/constants.py +++ b/snips_nlu/constants.py @@ -75,3 +75,5 @@ LANGUAGE_IT = "it" LANGUAGE_JA = "ja" LANGUAGE_KO = "ko" +LANGUAGE_PT_BR = "pt_br" +LANGUAGE_PT_PT = "pt_pt" diff --git a/snips_nlu/default_configs/__init__.py b/snips_nlu/default_configs/__init__.py index adc6fc9ea..d1b1f92fd 100644 --- a/snips_nlu/default_configs/__init__.py +++ b/snips_nlu/default_configs/__init__.py @@ -2,7 +2,7 @@ from snips_nlu.constants import ( LANGUAGE_DE, LANGUAGE_EN, LANGUAGE_ES, LANGUAGE_FR, LANGUAGE_IT, - LANGUAGE_JA, LANGUAGE_KO) + LANGUAGE_JA, LANGUAGE_KO, LANGUAGE_PT_BR, LANGUAGE_PT_PT) from .config_de import CONFIG as CONFIG_DE from .config_en import CONFIG as CONFIG_EN from .config_es import CONFIG as CONFIG_ES @@ -10,6 +10,8 @@ from .config_it import CONFIG as CONFIG_IT from .config_ja import CONFIG as CONFIG_JA from .config_ko import CONFIG as CONFIG_KO +from .config_pt_br import CONFIG as CONFIG_PT_BR +from .config_pt_pt import CONFIG as CONFIG_PT_PT DEFAULT_CONFIGS = { LANGUAGE_DE: CONFIG_DE, @@ -19,4 +21,6 @@ LANGUAGE_IT: CONFIG_IT, LANGUAGE_JA: CONFIG_JA, LANGUAGE_KO: CONFIG_KO, + LANGUAGE_PT_BR: CONFIG_PT_BR, + LANGUAGE_PT_PT: CONFIG_PT_PT, } From 30409f24a14cc842d4caaf28fb01a8148c388b22 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 15:46:19 +0100 Subject: [PATCH 08/13] Leverage entity scopes of each intent in deterministic intent parser --- .../deterministic_intent_parser.py | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py index 63238b9e6..29d78dfa9 100644 --- a/snips_nlu/intent_parser/deterministic_intent_parser.py +++ b/snips_nlu/intent_parser/deterministic_intent_parser.py @@ -56,7 +56,7 @@ def __init__(self, config=None, **shared): self._group_names_to_slot_names = None self.slot_names_to_group_names = None self.regexes_per_intent = None - self.builtin_scope = None + self.entity_scopes = None self.stop_words = None @property @@ -82,11 +82,16 @@ def slot_names_to_entities(self): def slot_names_to_entities(self, value): self._slot_names_to_entities = value if value is None: - self.builtin_scope = None + self.entity_scopes = None else: - self.builtin_scope = { - ent for slot_mapping in itervalues(value) - for ent in itervalues(slot_mapping) if is_builtin_entity(ent)} + self.entity_scopes = { + intent: { + "builtin": {ent for ent in itervalues(slot_mapping) + if is_builtin_entity(ent)}, + "custom": {ent for ent in itervalues(slot_mapping) + if not is_builtin_entity(ent)} + } + for intent, slot_mapping in iteritems(value)} @property def group_names_to_slot_names(self): @@ -208,30 +213,27 @@ def _parse_top_intents(self, text, top_n, intents=None): "top_n argument must be greater or equal to 1, but got: %s" % top_n) - builtin_entities = self.builtin_entity_parser.parse( - text, scope=self.builtin_scope, use_cache=True) - custom_entities = self.custom_entity_parser.parse( - text, use_cache=True) - all_entities = builtin_entities + custom_entities - placeholder_fn = lambda entity_name: _get_entity_name_placeholder( - entity_name, self.language) - ranges_mapping, processed_text = replace_entities_with_placeholders( - text, all_entities, placeholder_fn=placeholder_fn) - - # We try to match both the input text and the preprocessed text to - # cover inconsistencies between labeled data and builtin entity parsing - cleaned_text = self._preprocess_text(text) - cleaned_processed_text = self._preprocess_text(processed_text) + def placeholder_fn(entity_name): + return _get_entity_name_placeholder(entity_name, self.language) results = [] + cleaned_text = self._preprocess_text(text) - for intent, regexes in iteritems(self.regexes_per_intent): + for intent, entity_scope in iteritems(self.entity_scopes): if intents is not None and intent not in intents: continue - for regex in regexes: + builtin_entities = self.builtin_entity_parser.parse( + text, scope=entity_scope["builtin"], use_cache=True) + custom_entities = self.custom_entity_parser.parse( + text, scope=entity_scope["custom"], use_cache=True) + all_entities = builtin_entities + custom_entities + mapping, processed_text = replace_entities_with_placeholders( + text, all_entities, placeholder_fn=placeholder_fn) + cleaned_processed_text = self._preprocess_text(processed_text) + for regex in self.regexes_per_intent[intent]: res = self._get_matching_result(text, cleaned_processed_text, - regex, intent, ranges_mapping) - if res is None: + regex, intent, mapping) + if res is None and cleaned_text != cleaned_processed_text: res = self._get_matching_result(text, cleaned_text, regex, intent) if res is not None: @@ -239,6 +241,7 @@ def _parse_top_intents(self, text, top_n, intents=None): break if len(results) == top_n: return results + return results @fitted_required From 674dd2b90e56e1c4fa6e89aa5b93840ca6b634df Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 15:58:02 +0100 Subject: [PATCH 09/13] Fix sample datasets --- sample_datasets/beverage_dataset.json | 3 +++ sample_datasets/flights_dataset.json | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/sample_datasets/beverage_dataset.json b/sample_datasets/beverage_dataset.json index 4473ddde8..0af16623c 100644 --- a/sample_datasets/beverage_dataset.json +++ b/sample_datasets/beverage_dataset.json @@ -66,6 +66,9 @@ "slot_name": "number_of_cups", "text": "one" }, + { + "text": " " + }, { "entity": "beverage_type", "slot_name": "beverage_type", diff --git a/sample_datasets/flights_dataset.json b/sample_datasets/flights_dataset.json index 9f88139a4..2e730b4a1 100644 --- a/sample_datasets/flights_dataset.json +++ b/sample_datasets/flights_dataset.json @@ -42,6 +42,9 @@ "slot_name": "destination", "text": "London" }, + { + "text": " " + }, { "entity": "snips/datetime", "slot_name": "flight_time", @@ -87,6 +90,9 @@ "slot_name": "departure", "text": "istanbul" }, + { + "text": " " + }, { "entity": "snips/datetime", "slot_name": "flight_time", @@ -112,6 +118,9 @@ "slot_name": "destination", "text": "chicago" }, + { + "text": " " + }, { "entity": "snips/datetime", "slot_name": "flight_datetime", @@ -145,6 +154,9 @@ "slot_name": "flight_time", "text": "this weekend" }, + { + "text": " " + }, { "text": " ?" } From 518f4345520e7bcff97245922c332c1fe7caf1c5 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 6 Mar 2019 10:30:56 +0100 Subject: [PATCH 10/13] Increase integration tests threshold to 0.95 --- snips_nlu/tests/integration_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snips_nlu/tests/integration_test.py b/snips_nlu/tests/integration_test.py index 6a92fdfad..ba93a374a 100644 --- a/snips_nlu/tests/integration_test.py +++ b/snips_nlu/tests/integration_test.py @@ -9,7 +9,7 @@ from snips_nlu.preprocessing import tokenize_light from snips_nlu.tests.utils import PERFORMANCE_DATASET_PATH, SnipsTest -INTENT_CLASSIFICATION_THRESHOLD = 0.9 +INTENT_CLASSIFICATION_THRESHOLD = 0.95 SLOT_FILLING_THRESHOLD = 0.85 SKIPPED_DATE_PREFIXES = {"at", "in", "for", "on"} From e590981b4e70d62d72d336a3359eaea283885f6b Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 6 Mar 2019 11:17:17 +0100 Subject: [PATCH 11/13] Add support for portuguese in documentation --- docs/source/languages.rst | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/docs/source/languages.rst b/docs/source/languages.rst index 68d7cddb1..3c6723764 100644 --- a/docs/source/languages.rst +++ b/docs/source/languages.rst @@ -7,22 +7,26 @@ Snips NLU supports various languages, that are speficied in the dataset in the ``"language"`` attribute. Here is the list of supported language along with their isocode: -+------------+------------+ -| Language | ISO code | -+============+============+ -| German | de | -+------------+------------+ -| English | en | -+------------+------------+ -| Spanish | es | -+------------+------------+ -| French | fr | -+------------+------------+ -| Italian | it | -+------------+------------+ -| Japanese | ja | -+------------+------------+ -| Korean | ko | -+------------+------------+ ++-----------------+------------+ +| Language | ISO code | ++=================+============+ +| German | de | ++-----------------+------------+ +| English | en | ++-----------------+------------+ +| Spanish | es | ++-----------------+------------+ +| French | fr | ++-----------------+------------+ +| Italian | it | ++-----------------+------------+ +| Japanese | ja | ++-----------------+------------+ +| Korean | ko | ++-----------------+------------+ +| Portuguese (BR) | pt_br | ++-----------------+------------+ +| Portuguese (PT) | pt_pt | ++-----------------+------------+ Support for additional languages will come in the future, stay tuned :) From 74409ac4c9182a02d576e69ac30c3f5687376ea0 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 6 Mar 2019 15:23:41 +0100 Subject: [PATCH 12/13] Update Changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23cdc145c..d0d9e33f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ # Changelog All notable changes to this project will be documented in this file. +## [0.19.4] - 2019-03-06 +### Added +- Support for Portuguese: "pt_pt" and "pt_br" + +### Changed +- Enhancement: leverage entity scopes of each intent in deterministic intent parser + ## [0.19.3] - 2019-03-05 ### Fixed - Issue with intent classification reducing classification accuracy @@ -243,6 +250,7 @@ several commands. - Fix compiling issue with `bindgen` dependency when installing from source - Fix issue in `CRFSlotFiller` when handling builtin entities +[0.19.4]: https://github.com/snipsco/snips-nlu/compare/0.19.3...0.19.4 [0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3 [0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2 [0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1 From 532c84079b3ad3dfdeae477bb59908670f1c4afb Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 6 Mar 2019 15:24:03 +0100 Subject: [PATCH 13/13] Bump version to 0.19.4 --- snips_nlu/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py index 0c0398efb..d38334cd8 100644 --- a/snips_nlu/__about__.py +++ b/snips_nlu/__about__.py @@ -11,7 +11,7 @@ __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai" __license__ = "Apache License, Version 2.0" -__version__ = "0.19.3" +__version__ = "0.19.4" __model_version__ = "0.19.0" __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"