From 19dfb9ab8e4cfad383f5f436fbc60e4bc43b31a0 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 14:22:14 +0100
Subject: [PATCH 01/13] Sort resources before persisting them

---
 snips_nlu/resources.py                   | 9 ++++++---
 snips_nlu/slot_filler/feature_factory.py | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/snips_nlu/resources.py b/snips_nlu/resources.py
index 42dc2817d..65de1ae87 100644
--- a/snips_nlu/resources.py
+++ b/snips_nlu/resources.py
@@ -254,6 +254,7 @@ def _load_stop_words(stop_words_path):
 
 
 def _persist_stop_words(stop_words, path):
+    stop_words = sorted(stop_words)
     with path.open(encoding="utf8", mode="w") as f:
         for stop_word in stop_words:
             f.write("%s\n" % stop_word)
@@ -305,7 +306,7 @@ def _load_word_clusters(path):
 
 def _persist_word_clusters(word_clusters, path):
     with path.open(encoding="utf8", mode="w") as f:
-        for word, cluster in iteritems(word_clusters):
+        for word, cluster in sorted(iteritems(word_clusters)):
             f.write("%s\t%s\n" % (word, cluster))
 
 
@@ -327,6 +328,8 @@ def _load_gazetteer(path):
 
 
 def _persist_gazetteer(gazetteer, path):
+    # Sort gazetteer to avoid serialization diffs
+    gazetteer = sorted(gazetteer)
     with path.open(encoding="utf8", mode="w") as f:
         for word in gazetteer:
             f.write("%s\n" % word)
@@ -355,7 +358,7 @@ def _persist_stems(stems, path):
     for value, stem in iteritems(stems):
         reversed_stems[stem].append(value)
     with path.open(encoding="utf8", mode="w") as f:
-        for stem, values in iteritems(reversed_stems):
-            elements = [stem] + values
+        for stem, values in sorted(iteritems(reversed_stems)):
+            elements = [stem] + sorted(values)
             line = ",".join(elements)
             f.write("%s\n" % line)
diff --git a/snips_nlu/slot_filler/feature_factory.py b/snips_nlu/slot_filler/feature_factory.py
index 743410726..c539b5b09 100644
--- a/snips_nlu/slot_filler/feature_factory.py
+++ b/snips_nlu/slot_filler/feature_factory.py
@@ -508,7 +508,8 @@ def language(self, value):
 
     def fit(self, dataset, intent):
         self.language = dataset[LANGUAGE]
-        self.builtin_entities = self._get_builtin_entity_scope(dataset, intent)
+        self.builtin_entities = sorted(
+            self._get_builtin_entity_scope(dataset, intent))
         self.args["entity_labels"] = self.builtin_entities
 
     def build_features(self):

From 17daa1ee2e129fc914e68435a1882604b07dbcb9 Mon Sep 17 00:00:00 2001
From: Rosa Stern <rosa.stern@snips.ai>
Date: Wed, 20 Feb 2019 17:35:11 +0100
Subject: [PATCH 02/13] Add default configurations for Portuguese (PT and BR) -
 V0.

---
 snips_nlu/default_configs/config_pt_br.py | 170 ++++++++++++++++++++++
 snips_nlu/default_configs/config_pt_pt.py | 170 ++++++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 snips_nlu/default_configs/config_pt_br.py
 create mode 100644 snips_nlu/default_configs/config_pt_pt.py

diff --git a/snips_nlu/default_configs/config_pt_br.py b/snips_nlu/default_configs/config_pt_br.py
new file mode 100644
index 000000000..e835f94f4
--- /dev/null
+++ b/snips_nlu/default_configs/config_pt_br.py
@@ -0,0 +1,170 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+    "unit_name": "nlu_engine",
+    "intent_parsers_configs": [
+        {
+            "unit_name": "deterministic_intent_parser",
+            "max_queries": 500,
+            "max_pattern_length": 1000,
+            "ignore_stop_words": True
+        },
+        {
+            "unit_name": "probabilistic_intent_parser",
+            "slot_filler_config": {
+                "unit_name": "crf_slot_filler",
+                "feature_factory_configs": [
+                    {
+                        "args": {
+                            "common_words_gazetteer_name":
+                                "top_10000_words_stemmed",
+                            "use_stemming": True,
+                            "n": 1
+                        },
+                        "factory_name": "ngram",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0,
+                            1,
+                            2
+                        ]
+                    },
+                    {
+                        "args": {
+                            "common_words_gazetteer_name":
+                                "top_10000_words_stemmed",
+                            "use_stemming": True,
+                            "n": 2
+                        },
+                        "factory_name": "ngram",
+                        "offsets": [
+                            -2,
+                            1
+                        ]
+                    },
+                    {
+                        "args": {},
+                        "factory_name": "is_digit",
+                        "offsets": [
+                            -1,
+                            0,
+                            1
+                        ]
+                    },
+                    {
+                        "args": {},
+                        "factory_name": "is_first",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0
+                        ]
+                    },
+                    {
+                        "args": {},
+                        "factory_name": "is_last",
+                        "offsets": [
+                            0,
+                            1,
+                            2
+                        ]
+                    },
+                    {
+                        "args": {
+                            "n": 1
+                        },
+                        "factory_name": "shape_ngram",
+                        "offsets": [
+                            0
+                        ]
+                    },
+                    {
+                        "args": {
+                            "n": 2
+                        },
+                        "factory_name": "shape_ngram",
+                        "offsets": [
+                            -1,
+                            0
+                        ]
+                    },
+                    {
+                        "args": {
+                            "n": 3
+                        },
+                        "factory_name": "shape_ngram",
+                        "offsets": [
+                            -1
+                        ]
+                    },
+                    {
+                        "args": {
+                            "use_stemming": True,
+                            "tagging_scheme_code": 2
+                        },
+                        "factory_name": "entity_match",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0
+                        ],
+                        "drop_out": 0.5
+                    },
+                    {
+                        "args": {
+                            "tagging_scheme_code": 1
+                        },
+                        "factory_name": "builtin_entity_match",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0
+                        ]
+                    }
+                ],
+                "crf_args": {
+                    "c1": 0.1,
+                    "c2": 0.1,
+                    "algorithm": "lbfgs"
+                },
+                "tagging_scheme": 1,
+                "data_augmentation_config": {
+                    "min_utterances": 200,
+                    "capitalization_ratio": 0.2,
+                    "add_builtin_entities_examples": True
+                },
+                "random_seed": None
+            },
+            "intent_classifier_config": {
+                "unit_name": "log_reg_intent_classifier",
+                "data_augmentation_config": {
+                    "min_utterances": 20,
+                    "noise_factor": 5,
+                    "add_builtin_entities_examples": False,
+                    "max_unknown_words": None,
+                    "unknown_word_prob": 0.0,
+                    "unknown_words_replacement_string": None
+                },
+                "featurizer_config": {
+                    "unit_name": "featurizer",
+                    "pvalue_threshold": 0.4,
+                    "added_cooccurrence_feature_ratio": 0.0,
+                    "tfidf_vectorizer_config": {
+                        "unit_name": "tfidf_vectorizer",
+                        "use_stemming": True,
+                        "word_clusters_name": None
+                    },
+                    "cooccurrence_vectorizer_config": {
+                        "unit_name": "cooccurrence_vectorizer",
+                        "window_size": None,
+                        "filter_stop_words": True,
+                        "unknown_words_replacement_string": None,
+                        "keep_order": True
+                    }
+                },
+                "random_seed": None
+            }
+        }
+    ]
+}
diff --git a/snips_nlu/default_configs/config_pt_pt.py b/snips_nlu/default_configs/config_pt_pt.py
new file mode 100644
index 000000000..e835f94f4
--- /dev/null
+++ b/snips_nlu/default_configs/config_pt_pt.py
@@ -0,0 +1,170 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+    "unit_name": "nlu_engine",
+    "intent_parsers_configs": [
+        {
+            "unit_name": "deterministic_intent_parser",
+            "max_queries": 500,
+            "max_pattern_length": 1000,
+            "ignore_stop_words": True
+        },
+        {
+            "unit_name": "probabilistic_intent_parser",
+            "slot_filler_config": {
+                "unit_name": "crf_slot_filler",
+                "feature_factory_configs": [
+                    {
+                        "args": {
+                            "common_words_gazetteer_name":
+                                "top_10000_words_stemmed",
+                            "use_stemming": True,
+                            "n": 1
+                        },
+                        "factory_name": "ngram",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0,
+                            1,
+                            2
+                        ]
+                    },
+                    {
+                        "args": {
+                            "common_words_gazetteer_name":
+                                "top_10000_words_stemmed",
+                            "use_stemming": True,
+                            "n": 2
+                        },
+                        "factory_name": "ngram",
+                        "offsets": [
+                            -2,
+                            1
+                        ]
+                    },
+                    {
+                        "args": {},
+                        "factory_name": "is_digit",
+                        "offsets": [
+                            -1,
+                            0,
+                            1
+                        ]
+                    },
+                    {
+                        "args": {},
+                        "factory_name": "is_first",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0
+                        ]
+                    },
+                    {
+                        "args": {},
+                        "factory_name": "is_last",
+                        "offsets": [
+                            0,
+                            1,
+                            2
+                        ]
+                    },
+                    {
+                        "args": {
+                            "n": 1
+                        },
+                        "factory_name": "shape_ngram",
+                        "offsets": [
+                            0
+                        ]
+                    },
+                    {
+                        "args": {
+                            "n": 2
+                        },
+                        "factory_name": "shape_ngram",
+                        "offsets": [
+                            -1,
+                            0
+                        ]
+                    },
+                    {
+                        "args": {
+                            "n": 3
+                        },
+                        "factory_name": "shape_ngram",
+                        "offsets": [
+                            -1
+                        ]
+                    },
+                    {
+                        "args": {
+                            "use_stemming": True,
+                            "tagging_scheme_code": 2
+                        },
+                        "factory_name": "entity_match",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0
+                        ],
+                        "drop_out": 0.5
+                    },
+                    {
+                        "args": {
+                            "tagging_scheme_code": 1
+                        },
+                        "factory_name": "builtin_entity_match",
+                        "offsets": [
+                            -2,
+                            -1,
+                            0
+                        ]
+                    }
+                ],
+                "crf_args": {
+                    "c1": 0.1,
+                    "c2": 0.1,
+                    "algorithm": "lbfgs"
+                },
+                "tagging_scheme": 1,
+                "data_augmentation_config": {
+                    "min_utterances": 200,
+                    "capitalization_ratio": 0.2,
+                    "add_builtin_entities_examples": True
+                },
+                "random_seed": None
+            },
+            "intent_classifier_config": {
+                "unit_name": "log_reg_intent_classifier",
+                "data_augmentation_config": {
+                    "min_utterances": 20,
+                    "noise_factor": 5,
+                    "add_builtin_entities_examples": False,
+                    "max_unknown_words": None,
+                    "unknown_word_prob": 0.0,
+                    "unknown_words_replacement_string": None
+                },
+                "featurizer_config": {
+                    "unit_name": "featurizer",
+                    "pvalue_threshold": 0.4,
+                    "added_cooccurrence_feature_ratio": 0.0,
+                    "tfidf_vectorizer_config": {
+                        "unit_name": "tfidf_vectorizer",
+                        "use_stemming": True,
+                        "word_clusters_name": None
+                    },
+                    "cooccurrence_vectorizer_config": {
+                        "unit_name": "cooccurrence_vectorizer",
+                        "window_size": None,
+                        "filter_stop_words": True,
+                        "unknown_words_replacement_string": None,
+                        "keep_order": True
+                    }
+                },
+                "random_seed": None
+            }
+        }
+    ]
+}

From 24a7cda11775f5f2b3cebe43d8d7390208a3cd3c Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Thu, 28 Feb 2019 11:30:09 +0100
Subject: [PATCH 03/13] Bump snips-nlu-parsers to 0.2.0 and snips-nlu-utils to
 0.8.0

---
 sample_datasets/beverage_dataset.json | 2 +-
 setup.py                              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sample_datasets/beverage_dataset.json b/sample_datasets/beverage_dataset.json
index ae90db545..4473ddde8 100644
--- a/sample_datasets/beverage_dataset.json
+++ b/sample_datasets/beverage_dataset.json
@@ -136,4 +136,4 @@
     }
   },
   "language": "en"
-}
\ No newline at end of file
+}
diff --git a/setup.py b/setup.py
index f574442fc..45f687d67 100644
--- a/setup.py
+++ b/setup.py
@@ -24,8 +24,8 @@
     "scikit-learn>=0.19,<0.20",
     "sklearn-crfsuite>=0.3.6,<0.4",
     "semantic_version>=2.6,<3.0",
-    "snips_nlu_utils>=0.7,<0.8",
-    "snips_nlu_parsers>=0.1,<0.2",
+    "snips_nlu_utils>=0.8,<0.9",
+    "snips_nlu_parsers>=0.2,<0.3",
     "num2words>=0.5.6,<0.6",
     "plac>=0.9.6,<1.0",
     "requests>=2.0,<3.0",

From ffbf66a9efb615b0f26381b3a6f2361c3d0c29bc Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Thu, 28 Feb 2019 13:42:45 +0100
Subject: [PATCH 04/13] Fix resources names

---
 snips_nlu/default_configs/config_pt_br.py | 4 ++--
 snips_nlu/default_configs/config_pt_pt.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snips_nlu/default_configs/config_pt_br.py b/snips_nlu/default_configs/config_pt_br.py
index e835f94f4..e8a478b5b 100644
--- a/snips_nlu/default_configs/config_pt_br.py
+++ b/snips_nlu/default_configs/config_pt_br.py
@@ -17,7 +17,7 @@
                     {
                         "args": {
                             "common_words_gazetteer_name":
-                                "top_10000_words_stemmed",
+                                "top_5000_words_stemmed",
                             "use_stemming": True,
                             "n": 1
                         },
@@ -33,7 +33,7 @@
                     {
                         "args": {
                             "common_words_gazetteer_name":
-                                "top_10000_words_stemmed",
+                                "top_5000_words_stemmed",
                             "use_stemming": True,
                             "n": 2
                         },
diff --git a/snips_nlu/default_configs/config_pt_pt.py b/snips_nlu/default_configs/config_pt_pt.py
index e835f94f4..e8a478b5b 100644
--- a/snips_nlu/default_configs/config_pt_pt.py
+++ b/snips_nlu/default_configs/config_pt_pt.py
@@ -17,7 +17,7 @@
                     {
                         "args": {
                             "common_words_gazetteer_name":
-                                "top_10000_words_stemmed",
+                                "top_5000_words_stemmed",
                             "use_stemming": True,
                             "n": 1
                         },
@@ -33,7 +33,7 @@
                     {
                         "args": {
                             "common_words_gazetteer_name":
-                                "top_10000_words_stemmed",
+                                "top_5000_words_stemmed",
                             "use_stemming": True,
                             "n": 2
                         },

From 19d2485034a700788a86c6e2e350cfcfc59e549b Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Fri, 1 Mar 2019 16:02:00 +0100
Subject: [PATCH 05/13] Fix setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 45f687d67..006cb4dde 100644
--- a/setup.py
+++ b/setup.py
@@ -24,8 +24,8 @@
     "scikit-learn>=0.19,<0.20",
     "sklearn-crfsuite>=0.3.6,<0.4",
     "semantic_version>=2.6,<3.0",
-    "snips_nlu_utils>=0.8,<0.9",
-    "snips_nlu_parsers>=0.2,<0.3",
+    "snips-nlu-utils>=0.8,<0.9",
+    "snips-nlu-parsers>=0.2,<0.3",
     "num2words>=0.5.6,<0.6",
     "plac>=0.9.6,<1.0",
     "requests>=2.0,<3.0",

From b0f7deea16e6cfb623eb7b2e1d44312d07c3b37a Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Fri, 1 Mar 2019 17:00:05 +0100
Subject: [PATCH 06/13] Temporarly update the CI to directly download PT
 resources

---
 tox.ini | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index c1cd55591..e16019964 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,20 @@ envlist = py27, py35, py36, py37, integration-test, codecov, docs-html
 skip_install = true
 commands =
     pip install -e ".[test]"
-    snips-nlu download-all-languages
+
+    snips-nlu download -d snips_nlu_pt_br-0.1.0
+    snips-nlu download -d snips_nlu_pt_pt-0.1.0
+    snips-nlu link snips_nlu_pt_br pt_br -f
+    snips-nlu link snips_nlu_pt_pt pt_pt -f
+
+    snips-nlu download de
+    snips-nlu download en
+    snips-nlu download es
+    snips-nlu download fr
+    snips-nlu download it
+    snips-nlu download ja
+    snips-nlu download ko
+
     snips-nlu download-language-entities fr
     snips-nlu download-language-entities en
     coverage run -m unittest discover
@@ -19,7 +32,20 @@ basepython = python3.6
 skip_install = true
 commands =
     pip install -e ".[test]"
-    snips-nlu download-all-languages
+
+    snips-nlu download -d snips_nlu_pt_br-0.1.0
+    snips-nlu download -d snips_nlu_pt_pt-0.1.0
+    snips-nlu link snips_nlu_pt_br pt_br -f
+    snips-nlu link snips_nlu_pt_pt pt_pt -f
+
+    snips-nlu download de
+    snips-nlu download en
+    snips-nlu download es
+    snips-nlu download fr
+    snips-nlu download it
+    snips-nlu download ja
+    snips-nlu download ko
+
     python -m unittest discover -p 'linting_test*.py'
     python -m unittest discover -p 'integration_test*.py'
 

From 0e92d557df12e55f42d90045e131aea27d373e21 Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Tue, 5 Mar 2019 17:43:35 +0100
Subject: [PATCH 07/13] Expose PT configs

---
 snips_nlu/constants.py                | 2 ++
 snips_nlu/default_configs/__init__.py | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/snips_nlu/constants.py b/snips_nlu/constants.py
index 3acdeeef4..f37e48ac4 100644
--- a/snips_nlu/constants.py
+++ b/snips_nlu/constants.py
@@ -75,3 +75,5 @@
 LANGUAGE_IT = "it"
 LANGUAGE_JA = "ja"
 LANGUAGE_KO = "ko"
+LANGUAGE_PT_BR = "pt_br"
+LANGUAGE_PT_PT = "pt_pt"
diff --git a/snips_nlu/default_configs/__init__.py b/snips_nlu/default_configs/__init__.py
index adc6fc9ea..d1b1f92fd 100644
--- a/snips_nlu/default_configs/__init__.py
+++ b/snips_nlu/default_configs/__init__.py
@@ -2,7 +2,7 @@
 
 from snips_nlu.constants import (
     LANGUAGE_DE, LANGUAGE_EN, LANGUAGE_ES, LANGUAGE_FR, LANGUAGE_IT,
-    LANGUAGE_JA, LANGUAGE_KO)
+    LANGUAGE_JA, LANGUAGE_KO, LANGUAGE_PT_BR, LANGUAGE_PT_PT)
 from .config_de import CONFIG as CONFIG_DE
 from .config_en import CONFIG as CONFIG_EN
 from .config_es import CONFIG as CONFIG_ES
@@ -10,6 +10,8 @@
 from .config_it import CONFIG as CONFIG_IT
 from .config_ja import CONFIG as CONFIG_JA
 from .config_ko import CONFIG as CONFIG_KO
+from .config_pt_br import CONFIG as CONFIG_PT_BR
+from .config_pt_pt import CONFIG as CONFIG_PT_PT
 
 DEFAULT_CONFIGS = {
     LANGUAGE_DE: CONFIG_DE,
@@ -19,4 +21,6 @@
     LANGUAGE_IT: CONFIG_IT,
     LANGUAGE_JA: CONFIG_JA,
     LANGUAGE_KO: CONFIG_KO,
+    LANGUAGE_PT_BR: CONFIG_PT_BR,
+    LANGUAGE_PT_PT: CONFIG_PT_PT,
 }

From 30409f24a14cc842d4caaf28fb01a8148c388b22 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 15:46:19 +0100
Subject: [PATCH 08/13] Leverage entity scopes of each intent in deterministic
 intent parser

---
 .../deterministic_intent_parser.py            | 49 ++++++++++---------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py
index 63238b9e6..29d78dfa9 100644
--- a/snips_nlu/intent_parser/deterministic_intent_parser.py
+++ b/snips_nlu/intent_parser/deterministic_intent_parser.py
@@ -56,7 +56,7 @@ def __init__(self, config=None, **shared):
         self._group_names_to_slot_names = None
         self.slot_names_to_group_names = None
         self.regexes_per_intent = None
-        self.builtin_scope = None
+        self.entity_scopes = None
         self.stop_words = None
 
     @property
@@ -82,11 +82,16 @@ def slot_names_to_entities(self):
     def slot_names_to_entities(self, value):
         self._slot_names_to_entities = value
         if value is None:
-            self.builtin_scope = None
+            self.entity_scopes = None
         else:
-            self.builtin_scope = {
-                ent for slot_mapping in itervalues(value)
-                for ent in itervalues(slot_mapping) if is_builtin_entity(ent)}
+            self.entity_scopes = {
+                intent: {
+                    "builtin": {ent for ent in itervalues(slot_mapping)
+                                if is_builtin_entity(ent)},
+                    "custom": {ent for ent in itervalues(slot_mapping)
+                               if not is_builtin_entity(ent)}
+                }
+                for intent, slot_mapping in iteritems(value)}
 
     @property
     def group_names_to_slot_names(self):
@@ -208,30 +213,27 @@ def _parse_top_intents(self, text, top_n, intents=None):
                 "top_n argument must be greater or equal to 1, but got: %s"
                 % top_n)
 
-        builtin_entities = self.builtin_entity_parser.parse(
-            text, scope=self.builtin_scope, use_cache=True)
-        custom_entities = self.custom_entity_parser.parse(
-            text, use_cache=True)
-        all_entities = builtin_entities + custom_entities
-        placeholder_fn = lambda entity_name: _get_entity_name_placeholder(
-            entity_name, self.language)
-        ranges_mapping, processed_text = replace_entities_with_placeholders(
-            text, all_entities, placeholder_fn=placeholder_fn)
-
-        # We try to match both the input text and the preprocessed text to
-        # cover inconsistencies between labeled data and builtin entity parsing
-        cleaned_text = self._preprocess_text(text)
-        cleaned_processed_text = self._preprocess_text(processed_text)
+        def placeholder_fn(entity_name):
+            return _get_entity_name_placeholder(entity_name, self.language)
 
         results = []
+        cleaned_text = self._preprocess_text(text)
 
-        for intent, regexes in iteritems(self.regexes_per_intent):
+        for intent, entity_scope in iteritems(self.entity_scopes):
             if intents is not None and intent not in intents:
                 continue
-            for regex in regexes:
+            builtin_entities = self.builtin_entity_parser.parse(
+                text, scope=entity_scope["builtin"], use_cache=True)
+            custom_entities = self.custom_entity_parser.parse(
+                text, scope=entity_scope["custom"], use_cache=True)
+            all_entities = builtin_entities + custom_entities
+            mapping, processed_text = replace_entities_with_placeholders(
+                text, all_entities, placeholder_fn=placeholder_fn)
+            cleaned_processed_text = self._preprocess_text(processed_text)
+            for regex in self.regexes_per_intent[intent]:
                 res = self._get_matching_result(text, cleaned_processed_text,
-                                                regex, intent, ranges_mapping)
-                if res is None:
+                                                regex, intent, mapping)
+                if res is None and cleaned_text != cleaned_processed_text:
                     res = self._get_matching_result(text, cleaned_text, regex,
                                                     intent)
                 if res is not None:
@@ -239,6 +241,7 @@ def _parse_top_intents(self, text, top_n, intents=None):
                     break
             if len(results) == top_n:
                 return results
+
         return results
 
     @fitted_required

From 674dd2b90e56e1c4fa6e89aa5b93840ca6b634df Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 15:58:02 +0100
Subject: [PATCH 09/13] Fix sample datasets

---
 sample_datasets/beverage_dataset.json |  3 +++
 sample_datasets/flights_dataset.json  | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/sample_datasets/beverage_dataset.json b/sample_datasets/beverage_dataset.json
index 4473ddde8..0af16623c 100644
--- a/sample_datasets/beverage_dataset.json
+++ b/sample_datasets/beverage_dataset.json
@@ -66,6 +66,9 @@
               "slot_name": "number_of_cups",
               "text": "one"
             },
+            {
+              "text": " "
+            },
             {
               "entity": "beverage_type",
               "slot_name": "beverage_type",
diff --git a/sample_datasets/flights_dataset.json b/sample_datasets/flights_dataset.json
index 9f88139a4..2e730b4a1 100644
--- a/sample_datasets/flights_dataset.json
+++ b/sample_datasets/flights_dataset.json
@@ -42,6 +42,9 @@
               "slot_name": "destination",
               "text": "London"
             },
+            {
+              "text": " "
+            },
             {
               "entity": "snips/datetime",
               "slot_name": "flight_time",
@@ -87,6 +90,9 @@
               "slot_name": "departure",
               "text": "istanbul"
             },
+            {
+              "text": " "
+            },
             {
               "entity": "snips/datetime",
               "slot_name": "flight_time",
@@ -112,6 +118,9 @@
               "slot_name": "destination",
               "text": "chicago"
             },
+            {
+              "text": " "
+            },
             {
               "entity": "snips/datetime",
               "slot_name": "flight_datetime",
@@ -145,6 +154,9 @@
               "slot_name": "flight_time",
               "text": "this weekend"
             },
+            {
+              "text": " "
+            },
             {
               "text": " ?"
             }

From 518f4345520e7bcff97245922c332c1fe7caf1c5 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 6 Mar 2019 10:30:56 +0100
Subject: [PATCH 10/13] Increase integration tests threshold to 0.95

---
 snips_nlu/tests/integration_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snips_nlu/tests/integration_test.py b/snips_nlu/tests/integration_test.py
index 6a92fdfad..ba93a374a 100644
--- a/snips_nlu/tests/integration_test.py
+++ b/snips_nlu/tests/integration_test.py
@@ -9,7 +9,7 @@
 from snips_nlu.preprocessing import tokenize_light
 from snips_nlu.tests.utils import PERFORMANCE_DATASET_PATH, SnipsTest
 
-INTENT_CLASSIFICATION_THRESHOLD = 0.9
+INTENT_CLASSIFICATION_THRESHOLD = 0.95
 SLOT_FILLING_THRESHOLD = 0.85
 
 SKIPPED_DATE_PREFIXES = {"at", "in", "for", "on"}

From e590981b4e70d62d72d336a3359eaea283885f6b Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 6 Mar 2019 11:17:17 +0100
Subject: [PATCH 11/13] Add support for portuguese in documentation

---
 docs/source/languages.rst | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/docs/source/languages.rst b/docs/source/languages.rst
index 68d7cddb1..3c6723764 100644
--- a/docs/source/languages.rst
+++ b/docs/source/languages.rst
@@ -7,22 +7,26 @@ Snips NLU supports various languages, that are speficied in the dataset in the
 ``"language"`` attribute. Here is the list of supported language along with
 their isocode:
 
-+------------+------------+
-| Language   | ISO code   |
-+============+============+
-| German     | de         |
-+------------+------------+
-| English    | en         |
-+------------+------------+
-| Spanish    | es         |
-+------------+------------+
-| French     | fr         |
-+------------+------------+
-| Italian    | it         |
-+------------+------------+
-| Japanese   | ja         |
-+------------+------------+
-| Korean     | ko         |
-+------------+------------+
++-----------------+------------+
+| Language        | ISO code   |
++=================+============+
+| German          | de         |
++-----------------+------------+
+| English         | en         |
++-----------------+------------+
+| Spanish         | es         |
++-----------------+------------+
+| French          | fr         |
++-----------------+------------+
+| Italian         | it         |
++-----------------+------------+
+| Japanese        | ja         |
++-----------------+------------+
+| Korean          | ko         |
++-----------------+------------+
+| Portuguese (BR) | pt_br      |
++-----------------+------------+
+| Portuguese (PT) | pt_pt      |
++-----------------+------------+
 
 Support for additional languages will come in the future, stay tuned :)

From 74409ac4c9182a02d576e69ac30c3f5687376ea0 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 6 Mar 2019 15:23:41 +0100
Subject: [PATCH 12/13] Update Changelog

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 23cdc145c..d0d9e33f9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,13 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [0.19.4] - 2019-03-06
+### Added
+- Support for Portuguese: "pt_pt" and "pt_br"
+
+### Changed
+- Enhancement: leverage entity scopes of each intent in deterministic intent parser
+
 ## [0.19.3] - 2019-03-05
 ### Fixed
 - Issue with intent classification reducing classification accuracy
@@ -243,6 +250,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
+[0.19.4]: https://github.com/snipsco/snips-nlu/compare/0.19.3...0.19.4
 [0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3
 [0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2
 [0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1

From 532c84079b3ad3dfdeae477bb59908670f1c4afb Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 6 Mar 2019 15:24:03 +0100
Subject: [PATCH 13/13] Bump version to 0.19.4

---
 snips_nlu/__about__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py
index 0c0398efb..d38334cd8 100644
--- a/snips_nlu/__about__.py
+++ b/snips_nlu/__about__.py
@@ -11,7 +11,7 @@
 __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai"
 __license__ = "Apache License, Version 2.0"
 
-__version__ = "0.19.3"
+__version__ = "0.19.4"
 __model_version__ = "0.19.0"
 
 __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"