rth · joshlk · Jul 1, 2020 · Jul 3, 2020 · Jul 3, 2020 · Jul 6, 2020
diff --git a/python/src/lib.rs b/python/src/lib.rs
@@ -8,6 +8,7 @@ use pyo3::prelude::*;
 use pyo3::wrap_pyfunction;
 
 mod stem;
+mod token_processing;
 mod tokenize;
 mod tokenize_sentence;
 mod utils;
@@ -183,6 +184,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<tokenize::VTextTokenizer>()?;
     m.add_class::<tokenize::CharacterTokenizer>()?;
     m.add_class::<stem::SnowballStemmer>()?;
+    m.add_class::<token_processing::KSkipNGrams>()?;
     m.add_wrapped(wrap_pyfunction!(dice_similarity))?;
     m.add_wrapped(wrap_pyfunction!(jaro_similarity))?;
     m.add_wrapped(wrap_pyfunction!(jaro_winkler_similarity))?;

diff --git a/python/src/token_processing.rs b/python/src/token_processing.rs
@@ -0,0 +1,96 @@
+// Copyright 2019 vtext developers
+//
+// Licensed under the Apache License, Version 2.0,
+// <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
+// modified, or distributed except according to those terms.
+
+use pyo3::prelude::*;
+use pyo3::types::{PyIterator, PyList, PyString};
+use pyo3::PyIterProtocol;
+
+use crate::utils::{deserialize_params, serialize_params};
+use vtext::token_processing::*;
+
+/// __init__(self, min_n: int, max_n: int, max_k: int)
+///
+/// K-Skip-N-Grams generator
+///
+/// Provided with a list of tokens it generates k-skip-n-grams.
+///
+/// Parameters
+/// ----------
+/// min_n : int
+///    The minimum degree of the ngram
+/// max_n : int
+///    The maximum degree of the ngram
+/// max_k : int
+///    The maximum-degree of the skipgram: the total max skip between items
+#[pyclass(module = "vtext.token_processing")]
+pub struct KSkipNGrams {
+    inner: vtext::token_processing::KSkipNGrams,
+}
+
+#[pymethods]
+impl KSkipNGrams {
+    #[new]
+    fn new(min_n: usize, max_n: usize, max_k: usize) -> PyResult<Self> {
+        let kskipngrams = vtext::token_processing::KSkipNGrams::new(min_n, max_n, max_k);
+        Ok(KSkipNGrams { inner: kskipngrams })
+    }
+
+    /// transform(self, items: List[str],
+    ///     pad_left: Optional[str]=None, pad_right: Optional[str]=None) -> List[List[str]]
+    ///
+    /// Transforms a given sequence of `items` into k-skip-n-grams.
+    ///
+    /// Parameters
+    /// ----------
+    /// items : List[str]
+    ///   The list of items to create the k-skip-n-grams of.
+    /// pad_left : Optional[str]
+    ///   Optional string to use as left padding
+    /// pad_right : Optional[str]
+    ///   Optional string to use as right padding
+    ///
+    /// Returns
+    /// -------
+    /// k-skip-n-grams : List[List[str]]
+    ///    computed k-skip-n-grams
+    #[args(pad_left = "None", pad_right = "None")]
+    fn transform<'py>(
+        &self,
+        py: Python<'py>,
+        items: Vec<&str>,
+        pad_left: Option<&str>,
+        pad_right: Option<&str>,
+    ) -> PyResult<&'py PyList> {
+        let res: Vec<_> = self
+            .inner
+            .transform(Box::new(items.into_iter()), pad_left, pad_right)?
+            .collect();
+        let output = PyList::new(py, res);
+        Ok(output)
+    }
+
+    /// get_params(self, x)
+    ///
+    /// Get parameters for this estimator.
+    ///
+    /// Returns
+    /// -------
+    /// params : mapping of string to any
+    ///          Parameter names mapped to their values.
+    fn get_params(&self) -> PyResult<KSkipNGramsParams> {
+        Ok(self.inner.params.clone())
+    }
+
+    pub fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
+        serialize_params(&self.inner.params, py)
+    }
+
+    pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
+        let mut params: KSkipNGramsParams = deserialize_params(py, state)?;
+        self.inner = params.build();
+        Ok(())
+    }
+}
diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py
@@ -0,0 +1,40 @@
+# Copyright 2019 vtext developers
+#
+# Licensed under the Apache License, Version 2.0,
+# <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
+# modified, or distributed except according to those terms.
+
+import pytest
+import hypothesis
+import hypothesis.strategies as st
+
+from vtext.token_processing import KSkipNGrams
+
+
+def test_unicode_segment_tokenize():
+
+    gramizer = KSkipNGrams(min_n=2, max_n=2, max_k=0)
+    assert gramizer.transform(["One", "Two", "Three"]) == [
+        ["One", "Two"],
+        ["Two", "Three"],
+    ]
+
+    with pytest.raises(TypeError):
+        KSkipNGrams()
+
+    # n == 0
+    with pytest.raises(ValueError):
+        KSkipNGrams(min_n=0, max_n=0, max_k=0).transform(["One", "Two", "Three"])
+
+    # min_n > max_n
+    with pytest.raises(ValueError):
+        KSkipNGrams(min_n=1, max_n=0, max_k=0).transform(["One", "Two", "Three"])
+
+    # max_k < 0
+    with pytest.raises(OverflowError):
+        KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"])
+
+
+@hypothesis.given(st.text(min_size=2))
+def test_tokenize_edge_cases(txt):
+    KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(list(txt))
diff --git a/python/vtext/token_processing.py b/python/vtext/token_processing.py
@@ -0,0 +1,9 @@
+# Copyright 2019 vtext developers
+#
+# Licensed under the Apache License, Version 2.0,
+# <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
+# modified, or distributed except according to those terms.
+
+from ._lib import KSkipNGrams
+
+__all__ = ["KSkipNGrams"]
diff --git a/src/errors.rs b/src/errors.rs
@@ -5,13 +5,15 @@ use thiserror::Error;
 
 #[derive(Error, Debug)]
 pub enum EstimatorErr {
-    #[error("Invalid paramer: `{0}`")]
+    #[error("Invalid params: `{0}`")]
     InvalidParams(String),
     #[error("Invalid regex parameter")]
     RegexErr {
         #[from]
         source: regex::Error,
     },
+    #[error("Invalid Input: `{0}`")]
+    InvalidInput(String),
 }
 
 #[cfg(feature = "python")]

diff --git a/src/lib.rs b/src/lib.rs
@@ -41,6 +41,7 @@ assert_eq!(tokens, vec!["Flights", "ca", "n't", "depart", "after", "2:00", "pm",
 pub mod errors;
 mod math;
 pub mod metrics;
+pub mod token_processing;
 pub mod tokenize;
 pub mod tokenize_sentence;
 pub mod vectorize;