Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Feature/kskip-ngram #82

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a0ad101
[WIP] Ngrams: parity with ngrams and everygram but not with skipgram
joshlk Jul 1, 2020
76010c0
[WIP] skipgram n=3, k=1 parity with nltk
joshlk Jul 3, 2020
eb373f7
[WIP] parity with all nltk functions
joshlk Jul 3, 2020
062aefe
[WIP] refactored using index combinations
joshlk Jul 6, 2020
4e0d95d
Tidy code
joshlk Jul 6, 2020
a37592f
Refactored and documented
joshlk Jul 6, 2020
9462475
Refactored and documented
joshlk Jul 6, 2020
56f756f
Added documentation to `SampleCombinations`
joshlk Jul 7, 2020
da791fe
Change error to `EstimatorErr` enum and add `InputError` variant
joshlk Jul 15, 2020
146a13b
Make struct private and improve error
joshlk Jul 15, 2020
461fd52
Simplified code by chaining iterators. Parity with previous code test…
joshlk Jul 16, 2020
e6dab70
Tidied code, docs and cargo fmt
joshlk Jul 16, 2020
298ca35
Module name renamed to `token_processing`
joshlk Jul 16, 2020
9453129
KSkipNGrams Python API
joshlk Jul 16, 2020
5f6698e
Fixed doc
joshlk Jul 17, 2020
b37e47d
Update python/vtext/tests/test_token_processing.py
rth Jul 21, 2020
3bea027
Fix syntax error
rth Jul 21, 2020
e382b97
Minor fixes in the hypothesis test
rth Jul 21, 2020
dc80e5d
Increase min_size for tokens in python tests
rth Jul 21, 2020
d58ac99
`cargo clippy` suggestions
joshlk Jul 22, 2020
7b7ef18
Empty or input that is smaller than n gives empty output
joshlk Jul 22, 2020
6f1cadf
ngram and skipgram benchmark:
joshlk Jul 22, 2020
083b4ec
cargo fmt
joshlk Jul 22, 2020
6345cf3
`black python/ benchmarks/ evaluation/`
joshlk Jul 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use pyo3::prelude::*;
use pyo3::wrap_pyfunction;

mod stem;
mod token_processing;
mod tokenize;
mod tokenize_sentence;
mod utils;
Expand Down Expand Up @@ -183,6 +184,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<tokenize::VTextTokenizer>()?;
m.add_class::<tokenize::CharacterTokenizer>()?;
m.add_class::<stem::SnowballStemmer>()?;
m.add_class::<token_processing::KSkipNGrams>()?;
m.add_wrapped(wrap_pyfunction!(dice_similarity))?;
m.add_wrapped(wrap_pyfunction!(jaro_similarity))?;
m.add_wrapped(wrap_pyfunction!(jaro_winkler_similarity))?;
Expand Down
96 changes: 96 additions & 0 deletions python/src/token_processing.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2019 vtext developers
//
// Licensed under the Apache License, Version 2.0,
// <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
// modified, or distributed except according to those terms.

use pyo3::prelude::*;
use pyo3::types::{PyIterator, PyList, PyString};
use pyo3::PyIterProtocol;

use crate::utils::{deserialize_params, serialize_params};
use vtext::token_processing::*;

/// __init__(self, min_n: int, max_n: int, max_k: int)
///
/// K-Skip-N-Grams generator
///
/// Provided with a list of tokens it generates k-skip-n-grams.
///
/// Parameters
/// ----------
/// min_n : int
/// The minimum degree of the ngram
/// max_n : int
/// The maximum degree of the ngram
/// max_k : int
/// The maximum-degree of the skipgram: the total max skip between items
#[pyclass(module = "vtext.token_processing")]
pub struct KSkipNGrams {
inner: vtext::token_processing::KSkipNGrams,
}

#[pymethods]
impl KSkipNGrams {
#[new]
fn new(min_n: usize, max_n: usize, max_k: usize) -> PyResult<Self> {
let kskipngrams = vtext::token_processing::KSkipNGrams::new(min_n, max_n, max_k);
Ok(KSkipNGrams { inner: kskipngrams })
}

/// transform(self, items: List[str],
/// pad_left: Optional[str]=None, pad_right: Optional[str]=None) -> List[List[str]]
///
/// Transforms a given sequence of `items` into k-skip-n-grams.
///
/// Parameters
/// ----------
/// items : List[str]
/// The list of items to create the k-skip-n-grams of.
/// pad_left : Optional[str]
/// Optional string to use as left padding
/// pad_right : Optional[str]
/// Optional string to use as right padding
///
/// Returns
/// -------
/// k-skip-n-grams : List[List[str]]
/// computed k-skip-n-grams
#[args(pad_left = "None", pad_right = "None")]
fn transform<'py>(
&self,
py: Python<'py>,
items: Vec<&str>,
pad_left: Option<&str>,
pad_right: Option<&str>,
) -> PyResult<&'py PyList> {
let res: Vec<_> = self
.inner
.transform(Box::new(items.into_iter()), pad_left, pad_right)?
.collect();
let output = PyList::new(py, res);
Ok(output)
}

/// get_params(self, x)
///
/// Get parameters for this estimator.
///
/// Returns
/// -------
/// params : mapping of string to any
/// Parameter names mapped to their values.
fn get_params(&self) -> PyResult<KSkipNGramsParams> {
Ok(self.inner.params.clone())
}

pub fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
serialize_params(&self.inner.params, py)
}

pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
let mut params: KSkipNGramsParams = deserialize_params(py, state)?;
self.inner = params.build();
Ok(())
}
}
40 changes: 40 additions & 0 deletions python/vtext/tests/test_token_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2019 vtext developers
#
# Licensed under the Apache License, Version 2.0,
# <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
# modified, or distributed except according to those terms.

import pytest
import hypothesis
import hypothesis.strategies as st

from vtext.token_processing import KSkipNGrams


def test_unicode_segment_tokenize():

gramizer = KSkipNGrams(min_n=2, max_n=2, max_k=0)
assert gramizer.transform(["One", "Two", "Three"]) == [
["One", "Two"],
["Two", "Three"],
]

with pytest.raises(TypeError):
KSkipNGrams()

# n == 0
with pytest.raises(ValueError):
KSkipNGrams(min_n=0, max_n=0, max_k=0).transform(["One", "Two", "Three"])

# min_n > max_n
with pytest.raises(ValueError):
KSkipNGrams(min_n=1, max_n=0, max_k=0).transform(["One", "Two", "Three"])

# max_k < 0
with pytest.raises(OverflowError):
KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"])


@hypothesis.given(st.text(min_size=2))
rth marked this conversation as resolved.
Show resolved Hide resolved
def test_tokenize_edge_cases(txt):
KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(list(txt))
9 changes: 9 additions & 0 deletions python/vtext/token_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright 2019 vtext developers
#
# Licensed under the Apache License, Version 2.0,
# <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
# modified, or distributed except according to those terms.

from ._lib import KSkipNGrams

__all__ = ["KSkipNGrams"]
4 changes: 3 additions & 1 deletion src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ use thiserror::Error;

#[derive(Error, Debug)]
pub enum EstimatorErr {
#[error("Invalid paramer: `{0}`")]
#[error("Invalid params: `{0}`")]
InvalidParams(String),
#[error("Invalid regex parameter")]
RegexErr {
#[from]
source: regex::Error,
},
#[error("Invalid Input: `{0}`")]
InvalidInput(String),
}

#[cfg(feature = "python")]
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ assert_eq!(tokens, vec!["Flights", "ca", "n't", "depart", "after", "2:00", "pm",
pub mod errors;
mod math;
pub mod metrics;
pub mod token_processing;
pub mod tokenize;
pub mod tokenize_sentence;
pub mod vectorize;
Loading