Skip to content

Commit

Permalink
lccs now retrieving the sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
Natooz committed Mar 12, 2024
1 parent 713a899 commit fa39c09
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 21 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ lccs_len = lccs_length(seq1, seq2) # 5, [2, 3, 4, 5, 6]

## TODOs

* implement lccs with suffix tree;
* batch methods, i.e. supporting 2D arrays;
* batch methods with any number of dimensions (nD array) and add a `dim` argument;
* make it work with an unlimited number of sequences, and set `dim` and `pad_token` as kwargs only;
35 changes: 32 additions & 3 deletions src/cpu/lccs_cpu_dyn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ namespace nb = nanobind;
using namespace nb::literals;


// Calculating the length of the longest common contiguous subsequence with dynamic programming
int lccs_length(
// Returns the length of the longest common subsequence and the idx of its end in s1
std::vector<int> lccs_length_idx(
const nb::ndarray<double, nb::ndim<1>>& s1,
const nb::ndarray<double, nb::ndim<1>>& s2
) {
Expand All @@ -21,16 +21,45 @@ int lccs_length(

std::vector<std::vector<int>> table(s1Len + 1, std::vector<int>(s2Len + 1, 0));
int max_length = 0;
int imax = 0; // ending idx of the lccs
for (int i = 0; i < s1Len; ++i) {
for (int j = 0; j < s2Len; ++j) {
if (v1(i) == v2(j)) {
table[i + 1][j + 1] = table[i][j] + 1;
if (table[i + 1][j + 1] > max_length) {
imax = i;
max_length = table[i + 1][j + 1];
}
}
}
}
std::vector<int> lccs_len_idx = {max_length, imax};
return lccs_len_idx;
}


// Calculating the length of the longest common contiguous subsequence with dynamic programming
int lccs_length(
const nb::ndarray<double, nb::ndim<1>>& s1,
const nb::ndarray<double, nb::ndim<1>>& s2
) {
std::vector<int> lccs_len_idx = lccs_length_idx(s1, s2);
return lccs_len_idx[0];
}


// Calculating the longest common contiguous subsequence with dynamic programming
std::vector<int> lccs(
const nb::ndarray<double, nb::ndim<1>>& s1,
const nb::ndarray<double, nb::ndim<1>>& s2
) {
std::vector<int> lccs_len_idx = lccs_length_idx(s1, s2);

// Extract the longest common substring from s1
std::vector<int> longestSubseq(lccs_len_idx[0]);
int idx = 0;
for (int i = lccs_len_idx[1] - lccs_len_idx[0] + 1; i <= lccs_len_idx[1]; ++i)
longestSubseq[idx++] = s1(i);

return max_length;
return longestSubseq;
}
2 changes: 2 additions & 0 deletions src/lcs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ NB_MODULE(lcsvec_ext, m) {
m.def("lcs_table", &createLCSTable, "seq1"_a, "seq2"_a,
"Returns the longest common subsequence (lcs) table from `seq1` and `seq2`.");

m.def("lccs", &lccs, "seq1"_a, "seq2"_a,
"Returns the longest common contiguous subsequence (lccs) from `seq1` and `seq2`.");
m.def("lccs_length", &lccs_length, "seq1"_a, "seq2"_a,
"Returns the length of the longest common contiguous subsequence (lccs) from `seq1` and `seq2`.");
}
4 changes: 2 additions & 2 deletions src/lcsvec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Main LCSvec module."""

from .lcsvec_ext import lccs_length, lcs, lcs_length, lcs_table
from .lcsvec_ext import lccs, lccs_length, lcs, lcs_length, lcs_table

__version__ = "0.0.1"

__all__ = ["lccs_length", "lcs", "lcs_length", "lcs_table"]
__all__ = ["lccs", "lccs_length", "lcs", "lcs_length", "lcs_table"]
68 changes: 52 additions & 16 deletions tests/test_lccs_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,73 @@
from typing import TYPE_CHECKING

import numpy as np
from lcsvec import lccs_length
import pytest
from lcsvec import lccs, lccs_length
from torch import IntTensor, LongTensor, arange

if TYPE_CHECKING:
from numpy.typing import NDArray


TEST_CASES = [
(range(12), [8, 0, 1, 2, 8, 2, 3, 8, 4, 0], range(3)),
(range(12), [8, 0, 9, 2, 8, 2, 7, 3, 4, 5], range(3, 6)),
(range(12), [0, 1, 2, 3, 8, 9, 2, 3, 4, 5], range(4)),
(range(-2, 10), [9, -1, 0, 1, 2, 9, 2, 4, 4, 5], range(-1, 3)),
]


def _test_lccs(
seq1: NDArray | IntTensor | LongTensor,
seq2: NDArray | IntTensor | LongTensor,
ref: list[int],
) -> None:
lcs_len = lccs_length(seq1, seq2)
assert lcs_len == len(ref)
) -> True:
lccs_ = lccs(seq1, seq2)
lccs_len = lccs_length(seq1, seq2)

assert lccs_len == len(ref)
assert lccs_ == ref
return True

def test_lccs_numpy() -> None:

@pytest.mark.parametrize("sequences", TEST_CASES)
def test_lccs_numpy(
sequences: tuple[list[int] | range, list[int] | range, list[int] | range],
) -> None:
r"""Test the LCCS methods with numpy."""
seq1 = np.arange(0, 12)
seq2 = np.array([8, 0, 1, 2, 8, 2, 3, 8, 4, 0], dtype=np.int64)
ref = np.arange(0, 3).tolist()
seq1, seq2, ref = sequences
seq1 = (
np.arange(seq1.start, seq1.stop)
if isinstance(seq1, range)
else np.array(seq1, dtype=np.int64)
)
seq2 = (
np.arange(seq2.start, seq2.stop)
if isinstance(seq2, range)
else np.array(seq2, dtype=np.int64)
)
ref = (
np.arange(ref.start, ref.stop).tolist()
if isinstance(ref, range)
else np.array(ref, dtype=np.int64).tolist()
)

lcs_len = lccs_length(seq1, seq2)
assert lcs_len == len(ref)
assert _test_lccs(seq1, seq2, ref)


def test_lccs_torch() -> None:
@pytest.mark.parametrize("sequences", TEST_CASES)
def test_lccs_torch(
sequences: tuple[list[int] | range, list[int] | range, list[int] | range],
) -> None:
r"""Test the LCCS methods with pytorch."""
seq1 = arange(0, 12)
seq2 = LongTensor([8, 0, 1, 2, 8, 2, 3, 8, 4, 0])
ref = arange(0, 3).tolist()
seq1, seq2, ref = sequences
seq1 = (
arange(seq1.start, seq1.stop) if isinstance(seq1, range) else LongTensor(seq1)
)
seq2 = (
arange(seq2.start, seq2.stop) if isinstance(seq2, range) else LongTensor(seq2)
)
ref = arange(ref.start, ref.stop) if isinstance(ref, range) else LongTensor(ref)
ref = ref.tolist()

lcs_len = lccs_length(seq1, seq2)
assert lcs_len == len(ref)
assert _test_lccs(seq1, seq2, ref)

0 comments on commit fa39c09

Please # to comment.