From 51e83d6ac18f749fd5bbcbd3a828a6d42fe63ee1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 26 Feb 2025 14:22:57 -0500 Subject: [PATCH 1/9] Add nvtext substring deduplication API --- cpp/CMakeLists.txt | 1 + cpp/include/nvtext/dedup.hpp | 53 +++++ cpp/src/text/dedup.cu | 206 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/dedup_tests.cpp | 59 +++++ python/cudf/cudf/core/column/string.py | 28 +++ .../cudf/cudf/tests/text/test_text_methods.py | 22 ++ .../pylibcudf/libcudf/nvtext/dedup.pxd | 13 ++ .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 6 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 6 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 4 +- python/pylibcudf/pylibcudf/nvtext/dedup.pxd | 6 + python/pylibcudf/pylibcudf/nvtext/dedup.pyi | 5 + python/pylibcudf/pylibcudf/nvtext/dedup.pyx | 39 ++++ .../pylibcudf/tests/test_nvtext_dedup.py | 48 ++++ 15 files changed, 491 insertions(+), 6 deletions(-) create mode 100644 cpp/include/nvtext/dedup.hpp create mode 100644 cpp/src/text/dedup.cu create mode 100644 cpp/tests/text/dedup_tests.cpp create mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/dedup.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/dedup.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/dedup.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0282282b5f3..3e8852c7300 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -739,6 +739,7 @@ add_library( src/table/table.cpp src/table/table_device_view.cu src/table/table_view.cpp + src/text/dedup.cu src/text/detokenize.cu src/text/edit_distance.cu src/text/generate_ngrams.cu diff --git a/cpp/include/nvtext/dedup.hpp b/cpp/include/nvtext/dedup.hpp new file mode 100644 index 00000000000..4595bcdef57 --- /dev/null +++ b/cpp/include/nvtext/dedup.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +//! NVText APIs +namespace CUDF_EXPORT nvtext { +/** + * @addtogroup nvtext_replace + * @{ + * @file + */ + +/** + * @brief Returns a duplicate strings found in the given input + * + * The internal implementation creates a suffix array of the input which + * requires ~10x the input size for temporary memory. + * + * The output includes any strings of at least `min_width` bytes that + * appear more than once in the entire input. + * + * @param input Strings column to dedup + * @param min_width Minimum number of bytes must match to specify a duplicate + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with updated strings + */ +std::unique_ptr substring_deduplicate( + cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** @} */ // end of group +} // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu new file mode 100644 index 00000000000..add038e20c8 --- /dev/null +++ b/cpp/src/text/dedup.cu @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace nvtext { +namespace detail { +namespace { + +using string_index = cudf::strings::detail::string_index_pair; + +struct sort_comparator_fn { + char const* d_chars; + int64_t chars_size; + __device__ bool operator()(int64_t lhs, int64_t rhs) const + { + constexpr int64_t max_size = cuda::std::numeric_limits::max(); + + auto const lhs_size = static_cast(cuda::std::min(max_size, chars_size - lhs)); + auto const rhs_size = static_cast(cuda::std::min(max_size, chars_size - rhs)); + auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); + auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); + return lh_str < rh_str; + } +}; + +__device__ cudf::size_type count_common_bytes(cudf::string_view lhs, cudf::string_view rhs) +{ + auto const size1 = lhs.size_bytes(); + auto const size2 = rhs.size_bytes(); + auto const* ptr1 = lhs.data(); + auto const* ptr2 = rhs.data(); + + cudf::size_type idx = 0; + for (; (idx < size1) && (idx < size2); ++idx) { + if (*ptr1 != *ptr2) { break; } + ++ptr1; + ++ptr2; + } + return idx; +} + +struct find_duplicates_fn { + char const* d_chars; + int64_t chars_size; + cudf::size_type width; + int64_t const* d_indices; + __device__ int16_t operator()(int64_t idx) const + { + if (idx == 0) { return 0; } + constexpr int64_t max_size = cuda::std::numeric_limits::max(); + + auto const lhs = d_indices[idx - 1]; + auto const rhs = d_indices[idx]; + auto const lhs_size = static_cast(cuda::std::min(max_size, chars_size - lhs)); + auto const rhs_size = static_cast(cuda::std::min(max_size, chars_size - rhs)); + + auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); + auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); + + auto const size = + cuda::std::min(count_common_bytes(lh_str, rh_str), + static_cast(cuda::std::numeric_limits::max())); + return size >= width ? static_cast(size) : 0; + } +}; + +struct collapse_overlaps_fn { + char const* d_chars; + int64_t const* d_offsets; + int16_t const* d_sizes; + __device__ string_index operator()(int64_t idx) const + { + auto size = d_sizes[idx]; + auto offset = d_offsets[idx]; + if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { + return string_index{nullptr, 0}; + } + auto d_ptr = d_chars + offset; + return string_index(d_ptr, size); + } +}; + +} // namespace + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + + // need to handle slicing + auto d_input_chars = input.chars_begin(stream); + auto chars_size = input.chars_size(stream); + CUDF_EXPECTS(min_width < chars_size, "min_width value larger than the input"); + + auto indices = rmm::device_uvector(chars_size - min_width, stream); + auto sizes = rmm::device_uvector(indices.size(), stream); + + thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); + // thrust::sort may be limited to a 32-bit range + thrust::sort(rmm::exec_policy_nosync(stream), + indices.begin(), + indices.end(), + sort_comparator_fn{d_input_chars, chars_size}); + + // locate candidate duplicates within the suffixes produced by sort + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(indices.size()), + sizes.begin(), + find_duplicates_fn{d_input_chars, chars_size, min_width, indices.data()}); + + // remove the non-candidate entries from indices and sizes + thrust::remove_if( + rmm::exec_policy_nosync(stream), + indices.begin(), + indices.end(), + thrust::counting_iterator(0), + [d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }); + auto end = thrust::remove(rmm::exec_policy(stream), sizes.begin(), sizes.end(), 0); + sizes.resize(thrust::distance(sizes.begin(), end), stream); + indices.resize(sizes.size(), stream); + + // sort the resulting indices/sizes for overlap filtering + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), sizes.begin()); + + // produce final duplicates for make_strings_column and collapse any overlapping candidates + auto duplicates = + rmm::device_uvector(indices.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(indices.size()), + duplicates.begin(), + collapse_overlaps_fn{d_input_chars, indices.data(), sizes.data()}); + + // filter out the remaining non-viable candidates + duplicates.resize( + thrust::distance( + duplicates.begin(), + thrust::remove( + rmm::exec_policy(stream), duplicates.begin(), duplicates.end(), string_index{nullptr, 0})), + stream); + + // sort result by size descending (should be very fast) + thrust::sort(rmm::exec_policy_nosync(stream), + duplicates.begin(), + duplicates.end(), + [] __device__(auto lhs, auto rhs) -> bool { return lhs.second > rhs.second; }); + + return cudf::strings::detail::make_strings_column( + duplicates.begin(), duplicates.end(), stream, mr); +} +} // namespace detail + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::substring_deduplicate(input, min_width, stream, mr); +} + +} // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cfc6a0dc425..881238939e4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -607,6 +607,7 @@ ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_te ConfigureTest( TEXT_TEST text/bpe_tests.cpp + text/dedup_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp text/minhash_tests.cpp diff --git a/cpp/tests/text/dedup_tests.cpp b/cpp/tests/text/dedup_tests.cpp new file mode 100644 index 00000000000..0bad8ae22ae --- /dev/null +++ b/cpp/tests/text/dedup_tests.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +struct TextDedupTest : public cudf::test::BaseFixture {}; + +TEST_F(TextDedupTest, StringDedup) +{ + // https://loremipsum.io/generator?n=25&t=p + // clang-format off + auto input = cudf::test::strings_column_wrapper({ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ", // 90 + "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ", // 180 + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit ", // 270 + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 ", // 360 + "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ", // 450 + "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil ", // 540 + "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum ", // 630 + "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo ", // 720 + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur ", // 810 + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit ", // 900 + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 ", // 990 + }); + // clang-format on + + auto sv = cudf::strings_column_view(input); + + auto results = nvtext::substring_deduplicate(sv, 20); + auto expected = cudf::test::strings_column_wrapper({" 01234567890123456789 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + results = nvtext::substring_deduplicate(sv, 15); + expected = cudf::test::strings_column_wrapper( + {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c0ad33ec7d6..af1b4ad7463 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5343,6 +5343,27 @@ def is_vowel(self, position) -> SeriesOrIndex: self._column.is_letter(True, position) # type: ignore[arg-type] ) + def substring_deduplicate(self, min_width) -> SeriesOrIndex: + """ + + + Parameters + ---------- + min_width : int32 + The minimum number of bytes to determine duplicates + + Returns + ------- + Series of duplicate strings found + + """ + return self._return_or_inplace( + self._column.substring_deduplicate(min_width), # type: ignore[arg-type] + inplace=False, + expand=False, + retain_index=False, + ) + def edit_distance(self, targets) -> SeriesOrIndex: """ The ``targets`` strings are measured against the strings in this @@ -6210,6 +6231,13 @@ def hash_character_ngrams( ) return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() + def substring_deduplicate(self, min_width: int) -> Self: + result = plc.nvtext.dedup.substring_deduplicate( + self.to_pylibcudf(mode="read"), min_width + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() def edit_distance(self, targets: Self) -> NumericalColumn: result = plc.nvtext.edit_distance.edit_distance( diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index dc45827d2e8..f0b975d56e2 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1043,3 +1043,25 @@ def test_byte_pair_encoding(separator, input, results): actual = encoder(strings, separator) assert type(expected) is type(actual) assert_eq(expected, actual) + + +def test_substring_deduplicate(): + text = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " # 90 + "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " # 180 + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " # 270 + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " # 360 + "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. " # 450 + "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil " # 540 + "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum " # 630 + "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo " # 720 + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " # 810 + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " # 900 + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " # 990 + ) + input = cudf.Series([text]) + actual = input.str.substring_deduplicate(15) + expected = cudf.Series( + [" 01234567890123456789 ", ". 012345678901234", " reprehenderit "] + ) + assert_eq(expected, actual) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd new file mode 100644 index 00000000000..9e38b9b4c51 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.types cimport size_type + + +cdef extern from "nvtext/dedup.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] substring_deduplicate( + column_view source_strings, + size_type min_width) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 93e3fb15259..0333c84badd 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx + dedup.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx + normalize.pyx replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index ef837167eb9..1b58752d292 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. from . cimport ( byte_pair_encode, + dedup, edit_distance, generate_ngrams, jaccard, @@ -15,11 +16,12 @@ from . cimport ( ) __all__ = [ + "byte_pair_encode", + "dedup", "edit_distance", "generate_ngrams", "jaccard", "minhash", - "byte_pair_encode" "ngrams_tokenize", "normalize", "replace", diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index d88a7d4b825..0740c8c6e4e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from . import ( byte_pair_encode, + dedup, edit_distance, generate_ngrams, jaccard, @@ -16,6 +17,7 @@ __all__ = [ "byte_pair_encode", + "dedup", "edit_distance", "generate_ngrams", "jaccard", diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd new file mode 100644 index 00000000000..5ff85f60b68 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +cpdef Column substring_deduplicate(Column input, size_type min_width) diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi new file mode 100644 index 00000000000..c324cfbd9c2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def substring_deduplicate(input: Column, min_width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx new file mode 100644 index 00000000000..5c65129f380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.dedup cimport ( + substring_deduplicate as cpp_substring_deduplicate, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = ["substring_deduplicate"] + + +cpdef Column substring_deduplicate(Column input, size_type min_width): + """ + Returns duplicate strings found anywhere in the input column + with min_width minimum number of bytes. + + For details, see :cpp:func:`substring_deduplicate` + + Parameters + ---------- + input : Column + Strings column of text + min_width : size_type + Minimum width of bytes to detect duplicates + + Returns + ------- + Column + New column of duplicate strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_substring_deduplicate(input.view(), min_width) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py new file mode 100644 index 00000000000..e7f4a971f08 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["trouble", "toy", "syzygy"] + return pa.array(arr) + + +@pytest.mark.parametrize("check_vowels", [True, False]) +@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) +def test_is_letter(input_col, check_vowels, indices): + def is_letter(s, i, check): + vowels = "aeiouy" + return (s[i] in vowels) == check + + result = plc.nvtext.stemmer.is_letter( + plc.interop.from_arrow(input_col), + check_vowels, + plc.interop.from_arrow(pa.array(indices)) + if isinstance(indices, list) + else indices, + ) + expected = pa.array( + [ + is_letter( + s, + indices[i] if isinstance(indices, list) else indices, + check_vowels, + ) + for i, s in enumerate(input_col.to_pylist()) + ] + ) + assert_column_eq(result, expected) + + +def test_porter_stemmer_measure(input_col): + result = plc.nvtext.stemmer.porter_stemmer_measure( + plc.interop.from_arrow(input_col), + ) + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) From b2025fd8e3895a7d606acd892a911515029fc813 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Mar 2025 11:48:43 -0500 Subject: [PATCH 2/9] add sliced support; fix pytest --- cpp/src/text/dedup.cu | 24 +++++++---- cpp/tests/text/dedup_tests.cpp | 8 ++++ .../pylibcudf/tests/test_nvtext_dedup.py | 43 +++++-------------- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index add038e20c8..fbce1914461 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -96,9 +97,11 @@ struct find_duplicates_fn { auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); - auto const size = - cuda::std::min(count_common_bytes(lh_str, rh_str), - static_cast(cuda::std::numeric_limits::max())); + constexpr auto max_run_length = + static_cast(cuda::std::numeric_limits::max()); + + auto const size = cuda::std::min(count_common_bytes(lh_str, rh_str), max_run_length); + return size >= width ? static_cast(size) : 0; } }; @@ -112,6 +115,7 @@ struct collapse_overlaps_fn { auto size = d_sizes[idx]; auto offset = d_offsets[idx]; if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { + // TODO: need to handle chains longer than max return string_index{nullptr, 0}; } auto d_ptr = d_chars + offset; @@ -129,16 +133,18 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); auto d_strings = cudf::column_device_view::create(input.parent(), stream); - // need to handle slicing - auto d_input_chars = input.chars_begin(stream); - auto chars_size = input.chars_size(stream); - CUDF_EXPECTS(min_width < chars_size, "min_width value larger than the input"); + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + + auto d_input_chars = input.chars_begin(stream) + first_offset; + auto chars_size = last_offset - first_offset; + CUDF_EXPECTS(min_width < chars_size, "min_width value cannot exceed the input size"); - auto indices = rmm::device_uvector(chars_size - min_width, stream); + auto indices = rmm::device_uvector(chars_size - min_width + 1, stream); auto sizes = rmm::device_uvector(indices.size(), stream); thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); - // thrust::sort may be limited to a 32-bit range + // note: thrust::sort may be limited to a 32-bit range thrust::sort(rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), diff --git a/cpp/tests/text/dedup_tests.cpp b/cpp/tests/text/dedup_tests.cpp index 0bad8ae22ae..6429b70363a 100644 --- a/cpp/tests/text/dedup_tests.cpp +++ b/cpp/tests/text/dedup_tests.cpp @@ -56,4 +56,12 @@ TEST_F(TextDedupTest, StringDedup) expected = cudf::test::strings_column_wrapper( {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + // Test with sliced input + auto const sliced_input = cudf::slice(input, {1, 10}).front(); + + sv = cudf::strings_column_view(sliced_input); + results = nvtext::substring_deduplicate(sv, 15); + expected = cudf::test::strings_column_wrapper({"01234567890123456789 ", " reprehenderit "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py index e7f4a971f08..7987045435c 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -9,40 +9,19 @@ @pytest.fixture(scope="module") def input_col(): - arr = ["trouble", "toy", "syzygy"] + arr = [ + "01234567890123456789", + "01234567890123456789", + "01234567890123456789", + ] return pa.array(arr) -@pytest.mark.parametrize("check_vowels", [True, False]) -@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) -def test_is_letter(input_col, check_vowels, indices): - def is_letter(s, i, check): - vowels = "aeiouy" - return (s[i] in vowels) == check - - result = plc.nvtext.stemmer.is_letter( - plc.interop.from_arrow(input_col), - check_vowels, - plc.interop.from_arrow(pa.array(indices)) - if isinstance(indices, list) - else indices, - ) - expected = pa.array( - [ - is_letter( - s, - indices[i] if isinstance(indices, list) else indices, - check_vowels, - ) - for i, s in enumerate(input_col.to_pylist()) - ] - ) - assert_column_eq(result, expected) - - -def test_porter_stemmer_measure(input_col): - result = plc.nvtext.stemmer.porter_stemmer_measure( +@pytest.mark.parametrize("min_width", [10, 20]) +def test_substring_deduplicate(input_col, min_width): + result = plc.nvtext.dedup.substring_deduplicate( plc.interop.from_arrow(input_col), + min_width, ) - expected = pa.array([1, 1, 2], type=pa.int32()) + expected = pa.array(["01234567890123456789012345678901234567890123456789"]) assert_column_eq(result, expected) From 915a290b5f4253ace582fd29ccb3df05759218ac Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Mar 2025 19:22:08 -0500 Subject: [PATCH 3/9] fix pytest --- cpp/src/text/dedup.cu | 4 +++- .../cudf/cudf/tests/text/test_text_methods.py | 17 ++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index fbce1914461..3c59d9121ea 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -115,9 +115,11 @@ struct collapse_overlaps_fn { auto size = d_sizes[idx]; auto offset = d_offsets[idx]; if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { - // TODO: need to handle chains longer than max return string_index{nullptr, 0}; } + // TODO: need to handle chains longer than max + // size == d_sizes[idx-1] == max + auto d_ptr = d_chars + offset; return string_index(d_ptr, size); } diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 1c3d378ba5c..1463f1d3351 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1089,17 +1089,12 @@ def test_byte_pair_encoding(separator, input, results): def test_substring_deduplicate(): text = ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " # 90 - "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " # 180 - "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " # 270 - "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " # 360 - "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. " # 450 - "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil " # 540 - "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum " # 630 - "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo " # 720 - "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " # 810 - "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " # 900 - "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " # 990 + " 01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " ) input = cudf.Series([text]) actual = input.str.substring_deduplicate(15) From 2c9f6750e02682a3d2f095d7cf8b8241c5ee0b76 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 5 Mar 2025 15:13:28 -0500 Subject: [PATCH 4/9] add max-run-length edge case handling --- cpp/src/text/dedup.cu | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index 3c59d9121ea..fe08742a1ae 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -112,13 +112,30 @@ struct collapse_overlaps_fn { int16_t const* d_sizes; __device__ string_index operator()(int64_t idx) const { + constexpr auto max_run_length = + static_cast(cuda::std::numeric_limits::max()); + auto size = d_sizes[idx]; auto offset = d_offsets[idx]; - if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { - return string_index{nullptr, 0}; + if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1])) { + if (size < d_sizes[idx - 1]) { return string_index{nullptr, 0}; } + if (size == d_sizes[idx - 1] && size == max_run_length) { + // check if we are in the middle of a chain + auto prev_idx = idx - max_run_length; + auto prev_offset = offset; + while (prev_idx >= 0) { + if (d_offsets[prev_idx] != (prev_offset - max_run_length)) { + prev_idx = -1; + } else { + if (d_sizes[idx + 1] < size) { break; } // final edge + prev_offset = d_offsets[prev_idx]; + if ((prev_idx == 0) || ((prev_offset - 1) != d_offsets[prev_idx - 1])) { break; } + prev_idx -= max_run_length; + } + } + if (prev_idx < 0) { return string_index{nullptr, 0}; } + } } - // TODO: need to handle chains longer than max - // size == d_sizes[idx-1] == max auto d_ptr = d_chars + offset; return string_index(d_ptr, size); From b1e44f2875c16f9275538ccbc725e4fdf2ea1d7e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 7 Mar 2025 15:05:32 -0500 Subject: [PATCH 5/9] use cub sort instead of thrust sort --- cpp/src/text/dedup.cu | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index fe08742a1ae..95ecdadc2bc 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -162,12 +162,17 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co auto indices = rmm::device_uvector(chars_size - min_width + 1, stream); auto sizes = rmm::device_uvector(indices.size(), stream); - thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); - // note: thrust::sort may be limited to a 32-bit range - thrust::sort(rmm::exec_policy_nosync(stream), - indices.begin(), - indices.end(), - sort_comparator_fn{d_input_chars, chars_size}); + { + auto const cmp_op = sort_comparator_fn{d_input_chars, chars_size}; + auto const seq = thrust::make_counting_iterator(0); + auto tmp_bytes = std::size_t{0}; + cub::DeviceMergeSort::SortKeysCopy( + nullptr, tmp_bytes, seq, indices.begin(), indices.size(), cmp_op, stream.value()); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + // std::cout << indices.size() * sizeof(int64_t) << "/" << tmp_bytes << std::endl; + cub::DeviceMergeSort::SortKeysCopy( + tmp_stg.data(), tmp_bytes, seq, indices.begin(), indices.size(), cmp_op, stream.value()); + } // locate candidate duplicates within the suffixes produced by sort thrust::transform(rmm::exec_policy_nosync(stream), From 031bd6de29d2a363be4af0dbc79c0c159063ec05 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 11 Mar 2025 17:55:18 -0400 Subject: [PATCH 6/9] add remove-if-safe --- cpp/src/text/dedup.cu | 84 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index 95ecdadc2bc..9f47cc5ce84 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,33 @@ struct sort_comparator_fn { } }; +#if 0 +__global__ void bitonic_sort_step( + sort_comparator_fn scfn, int64_t* d_indices, int64_t size, int64_t j, int64_t k) +{ + auto const i = cudf::detail::grid_1d::global_thread_id(); + auto const ixj = i ^ j; + + if (i >= size || ixj >= size) { return; } + + if ((ixj) > i) { + if ((i & k) == 0) { + if (scfn(d_indices[ixj], d_indices[i])) { //(dev_values[i] > dev_values[ixj]) + auto const temp = d_indices[i]; // dev_values[i]; + d_indices[i] = d_indices[ixj]; // dev_values[i] = dev_values[ixj]; + d_indices[ixj] = temp; // dev_values[ixj] = temp; + } + } else { + if (scfn(d_indices[i], d_indices[ixj])) { //(dev_values[i] < dev_values[ixj]) + auto const temp = d_indices[i]; // dev_values[i]; + d_indices[i] = d_indices[ixj]; // dev_values[i] = dev_values[ixj]; + d_indices[ixj] = temp; // dev_values[ixj] = temp; + } + } + } +} +#endif + __device__ cudf::size_type count_common_bytes(cudf::string_view lhs, cudf::string_view rhs) { auto const size1 = lhs.size_bytes(); @@ -142,6 +170,39 @@ struct collapse_overlaps_fn { } }; +template +Iterator remove_if_safe( + Iterator first, Iterator last, Stencil stencil, Predicate const& fn, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove_if(rmm::exec_policy(stream), itr, end, stencil, fn); + itr = end; + } + return result; +} + +// handles ranges above int32 max +template +Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove(rmm::exec_policy(stream), itr, end, value); + itr = end; + } + return result; +} } // namespace std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, @@ -173,6 +234,21 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co cub::DeviceMergeSort::SortKeysCopy( tmp_stg.data(), tmp_bytes, seq, indices.begin(), indices.size(), cmp_op, stream.value()); } +#if 0 + { + thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); + auto const cmp_op = sort_comparator_fn{d_input_chars, chars_size}; + auto size2 = 1 << static_cast(std::log2(chars_size) + 1.0); + for (auto k = 2L; k <= size2; k <<= 1) { + for (auto j = k >> 1; j > 0; j = j >> 1) { + auto grid = cudf::detail::grid_1d(chars_size, 512); + bitonic_sort_step<<>>( + cmp_op, indices.data(), (int64_t)indices.size(), j, k); + } + } + std::cout << "bitonic-sort " << (int)cudaStreamSynchronize(stream.value()) << std::endl; + } +#endif // locate candidate duplicates within the suffixes produced by sort thrust::transform(rmm::exec_policy_nosync(stream), @@ -182,13 +258,13 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co find_duplicates_fn{d_input_chars, chars_size, min_width, indices.data()}); // remove the non-candidate entries from indices and sizes - thrust::remove_if( - rmm::exec_policy_nosync(stream), + remove_if_safe( indices.begin(), indices.end(), thrust::counting_iterator(0), - [d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }); - auto end = thrust::remove(rmm::exec_policy(stream), sizes.begin(), sizes.end(), 0); + [d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }, + stream); + auto end = remove_safe(sizes.begin(), sizes.end(), 0, stream); sizes.resize(thrust::distance(sizes.begin(), end), stream); indices.resize(sizes.size(), stream); From c5ff64b382fd2b90ed309f5405bffd3ec24034d4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Mar 2025 15:41:25 -0400 Subject: [PATCH 7/9] build_suffix_array --- cpp/include/nvtext/dedup.hpp | 17 +++++ cpp/src/text/dedup.cu | 70 +++++++++++++------ python/cudf/cudf/core/column/string.py | 16 +++++ .../pylibcudf/libcudf/nvtext/dedup.pxd | 10 +++ python/pylibcudf/pylibcudf/nvtext/dedup.pxd | 1 + python/pylibcudf/pylibcudf/nvtext/dedup.pyi | 1 + python/pylibcudf/pylibcudf/nvtext/dedup.pyx | 48 ++++++++++++- 7 files changed, 141 insertions(+), 22 deletions(-) diff --git a/cpp/include/nvtext/dedup.hpp b/cpp/include/nvtext/dedup.hpp index 4595bcdef57..ba31938545d 100644 --- a/cpp/include/nvtext/dedup.hpp +++ b/cpp/include/nvtext/dedup.hpp @@ -20,6 +20,9 @@ #include #include +#include +#include + //! NVText APIs namespace CUDF_EXPORT nvtext { /** @@ -49,5 +52,19 @@ std::unique_ptr substring_deduplicate( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Builds a suffix array for the input strings column + * + * @param input Strings column to build suffix array for + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Sorted suffix array and corresponding sizes + */ +std::pair>, + std::unique_ptr>> +build_suffix_array(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index 9f47cc5ce84..ee2ca96bc84 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -205,14 +205,13 @@ Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_st } } // namespace -std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, - cudf::size_type min_width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair>, + std::unique_ptr>> +build_suffix_array(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); - auto d_strings = cudf::column_device_view::create(input.parent(), stream); - auto [first_offset, last_offset] = cudf::strings::detail::get_first_and_last_offset(input, stream); @@ -250,36 +249,57 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co } #endif + return std::make_pair(std::make_unique>(std::move(indices)), + std::make_unique>(std::move(sizes))); +} + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + + auto d_input_chars = input.chars_begin(stream) + first_offset; + auto chars_size = last_offset - first_offset; + CUDF_EXPECTS(min_width < chars_size, "min_width value cannot exceed the input size"); + + auto [indices, sizes] = build_suffix_array(input, min_width, stream, mr); + // locate candidate duplicates within the suffixes produced by sort thrust::transform(rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), - thrust::counting_iterator(indices.size()), - sizes.begin(), - find_duplicates_fn{d_input_chars, chars_size, min_width, indices.data()}); + thrust::counting_iterator(indices->size()), + sizes->begin(), + find_duplicates_fn{d_input_chars, chars_size, min_width, indices->data()}); // remove the non-candidate entries from indices and sizes remove_if_safe( - indices.begin(), - indices.end(), + indices->begin(), + indices->end(), thrust::counting_iterator(0), - [d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }, + [d_sizes = sizes->data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }, stream); - auto end = remove_safe(sizes.begin(), sizes.end(), 0, stream); - sizes.resize(thrust::distance(sizes.begin(), end), stream); - indices.resize(sizes.size(), stream); + auto end = remove_safe(sizes->begin(), sizes->end(), 0, stream); + sizes->resize(thrust::distance(sizes->begin(), end), stream); + indices->resize(sizes->size(), stream); // sort the resulting indices/sizes for overlap filtering thrust::sort_by_key( - rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), sizes.begin()); + rmm::exec_policy_nosync(stream), indices->begin(), indices->end(), sizes->begin()); // produce final duplicates for make_strings_column and collapse any overlapping candidates auto duplicates = - rmm::device_uvector(indices.size(), stream); + rmm::device_uvector(indices->size(), stream); thrust::transform(rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), - thrust::counting_iterator(indices.size()), + thrust::counting_iterator(indices->size()), duplicates.begin(), - collapse_overlaps_fn{d_input_chars, indices.data(), sizes.data()}); + collapse_overlaps_fn{d_input_chars, indices->data(), sizes->data()}); // filter out the remaining non-viable candidates duplicates.resize( @@ -298,6 +318,7 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co return cudf::strings::detail::make_strings_column( duplicates.begin(), duplicates.end(), stream, mr); } + } // namespace detail std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, @@ -309,4 +330,13 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co return detail::substring_deduplicate(input, min_width, stream, mr); } +std::pair>, + std::unique_ptr>> +build_suffix_array(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::build_suffix_array(input, 8, stream, mr); +} } // namespace nvtext diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2856ac604d7..6342a37ab9c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5364,6 +5364,15 @@ def substring_deduplicate(self, min_width) -> SeriesOrIndex: retain_index=False, ) + def build_suffix_array(self) -> SeriesOrIndex: + """ """ + return self._return_or_inplace( + self._column.build_suffix_array(), # type: ignore[arg-type] + inplace=False, + expand=False, + retain_index=False, + ) + def edit_distance(self, targets) -> SeriesOrIndex: """ The ``targets`` strings are measured against the strings in this @@ -6351,6 +6360,13 @@ def substring_deduplicate(self, min_width: int) -> Self: ) return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() + def build_suffix_array(self) -> Self: + result = plc.nvtext.dedup.build_suffix_array( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() def edit_distance(self, targets: Self) -> NumericalColumn: result = plc.nvtext.edit_distance.edit_distance( diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd index 9e38b9b4c51..83bc6c31875 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd @@ -1,13 +1,23 @@ # Copyright (c) 2025, NVIDIA CORPORATION. + from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type +from rmm.librmm.device_uvector cimport device_uvector + +ctypedef unique_ptr[device_uvector[int]] suffix_array_type +ctypedef unique_ptr[device_uvector[int]] suffix_size_type +ctypedef pair[suffix_array_type, suffix_size_type] suffix_array_pair_type cdef extern from "nvtext/dedup.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] substring_deduplicate( column_view source_strings, size_type min_width) except +libcudf_exception_handler + + cdef suffix_array_pair_type build_suffix_array( + column_view source_strings) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd index 5ff85f60b68..d5713175788 100644 --- a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd @@ -4,3 +4,4 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.types cimport size_type cpdef Column substring_deduplicate(Column input, size_type min_width) +cpdef tuple build_suffix_array(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi index c324cfbd9c2..26b06eed8ec 100644 --- a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi @@ -3,3 +3,4 @@ from pylibcudf.column import Column def substring_deduplicate(input: Column, min_width: int) -> Column: ... +def build_suffix_array(input: Column) -> tuple[Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx index 5c65129f380..ab956475b15 100644 --- a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx @@ -1,16 +1,33 @@ # Copyright (c) 2025, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr +from cython.operator import dereference + +from libcpp.memory cimport unique_ptr, make_unique from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.dedup cimport ( substring_deduplicate as cpp_substring_deduplicate, + build_suffix_array as cpp_build_suffix_array, + suffix_array_pair_type as cpp_suffix_array_pair_type, ) from pylibcudf.libcudf.types cimport size_type -__all__ = ["substring_deduplicate"] +from rmm.librmm.device_buffer cimport device_buffer + +__all__ = ["substring_deduplicate", "build_suffix_array"] +cdef Column _column_from_suffix_array(cpp_suffix_array_pair_type suffix_array): + # helper to convert a suffix array to a Column + return Column.from_libcudf( + move( + make_unique[column]( + move(dereference(suffix_array.get())), + device_buffer(), + 0 + ) + ) + ) cpdef Column substring_deduplicate(Column input, size_type min_width): """ @@ -37,3 +54,30 @@ cpdef Column substring_deduplicate(Column input, size_type min_width): c_result = cpp_substring_deduplicate(input.view(), min_width) return Column.from_libcudf(move(c_result)) + + +cpdef tuple build_suffix_array(Column input): + """ + Builds a suffix array for the input strings column + + For details, see :cpp:func:`build_suffix_array` + + Parameters + ---------- + input : Column + Strings column of text + + Returns + ------- + Column + New column of suffix array + """ + cdef cpp_suffix_array_pair_type c_result + + with nogil: + c_result = cpp_build_suffix_array(input.view()) + + return ( + _column_from_suffix_array(move(c_result.first)), + _column_from_suffix_array(move(c_result.second)), + ) From 9b89188e2268cb3c045108edf910b7b7acdb48d6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Mar 2025 17:50:32 -0400 Subject: [PATCH 8/9] fix cython interface --- cpp/include/nvtext/dedup.hpp | 9 ++++----- cpp/src/text/dedup.cu | 12 ++++++------ python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd | 7 +++---- python/pylibcudf/pylibcudf/nvtext/dedup.pxd | 2 +- python/pylibcudf/pylibcudf/nvtext/dedup.pyi | 2 +- python/pylibcudf/pylibcudf/nvtext/dedup.pyx | 13 +++++-------- 6 files changed, 20 insertions(+), 25 deletions(-) diff --git a/cpp/include/nvtext/dedup.hpp b/cpp/include/nvtext/dedup.hpp index ba31938545d..6d3a1d80ec6 100644 --- a/cpp/include/nvtext/dedup.hpp +++ b/cpp/include/nvtext/dedup.hpp @@ -60,11 +60,10 @@ std::unique_ptr substring_deduplicate( * @param mr Device memory resource used to allocate the returned column's device memory * @return Sorted suffix array and corresponding sizes */ -std::pair>, - std::unique_ptr>> -build_suffix_array(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +std::unique_ptr> build_suffix_array( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index ee2ca96bc84..1e7a333593a 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -330,13 +330,13 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co return detail::substring_deduplicate(input, min_width, stream, mr); } -std::pair>, - std::unique_ptr>> -build_suffix_array(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr> build_suffix_array( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::build_suffix_array(input, 8, stream, mr); + auto result = detail::build_suffix_array(input, 8, stream, mr); + return std::move(result.first); } } // namespace nvtext diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd index 83bc6c31875..386d2f50a8c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +from libc.stdint cimport int64_t from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -9,9 +10,7 @@ from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_uvector cimport device_uvector -ctypedef unique_ptr[device_uvector[int]] suffix_array_type -ctypedef unique_ptr[device_uvector[int]] suffix_size_type -ctypedef pair[suffix_array_type, suffix_size_type] suffix_array_pair_type +ctypedef unique_ptr[device_uvector[int64_t]] suffix_array_type cdef extern from "nvtext/dedup.hpp" namespace "nvtext" nogil: @@ -19,5 +18,5 @@ cdef extern from "nvtext/dedup.hpp" namespace "nvtext" nogil: column_view source_strings, size_type min_width) except +libcudf_exception_handler - cdef suffix_array_pair_type build_suffix_array( + cdef suffix_array_type build_suffix_array( column_view source_strings) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd index d5713175788..197c722ffd4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd @@ -4,4 +4,4 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.types cimport size_type cpdef Column substring_deduplicate(Column input, size_type min_width) -cpdef tuple build_suffix_array(Column input) +cpdef Column build_suffix_array(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi index 26b06eed8ec..06249cc989b 100644 --- a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi @@ -3,4 +3,4 @@ from pylibcudf.column import Column def substring_deduplicate(input: Column, min_width: int) -> Column: ... -def build_suffix_array(input: Column) -> tuple[Column, Column]: ... +def build_suffix_array(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx index ab956475b15..58f07f83df6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx @@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.dedup cimport ( substring_deduplicate as cpp_substring_deduplicate, build_suffix_array as cpp_build_suffix_array, - suffix_array_pair_type as cpp_suffix_array_pair_type, + suffix_array_type as cpp_suffix_array_type ) from pylibcudf.libcudf.types cimport size_type @@ -17,7 +17,7 @@ from rmm.librmm.device_buffer cimport device_buffer __all__ = ["substring_deduplicate", "build_suffix_array"] -cdef Column _column_from_suffix_array(cpp_suffix_array_pair_type suffix_array): +cdef Column _column_from_suffix_array(cpp_suffix_array_type suffix_array): # helper to convert a suffix array to a Column return Column.from_libcudf( move( @@ -56,7 +56,7 @@ cpdef Column substring_deduplicate(Column input, size_type min_width): return Column.from_libcudf(move(c_result)) -cpdef tuple build_suffix_array(Column input): +cpdef Column build_suffix_array(Column input): """ Builds a suffix array for the input strings column @@ -72,12 +72,9 @@ cpdef tuple build_suffix_array(Column input): Column New column of suffix array """ - cdef cpp_suffix_array_pair_type c_result + cdef cpp_suffix_array_type c_result with nogil: c_result = cpp_build_suffix_array(input.view()) - return ( - _column_from_suffix_array(move(c_result.first)), - _column_from_suffix_array(move(c_result.second)), - ) + return _column_from_suffix_array(move(c_result)) From f5a23d2ec82084d639c886cbac106ccff8b2ea62 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Mar 2025 20:08:52 -0400 Subject: [PATCH 9/9] add benchmark; debug bitonic --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/text/dedup.cpp | 55 ++++++++++++++++++++++++ cpp/include/nvtext/dedup.hpp | 2 + cpp/src/text/dedup.cu | 77 +++++++++++++++++++++------------- cpp/tests/text/dedup_tests.cpp | 61 +++++++++++++++++++++------ 5 files changed, 156 insertions(+), 40 deletions(-) create mode 100644 cpp/benchmarks/text/dedup.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5f1768bdcac..9bff2402eca 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -361,6 +361,7 @@ ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp transform/transform ConfigureNVBench( TEXT_NVBENCH text/byte_pair_encoding.cpp + text/dedup.cpp text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp diff --git a/cpp/benchmarks/text/dedup.cpp b/cpp/benchmarks/text/dedup.cpp new file mode 100644 index 00000000000..cb4d67dab3d --- /dev/null +++ b/cpp/benchmarks/text/dedup.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include + +static void bench_substring_deduplicate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const dup_width = static_cast(state.get_int64("dup_width")); + + data_profile const strings_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + cudf::strings_column_view input(strings_table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + // output are integers (one per row) + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::substring_deduplicate(input, dup_width); + }); +} + +NVBENCH_BENCH(bench_substring_deduplicate) + .set_name("substring_deduplicate") + .add_int64_axis("dup_width", {50}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144}); diff --git a/cpp/include/nvtext/dedup.hpp b/cpp/include/nvtext/dedup.hpp index 6d3a1d80ec6..cc911ee64e3 100644 --- a/cpp/include/nvtext/dedup.hpp +++ b/cpp/include/nvtext/dedup.hpp @@ -56,12 +56,14 @@ std::unique_ptr substring_deduplicate( * @brief Builds a suffix array for the input strings column * * @param input Strings column to build suffix array for + * @param bitonic If true, use bitonic sort to build the suffix array * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Sorted suffix array and corresponding sizes */ std::unique_ptr> build_suffix_array( cudf::strings_column_view const& input, + bool bitonic = false, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index 1e7a333593a..f322634be97 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -43,6 +43,8 @@ #include #include +#include + namespace nvtext { namespace detail { namespace { @@ -64,32 +66,47 @@ struct sort_comparator_fn { } }; -#if 0 -__global__ void bitonic_sort_step( - sort_comparator_fn scfn, int64_t* d_indices, int64_t size, int64_t j, int64_t k) +__global__ [[maybe_unused]] void bitonic_sort_step( + sort_comparator_fn scfn, int64_t* d_indices, int64_t size, int64_t size2, int64_t j, int64_t k) { auto const i = cudf::detail::grid_1d::global_thread_id(); auto const ixj = i ^ j; - if (i >= size || ixj >= size) { return; } + if (i >= size2) { return; } - if ((ixj) > i) { + if (ixj > i) { + auto const di = d_indices[i]; + auto const dixj = d_indices[ixj]; + auto const gtlt = (i & k) == 0 ? '>' : '<'; + if (di == 2 || dixj == 2 || di == 16 || dixj == 16) + printf("%ld=%ld %c %ld=%ld\n", i, di, gtlt, ixj, dixj); if ((i & k) == 0) { - if (scfn(d_indices[ixj], d_indices[i])) { //(dev_values[i] > dev_values[ixj]) - auto const temp = d_indices[i]; // dev_values[i]; - d_indices[i] = d_indices[ixj]; // dev_values[i] = dev_values[ixj]; - d_indices[ixj] = temp; // dev_values[ixj] = temp; + if (di >= size || dixj >= size) { + if (dixj < di) { + auto const temp = di; + d_indices[i] = dixj; + d_indices[ixj] = temp; + } + } else if (scfn(dixj, di)) { //(dev_values[i] > dev_values[ixj]) + auto const temp = di; // dev_values[i]; + d_indices[i] = dixj; // dev_values[i] = dev_values[ixj]; + d_indices[ixj] = temp; // dev_values[ixj] = temp; } } else { - if (scfn(d_indices[i], d_indices[ixj])) { //(dev_values[i] < dev_values[ixj]) - auto const temp = d_indices[i]; // dev_values[i]; - d_indices[i] = d_indices[ixj]; // dev_values[i] = dev_values[ixj]; - d_indices[ixj] = temp; // dev_values[ixj] = temp; + if (di >= size || dixj >= size) { + if (dixj < di) { + auto const temp = di; + d_indices[i] = dixj; + d_indices[ixj] = temp; + } + } else if (scfn(di, dixj)) { //(dev_values[i] < dev_values[ixj]) + auto const temp = di; // dev_values[i]; + d_indices[i] = dixj; // dev_values[i] = dev_values[ixj]; + d_indices[ixj] = temp; // dev_values[ixj] = temp; } } } } -#endif __device__ cudf::size_type count_common_bytes(cudf::string_view lhs, cudf::string_view rhs) { @@ -215,14 +232,15 @@ build_suffix_array(cudf::strings_column_view const& input, auto [first_offset, last_offset] = cudf::strings::detail::get_first_and_last_offset(input, stream); - auto d_input_chars = input.chars_begin(stream) + first_offset; - auto chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; + auto const chars_size = last_offset - first_offset; CUDF_EXPECTS(min_width < chars_size, "min_width value cannot exceed the input size"); - auto indices = rmm::device_uvector(chars_size - min_width + 1, stream); + auto size = chars_size - min_width + 1; + auto indices = rmm::device_uvector(size, stream); auto sizes = rmm::device_uvector(indices.size(), stream); - { + if (min_width > 0) { auto const cmp_op = sort_comparator_fn{d_input_chars, chars_size}; auto const seq = thrust::make_counting_iterator(0); auto tmp_bytes = std::size_t{0}; @@ -232,22 +250,25 @@ build_suffix_array(cudf::strings_column_view const& input, // std::cout << indices.size() * sizeof(int64_t) << "/" << tmp_bytes << std::endl; cub::DeviceMergeSort::SortKeysCopy( tmp_stg.data(), tmp_bytes, seq, indices.begin(), indices.size(), cmp_op, stream.value()); - } -#if 0 - { + } else { + size = chars_size - 8 + 1; + auto const size2 = 1L << static_cast(std::ceil(std::log2(size))); + std::cout << "size: " << size << " size2: " << size2 << std::endl; + indices.resize(size2, stream); + std::cout << "indices size: " << indices.size() << std::endl; thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); auto const cmp_op = sort_comparator_fn{d_input_chars, chars_size}; - auto size2 = 1 << static_cast(std::log2(chars_size) + 1.0); + auto const grid = cudf::detail::grid_1d(size2, 512); for (auto k = 2L; k <= size2; k <<= 1) { for (auto j = k >> 1; j > 0; j = j >> 1) { - auto grid = cudf::detail::grid_1d(chars_size, 512); + // printf("--------- k=%ld j=%ld\n", k, j); bitonic_sort_step<<>>( - cmp_op, indices.data(), (int64_t)indices.size(), j, k); + cmp_op, indices.data(), size, size2, j, k); } } - std::cout << "bitonic-sort " << (int)cudaStreamSynchronize(stream.value()) << std::endl; + // std::cout << "bitonic-sort " << (int)cudaStreamSynchronize(stream.value()) << std::endl; + indices.resize(size, stream); } -#endif return std::make_pair(std::make_unique>(std::move(indices)), std::make_unique>(std::move(sizes))); @@ -332,11 +353,11 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co std::unique_ptr> build_suffix_array( cudf::strings_column_view const& input, + bool bitonic, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto result = detail::build_suffix_array(input, 8, stream, mr); - return std::move(result.first); + return std::get<0>(detail::build_suffix_array(input, bitonic ? 0 : 8, stream, mr)); } } // namespace nvtext diff --git a/cpp/tests/text/dedup_tests.cpp b/cpp/tests/text/dedup_tests.cpp index 6429b70363a..d40aff218e2 100644 --- a/cpp/tests/text/dedup_tests.cpp +++ b/cpp/tests/text/dedup_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -48,20 +49,56 @@ TEST_F(TextDedupTest, StringDedup) auto sv = cudf::strings_column_view(input); - auto results = nvtext::substring_deduplicate(sv, 20); - auto expected = cudf::test::strings_column_wrapper({" 01234567890123456789 "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + // auto results = nvtext::substring_deduplicate(sv, 20); + // auto expected = cudf::test::strings_column_wrapper({" 01234567890123456789 "}); + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - results = nvtext::substring_deduplicate(sv, 15); - expected = cudf::test::strings_column_wrapper( - {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + auto results = nvtext::substring_deduplicate(sv, 15); + cudf::test::print(results->view()); + // auto expected = cudf::test::strings_column_wrapper( + // {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); // Test with sliced input - auto const sliced_input = cudf::slice(input, {1, 10}).front(); + // auto const sliced_input = cudf::slice(input, {1, 10}).front(); + // + // sv = cudf::strings_column_view(sliced_input); + // results = nvtext::substring_deduplicate(sv, 15); + // expected = cudf::test::strings_column_wrapper({"01234567890123456789 ", " reprehenderit "}); + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + +TEST_F(TextDedupTest, SuffixArray) +{ + // https://loremipsum.io/generator?n=25&t=p + // clang-format off + auto input = cudf::test::strings_column_wrapper({ + "Lorem ipsum dolor sit am"//et, consectetur adipiscing elit, sed do eiusmod tempor incididunt ", // 90 + // "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ", // 180 + // "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit ", // 270 + // "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 ", // 360 + // "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ", // 450 + // "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil ", // 540 + // "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum ", // 630 + // "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo ", // 720 + // "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur ", // 810 + // "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit ", // 900 + // "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 ", // 990 + }); + // clang-format on + + auto sv = cudf::strings_column_view(input); + std::cout << "input size: " << sv.chars_size(cudf::get_default_stream()) << std::endl; + + auto results = nvtext::build_suffix_array(sv); + auto results_column = + std::make_unique(std::move(*(results.release())), rmm::device_buffer{}, 0); + std::cout << "non-bitonic results: " << results_column->size() << std::endl; + cudf::test::print(results_column->view()); - sv = cudf::strings_column_view(sliced_input); - results = nvtext::substring_deduplicate(sv, 15); - expected = cudf::test::strings_column_wrapper({"01234567890123456789 ", " reprehenderit "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + results = nvtext::build_suffix_array(sv, true); + results_column = + std::make_unique(std::move(*(results.release())), rmm::device_buffer{}, 0); + std::cout << "bitonic results: " << results_column->size() << std::endl; + cudf::test::print(results_column->view()); }