From eacfbba67d13bda239a79fa612942517e261f442 Mon Sep 17 00:00:00 2001 From: Maciej Date: Thu, 30 Nov 2023 22:16:02 +0100 Subject: [PATCH] sample size k build changed, tests added --- .Rproj.user/E3DB6272/pcs/files-pane.pper | 2 +- .Rproj.user/E3DB6272/pcs/source-pane.pper | 2 +- .Rproj.user/shared/notebooks/paths | 1 + NEWS.md | 1 + R/method_nnd.R | 2 +- README.Rmd | 1 + README.md | 26 +++++++++++------------ inst/tinytest/test_reclin2.R | 2 +- man/controls_ann.Rd | 6 +++--- 9 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.Rproj.user/E3DB6272/pcs/files-pane.pper b/.Rproj.user/E3DB6272/pcs/files-pane.pper index 95effdb..7355041 100644 --- a/.Rproj.user/E3DB6272/pcs/files-pane.pper +++ b/.Rproj.user/E3DB6272/pcs/files-pane.pper @@ -9,5 +9,5 @@ "ascending": false } ], - "path": "~/git/nauka/ncn-foreigners/software/blocking/R" + "path": "~/git/nauka/ncn-foreigners/software/blocking" } \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/pcs/source-pane.pper b/.Rproj.user/E3DB6272/pcs/source-pane.pper index be19143..c755c58 100644 --- a/.Rproj.user/E3DB6272/pcs/source-pane.pper +++ b/.Rproj.user/E3DB6272/pcs/source-pane.pper @@ -1,4 +1,4 @@ { - "activeTab": 0, + "activeTab": 2, "activeTabSourceWindow0": 0 } \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index 2302d89..5149a5a 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -9,6 +9,7 @@ /Users/berenz/git/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="B0938ADD" /Users/berenz/git/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="C19508EB" /Users/berenz/git/nauka/ncn-foreigners/software/blocking/R/method_mlpack.R="9402F0E3" +/Users/berenz/git/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="E5F797EE" /Users/berenz/git/nauka/ncn-foreigners/software/blocking/R/methods.R="081419BC" /Users/berenz/git/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="D089A6FC" /Users/berenz/git/nauka/ncn-foreigners/software/blocking/README.Rmd="2B1049F0" diff --git a/NEWS.md b/NEWS.md index 7d88a39..7bb0268 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,3 +10,4 @@ 8. first vignette added. 9. evaluation with standard metrics (recall, fpr etc) added, works with vector for deduplication. 10. added saving index for hnsw and annoy +11. `rnndescend` support added. diff --git a/R/method_nnd.R b/R/method_nnd.R index 8ae94bd..46c4c72 100644 --- a/R/method_nnd.R +++ b/R/method_nnd.R @@ -27,7 +27,7 @@ method_nnd <- function(x, control) { l_ind <- rnndescent::rnnd_build(data = x, - k = control$nnd$k_build, + k = if (nrow(x) < control$nnd$k_build) nrow(x) else control$nnd$k_build, metric = distance, verbose = verbose, n_threads = n_threads, diff --git a/README.Rmd b/README.Rmd index b7337db..3250a0d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -24,6 +24,7 @@ A small package used to block records for data deduplication and record linkage Currently supports the following R packages that binds to specific ANN algorithms ++ [rnndescent](https://cran.r-project.org/package=rnndescent) (default), + [RcppHNSW](https://cran.r-project.org/package=RcppHNSW), + [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy), + [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see `mlpack::lsh` and `mlpack::knn`). diff --git a/README.md b/README.md index d6a3ed8..7506bd8 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ and graphs (via `igraph`). Currently supports the following R packages that binds to specific ANN algorithms +- [rnndescent](https://cran.r-project.org/package=rnndescent) (default), - [RcppHNSW](https://cran.r-project.org/package=RcppHNSW), - [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy), - [mlpack](https://cran.r-project.org/package=RcppAnnoy) (see @@ -93,13 +94,10 @@ Deduplication using blocking ``` r blocking_result <- blocking(x = df_example$txt) -#> 'as(, "dgCMatrix")' is deprecated. -#> Use 'as(., "CsparseMatrix")' instead. -#> See help("Deprecated") and help("Matrix-deprecated"). ## data frame with indices and block blocking_result #> ======================================================== -#> Blocking based on the hnsw method. +#> Blocking based on the nnd method. #> Number of blocks: 2. #> Number of columns used for blocking: 28. #> Reduction ratio: 0.5714. @@ -113,16 +111,16 @@ Table with blocking ``` r blocking_result$result -#> x y block -#> -#> 1: 1 2 1 -#> 2: 2 1 1 -#> 3: 2 3 1 -#> 4: 2 4 1 -#> 5: 5 6 2 -#> 6: 5 7 2 -#> 7: 5 8 2 -#> 8: 6 5 2 +#> x y block dist +#> +#> 1: 1 2 1 0.10000005 +#> 2: 1 3 1 0.14188367 +#> 3: 1 4 1 0.28286284 +#> 4: 2 1 1 0.10000005 +#> 5: 5 6 2 0.08333336 +#> 6: 5 7 2 0.13397458 +#> 7: 5 8 2 0.27831215 +#> 8: 6 5 2 0.08333336 ``` Deduplication followed by the `reclin2` package diff --git a/inst/tinytest/test_reclin2.R b/inst/tinytest/test_reclin2.R index fa29719..b6d56fd 100644 --- a/inst/tinytest/test_reclin2.R +++ b/inst/tinytest/test_reclin2.R @@ -1,7 +1,7 @@ source("test_data.R") expect_silent( - pair_ann(x = df_example, on = "txt") + pair_ann(x = df_example, on = "txt", ann = "hnsw") ) expect_equal( diff --git a/man/controls_ann.Rd b/man/controls_ann.Rd index a07d62b..0131e3f 100644 --- a/man/controls_ann.Rd +++ b/man/controls_ann.Rd @@ -6,9 +6,9 @@ \usage{ controls_ann( sparse = FALSE, - nnd = list(k_build = 2, use_alt_metric = TRUE, init = "tree", n_trees = NULL, leaf_size - = NULL, max_tree_depth = 200, margin = "auto", n_iters = NULL, delta = 0.001, - max_candidates = NULL, low_memory = TRUE, n_search_trees = 1, + nnd = list(k_build = 30, use_alt_metric = TRUE, init = "tree", n_trees = NULL, + leaf_size = NULL, max_tree_depth = 200, margin = "auto", n_iters = NULL, delta = + 0.001, max_candidates = NULL, low_memory = TRUE, n_search_trees = 1, pruning_degree_multiplier = 1.5, diversify_prob = 1, progress = "bar", obs = "R"), hnsw = list(M = 25, ef_c = 200, ef_s = 200, grain_size = 1, byrow = TRUE), lsh = list(bucket_size = 500, hash_width = 10, num_probes = 0, projections = 10, tables