Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Use c++ for ints and doubles #26

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@
.DS_Store
.pre-commit-config.yaml
docs
todo.txt
*.txt
CRAN-SUBMISSION
codecov.yml
revdep/
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ Depends:
R (>= 4.1.0)
LazyData: true
Config/Needs/website: rmarkdown
LinkingTo:
cpp11
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ importFrom(vctrs,vec_as_names)
importFrom(vctrs,vec_interleave)
importFrom(vctrs,vec_locate_matches)
importFrom(vctrs,vec_ptype_common)
useDynLib(versus, .registration = TRUE)
45 changes: 8 additions & 37 deletions R/compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,37 +146,28 @@ locate_matches <- function(table_a, table_b, by) {
fsubset(table_a, j = by),
fsubset(table_b, j = by),
relationship = "one-to-one",
no_match = -1L,
remaining = -2L
remaining = NA_integer_
)
match_group <- fcase(
matches$haystack == -1, "a",
matches$needles == -2, "b",
default = "common"
)
out <- lapply(matches, gsplit, match_group, use.g.names = TRUE)
out$haystack$a <- NULL
out$needles$b <- NULL
out
exec(split_matches, !!!matches)
}

get_unmatched_rows <- function(table_a, table_b, by, matches) {
unmatched <- list(
a = fsubset(table_a, matches$needles$a, by),
b = fsubset(table_b, matches$haystack$b, by)
a = fsubset(table_a, matches$a, by),
b = fsubset(table_b, matches$b, by)
)
unmatched %>%
bind_rows(.id = "table") %>%
mutate(row = with(matches, c(needles$a, haystack$b)) %||% integer(0)) %>%
mutate(row = with(matches, c(a, b))) %>%
as_tibble()
}

converge <- function(table_a, table_b, by, matches) {
common_cols <- setdiff(intersect(names(table_a), names(table_b)), by)

by_a <- fsubset(table_a, matches$needles$common, by)
common_a <- fsubset(table_a, matches$needles$common, common_cols)
common_b <- fsubset(table_b, matches$haystack$common, common_cols)
by_a <- fsubset(table_a, matches$common$a, by)
common_a <- fsubset(table_a, matches$common$a, common_cols)
common_b <- fsubset(table_b, matches$common$b, common_cols)

add_vars(
by_a,
Expand Down Expand Up @@ -208,26 +199,6 @@ get_contents <- function(table_a, table_b, by) {
out
}

get_diff_rows <- function(col, table_a, table_b, matches, allow_both_NA) {
col_a <- fsubset(table_a, matches$needles$common, col)[[1]]
col_b <- fsubset(table_b, matches$haystack$common, col)[[1]]
not_equal <- which(not_equal(col_a, col_b, allow_both_NA))
tibble(
row_a = matches$needles$common[not_equal] %||% integer(0),
row_b = matches$haystack$common[not_equal] %||% integer(0)
)
}

not_equal <- function(col_a, col_b, allow_both_NA) {
neq <- col_a != col_b
if (allow_both_NA) {
out <- fcoalesce(neq, is.na(col_a) != is.na(col_b))
} else {
out <- fcoalesce(neq, is.na(col_a) | is.na(col_b))
}
out
}

store_tables <- function(table_a, table_b) {
inform_dt_copy(table_a, table_b)
env <- new_environment()
Expand Down
13 changes: 13 additions & 0 deletions R/cpp11.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Generated by cpp11: do not edit by hand

get_diff_rows_int <- function(vec_a, vec_b, idx_a, idx_b) {
.Call(`_versus_get_diff_rows_int`, vec_a, vec_b, idx_a, idx_b)
}

get_diff_rows_dbl <- function(vec_a, vec_b, idx_a, idx_b) {
.Call(`_versus_get_diff_rows_dbl`, vec_a, vec_b, idx_a, idx_b)
}

split_matches <- function(needles, haystack) {
.Call(`_versus_split_matches`, needles, haystack)
}
57 changes: 57 additions & 0 deletions R/get-diff-rows.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
get_diff_rows <- function(col, table_a, table_b, matches, allow_both_NA) {
if (can_use_cpp(col, table_a, table_b, matches, allow_both_NA)) {
cpp_get_diff_rows(
table_a[[col]],
table_b[[col]],
matches$common$a,
matches$common$b
)
} else {
r_get_diff_rows(col, table_a, table_b, matches, allow_both_NA)
}
}

can_use_cpp <- function(col, table_a, table_b, matches, allow_both_NA) {
if (!allow_both_NA) {
return(FALSE)
}
class_a <- class(table_a[[col]])
class_b <- class(table_b[[col]])
if (!identical(class_a, class_b)) {
return(FALSE)
}
for (cpp_class in c("integer", "numeric", "Date")) {
if (identical(class_a, cpp_class)) {
return(TRUE)
}
}
return(FALSE)
}

cpp_get_diff_rows <- function(vec_a, vec_b, idx_a, idx_b) {
class_a <- class(vec_a)
if (identical(class_a, "numeric") || identical(class_a, "Date")) {
return(get_diff_rows_dbl(vec_a, vec_b, idx_a, idx_b))
}
if (identical(class_a, "integer")) {
return(get_diff_rows_int(vec_a, vec_b, idx_a, idx_b))
}
}

r_get_diff_rows <- function(col, table_a, table_b, matches, allow_both_NA) {
col_a <- fsubset(table_a, matches$common$a, col)[[1]]
col_b <- fsubset(table_b, matches$common$b, col)[[1]]
matches$common %>%
fsubset(not_equal(col_a, col_b, allow_both_NA)) %>%
rename_with(\(x) paste0("row_", x))
}

not_equal <- function(col_a, col_b, allow_both_NA) {
neq <- col_a != col_b
if (allow_both_NA) {
out <- fcoalesce(neq, is.na(col_a) != is.na(col_b))
} else {
out <- fcoalesce(neq, is.na(col_a) | is.na(col_b))
}
out
}
5 changes: 1 addition & 4 deletions R/helpers.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
fsubset <- function(x, i, j) {
if (!missing(i) && (is_null(i) || identical(i, 0))) {
i <- integer(0)
}
ss(x, i, j, check = FALSE)
}

Expand Down Expand Up @@ -52,7 +49,7 @@ table_init <- function(comparison, cols = c("intersection", "by"), tbl = c("a",
# simulate a data frame with the same classes as table_[tbl]
cols <- arg_match(cols)
tbl <- arg_match(tbl)
fsubset(comparison$input$value[[tbl]], 0, comparison[[cols]]$column)
fsubset(comparison$input$value[[tbl]], integer(0), comparison[[cols]]$column)
}

get_cols_from_comparison <- function(
Expand Down
4 changes: 2 additions & 2 deletions R/slice-diffs.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ slice_diffs <- function(comparison, table, column = everything()) {
}

slice_diffs_impl <- function(comparison, table, column, j, call = caller_env()) {
diff_cols <- identify_diff_cols(comparison, column)
diff_cols <- identify_diff_cols(comparison, column, call = call)
if (is_empty(diff_cols)) {
out <- fsubset(comparison$input$value[[table]], 0, j)
out <- fsubset(comparison$input$value[[table]], integer(0), j)
return(as_tibble(out))
}
rows <- fsubset(comparison$intersection, diff_cols, "diff_rows")[[1]] %>%
Expand Down
1 change: 1 addition & 0 deletions R/versus.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@
#' @importFrom tibble tibble rownames_to_column enframe
#' @importFrom collapse ss add_vars frename gsplit
#' @importFrom data.table fcase fcoalesce copy
#' @useDynLib versus, .registration = TRUE
"_PACKAGE"
3 changes: 3 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.o
*.so
*.dll
66 changes: 66 additions & 0 deletions src/code.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include <cpp11.hpp>
using namespace cpp11;

[[cpp11::register]]
data_frame get_diff_rows_int(integers vec_a, integers vec_b, integers idx_a, integers idx_b) {
int n = idx_a.size();
writable::integers row_a, row_b;

for (int i = 0; i < n; ++i) {
if (vec_a[idx_a[i] - 1] != vec_b[idx_b[i] - 1]) {
row_a.push_back(idx_a[i]);
row_b.push_back(idx_b[i]);
}
}
writable::data_frame out({"row_a"_nm = row_a, "row_b"_nm = row_b});
out.attr("class") = {"tbl_df", "tbl", "data.frame"};
return std::move(out);
}


[[cpp11::register]]
data_frame get_diff_rows_dbl(doubles vec_a, doubles vec_b, integers idx_a, integers idx_b) {
int n = idx_a.size();
writable::integers row_a, row_b;

for (int i = 0; i < n; ++i) {
double a = vec_a[idx_a[i] - 1];
double b = vec_b[idx_b[i] - 1];
if (a != b) {
if (R_IsNA(a) && R_IsNA(b)) {
continue;
}
row_a.push_back(idx_a[i]);
row_b.push_back(idx_b[i]);
}
}
writable::data_frame out({"row_a"_nm = row_a, "row_b"_nm = row_b});
out.attr("class") = {"tbl_df", "tbl", "data.frame"};
return std::move(out);
}

[[cpp11::register]]
list split_matches(integers needles, integers haystack) {
// split `matches` into
// a = vector of rows only in `table_a`
// b = vector of rows only in `table_b`
// common = tibble of indices from both tables
int n = needles.size();
writable::integers a, b, common_a, common_b;

for (int i = 0; i < n; ++i) {
if (needles[i] == NA_INTEGER) {
b.push_back(haystack[i]);
} else if (haystack[i] == NA_INTEGER) {
a.push_back(needles[i]);
} else {
common_a.push_back(needles[i]);
common_b.push_back(haystack[i]);
}
}
writable::data_frame common({"a"_nm = common_a, "b"_nm = common_b});
common.attr("class") = {"tbl_df", "tbl", "data.frame"};
writable::list out({"common"_nm = common, "a"_nm = a, "b"_nm = b});
return std::move(out);
}

43 changes: 43 additions & 0 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Generated by cpp11: do not edit by hand
// clang-format off


#include "cpp11/declarations.hpp"
#include <R_ext/Visibility.h>

// code.cpp
data_frame get_diff_rows_int(integers vec_a, integers vec_b, integers idx_a, integers idx_b);
extern "C" SEXP _versus_get_diff_rows_int(SEXP vec_a, SEXP vec_b, SEXP idx_a, SEXP idx_b) {
BEGIN_CPP11
return cpp11::as_sexp(get_diff_rows_int(cpp11::as_cpp<cpp11::decay_t<integers>>(vec_a), cpp11::as_cpp<cpp11::decay_t<integers>>(vec_b), cpp11::as_cpp<cpp11::decay_t<integers>>(idx_a), cpp11::as_cpp<cpp11::decay_t<integers>>(idx_b)));
END_CPP11
}
// code.cpp
data_frame get_diff_rows_dbl(doubles vec_a, doubles vec_b, integers idx_a, integers idx_b);
extern "C" SEXP _versus_get_diff_rows_dbl(SEXP vec_a, SEXP vec_b, SEXP idx_a, SEXP idx_b) {
BEGIN_CPP11
return cpp11::as_sexp(get_diff_rows_dbl(cpp11::as_cpp<cpp11::decay_t<doubles>>(vec_a), cpp11::as_cpp<cpp11::decay_t<doubles>>(vec_b), cpp11::as_cpp<cpp11::decay_t<integers>>(idx_a), cpp11::as_cpp<cpp11::decay_t<integers>>(idx_b)));
END_CPP11
}
// code.cpp
list split_matches(integers needles, integers haystack);
extern "C" SEXP _versus_split_matches(SEXP needles, SEXP haystack) {
BEGIN_CPP11
return cpp11::as_sexp(split_matches(cpp11::as_cpp<cpp11::decay_t<integers>>(needles), cpp11::as_cpp<cpp11::decay_t<integers>>(haystack)));
END_CPP11
}

extern "C" {
static const R_CallMethodDef CallEntries[] = {
{"_versus_get_diff_rows_dbl", (DL_FUNC) &_versus_get_diff_rows_dbl, 4},
{"_versus_get_diff_rows_int", (DL_FUNC) &_versus_get_diff_rows_int, 4},
{"_versus_split_matches", (DL_FUNC) &_versus_split_matches, 2},
{NULL, NULL, 0}
};
}

extern "C" attribute_visible void R_init_versus(DllInfo* dll){
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
R_forceSymbols(dll, TRUE);
}
19 changes: 19 additions & 0 deletions tests/testthat/_snaps/slice-diffs.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,22 @@
Error in `slice_diffs()`:
! `table` is absent but must be supplied.

# Error on slice_diffs() with empty selection

Code
slice_diffs(comp, "a", where(is.factor))
Condition
Error in `slice_diffs()`:
! Problem with argument `column = where(is.factor)`:
* Must select at least one item.

# Error on value_diffs when column doesn't exist

Code
slice_diffs(comp, "a", bear)
Condition
Error in `slice_diffs()`:
! Problem with argument `column = bear`:
* Must select columns from `comparison$intersection`
i column `bear` is not part of the supplied comparison

56 changes: 56 additions & 0 deletions tests/testthat/_snaps/weave-diffs.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,59 @@
! Problem with argument `comparison = example_df_a`
i `comparison` must be the output of `versus::compare()`

# Error when `column` isn't a comparison

Code
weave_diffs_long(example_df_a, disp)
Condition
Error in `weave_diffs_long()`:
! Problem with argument `comparison = example_df_a`
i `comparison` must be the output of `versus::compare()`

---

Code
weave_diffs_wide(example_df_a, disp)
Condition
Error in `weave_diffs_wide()`:
! Problem with argument `comparison = example_df_a`
i `comparison` must be the output of `versus::compare()`

# Error on weave_diffs() with empty selection

Code
weave_diffs_long(comp, where(is.factor))
Condition
Error in `weave_diffs_long()`:
! Problem with argument `column = where(is.factor)`:
* Must select at least one item.

---

Code
weave_diffs_wide(comp, where(is.factor))
Condition
Error in `weave_diffs_wide()`:
! Problem with argument `column = where(is.factor)`:
* Must select at least one item.

# Error on value_diffs when column doesn't exist

Code
weave_diffs_long(comp, bear)
Condition
Error in `weave_diffs_long()`:
! Problem with argument `column = bear`:
* Must select columns from `comparison$intersection`
i column `bear` is not part of the supplied comparison

---

Code
weave_diffs_wide(comp, bear)
Condition
Error in `weave_diffs_wide()`:
! Problem with argument `column = bear`:
* Must select columns from `comparison$intersection`
i column `bear` is not part of the supplied comparison

Loading