Skip to content

Commit

Permalink
Rename miss_count to n_diff in diff_tbl and update output in summary()
Browse files Browse the repository at this point in the history
  • Loading branch information
gadenbuie committed Aug 31, 2018
1 parent 132eb08 commit 48b2f22
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 78 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: different
Title: Compare Differences in Data Frames
Version: 0.0.0.9102
Version: 0.0.0.9103
Authors@R: person("Garrick", "Aden-Buie", email =
"Garrick.Aden-Buie@moffitt.org", role = c("aut", "cre"))
Description: Tools for comparing and resolving differences between data
Expand Down
12 changes: 6 additions & 6 deletions R/compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ diff_compare <- function(
misses = purrr::map2(value.x, value.y, ~ not_equal(..1, ..2, tolerance)),
misses = purrr::pmap(list(misses, value.x, value.y), ~ union(..1, which(xor(is.na(..2), is.na(..3))))),
misses = purrr::map(misses, sort),
miss_count = purrr::map_int(misses, length),
miss_count = ifelse(purrr::map_lgl(value.x, is.null), NA, miss_count),
miss_count = ifelse(purrr::map_lgl(value.y, is.null), NA, miss_count),
state = ifelse(miss_count == 0, "same", "diff"),
n_diff = purrr::map_int(misses, length),
n_diff = ifelse(purrr::map_lgl(value.x, is.null), NA, n_diff),
n_diff = ifelse(purrr::map_lgl(value.y, is.null), NA, n_diff),
state = ifelse(n_diff == 0, "same", "diff"),
state = ifelse(purrr::map_lgl(value.x, is.null), "unique_y", state),
state = ifelse(purrr::map_lgl(value.y, is.null), "unique_x", state),
state = ifelse(grepl("^_row\\.", variable), "same", state) # Manually move row indices to "same" group
Expand All @@ -94,7 +94,7 @@ diff_compare <- function(
z_tidy_diff$value.x <- purrr::map2(z$diff$value.x, z$diff$misses, function(x, y) x[y])
z_tidy_diff$value.y <- purrr::map2(z$diff$value.y, z$diff$misses, function(x, y) x[y])
z_tidy_diff <- z_tidy_diff %>%
select(-miss_count, -state) %>%
select(-n_diff, -state) %>%
split(.$variable) %>%
purrr::map(~ {
tidyr::unnest(.) %>%
Expand All @@ -121,7 +121,7 @@ diff_compare <- function(
}

z <- purrr::map_dfr(z, ~ {
select(., variable, state, miss_count)
select(., variable, state, n_diff)
})

z <- if (is.null(z_tidy_diff)) {
Expand Down
53 changes: 35 additions & 18 deletions R/print.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ wrap_lines <- function(x, indent = 4) {
}

cat_differences <- function(z) {
n_differences <- sum(z$miss_count, na.rm = TRUE)
n_diffs <- sum(z$n_diff, na.rm = TRUE)
n_unique_columns_x <- sum(z$state == "unique_x")
n_unique_columns_y <- sum(z$state == "unique_y")
if (!n_differences) {
if (!n_diffs) {
if (n_unique_columns_x + n_unique_columns_y) {
cat_bullet("There were no differences in overlapping columns between {paste0('`', metadata(z)$names, '`', collapse = ' and ')}", bullet = "tick")
} else {
Expand All @@ -45,30 +45,38 @@ cat_differences <- function(z) {
} else {
n_diff_rows <- purrr::map(z$diff, "miss_index") %>%
purrr::reduce(union) %>% length()
n_differences <- glue::glue("{crayon::bold(n_differences)} differences")
cat_bullet("There were {crayon::red(n_differences)} ",
"across {sum(z$state == 'diff')} cols ",
"and {n_diff_rows} rows",
bullet = "pointer")
n_diffs <- glue::glue("{crayon::bold(n_diffs)} differences")
cat_bullet("There were {crayon::red(n_diffs)} across {sum(z$state == 'diff')} cols and {n_diff_rows} rows", bullet = "cross")
}
}

cat_overlapping_columns <- function(z) {
stopifnot(is_diff_tbl(z))
overlaps <- filter(z, !grepl("unique", state), !grepl("^_row", variable)) %>%
overlaps <-
z %>%
filter(
!grepl("unique", state),
!grepl("^_row", variable)
) %>%
group_by(state) %>%
summarize(vars = paste0("`", crayon::bold(variable), "`", collapse = ", "), n = n())
summarize(
vars = paste0("`", crayon::bold(variable), "`", collapse = ", "),
n = n()
)

cat_bullet("There are {crayon::bold(sum(overlaps$n))} columns that appear in both")
noun <- pluralize(sum(overlaps$n), "column")
cat_bullet("{crayon::bold(sum(overlaps$n))} {noun} appear in both {df_name(z, 'x', 0.3)} and {df_name(z, 'y', 0.3)}")

if ("same" %in% overlaps$state) {
same_cols <- overlaps %>% filter(state == "same")
cli::cat_line(crayon::green(glue::glue(" \U2714 {crayon::bold(same_cols$n)} cols are identical: ")))
noun <- pluralize(same_cols$n, "column")
cli::cat_line(crayon::green(glue::glue(" \U2714 {crayon::bold(same_cols$n)} {noun} have identical entries: ")))
cat_variable_names(z, "same")
}
if ("diff" %in% overlaps$state) {
diff_cols <- overlaps %>% filter(state == "diff")
cli::cat_line(crayon::red(glue::glue(" \u2716 {crayon::bold(diff_cols$n)} cols have differences: ")))
noun <- pluralize(diff_cols$n, "column")
cli::cat_line(crayon::red(glue::glue(" \u2716 {crayon::bold(diff_cols$n)} {noun} have differences: ")))
cat_variable_names(z, "diff", 4)
}
}
Expand All @@ -89,15 +97,15 @@ cat_unique_columns <- function(z) {
n_unique_columns_y <- sum(z$state == "unique_y")

if (n_unique_columns_x) {
plural <- if (n_unique_columns_x > 1) "s" else ""
cat_bullet("{crayon::italic(metadata(z, 'names')['x'])} has ",
"{crayon::bold(n_unique_columns_x)} unique column{plural}:")
noun <- pluralize(n_unique_columns_x, "column")
verb <- pluralize(n_unique_columns_x, "is", "are")
cat_bullet("{crayon::bold(n_unique_columns_x)} {noun} in {crayon::bold(df_name(z, 'x'))} {verb} not in {df_name(z, 'y')}:")
cat_variable_names(z, "unique_x")
}
if (n_unique_columns_y) {
plural <- if (n_unique_columns_y > 1) "s" else ""
cat_bullet("{crayon::italic(metadata(z, 'names')['y'])} has ",
"{crayon::bold(n_unique_columns_y)} unique column{plural}:")
noun <- pluralize(n_unique_columns_y, "column")
verb <- pluralize(n_unique_columns_y, "is", "are")
cat_bullet("{crayon::bold(n_unique_columns_y)} {noun} in {crayon::bold(df_name(z, 'y'))} {verb} not in {df_name(z, 'x')}:")
cat_variable_names(z, "unique_y")
}
}
Expand All @@ -110,3 +118,12 @@ cat_dimensions <- function(z) {
cli::cat_line(dims)
cli::cat_line()
}

df_name <- function(z, name_of = "x", truncate_width = 0.45) {
nm <- metadata(z, "names")[[name_of]]
if (nchar(nm) > (cli::console_width() * truncate_width)) name_of else nm
}

pluralize <- function(n, base, pluralized = paste0(base, "s")) {
if (n < 2) base else pluralized
}
106 changes: 53 additions & 53 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,42 @@ Suppose you have two data sets that you would like to compare.

``` r
x
#> # A tibble: 996 x 11
#> # Groups: id_01, id_02 [996]
#> id_01 id_02 colname_01 colname_02 colname_03 colname_04 colname_05
#> <chr> <chr> <chr> <chr> <int> <fct> <int>
#> 1 jwmq… nhxu… pwxvgf uvwtvc 20 b -37
#> 2 edps… yjhk… hrlqtz hxeucz 46 a 7
#> 3 ykal… sazu… abptix jiuvod -20 g 7
#> 4 bemr… plsf… bwgyok vzvsuj -2 c -14
#> 5 rseu… umjp… wghpbq ldkdbn -2 h -29
#> 6 mded… rlyy… gaezvt wiirlg 3 c -38
#> 7 qxqd… mtgl… phunsl bsxvoo 50 a 3
#> 8 rwau… htcy… irboxg kacplf -11 h -28
#> 9 jlfb… foac… iweywv sxcnhv 8 h -38
#> 10 pird… aqoe… vsnlia vovrjp -35 i 33
#> # ... with 986 more rows, and 4 more variables: colname_06 <int>,
#> # colname_07 <chr>, colname_08 <chr>, colname_10 <int>
#> # A tibble: 997 x 11
#> # Groups: id_01, id_02 [997]
#> id_01 id_02 colname_01 colname_02 colname_04 colname_05 colname_06
#> <int> <chr> <dbl> <int> <dbl> <fct> <fct>
#> 1 148 maqt… 62.7 -32 26.7 c a
#> 2 397 bfqt… 55.4 38 46.5 b h
#> 3 167 oqxt… 19.7 -33 66.6 c i
#> 4 105 vyfo… 26.5 -41 56.0 a e
#> 5 583 wzzf… 52.8 -26 0.0360 g a
#> 6 693 ykuj… 42.8 2 89.5 f h
#> 7 637 kcxq… 18.2 -29 14.5 d e
#> 8 777 vcsx… 42.7 19 43.0 c h
#> 9 814 mrrz… 73.9 28 69.4 g d
#> 10 933 yusl… 25.2 -44 20.4 j i
#> # ... with 987 more rows, and 4 more variables: colname_07 <chr>,
#> # colname_08 <chr>, colname_09 <chr>, colname_10 <fct>
```

``` r
y
#> # A tibble: 993 x 10
#> # Groups: id_01, id_02 [993]
#> id_01 id_02 colname_01 colname_02 colname_04 colname_05 colname_06
#> <chr> <chr> <chr> <chr> <fct> <chr> <int>
#> 1 bwjl… lhri… ajltcz fyznuc f 18 38
#> 2 pefj… vdeh… mslfwp spoxof a -16 -18
#> 3 yeev… jwue… leisxo qmbpdo h 10 -29
#> 4 herd… anfh… fdprqg mtdacb a -27 -19
#> 5 ydmj… pjrk… vgvkqn lclsci f 31 46
#> 6 pdnv… vzdq… safrup lahlvy i 19 -46
#> 7 wohv… xwti… uxtimp zuvrqb f -21 -44
#> 8 lpqc… atog… iaynzf trmxkp b 46 44
#> 9 lzrs… oyyr… trjucy jmtbxm f -28 -40
#> 10 fmbt… hlip… iegvcp ofcqmh e 20 -14
#> # ... with 983 more rows, and 3 more variables: colname_07 <chr>,
#> # colname_08 <chr>, colname_10 <int>
#> id_01 id_02 colname_01 colname_02 colname_04 colname_06 colname_07
#> <int> <chr> <dbl> <chr> <dbl> <fct> <chr>
#> 1 489 szgh… 9.58 -35 11.7 f esuqju
#> 2 198 twkh… 61.7 34 59.6 i zhdoss
#> 3 612 ttki… 91.4 7 75.6 h wbjhhf
#> 4 873 dsng… 66.7 41 77.5 h wsudet
#> 5 24 mnzo… 45.2 -32 63.4 h yamuei
#> 6 15 lpzc… 42.6 26 60.4 g flhcox
#> 7 757 ygah… 46.3 -42 36.1 b xrtzih
#> 8 909 dmjd… 23.9 50 92.2 d lvudci
#> 9 717 immn… 17.3 -13 16.0 c csvjhr
#> 10 196 ddzh… 5.07 34 72.8 i orqjut
#> # ... with 983 more rows, and 3 more variables: colname_08 <chr>,
#> # colname_09 <chr>, colname_10 <fct>
```

Both have ID columns `id_01` and `id_02`, but we know they will differ
Expand All @@ -66,7 +66,7 @@ in their number of rows and columns and that the rows are unordered.
``` r
> z
<diff_tbl: x vs y>
There were 243 differences across 8 cols and 152 rows
There were 231 differences across 8 cols and 148 rows
```

``` r
Expand All @@ -75,35 +75,35 @@ in their number of rows and columns and that the rows are unordered.
# Dimensions
set rows cols
----- ----- -----
x 996 11
x 997 11
y 993 10

# Columns
x has 1 unique column:
`colname_03`
There are 10 columns that appear in both
2 cols are identical:
1 column in x is not in y:
`colname_05`
10 columns appear in both x and y
2 columns have identical entries:
`id_01`, `id_02`
8 cols have differences:
`colname_01`, `colname_02`, `colname_04`, `colname_05`, `colname_06`, `colname_07`,
`colname_08`,
8 columns have differences:
`colname_01`, `colname_02`, `colname_04`, `colname_06`, `colname_07`, `colname_08`,
`colname_09`,
`colname_10`

# Differences
There were 243 differences across 8 cols and 152 rows
variable type.x type.y state miss_count diff
----- ----- ----- ----- ----- ------
colname_01 character character diff 31 <tibble [31 × 7]>
colname_02 character character diff 31 <tibble [31 × 7]>
colname_04 factor factor diff 29 <tibble [29 × 7]>
colname_05 integer character diff 30 <tibble [30 × 7]>
colname_06 integer integer diff 29 <tibble [29 × 7]>
colname_07 character character diff 31 <tibble [31 × 7]>
colname_08 character character diff 31 <tibble [31 × 7]>
colname_10 integer integer diff 31 <tibble [31 × 7]>
id_01 character character same 0 <NULL>
id_02 character character same 0 <NULL>
colname_03 integer <NA> unique_x NA <NULL>
There were 231 differences across 8 cols and 148 rows
variable type.x type.y state n_diff diff
----- ----- ----- ----- ----- ------
colname_01 numeric numeric diff 29 <tibble [29 × 7]>
colname_02 integer character diff 30 <tibble [30 × 7]>
colname_04 numeric numeric diff 30 <tibble [30 × 7]>
colname_06 factor factor diff 26 <tibble [26 × 7]>
colname_07 character character diff 30 <tibble [30 × 7]>
colname_08 character character diff 30 <tibble [30 × 7]>
colname_09 character character diff 30 <tibble [30 × 7]>
colname_10 factor factor diff 26 <tibble [26 × 7]>
id_01 integer integer same 0 <NULL>
id_02 character character same 0 <NULL>
colname_05 factor <NA> unique_x NA <NULL>
```

``` r
Expand Down
Binary file modified man/figures/README-diff_tbl-plot-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 48b2f22

Please # to comment.