Rename miss_count to n_diff in diff_tbl and update output in summary() …

…#7
GerkeLab · Aug 31, 2018 · 48b2f22 · 48b2f22
1 parent 132eb08
commit 48b2f22
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 78 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: different
 Title: Compare Differences in Data Frames
-Version: 0.0.0.9102
+Version: 0.0.0.9103
 Authors@R: person("Garrick", "Aden-Buie", email =
         "Garrick.Aden-Buie@moffitt.org", role = c("aut", "cre"))
 Description: Tools for comparing and resolving differences between data

diff --git a/R/compare.R b/R/compare.R
@@ -79,10 +79,10 @@ diff_compare <- function(
       misses     = purrr::map2(value.x, value.y, ~ not_equal(..1, ..2, tolerance)),
       misses     = purrr::pmap(list(misses, value.x, value.y), ~ union(..1, which(xor(is.na(..2), is.na(..3))))),
       misses     = purrr::map(misses, sort),
-      miss_count = purrr::map_int(misses, length),
-      miss_count = ifelse(purrr::map_lgl(value.x, is.null), NA, miss_count),
-      miss_count = ifelse(purrr::map_lgl(value.y, is.null), NA, miss_count),
-      state      = ifelse(miss_count == 0, "same", "diff"),
+      n_diff     = purrr::map_int(misses, length),
+      n_diff     = ifelse(purrr::map_lgl(value.x, is.null), NA, n_diff),
+      n_diff     = ifelse(purrr::map_lgl(value.y, is.null), NA, n_diff),
+      state      = ifelse(n_diff == 0, "same", "diff"),
       state      = ifelse(purrr::map_lgl(value.x, is.null), "unique_y", state),
       state      = ifelse(purrr::map_lgl(value.y, is.null), "unique_x", state),
       state      = ifelse(grepl("^_row\\.", variable), "same", state)             # Manually move row indices to "same" group
@@ -94,7 +94,7 @@ diff_compare <- function(
     z_tidy_diff$value.x <- purrr::map2(z$diff$value.x, z$diff$misses, function(x, y) x[y])
     z_tidy_diff$value.y <- purrr::map2(z$diff$value.y, z$diff$misses, function(x, y) x[y])
     z_tidy_diff <- z_tidy_diff %>%
-      select(-miss_count, -state) %>%
+      select(-n_diff, -state) %>%
       split(.$variable) %>%
       purrr::map(~ {
         tidyr::unnest(.) %>%
@@ -121,7 +121,7 @@ diff_compare <- function(
   }
 
   z <- purrr::map_dfr(z, ~ {
-    select(., variable, state, miss_count)
+    select(., variable, state, n_diff)
   })
 
   z <- if (is.null(z_tidy_diff)) {

diff --git a/R/print.R b/R/print.R
@@ -33,10 +33,10 @@ wrap_lines <- function(x, indent = 4) {
 }
 
 cat_differences <- function(z) {
-  n_differences <- sum(z$miss_count, na.rm = TRUE)
+  n_diffs <- sum(z$n_diff, na.rm = TRUE)
   n_unique_columns_x <- sum(z$state == "unique_x")
   n_unique_columns_y <- sum(z$state == "unique_y")
-  if (!n_differences) {
+  if (!n_diffs) {
     if (n_unique_columns_x + n_unique_columns_y) {
       cat_bullet("There were no differences in overlapping columns between {paste0('`', metadata(z)$names, '`', collapse = ' and ')}", bullet = "tick")
     } else {
@@ -45,30 +45,38 @@ cat_differences <- function(z) {
   } else {
     n_diff_rows <- purrr::map(z$diff, "miss_index") %>%
       purrr::reduce(union) %>% length()
-    n_differences <- glue::glue("{crayon::bold(n_differences)} differences")
-    cat_bullet("There were {crayon::red(n_differences)} ",
-               "across {sum(z$state == 'diff')} cols ",
-               "and {n_diff_rows} rows",
-               bullet = "pointer")
+    n_diffs <- glue::glue("{crayon::bold(n_diffs)} differences")
+    cat_bullet("There were {crayon::red(n_diffs)} across {sum(z$state == 'diff')} cols and {n_diff_rows} rows", bullet = "cross")
   }
 }
 
 cat_overlapping_columns <- function(z) {
   stopifnot(is_diff_tbl(z))
-  overlaps <- filter(z, !grepl("unique", state), !grepl("^_row", variable)) %>%
+  overlaps <-
+    z %>%
+    filter(
+      !grepl("unique", state),
+      !grepl("^_row", variable)
+    ) %>%
     group_by(state) %>%
-    summarize(vars = paste0("`", crayon::bold(variable), "`", collapse = ", "), n = n())
+    summarize(
+      vars = paste0("`", crayon::bold(variable), "`", collapse = ", "),
+      n = n()
+    )
 
-  cat_bullet("There are {crayon::bold(sum(overlaps$n))} columns that appear in both")
+  noun <- pluralize(sum(overlaps$n), "column")
+  cat_bullet("{crayon::bold(sum(overlaps$n))} {noun} appear in both {df_name(z, 'x', 0.3)} and {df_name(z, 'y', 0.3)}")
 
   if ("same" %in% overlaps$state) {
     same_cols <- overlaps %>% filter(state == "same")
-    cli::cat_line(crayon::green(glue::glue("  \U2714 {crayon::bold(same_cols$n)} cols are identical: ")))
+    noun <- pluralize(same_cols$n, "column")
+    cli::cat_line(crayon::green(glue::glue("  \U2714 {crayon::bold(same_cols$n)} {noun} have identical entries: ")))
     cat_variable_names(z, "same")
   }
   if ("diff" %in% overlaps$state) {
     diff_cols <- overlaps %>% filter(state == "diff")
-    cli::cat_line(crayon::red(glue::glue("  \u2716 {crayon::bold(diff_cols$n)} cols have differences: ")))
+    noun <- pluralize(diff_cols$n, "column")
+    cli::cat_line(crayon::red(glue::glue("  \u2716 {crayon::bold(diff_cols$n)} {noun} have differences: ")))
     cat_variable_names(z, "diff", 4)
   }
 }
@@ -89,15 +97,15 @@ cat_unique_columns <- function(z) {
   n_unique_columns_y <- sum(z$state == "unique_y")
 
   if (n_unique_columns_x) {
-    plural <- if (n_unique_columns_x > 1) "s" else ""
-    cat_bullet("{crayon::italic(metadata(z, 'names')['x'])} has ",
-               "{crayon::bold(n_unique_columns_x)} unique column{plural}:")
+    noun <- pluralize(n_unique_columns_x, "column")
+    verb <- pluralize(n_unique_columns_x, "is", "are")
+    cat_bullet("{crayon::bold(n_unique_columns_x)} {noun} in {crayon::bold(df_name(z, 'x'))} {verb} not in {df_name(z, 'y')}:")
     cat_variable_names(z, "unique_x")
   }
   if (n_unique_columns_y) {
-    plural <- if (n_unique_columns_y > 1) "s" else ""
-    cat_bullet("{crayon::italic(metadata(z, 'names')['y'])} has ",
-               "{crayon::bold(n_unique_columns_y)} unique column{plural}:")
+    noun <- pluralize(n_unique_columns_y, "column")
+    verb <- pluralize(n_unique_columns_y, "is", "are")
+    cat_bullet("{crayon::bold(n_unique_columns_y)} {noun} in {crayon::bold(df_name(z, 'y'))} {verb} not in {df_name(z, 'x')}:")
     cat_variable_names(z, "unique_y")
   }
 }
@@ -110,3 +118,12 @@ cat_dimensions <- function(z) {
   cli::cat_line(dims)
   cli::cat_line()
 }
+
+df_name <- function(z, name_of = "x", truncate_width = 0.45) {
+  nm <- metadata(z, "names")[[name_of]]
+  if (nchar(nm) > (cli::console_width() * truncate_width)) name_of else nm
+}
+
+pluralize <- function(n, base, pluralized = paste0(base, "s")) {
+  if (n < 2) base else pluralized
+}
diff --git a/README.md b/README.md
@@ -18,42 +18,42 @@ Suppose you have two data sets that you would like to compare.
 
 ``` r
 x
-#> # A tibble: 996 x 11
-#> # Groups:   id_01, id_02 [996]
-#>    id_01 id_02 colname_01 colname_02 colname_03 colname_04 colname_05
-#>    <chr> <chr> <chr>      <chr>           <int> <fct>           <int>
-#>  1 jwmq… nhxu… pwxvgf     uvwtvc             20 b                 -37
-#>  2 edps… yjhk… hrlqtz     hxeucz             46 a                   7
-#>  3 ykal… sazu… abptix     jiuvod            -20 g                   7
-#>  4 bemr… plsf… bwgyok     vzvsuj             -2 c                 -14
-#>  5 rseu… umjp… wghpbq     ldkdbn             -2 h                 -29
-#>  6 mded… rlyy… gaezvt     wiirlg              3 c                 -38
-#>  7 qxqd… mtgl… phunsl     bsxvoo             50 a                   3
-#>  8 rwau… htcy… irboxg     kacplf            -11 h                 -28
-#>  9 jlfb… foac… iweywv     sxcnhv              8 h                 -38
-#> 10 pird… aqoe… vsnlia     vovrjp            -35 i                  33
-#> # ... with 986 more rows, and 4 more variables: colname_06 <int>,
-#> #   colname_07 <chr>, colname_08 <chr>, colname_10 <int>
+#> # A tibble: 997 x 11
+#> # Groups:   id_01, id_02 [997]
+#>    id_01 id_02 colname_01 colname_02 colname_04 colname_05 colname_06
+#>    <int> <chr>      <dbl>      <int>      <dbl> <fct>      <fct>     
+#>  1   148 maqt…       62.7        -32    26.7    c          a         
+#>  2   397 bfqt…       55.4         38    46.5    b          h         
+#>  3   167 oqxt…       19.7        -33    66.6    c          i         
+#>  4   105 vyfo…       26.5        -41    56.0    a          e         
+#>  5   583 wzzf…       52.8        -26     0.0360 g          a         
+#>  6   693 ykuj…       42.8          2    89.5    f          h         
+#>  7   637 kcxq…       18.2        -29    14.5    d          e         
+#>  8   777 vcsx…       42.7         19    43.0    c          h         
+#>  9   814 mrrz…       73.9         28    69.4    g          d         
+#> 10   933 yusl…       25.2        -44    20.4    j          i         
+#> # ... with 987 more rows, and 4 more variables: colname_07 <chr>,
+#> #   colname_08 <chr>, colname_09 <chr>, colname_10 <fct>
 ```
 
 ``` r
 y
 #> # A tibble: 993 x 10
 #> # Groups:   id_01, id_02 [993]
-#>    id_01 id_02 colname_01 colname_02 colname_04 colname_05 colname_06
-#>    <chr> <chr> <chr>      <chr>      <fct>      <chr>           <int>
-#>  1 bwjl… lhri… ajltcz     fyznuc     f          18                 38
-#>  2 pefj… vdeh… mslfwp     spoxof     a          -16               -18
-#>  3 yeev… jwue… leisxo     qmbpdo     h          10                -29
-#>  4 herd… anfh… fdprqg     mtdacb     a          -27               -19
-#>  5 ydmj… pjrk… vgvkqn     lclsci     f          31                 46
-#>  6 pdnv… vzdq… safrup     lahlvy     i          19                -46
-#>  7 wohv… xwti… uxtimp     zuvrqb     f          -21               -44
-#>  8 lpqc… atog… iaynzf     trmxkp     b          46                 44
-#>  9 lzrs… oyyr… trjucy     jmtbxm     f          -28               -40
-#> 10 fmbt… hlip… iegvcp     ofcqmh     e          20                -14
-#> # ... with 983 more rows, and 3 more variables: colname_07 <chr>,
-#> #   colname_08 <chr>, colname_10 <int>
+#>    id_01 id_02 colname_01 colname_02 colname_04 colname_06 colname_07
+#>    <int> <chr>      <dbl> <chr>           <dbl> <fct>      <chr>     
+#>  1   489 szgh…       9.58 -35              11.7 f          esuqju    
+#>  2   198 twkh…      61.7  34               59.6 i          zhdoss    
+#>  3   612 ttki…      91.4  7                75.6 h          wbjhhf    
+#>  4   873 dsng…      66.7  41               77.5 h          wsudet    
+#>  5    24 mnzo…      45.2  -32              63.4 h          yamuei    
+#>  6    15 lpzc…      42.6  26               60.4 g          flhcox    
+#>  7   757 ygah…      46.3  -42              36.1 b          xrtzih    
+#>  8   909 dmjd…      23.9  50               92.2 d          lvudci    
+#>  9   717 immn…      17.3  -13              16.0 c          csvjhr    
+#> 10   196 ddzh…       5.07 34               72.8 i          orqjut    
+#> # ... with 983 more rows, and 3 more variables: colname_08 <chr>,
+#> #   colname_09 <chr>, colname_10 <fct>
 ```
 
 Both have ID columns `id_01` and `id_02`, but we know they will differ
@@ -66,7 +66,7 @@ in their number of rows and columns and that the rows are unordered.
 ``` r
 > z
 <diff_tbl: x vs y>
-❯ There were 243 differences across 8 cols and 152 rows
+✖ There were 231 differences across 8 cols and 148 rows
 ```
 
 ``` r
@@ -75,35 +75,35 @@ in their number of rows and columns and that the rows are unordered.
 # Dimensions
     set    rows  cols
     ----- ----- -----
-    x       996    11
+    x       997    11
     y       993    10
 
 # Columns
-● x has 1 unique column:
-    `colname_03`
-● There are 10 columns that appear in both
-  ✔ 2 cols are identical: 
+● 1 column in x is not in y:
+    `colname_05`
+● 10 columns appear in both x and y
+  ✔ 2 columns have identical entries: 
     `id_01`, `id_02`
-  ✖ 8 cols have differences: 
-    `colname_01`, `colname_02`, `colname_04`, `colname_05`, `colname_06`, `colname_07`, 
-    `colname_08`, 
+  ✖ 8 columns have differences: 
+    `colname_01`, `colname_02`, `colname_04`, `colname_06`, `colname_07`, `colname_08`, 
+    `colname_09`, 
     `colname_10`
 
 # Differences
-❯ There were 243 differences across 8 cols and 152 rows
-    variable   type.x    type.y    state    miss_count diff             
-    -----      -----     -----     -----         ----- ------           
-    colname_01 character character diff             31 <tibble [31 × 7]>
-    colname_02 character character diff             31 <tibble [31 × 7]>
-    colname_04 factor    factor    diff             29 <tibble [29 × 7]>
-    colname_05 integer   character diff             30 <tibble [30 × 7]>
-    colname_06 integer   integer   diff             29 <tibble [29 × 7]>
-    colname_07 character character diff             31 <tibble [31 × 7]>
-    colname_08 character character diff             31 <tibble [31 × 7]>
-    colname_10 integer   integer   diff             31 <tibble [31 × 7]>
-    id_01      character character same              0 <NULL>           
-    id_02      character character same              0 <NULL>           
-    colname_03 integer   <NA>      unique_x         NA <NULL>           
+✖ There were 231 differences across 8 cols and 148 rows
+    variable   type.x    type.y    state    n_diff diff             
+    -----      -----     -----     -----     ----- ------           
+    colname_01 numeric   numeric   diff         29 <tibble [29 × 7]>
+    colname_02 integer   character diff         30 <tibble [30 × 7]>
+    colname_04 numeric   numeric   diff         30 <tibble [30 × 7]>
+    colname_06 factor    factor    diff         26 <tibble [26 × 7]>
+    colname_07 character character diff         30 <tibble [30 × 7]>
+    colname_08 character character diff         30 <tibble [30 × 7]>
+    colname_09 character character diff         30 <tibble [30 × 7]>
+    colname_10 factor    factor    diff         26 <tibble [26 × 7]>
+    id_01      integer   integer   same          0 <NULL>           
+    id_02      character character same          0 <NULL>           
+    colname_05 factor    <NA>      unique_x     NA <NULL>           
 ```
 
 ``` r

diff --git a/man/figures/README-diff_tbl-plot-1.png b/man/figures/README-diff_tbl-plot-1.png