From b40cc4391584bdb2ddab472020c7fb265233d627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 26 Sep 2016 09:40:30 +0200 Subject: [PATCH 1/2] better default values for hash table sizes to avoid rehashing and speed up e.g. n_distinct() --- inst/include/dplyr/Result/Count_Distinct.h | 6 ++++-- src/api.cpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/inst/include/dplyr/Result/Count_Distinct.h b/inst/include/dplyr/Result/Count_Distinct.h index b5913ea27f..c2be326dc2 100644 --- a/inst/include/dplyr/Result/Count_Distinct.h +++ b/inst/include/dplyr/Result/Count_Distinct.h @@ -17,11 +17,12 @@ namespace dplyr { typedef dplyr_hash_set Set; Count_Distinct(Visitor v_): - v(v_), set(1024, Hash(v), Pred(v)) + v(v_), set(0, Hash(v), Pred(v)) {} inline int process_chunk(const SlicingIndex& indices) { set.clear(); + set.rehash(indices.size()); int n = indices.size(); for (int i=0; i Set; Count_Distinct_Narm(Visitor v_): - v(v_), set(1024, Hash(v), Pred(v)) + v(v_), set(0, Hash(v), Pred(v)) {} inline int process_chunk(const SlicingIndex& indices) { set.clear(); + set.rehash(indices.size()); int n = indices.size(); for (int i=0; i Date: Mon, 26 Sep 2016 09:41:34 +0200 Subject: [PATCH 2/2] cleanup --- inst/include/dplyr/Result/In.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/inst/include/dplyr/Result/In.h b/inst/include/dplyr/Result/In.h index 43b036c147..fc21f5e7e6 100644 --- a/inst/include/dplyr/Result/In.h +++ b/inst/include/dplyr/Result/In.h @@ -13,10 +13,9 @@ namespace dplyr { typedef typename Rcpp::Vector Vec; typedef typename Rcpp::traits::storage_type::type STORAGE; - In(Vec data_, Vec table_) : + In(Vec data_, const Vec& table_) : data(data_), - table(table_), - set(table.begin(), table.end()) + set(table_.begin(), table_.end()) {} void process_slice(LogicalVector& out, const SlicingIndex& index, const SlicingIndex& out_index) { @@ -32,7 +31,7 @@ namespace dplyr { } private: - Vec data, table; + Vec data; dplyr_hash_set set; };