Skip to content

Commit ca8aab4

Browse files
zanmato1984pitrou
authored andcommitted
apacheGH-41329: [C++][Gandiva] Fix gandiva cache size env var (apache#41330)
### Rationale for this change Gandiva cache size validity checks are not robust enough (the negativity test is broken), and they are not currently tested. ### What changes are included in this PR? 1. Fix checking gandiva cache size env var. 2. Make cache size static so it only gets evaluated once. 3. Add test cases. 4. Enrich the description in the document about this env var. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: apache#41329 Lead-authored-by: Ruoxi Sun <zanmato1984@gmail.com> Co-authored-by: Rossi Sun <zanmato1984@gmail.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 9905fea commit ca8aab4

File tree

4 files changed

+116
-18
lines changed

4 files changed

+116
-18
lines changed

cpp/src/gandiva/cache.cc

+31-16
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,41 @@
2020
#include "arrow/result.h"
2121
#include "arrow/util/io_util.h"
2222
#include "arrow/util/logging.h"
23+
#include "arrow/util/value_parsing.h"
2324

2425
namespace gandiva {
2526

26-
static const size_t DEFAULT_CACHE_SIZE = 5000;
27-
28-
int GetCapacity() {
29-
size_t capacity = DEFAULT_CACHE_SIZE;
30-
auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE");
31-
if (maybe_env_cache_size.ok()) {
32-
const auto env_cache_size = *std::move(maybe_env_cache_size);
33-
if (!env_cache_size.empty()) {
34-
capacity = std::atol(env_cache_size.c_str());
35-
if (capacity <= 0) {
36-
ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. "
37-
<< "Using default cache size: " << DEFAULT_CACHE_SIZE;
38-
capacity = DEFAULT_CACHE_SIZE;
39-
}
40-
}
27+
constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE";
28+
constexpr auto kDefaultCacheSize = 5000;
29+
30+
namespace internal {
31+
int GetCacheCapacityFromEnvVar() {
32+
auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar);
33+
if (!maybe_env_value.ok()) {
34+
return kDefaultCacheSize;
35+
}
36+
const auto env_value = *std::move(maybe_env_value);
37+
if (env_value.empty()) {
38+
return kDefaultCacheSize;
39+
}
40+
int capacity = 0;
41+
bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>(
42+
env_value.c_str(), env_value.size(), &capacity);
43+
if (!ok || capacity <= 0) {
44+
ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar
45+
<< ". Using default cache size: " << kDefaultCacheSize;
46+
return kDefaultCacheSize;
4147
}
42-
return static_cast<int>(capacity);
48+
return capacity;
49+
}
50+
} // namespace internal
51+
52+
// Deprecated in 17.0.0. Use GetCacheCapacity instead.
53+
int GetCapacity() { return GetCacheCapacity(); }
54+
55+
int GetCacheCapacity() {
56+
static const int capacity = internal::GetCacheCapacityFromEnvVar();
57+
return capacity;
4358
}
4459

4560
void LogCacheSize(size_t capacity) {

cpp/src/gandiva/cache.h

+14-1
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,27 @@
2020
#include <cstdlib>
2121
#include <mutex>
2222

23+
#include "arrow/util/macros.h"
2324
#include "gandiva/lru_cache.h"
2425
#include "gandiva/visibility.h"
2526

2627
namespace gandiva {
2728

29+
namespace internal {
30+
// Only called once by GetCacheCapacity().
31+
// Do the actual work of getting the cache capacity from env var.
32+
// Also makes the testing easier.
33+
GANDIVA_EXPORT
34+
int GetCacheCapacityFromEnvVar();
35+
} // namespace internal
36+
37+
ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.")
2838
GANDIVA_EXPORT
2939
int GetCapacity();
3040

41+
GANDIVA_EXPORT
42+
int GetCacheCapacity();
43+
3144
GANDIVA_EXPORT
3245
void LogCacheSize(size_t capacity);
3346

@@ -36,7 +49,7 @@ class Cache {
3649
public:
3750
explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); }
3851

39-
Cache() : Cache(GetCapacity()) {}
52+
Cache() : Cache(GetCacheCapacity()) {}
4053

4154
ValueType GetObjectCode(const KeyType& cache_key) {
4255
std::optional<ValueType> result;

cpp/src/gandiva/cache_test.cc

+67-1
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,14 @@
1616
// under the License.
1717

1818
#include "gandiva/cache.h"
19+
#include "arrow/testing/gtest_util.h"
20+
#include "arrow/util/io_util.h"
21+
#include "arrow/util/logging.h"
1922

2023
#include <gtest/gtest.h>
2124

2225
namespace gandiva {
26+
2327
class TestCacheKey {
2428
public:
2529
explicit TestCacheKey(int value) : value_(value) {}
@@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) {
3842
ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world");
3943
}
4044

41-
TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); }
45+
namespace {
46+
constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE";
47+
constexpr auto default_cache_capacity = 5000;
48+
} // namespace
49+
50+
TEST(TestCache, TestGetCacheCapacityDefault) {
51+
ASSERT_EQ(GetCacheCapacity(), default_cache_capacity);
52+
}
53+
54+
TEST(TestCache, TestGetCacheCapacityEnvVar) {
55+
using ::arrow::EnvVarGuard;
56+
57+
// Empty.
58+
{
59+
EnvVarGuard guard(cache_capacity_env_var, "");
60+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
61+
}
62+
63+
// Non-number.
64+
{
65+
EnvVarGuard guard(cache_capacity_env_var, "invalid");
66+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
67+
}
68+
69+
// Number with invalid suffix.
70+
{
71+
EnvVarGuard guard(cache_capacity_env_var, "42MB");
72+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
73+
}
74+
75+
// Valid positive number.
76+
{
77+
EnvVarGuard guard(cache_capacity_env_var, "42");
78+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42);
79+
}
80+
81+
// Int max.
82+
{
83+
auto str = std::to_string(std::numeric_limits<int>::max());
84+
EnvVarGuard guard(cache_capacity_env_var, str.c_str());
85+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits<int>::max());
86+
}
87+
88+
// Zero.
89+
{
90+
EnvVarGuard guard(cache_capacity_env_var, "0");
91+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
92+
}
93+
94+
// Negative number.
95+
{
96+
EnvVarGuard guard(cache_capacity_env_var, "-1");
97+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
98+
}
99+
100+
// Over int max.
101+
{
102+
auto str = std::to_string(static_cast<int64_t>(std::numeric_limits<int>::max()) + 1);
103+
EnvVarGuard guard(cache_capacity_env_var, str.c_str());
104+
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
105+
}
106+
}
107+
42108
} // namespace gandiva

docs/source/cpp/env_vars.rst

+4
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ that changing their value later will have an effect.
181181
The number of entries to keep in the Gandiva JIT compilation cache.
182182
The cache is in-memory and does not persist across processes.
183183

184+
The default cache size is 5000. The value of this environment variable
185+
should be a positive integer and should not exceed the maximum value
186+
of int32. Otherwise the default value is used.
187+
184188
.. envvar:: HADOOP_HOME
185189

186190
The path to the Hadoop installation.

0 commit comments

Comments
 (0)