Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

GH-41329: [C++][Gandiva] Fix gandiva cache size env var #41330

Merged
merged 13 commits into from
May 14, 2024
47 changes: 31 additions & 16 deletions cpp/src/gandiva/cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,41 @@
#include "arrow/result.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

namespace gandiva {

static const size_t DEFAULT_CACHE_SIZE = 5000;

int GetCapacity() {
size_t capacity = DEFAULT_CACHE_SIZE;
auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE");
if (maybe_env_cache_size.ok()) {
const auto env_cache_size = *std::move(maybe_env_cache_size);
if (!env_cache_size.empty()) {
capacity = std::atol(env_cache_size.c_str());
if (capacity <= 0) {
ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. "
<< "Using default cache size: " << DEFAULT_CACHE_SIZE;
capacity = DEFAULT_CACHE_SIZE;
}
}
constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE";
constexpr auto kDefaultCacheSize = 5000;

namespace internal {
int GetCacheCapacityFromEnvVar() {
auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar);
if (!maybe_env_value.ok()) {
return kDefaultCacheSize;
}
const auto env_value = *std::move(maybe_env_value);
if (env_value.empty()) {
return kDefaultCacheSize;
}
int capacity = 0;
bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>(
env_value.c_str(), env_value.size(), &capacity);
Comment on lines +41 to +42
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the parsing according to the recommendation from #41335 (comment)

if (!ok || capacity <= 0) {
ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar
<< ". Using default cache size: " << kDefaultCacheSize;
return kDefaultCacheSize;
}
return static_cast<int>(capacity);
return capacity;
}
} // namespace internal

// Deprecated in 17.0.0. Use GetCacheCapacity instead.
int GetCapacity() { return GetCacheCapacity(); }

int GetCacheCapacity() {
static const int capacity = internal::GetCacheCapacityFromEnvVar();
return capacity;
}

void LogCacheSize(size_t capacity) {
Expand Down
15 changes: 14 additions & 1 deletion cpp/src/gandiva/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,27 @@
#include <cstdlib>
#include <mutex>

#include "arrow/util/macros.h"
#include "gandiva/lru_cache.h"
#include "gandiva/visibility.h"

namespace gandiva {

namespace internal {
// Only called once by GetCacheCapacity().
// Do the actual work of getting the cache capacity from env var.
// Also makes the testing easier.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not possible in google test to re-initialize a static variable. So have this dedicated function to do the actual work eagerly, then we can test it instead of GetCapacity (which contains the static variable).

GANDIVA_EXPORT
int GetCacheCapacityFromEnvVar();
} // namespace internal

ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.")
GANDIVA_EXPORT
int GetCapacity();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be called GetCacheCapacity? The current name is too imprecise.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it can. But is this public API?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea :-) You could deprecate the old API if we want to ensure a smoother migration.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed with deprecating old API and adding renamed one.


GANDIVA_EXPORT
int GetCacheCapacity();

GANDIVA_EXPORT
void LogCacheSize(size_t capacity);

Expand All @@ -36,7 +49,7 @@ class Cache {
public:
explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); }

Cache() : Cache(GetCapacity()) {}
Cache() : Cache(GetCacheCapacity()) {}

ValueType GetObjectCode(const KeyType& cache_key) {
std::optional<ValueType> result;
Expand Down
68 changes: 67 additions & 1 deletion cpp/src/gandiva/cache_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@
// under the License.

#include "gandiva/cache.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"

#include <gtest/gtest.h>

namespace gandiva {

class TestCacheKey {
public:
explicit TestCacheKey(int value) : value_(value) {}
Expand All @@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) {
ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world");
}

TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); }
namespace {
constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE";
constexpr auto default_cache_capacity = 5000;
} // namespace

TEST(TestCache, TestGetCacheCapacityDefault) {
ASSERT_EQ(GetCacheCapacity(), default_cache_capacity);
}

TEST(TestCache, TestGetCacheCapacityEnvVar) {
using ::arrow::EnvVarGuard;

// Empty.
{
EnvVarGuard guard(cache_capacity_env_var, "");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Non-number.
{
EnvVarGuard guard(cache_capacity_env_var, "invalid");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Number with invalid suffix.
{
EnvVarGuard guard(cache_capacity_env_var, "42MB");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Valid positive number.
{
EnvVarGuard guard(cache_capacity_env_var, "42");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42);
}

// Int max.
{
auto str = std::to_string(std::numeric_limits<int>::max());
EnvVarGuard guard(cache_capacity_env_var, str.c_str());
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits<int>::max());
}

// Zero.
{
EnvVarGuard guard(cache_capacity_env_var, "0");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Negative number.
{
EnvVarGuard guard(cache_capacity_env_var, "-1");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Over int max.
{
auto str = std::to_string(static_cast<int64_t>(std::numeric_limits<int>::max()) + 1);
EnvVarGuard guard(cache_capacity_env_var, str.c_str());
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}
}

} // namespace gandiva
4 changes: 4 additions & 0 deletions docs/source/cpp/env_vars.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ that changing their value later will have an effect.
The number of entries to keep in the Gandiva JIT compilation cache.
The cache is in-memory and does not persist across processes.

The default cache size is 5000. The value of this environment variable
should be a positive integer and should not exceed the maximum value
of int32. Otherwise the default value is used.

.. envvar:: HADOOP_HOME

The path to the Hadoop installation.
Expand Down
Loading