diff --git a/src/cpu/aarch64/acl_softmax.cpp b/src/cpu/aarch64/acl_softmax.cpp index 54417423833..3562704df0a 100644 --- a/src/cpu/aarch64/acl_softmax.cpp +++ b/src/cpu/aarch64/acl_softmax.cpp @@ -77,15 +77,12 @@ status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) { // associated with calling the external library and the negative // coefficient on total_size as ACL being faster at processing // each element - auto calculate_performance_diff = [](dnnl::impl::dim_t outer_size, - dnnl::impl::dim_t axis_size, - const int threads, - double sec_coff) { - double acl_ref_performance_diff = 1 + 0.005 * outer_size - + sec_coff * axis_size - * std::ceil(double(outer_size) / threads); - - if (threads > 1 || outer_size > 1) { + auto calculate_performance_diff = [=](double axis_coeff) { + double acl_ref_performance_diff = 1 + 0.005 * outer_size_ + + axis_coeff * axis_size_ + * std::ceil(double(outer_size_) / threads); + + if (threads > 1 || outer_size_ > 1) { acl_ref_performance_diff += 17; // Adds constant overhead for using threads within ACL } @@ -93,8 +90,7 @@ status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) { }; if (inner_size_ == 1) { - double acl_ref_performance_diff = calculate_performance_diff( - outer_size_, axis_size_, threads, -0.0027); + double acl_ref_performance_diff = calculate_performance_diff(-0.0027); if (acl_ref_performance_diff > 0) return status::unimplemented; // If the inner size is 1, we can get rid of the dimension. @@ -111,8 +107,8 @@ status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) { // A rough empirical heuristic, see comment above // The only difference here is that ACL does a reorder, and so // is considerably better - double acl_ref_performance_diff = calculate_performance_diff( - outer_size_, axis_size_, threads, -0.01); + double acl_ref_performance_diff + = calculate_performance_diff(-0.01 * inner_size_); if (acl_ref_performance_diff > 0) return status::unimplemented; // Irrespective of the input dimensions, we construct a tensor