From 5ac9a39a3ce7a8e08c9094c84b45261cdf4d8979 Mon Sep 17 00:00:00 2001 From: Jie Ren Date: Sun, 30 Oct 2022 16:42:29 +0800 Subject: [PATCH 1/5] perf(acc_op): use torch API for accelerating CPU codes --- src/adam_op/adam_op_impl_cpu.cpp | 91 +++++++++++--------------------- 1 file changed, 31 insertions(+), 60 deletions(-) diff --git a/src/adam_op/adam_op_impl_cpu.cpp b/src/adam_op/adam_op_impl_cpu.cpp index 82accd8c..b55858ec 100644 --- a/src/adam_op/adam_op_impl_cpu.cpp +++ b/src/adam_op/adam_op_impl_cpu.cpp @@ -67,20 +67,18 @@ TensorArray<3> adamForwardInplaceCPU(const torch::Tensor &updates, const other_t inv_one_minus_pow_b1 = 1 / (1 - std::pow(b1, count)); const other_t inv_one_minus_pow_b2 = 1 / (1 - std::pow(b2, count)); - const size_t n = getTensorPlainSize(updates); - AT_DISPATCH_SCALAR_TYPES(updates.scalar_type(), "adamForwardInplaceCPU", ([&] { - adamForwardInplaceCPUKernel( - scalar_t(b1), - scalar_t(inv_one_minus_pow_b1), - scalar_t(b2), - scalar_t(inv_one_minus_pow_b2), - scalar_t(eps), - scalar_t(eps_root), - n, - updates.data_ptr(), - mu.data_ptr(), - nu.data_ptr()); - })); + AT_DISPATCH_SCALAR_TYPES( + updates.scalar_type(), "adamForwardInplaceCPU", ([&] { + mu.mul_(scalar_t(b1)).add_(updates, 1 - scalar_t(b1)); + + nu.mul_(scalar_t(b2)).addcmul_(updates, updates.conj(), 1 - scalar_t(b2)); + + updates.copy_(mu.mul(scalar_t(inv_one_minus_pow_b1)) + .div_(nu.mul(inv_one_minus_pow_b2) + .add_(scalar_t(eps_root)) + .sqrt_() + .add_(scalar_t(eps)))); + })); return TensorArray<3>{updates, mu, nu}; } @@ -102,16 +100,10 @@ void adamForwardMuCPUKernel(const scalar_t *__restrict__ updates_ptr, torch::Tensor adamForwardMuCPU(const torch::Tensor &updates, const torch::Tensor &mu, const pyfloat_t b1) { - auto mu_out = torch::empty_like(mu); + torch::Tensor mu_out; - const size_t n = getTensorPlainSize(updates); AT_DISPATCH_SCALAR_TYPES(updates.scalar_type(), "adamForwardMuCPU", ([&] { - adamForwardMuCPUKernel( - updates.data_ptr(), - mu.data_ptr(), - scalar_t(b1), - n, - mu_out.data_ptr()); + mu_out = mu.mul(b1).add_(updates, 1 - scalar_t(b1)); })); return mu_out; } @@ -135,16 +127,11 @@ void adamForwardNuCPUKernel(const scalar_t *__restrict__ updates_ptr, torch::Tensor adamForwardNuCPU(const torch::Tensor &updates, const torch::Tensor &nu, const pyfloat_t b2) { - auto nu_out = torch::empty_like(nu); + torch::Tensor nu_out; - const size_t n = getTensorPlainSize(updates); AT_DISPATCH_SCALAR_TYPES(updates.scalar_type(), "adamForwardNuCPU", ([&] { - adamForwardNuCPUKernel( - updates.data_ptr(), - nu.data_ptr(), - scalar_t(b2), - n, - nu_out.data_ptr()); + nu_out = + nu.mul(b2).addcmul_(updates, updates.conj(), 1 - scalar_t(b2)); })); return nu_out; } @@ -177,24 +164,19 @@ torch::Tensor adamForwardUpdatesCPU(const torch::Tensor &new_mu, const pyuint_t count) { using other_t = pyfloat_t; - auto updates_out = torch::empty_like(new_mu); + torch::Tensor updates_out; const other_t one_minus_pow_b1 = 1 - std::pow(b1, count); const other_t inv_one_minus_pow_b1 = 1 / one_minus_pow_b1; const other_t one_minus_pow_b2 = 1 - std::pow(b2, count); const other_t inv_one_minus_pow_b2 = 1 / one_minus_pow_b2; - const size_t n = getTensorPlainSize(new_mu); AT_DISPATCH_SCALAR_TYPES(new_mu.scalar_type(), "adamForwardUpdatesCPU", ([&] { - adamForwardUpdatesCPUKernel( - new_mu.data_ptr(), - new_nu.data_ptr(), - scalar_t(inv_one_minus_pow_b1), - scalar_t(inv_one_minus_pow_b2), - scalar_t(eps), - scalar_t(eps_root), - n, - updates_out.data_ptr()); + updates_out = new_mu.mul(scalar_t(inv_one_minus_pow_b1)) + .div_(new_nu.mul(scalar_t(inv_one_minus_pow_b2)) + .add_(scalar_t(eps_root)) + .sqrt_() + .add_(scalar_t(eps))); })); return updates_out; } @@ -218,17 +200,12 @@ TensorArray<2> adamBackwardMuCPU(const torch::Tensor &dmu, const torch::Tensor &updates, const torch::Tensor &mu, const pyfloat_t b1) { - auto dupdates_out = torch::empty_like(updates); - auto dmu_out = torch::empty_like(mu); + torch::Tensor dupdates_out; + torch::Tensor dmu_out; - const size_t n = getTensorPlainSize(dmu); AT_DISPATCH_SCALAR_TYPES(dmu.scalar_type(), "adamBackwardMuCPU", ([&] { - adamBackwardMuCPUKernel( - dmu.data_ptr(), - scalar_t(b1), - n, - dupdates_out.data_ptr(), - dmu_out.data_ptr()); + dupdates_out = dmu.mul(1 - scalar_t(b1)); + dmu_out = dmu.mul(scalar_t(b1)); })); return TensorArray<2>{std::move(dupdates_out), std::move(dmu_out)}; } @@ -254,18 +231,12 @@ TensorArray<2> adamBackwardNuCPU(const torch::Tensor &dnu, const torch::Tensor &updates, const torch::Tensor &nu, const pyfloat_t b2) { - auto dupdates_out = torch::empty_like(updates); - auto dnu_out = torch::empty_like(nu); + torch::Tensor dupdates_out; + torch::Tensor dnu_out; - const size_t n = getTensorPlainSize(dnu); AT_DISPATCH_SCALAR_TYPES(dnu.scalar_type(), "adamForwardNuCPU", ([&] { - adamBackwardNuCPUKernel( - dnu.data_ptr(), - updates.data_ptr(), - scalar_t(b2), - n, - dupdates_out.data_ptr(), - dnu_out.data_ptr()); + dupdates_out = updates.mul(2 - 2 * scalar_t(b2)).mul_(dnu); + dnu_out = dnu.mul(scalar_t(b2)); })); return TensorArray<2>{std::move(dupdates_out), std::move(dnu_out)}; } From 51e2464cd12e45fdd9d98ad4254847617b768827 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 2 Nov 2022 17:28:06 +0800 Subject: [PATCH 2/5] perf(acc_op): add if condition for the element number small situations --- src/adam_op/adam_op_impl_cpu.cpp | 114 ++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 38 deletions(-) diff --git a/src/adam_op/adam_op_impl_cpu.cpp b/src/adam_op/adam_op_impl_cpu.cpp index b55858ec..27f1cca8 100644 --- a/src/adam_op/adam_op_impl_cpu.cpp +++ b/src/adam_op/adam_op_impl_cpu.cpp @@ -27,6 +27,8 @@ using std::size_t; namespace adam_op { +constexpr int min_elements_use_omp = 1000; + template void adamForwardInplaceCPUKernel(const other_t b1, const other_t inv_one_minus_pow_b1, @@ -38,7 +40,8 @@ void adamForwardInplaceCPUKernel(const other_t b1, scalar_t *__restrict__ updates_ptr, scalar_t *__restrict__ mu_ptr, scalar_t *__restrict__ nu_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t mu = mu_ptr[tid]; @@ -67,18 +70,20 @@ TensorArray<3> adamForwardInplaceCPU(const torch::Tensor &updates, const other_t inv_one_minus_pow_b1 = 1 / (1 - std::pow(b1, count)); const other_t inv_one_minus_pow_b2 = 1 / (1 - std::pow(b2, count)); - AT_DISPATCH_SCALAR_TYPES( - updates.scalar_type(), "adamForwardInplaceCPU", ([&] { - mu.mul_(scalar_t(b1)).add_(updates, 1 - scalar_t(b1)); - - nu.mul_(scalar_t(b2)).addcmul_(updates, updates.conj(), 1 - scalar_t(b2)); - - updates.copy_(mu.mul(scalar_t(inv_one_minus_pow_b1)) - .div_(nu.mul(inv_one_minus_pow_b2) - .add_(scalar_t(eps_root)) - .sqrt_() - .add_(scalar_t(eps)))); - })); + const size_t n = getTensorPlainSize(updates); + AT_DISPATCH_SCALAR_TYPES(updates.scalar_type(), "adamForwardInplaceCPU", ([&] { + adamForwardInplaceCPUKernel( + scalar_t(b1), + scalar_t(inv_one_minus_pow_b1), + scalar_t(b2), + scalar_t(inv_one_minus_pow_b2), + scalar_t(eps), + scalar_t(eps_root), + n, + updates.data_ptr(), + mu.data_ptr(), + nu.data_ptr()); + })); return TensorArray<3>{updates, mu, nu}; } @@ -88,7 +93,8 @@ void adamForwardMuCPUKernel(const scalar_t *__restrict__ updates_ptr, const other_t b1, const size_t n, scalar_t *__restrict__ mu_out_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t mu = mu_ptr[tid]; @@ -100,10 +106,16 @@ void adamForwardMuCPUKernel(const scalar_t *__restrict__ updates_ptr, torch::Tensor adamForwardMuCPU(const torch::Tensor &updates, const torch::Tensor &mu, const pyfloat_t b1) { - torch::Tensor mu_out; + auto mu_out = torch::empty_like(mu); + const size_t n = getTensorPlainSize(updates); AT_DISPATCH_SCALAR_TYPES(updates.scalar_type(), "adamForwardMuCPU", ([&] { - mu_out = mu.mul(b1).add_(updates, 1 - scalar_t(b1)); + adamForwardMuCPUKernel( + updates.data_ptr(), + mu.data_ptr(), + scalar_t(b1), + n, + mu_out.data_ptr()); })); return mu_out; } @@ -114,7 +126,8 @@ void adamForwardNuCPUKernel(const scalar_t *__restrict__ updates_ptr, const other_t b2, const size_t n, scalar_t *__restrict__ nu_out_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t nu = nu_ptr[tid]; @@ -127,11 +140,16 @@ void adamForwardNuCPUKernel(const scalar_t *__restrict__ updates_ptr, torch::Tensor adamForwardNuCPU(const torch::Tensor &updates, const torch::Tensor &nu, const pyfloat_t b2) { - torch::Tensor nu_out; + auto nu_out = torch::empty_like(nu); + const size_t n = getTensorPlainSize(updates); AT_DISPATCH_SCALAR_TYPES(updates.scalar_type(), "adamForwardNuCPU", ([&] { - nu_out = - nu.mul(b2).addcmul_(updates, updates.conj(), 1 - scalar_t(b2)); + adamForwardNuCPUKernel( + updates.data_ptr(), + nu.data_ptr(), + scalar_t(b2), + n, + nu_out.data_ptr()); })); return nu_out; } @@ -145,7 +163,8 @@ void adamForwardUpdatesCPUKernel(const scalar_t *__restrict__ new_mu_ptr, const other_t eps_root, const size_t n, scalar_t *__restrict__ updates_out_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t new_mu = new_mu_ptr[tid]; const scalar_t new_nu = new_nu_ptr[tid]; @@ -164,19 +183,24 @@ torch::Tensor adamForwardUpdatesCPU(const torch::Tensor &new_mu, const pyuint_t count) { using other_t = pyfloat_t; - torch::Tensor updates_out; + auto updates_out = torch::empty_like(new_mu); const other_t one_minus_pow_b1 = 1 - std::pow(b1, count); const other_t inv_one_minus_pow_b1 = 1 / one_minus_pow_b1; const other_t one_minus_pow_b2 = 1 - std::pow(b2, count); const other_t inv_one_minus_pow_b2 = 1 / one_minus_pow_b2; + const size_t n = getTensorPlainSize(new_mu); AT_DISPATCH_SCALAR_TYPES(new_mu.scalar_type(), "adamForwardUpdatesCPU", ([&] { - updates_out = new_mu.mul(scalar_t(inv_one_minus_pow_b1)) - .div_(new_nu.mul(scalar_t(inv_one_minus_pow_b2)) - .add_(scalar_t(eps_root)) - .sqrt_() - .add_(scalar_t(eps))); + adamForwardUpdatesCPUKernel( + new_mu.data_ptr(), + new_nu.data_ptr(), + scalar_t(inv_one_minus_pow_b1), + scalar_t(inv_one_minus_pow_b2), + scalar_t(eps), + scalar_t(eps_root), + n, + updates_out.data_ptr()); })); return updates_out; } @@ -187,7 +211,8 @@ void adamBackwardMuCPUKernel(const scalar_t *__restrict__ dmu_ptr, const size_t n, scalar_t *__restrict__ dupdates_out_ptr, scalar_t *__restrict__ dmu_out_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t dmu = dmu_ptr[tid]; @@ -200,12 +225,17 @@ TensorArray<2> adamBackwardMuCPU(const torch::Tensor &dmu, const torch::Tensor &updates, const torch::Tensor &mu, const pyfloat_t b1) { - torch::Tensor dupdates_out; - torch::Tensor dmu_out; + auto dupdates_out = torch::empty_like(updates); + auto dmu_out = torch::empty_like(mu); + const size_t n = getTensorPlainSize(dmu); AT_DISPATCH_SCALAR_TYPES(dmu.scalar_type(), "adamBackwardMuCPU", ([&] { - dupdates_out = dmu.mul(1 - scalar_t(b1)); - dmu_out = dmu.mul(scalar_t(b1)); + adamBackwardMuCPUKernel( + dmu.data_ptr(), + scalar_t(b1), + n, + dupdates_out.data_ptr(), + dmu_out.data_ptr()); })); return TensorArray<2>{std::move(dupdates_out), std::move(dmu_out)}; } @@ -217,7 +247,8 @@ void adamBackwardNuCPUKernel(const scalar_t *__restrict__ dnu_ptr, const size_t n, scalar_t *__restrict__ dupdates_out_ptr, scalar_t *__restrict__ dnu_out_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t dnu = dnu_ptr[tid]; const scalar_t updates = updates_ptr[tid]; @@ -231,12 +262,18 @@ TensorArray<2> adamBackwardNuCPU(const torch::Tensor &dnu, const torch::Tensor &updates, const torch::Tensor &nu, const pyfloat_t b2) { - torch::Tensor dupdates_out; - torch::Tensor dnu_out; + auto dupdates_out = torch::empty_like(updates); + auto dnu_out = torch::empty_like(nu); + const size_t n = getTensorPlainSize(dnu); AT_DISPATCH_SCALAR_TYPES(dnu.scalar_type(), "adamForwardNuCPU", ([&] { - dupdates_out = updates.mul(2 - 2 * scalar_t(b2)).mul_(dnu); - dnu_out = dnu.mul(scalar_t(b2)); + adamBackwardNuCPUKernel( + dnu.data_ptr(), + updates.data_ptr(), + scalar_t(b2), + n, + dupdates_out.data_ptr(), + dnu_out.data_ptr()); })); return TensorArray<2>{std::move(dupdates_out), std::move(dnu_out)}; } @@ -250,7 +287,8 @@ void adamBackwardUpdatesCPUKernel(const scalar_t *__restrict__ dupdates_ptr, const size_t n, scalar_t *__restrict__ dnew_mu_out_ptr, scalar_t *__restrict__ dnew_nu_out_ptr) { -#pragma omp parallel for num_threads(omp_get_num_procs()) +#pragma omp parallel for num_threads(std::min( \ + n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) for (size_t tid = 0; tid < n; ++tid) { const scalar_t dupdates = dupdates_ptr[tid]; const scalar_t updates = updates_ptr[tid]; From 865e79c1192c0eddb737b971311009e7b287c813 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 2 Nov 2022 17:22:38 +0800 Subject: [PATCH 3/5] docs(CHANGELOG): update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a01f6751..d0747f6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add if condition of number of threads for CPU OPs by [@JieRen98](https://github.com/JieRen98) in [#105](https://github.com/metaopt/torchopt/pull/105). - Add implicit MAML omniglot few-shot classification example with OOP APIs by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/metaopt/torchopt/pull/107). - Add implicit MAML omniglot few-shot classification example by [@Benjamin-eecs](https://github.com/Benjamin-eecs) in [#48](https://github.com/metaopt/torchopt/pull/48). - Add object-oriented modules support for implicit meta-gradient by [@XuehaiPan](https://github.com/XuehaiPan) in [#101](https://github.com/metaopt/torchopt/pull/101). From 2a0d9a41fda62e3a061171b67c076e77d21069a1 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 3 Nov 2022 21:33:30 +0800 Subject: [PATCH 4/5] lint: appease linters --- src/adam_op/adam_op_impl_cpu.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/adam_op/adam_op_impl_cpu.cpp b/src/adam_op/adam_op_impl_cpu.cpp index 27f1cca8..ed05c6f0 100644 --- a/src/adam_op/adam_op_impl_cpu.cpp +++ b/src/adam_op/adam_op_impl_cpu.cpp @@ -27,7 +27,7 @@ using std::size_t; namespace adam_op { -constexpr int min_elements_use_omp = 1000; +constexpr size_t MIN_NUMEL_USE_OMP = 1000; template void adamForwardInplaceCPUKernel(const other_t b1, @@ -41,7 +41,7 @@ void adamForwardInplaceCPUKernel(const other_t b1, scalar_t *__restrict__ mu_ptr, scalar_t *__restrict__ nu_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t mu = mu_ptr[tid]; @@ -94,7 +94,7 @@ void adamForwardMuCPUKernel(const scalar_t *__restrict__ updates_ptr, const size_t n, scalar_t *__restrict__ mu_out_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t mu = mu_ptr[tid]; @@ -127,7 +127,7 @@ void adamForwardNuCPUKernel(const scalar_t *__restrict__ updates_ptr, const size_t n, scalar_t *__restrict__ nu_out_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t nu = nu_ptr[tid]; @@ -164,7 +164,7 @@ void adamForwardUpdatesCPUKernel(const scalar_t *__restrict__ new_mu_ptr, const size_t n, scalar_t *__restrict__ updates_out_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t new_mu = new_mu_ptr[tid]; const scalar_t new_nu = new_nu_ptr[tid]; @@ -212,7 +212,7 @@ void adamBackwardMuCPUKernel(const scalar_t *__restrict__ dmu_ptr, scalar_t *__restrict__ dupdates_out_ptr, scalar_t *__restrict__ dmu_out_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t dmu = dmu_ptr[tid]; @@ -248,7 +248,7 @@ void adamBackwardNuCPUKernel(const scalar_t *__restrict__ dnu_ptr, scalar_t *__restrict__ dupdates_out_ptr, scalar_t *__restrict__ dnu_out_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t dnu = dnu_ptr[tid]; const scalar_t updates = updates_ptr[tid]; @@ -288,7 +288,7 @@ void adamBackwardUpdatesCPUKernel(const scalar_t *__restrict__ dupdates_ptr, scalar_t *__restrict__ dnew_mu_out_ptr, scalar_t *__restrict__ dnew_nu_out_ptr) { #pragma omp parallel for num_threads(std::min( \ - n / (size_t)min_elements_use_omp, (size_t)omp_get_num_procs())) if (n > min_elements_use_omp) + n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t dupdates = dupdates_ptr[tid]; const scalar_t updates = updates_ptr[tid]; From d6b733c7131832cf4547d608aa6140d73919a75b Mon Sep 17 00:00:00 2001 From: Jie Ren Date: Sun, 6 Nov 2022 21:58:58 +0800 Subject: [PATCH 5/5] fix(acc_op): use static_cast --- src/adam_op/adam_op_impl_cpu.cpp | 35 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/adam_op/adam_op_impl_cpu.cpp b/src/adam_op/adam_op_impl_cpu.cpp index ed05c6f0..c2ae4ae0 100644 --- a/src/adam_op/adam_op_impl_cpu.cpp +++ b/src/adam_op/adam_op_impl_cpu.cpp @@ -40,8 +40,9 @@ void adamForwardInplaceCPUKernel(const other_t b1, scalar_t *__restrict__ updates_ptr, scalar_t *__restrict__ mu_ptr, scalar_t *__restrict__ nu_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t mu = mu_ptr[tid]; @@ -93,8 +94,9 @@ void adamForwardMuCPUKernel(const scalar_t *__restrict__ updates_ptr, const other_t b1, const size_t n, scalar_t *__restrict__ mu_out_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t mu = mu_ptr[tid]; @@ -126,8 +128,9 @@ void adamForwardNuCPUKernel(const scalar_t *__restrict__ updates_ptr, const other_t b2, const size_t n, scalar_t *__restrict__ nu_out_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t updates = updates_ptr[tid]; const scalar_t nu = nu_ptr[tid]; @@ -163,8 +166,9 @@ void adamForwardUpdatesCPUKernel(const scalar_t *__restrict__ new_mu_ptr, const other_t eps_root, const size_t n, scalar_t *__restrict__ updates_out_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t new_mu = new_mu_ptr[tid]; const scalar_t new_nu = new_nu_ptr[tid]; @@ -211,8 +215,9 @@ void adamBackwardMuCPUKernel(const scalar_t *__restrict__ dmu_ptr, const size_t n, scalar_t *__restrict__ dupdates_out_ptr, scalar_t *__restrict__ dmu_out_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t dmu = dmu_ptr[tid]; @@ -247,8 +252,9 @@ void adamBackwardNuCPUKernel(const scalar_t *__restrict__ dnu_ptr, const size_t n, scalar_t *__restrict__ dupdates_out_ptr, scalar_t *__restrict__ dnu_out_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t dnu = dnu_ptr[tid]; const scalar_t updates = updates_ptr[tid]; @@ -287,8 +293,9 @@ void adamBackwardUpdatesCPUKernel(const scalar_t *__restrict__ dupdates_ptr, const size_t n, scalar_t *__restrict__ dnew_mu_out_ptr, scalar_t *__restrict__ dnew_nu_out_ptr) { -#pragma omp parallel for num_threads(std::min( \ - n / MIN_NUMEL_USE_OMP, (size_t)omp_get_num_procs())) if (n > MIN_NUMEL_USE_OMP) // NOLINT +#pragma omp parallel for num_threads( \ + std::min(n / MIN_NUMEL_USE_OMP, \ + static_cast (omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP) // NOLINT for (size_t tid = 0; tid < n; ++tid) { const scalar_t dupdates = dupdates_ptr[tid]; const scalar_t updates = updates_ptr[tid];