enabling multithreading in broadcast_reduce (#9444)

apache · Jan 15, 2018 · c74cf1b · c74cf1b
1 parent 02b69db
commit c74cf1b
Showing 1 changed file with 2 additions and 0 deletions.
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
@@ -197,6 +197,7 @@ void seq_reduce_compute(const int N, const int M, const bool addto,
                         const DType *big, DType *small, const Shape<ndim> bshape,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
+  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int idx = 0; idx < N; ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
       rstride);
@@ -266,6 +267,7 @@ void seq_reduce_compute(const int N, const int M, const bool addto,
                         const Shape<ndim> lhs_shape, const Shape<ndim> lhs_stride,
                         const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
                         const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
+  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int idx = 0; idx < N; ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
       big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,