Skip to content

Commit fb08adf

Browse files
authored
[SYCL] Improve performance of generic shuffles (#3815)
* Fix upper bound in GenericCall Previous upper bound considered only the offset, allowing a memcpy for the final chunk to walk off the end of the byte array. * Replace detail::memcpy with std::memcpy sycl::detail::memcpy is implemented as a loop, resulting in different optimizations than std::memcpy. Signed-off-by: John Pennycook <john.pennycook@intel.com>
1 parent e9d308e commit fb08adf

File tree

1 file changed

+13
-13
lines changed

1 file changed

+13
-13
lines changed

sycl/include/CL/sycl/detail/spirv.hpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ template <typename T, typename Functor>
5555
void GenericCall(const Functor &ApplyToBytes) {
5656
if (sizeof(T) >= sizeof(ShuffleChunkT)) {
5757
#pragma unroll
58-
for (size_t Offset = 0; Offset < sizeof(T);
58+
for (size_t Offset = 0; Offset + sizeof(ShuffleChunkT) <= sizeof(T);
5959
Offset += sizeof(ShuffleChunkT)) {
6060
ApplyToBytes(Offset, sizeof(ShuffleChunkT));
6161
}
@@ -160,9 +160,9 @@ EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
160160
char *ResultBytes = reinterpret_cast<char *>(&Result);
161161
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
162162
uint64_t BroadcastX, BroadcastResult;
163-
detail::memcpy(&BroadcastX, XBytes + Offset, Size);
163+
std::memcpy(&BroadcastX, XBytes + Offset, Size);
164164
BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
165-
detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
165+
std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
166166
};
167167
GenericCall<T>(BroadcastBytes);
168168
return Result;
@@ -213,9 +213,9 @@ EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
213213
char *ResultBytes = reinterpret_cast<char *>(&Result);
214214
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
215215
uint64_t BroadcastX, BroadcastResult;
216-
detail::memcpy(&BroadcastX, XBytes + Offset, Size);
216+
std::memcpy(&BroadcastX, XBytes + Offset, Size);
217217
BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
218-
detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
218+
std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
219219
};
220220
GenericCall<T>(BroadcastBytes);
221221
return Result;
@@ -697,9 +697,9 @@ EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
697697
char *ResultBytes = reinterpret_cast<char *>(&Result);
698698
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
699699
ShuffleChunkT ShuffleX, ShuffleResult;
700-
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
700+
std::memcpy(&ShuffleX, XBytes + Offset, Size);
701701
ShuffleResult = SubgroupShuffle(ShuffleX, local_id);
702-
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
702+
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
703703
};
704704
GenericCall<T>(ShuffleBytes);
705705
return Result;
@@ -712,9 +712,9 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
712712
char *ResultBytes = reinterpret_cast<char *>(&Result);
713713
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
714714
ShuffleChunkT ShuffleX, ShuffleResult;
715-
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
715+
std::memcpy(&ShuffleX, XBytes + Offset, Size);
716716
ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id);
717-
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
717+
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
718718
};
719719
GenericCall<T>(ShuffleBytes);
720720
return Result;
@@ -727,9 +727,9 @@ EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
727727
char *ResultBytes = reinterpret_cast<char *>(&Result);
728728
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
729729
ShuffleChunkT ShuffleX, ShuffleResult;
730-
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
730+
std::memcpy(&ShuffleX, XBytes + Offset, Size);
731731
ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id);
732-
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
732+
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
733733
};
734734
GenericCall<T>(ShuffleBytes);
735735
return Result;
@@ -742,9 +742,9 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
742742
char *ResultBytes = reinterpret_cast<char *>(&Result);
743743
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
744744
ShuffleChunkT ShuffleX, ShuffleResult;
745-
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
745+
std::memcpy(&ShuffleX, XBytes + Offset, Size);
746746
ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id);
747-
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
747+
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
748748
};
749749
GenericCall<T>(ShuffleBytes);
750750
return Result;

0 commit comments

Comments
 (0)