Skip to content

Commit c5f3f7c

Browse files
committed
Propely set nontemporal when volatile is also present
Iterator MI can advance in insertWait() but we need original instruction to set temporal hint. Just move it before handling volatile.
1 parent 8540ea5 commit c5f3f7c

File tree

4 files changed

+11
-11
lines changed

4 files changed

+11
-11
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -2392,6 +2392,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23922392

23932393
bool Changed = false;
23942394

2395+
if (IsNonTemporal) {
2396+
// Set non-temporal hint for all cache levels.
2397+
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2398+
}
2399+
23952400
if (IsVolatile) {
23962401
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
23972402

@@ -2407,11 +2412,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
24072412
Position::AFTER);
24082413
}
24092414

2410-
if (IsNonTemporal) {
2411-
// Set non-temporal hint for all cache levels.
2412-
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2413-
}
2414-
24152415
return Changed;
24162416
}
24172417

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -824,7 +824,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
824824
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
825825
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
826826
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
827-
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
827+
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
828828
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
829829
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
830830
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
@@ -836,7 +836,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
836836
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
837837
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
838838
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
839-
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
839+
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
840840
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
841841
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
842842
; GFX12-CU-NEXT: s_wait_dscnt 0x0

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
807807
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
808808
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
809809
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
810-
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
810+
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
811811
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
812812
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
813813
; GFX12-WGP-NEXT: s_nop 0
@@ -819,7 +819,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
819819
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
820820
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
821821
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
822-
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
822+
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
823823
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
824824
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[2:3]
825825
; GFX12-CU-NEXT: s_nop 0

llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
989989
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
990990
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0
991991
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
992-
; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
992+
; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
993993
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
994994
; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
995995
; GFX12-WGP-NEXT: s_nop 0
@@ -1003,7 +1003,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
10031003
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
10041004
; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0
10051005
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1006-
; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
1006+
; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
10071007
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
10081008
; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1]
10091009
; GFX12-CU-NEXT: s_nop 0

0 commit comments

Comments
 (0)