Skip to content

Commit 27ce512

Browse files
authored
[AMDGPU] Fix setting nontemporal in memory legalizer (#83815)
Iterator MI can advance in insertWait() but we need original instruction to set temporal hint. Just move it before handling volatile.
1 parent d9ae4a6 commit 27ce512

5 files changed

+710
-5
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -2392,6 +2392,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23922392

23932393
bool Changed = false;
23942394

2395+
if (IsNonTemporal) {
2396+
// Set non-temporal hint for all cache levels.
2397+
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2398+
}
2399+
23952400
if (IsVolatile) {
23962401
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
23972402

@@ -2407,11 +2412,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
24072412
Position::AFTER);
24082413
}
24092414

2410-
if (IsNonTemporal) {
2411-
// Set non-temporal hint for all cache levels.
2412-
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2413-
}
2414-
24152415
return Changed;
24162416
}
24172417

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

+165
Original file line numberDiff line numberDiff line change
@@ -684,5 +684,170 @@ entry:
684684
ret void
685685
}
686686

687+
define amdgpu_kernel void @flat_nontemporal_volatile_load(
688+
; GFX7-LABEL: flat_nontemporal_volatile_load:
689+
; GFX7: ; %bb.0: ; %entry
690+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
691+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
692+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
693+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
694+
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
695+
; GFX7-NEXT: s_waitcnt vmcnt(0)
696+
; GFX7-NEXT: v_mov_b32_e32 v0, s2
697+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
698+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
699+
; GFX7-NEXT: flat_store_dword v[0:1], v2
700+
; GFX7-NEXT: s_endpgm
701+
;
702+
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
703+
; GFX10-WGP: ; %bb.0: ; %entry
704+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
705+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
706+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
707+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
708+
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
709+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
710+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
711+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
712+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
713+
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
714+
; GFX10-WGP-NEXT: s_endpgm
715+
;
716+
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
717+
; GFX10-CU: ; %bb.0: ; %entry
718+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
719+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
720+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
721+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
722+
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
723+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
724+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
725+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
726+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
727+
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
728+
; GFX10-CU-NEXT: s_endpgm
729+
;
730+
; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
731+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
732+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
733+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
734+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
735+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
736+
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
737+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
738+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
739+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
740+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
741+
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
742+
; SKIP-CACHE-INV-NEXT: s_endpgm
743+
;
744+
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
745+
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
746+
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
747+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
748+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
749+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
750+
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
751+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
752+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
753+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
754+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
755+
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
756+
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
757+
;
758+
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
759+
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
760+
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
761+
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
762+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
763+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
764+
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
765+
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
766+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
767+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
768+
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
769+
; GFX90A-TGSPLIT-NEXT: s_endpgm
770+
;
771+
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
772+
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
773+
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
774+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
775+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
776+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
777+
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
778+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
779+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
780+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
781+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
782+
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
783+
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
784+
;
785+
; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
786+
; GFX940-TGSPLIT: ; %bb.0: ; %entry
787+
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
788+
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
789+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
790+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
791+
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
792+
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
793+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
794+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
795+
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
796+
; GFX940-TGSPLIT-NEXT: s_endpgm
797+
;
798+
; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
799+
; GFX11-WGP: ; %bb.0: ; %entry
800+
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
801+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
802+
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
803+
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
804+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
805+
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
806+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
807+
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
808+
; GFX11-WGP-NEXT: s_endpgm
809+
;
810+
; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
811+
; GFX11-CU: ; %bb.0: ; %entry
812+
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
813+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
814+
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
815+
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
816+
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
817+
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
818+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
819+
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
820+
; GFX11-CU-NEXT: s_endpgm
821+
;
822+
; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
823+
; GFX12-WGP: ; %bb.0: ; %entry
824+
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
825+
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
826+
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
827+
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
828+
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
829+
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
830+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
831+
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
832+
; GFX12-WGP-NEXT: s_endpgm
833+
;
834+
; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
835+
; GFX12-CU: ; %bb.0: ; %entry
836+
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
837+
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
838+
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
839+
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
840+
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
841+
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
842+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
843+
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
844+
; GFX12-CU-NEXT: s_endpgm
845+
ptr %in, ptr %out) {
846+
entry:
847+
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
848+
store i32 %val, ptr %out
849+
ret void
850+
}
851+
687852
!0 = !{i32 1}
688853
declare i32 @llvm.amdgcn.workitem.id.x()

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

+158
Original file line numberDiff line numberDiff line change
@@ -674,5 +674,163 @@ entry:
674674
ret void
675675
}
676676

677+
define amdgpu_kernel void @global_nontemporal_volatile_load(
678+
; GFX6-LABEL: global_nontemporal_volatile_load:
679+
; GFX6: ; %bb.0: ; %entry
680+
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
681+
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
682+
; GFX6-NEXT: s_mov_b32 s6, -1
683+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
684+
; GFX6-NEXT: s_mov_b32 s4, s0
685+
; GFX6-NEXT: s_mov_b32 s5, s1
686+
; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
687+
; GFX6-NEXT: s_waitcnt vmcnt(0)
688+
; GFX6-NEXT: s_mov_b32 s4, s2
689+
; GFX6-NEXT: s_mov_b32 s5, s3
690+
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
691+
; GFX6-NEXT: s_endpgm
692+
;
693+
; GFX7-LABEL: global_nontemporal_volatile_load:
694+
; GFX7: ; %bb.0: ; %entry
695+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
696+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
697+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
698+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
699+
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
700+
; GFX7-NEXT: s_waitcnt vmcnt(0)
701+
; GFX7-NEXT: v_mov_b32_e32 v0, s2
702+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
703+
; GFX7-NEXT: flat_store_dword v[0:1], v2
704+
; GFX7-NEXT: s_endpgm
705+
;
706+
; GFX10-WGP-LABEL: global_nontemporal_volatile_load:
707+
; GFX10-WGP: ; %bb.0: ; %entry
708+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
709+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
710+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
711+
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
712+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
713+
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
714+
; GFX10-WGP-NEXT: s_endpgm
715+
;
716+
; GFX10-CU-LABEL: global_nontemporal_volatile_load:
717+
; GFX10-CU: ; %bb.0: ; %entry
718+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
719+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
720+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
721+
; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
722+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
723+
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
724+
; GFX10-CU-NEXT: s_endpgm
725+
;
726+
; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load:
727+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
728+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
729+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
730+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
731+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
732+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
733+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
734+
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
735+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
736+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
737+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
738+
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
739+
; SKIP-CACHE-INV-NEXT: s_endpgm
740+
;
741+
; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
742+
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
743+
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
744+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
745+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
746+
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
747+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
748+
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
749+
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
750+
;
751+
; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load:
752+
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
753+
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
754+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
755+
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
756+
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
757+
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
758+
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
759+
; GFX90A-TGSPLIT-NEXT: s_endpgm
760+
;
761+
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
762+
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
763+
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
764+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
765+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
766+
; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
767+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
768+
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
769+
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
770+
;
771+
; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load:
772+
; GFX940-TGSPLIT: ; %bb.0: ; %entry
773+
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
774+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
775+
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
776+
; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
777+
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
778+
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
779+
; GFX940-TGSPLIT-NEXT: s_endpgm
780+
;
781+
; GFX11-WGP-LABEL: global_nontemporal_volatile_load:
782+
; GFX11-WGP: ; %bb.0: ; %entry
783+
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
784+
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
785+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
786+
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
787+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
788+
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
789+
; GFX11-WGP-NEXT: s_nop 0
790+
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
791+
; GFX11-WGP-NEXT: s_endpgm
792+
;
793+
; GFX11-CU-LABEL: global_nontemporal_volatile_load:
794+
; GFX11-CU: ; %bb.0: ; %entry
795+
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
796+
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
797+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
798+
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
799+
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
800+
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
801+
; GFX11-CU-NEXT: s_nop 0
802+
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
803+
; GFX11-CU-NEXT: s_endpgm
804+
;
805+
; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
806+
; GFX12-WGP: ; %bb.0: ; %entry
807+
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
808+
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
809+
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
810+
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
811+
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
812+
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
813+
; GFX12-WGP-NEXT: s_nop 0
814+
; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
815+
; GFX12-WGP-NEXT: s_endpgm
816+
;
817+
; GFX12-CU-LABEL: global_nontemporal_volatile_load:
818+
; GFX12-CU: ; %bb.0: ; %entry
819+
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
820+
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
821+
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
822+
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
823+
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
824+
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[2:3]
825+
; GFX12-CU-NEXT: s_nop 0
826+
; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
827+
; GFX12-CU-NEXT: s_endpgm
828+
ptr addrspace(1) %in, ptr addrspace(1) %out) {
829+
entry:
830+
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
831+
store i32 %val, ptr addrspace(1) %out
832+
ret void
833+
}
834+
677835
!0 = !{i32 1}
678836
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)