@@ -684,5 +684,170 @@ entry:
684
684
ret void
685
685
}
686
686
687
+ define amdgpu_kernel void @flat_nontemporal_volatile_load (
688
+ ; GFX7-LABEL: flat_nontemporal_volatile_load:
689
+ ; GFX7: ; %bb.0: ; %entry
690
+ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
691
+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
692
+ ; GFX7-NEXT: v_mov_b32_e32 v0, s0
693
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s1
694
+ ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
695
+ ; GFX7-NEXT: s_waitcnt vmcnt(0)
696
+ ; GFX7-NEXT: v_mov_b32_e32 v0, s2
697
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s3
698
+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
699
+ ; GFX7-NEXT: flat_store_dword v[0:1], v2
700
+ ; GFX7-NEXT: s_endpgm
701
+ ;
702
+ ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
703
+ ; GFX10-WGP: ; %bb.0: ; %entry
704
+ ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
705
+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
706
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
707
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
708
+ ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
709
+ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
710
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
711
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
712
+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
713
+ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
714
+ ; GFX10-WGP-NEXT: s_endpgm
715
+ ;
716
+ ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
717
+ ; GFX10-CU: ; %bb.0: ; %entry
718
+ ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
719
+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
720
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
721
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
722
+ ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
723
+ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
724
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
725
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
726
+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
727
+ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
728
+ ; GFX10-CU-NEXT: s_endpgm
729
+ ;
730
+ ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
731
+ ; SKIP-CACHE-INV: ; %bb.0: ; %entry
732
+ ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
733
+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
734
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
735
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
736
+ ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
737
+ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
738
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
739
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
740
+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
741
+ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
742
+ ; SKIP-CACHE-INV-NEXT: s_endpgm
743
+ ;
744
+ ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
745
+ ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
746
+ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
747
+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
748
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
749
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
750
+ ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
751
+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
752
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
753
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
754
+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
755
+ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
756
+ ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
757
+ ;
758
+ ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
759
+ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
760
+ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
761
+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
762
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
763
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
764
+ ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
765
+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
766
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
767
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
768
+ ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
769
+ ; GFX90A-TGSPLIT-NEXT: s_endpgm
770
+ ;
771
+ ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
772
+ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
773
+ ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
774
+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
775
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
776
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
777
+ ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
778
+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
779
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
780
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
781
+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
782
+ ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
783
+ ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
784
+ ;
785
+ ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
786
+ ; GFX940-TGSPLIT: ; %bb.0: ; %entry
787
+ ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
788
+ ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
789
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
790
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
791
+ ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
792
+ ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
793
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
794
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
795
+ ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
796
+ ; GFX940-TGSPLIT-NEXT: s_endpgm
797
+ ;
798
+ ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
799
+ ; GFX11-WGP: ; %bb.0: ; %entry
800
+ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
801
+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
802
+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
803
+ ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
804
+ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
805
+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
806
+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
807
+ ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
808
+ ; GFX11-WGP-NEXT: s_endpgm
809
+ ;
810
+ ; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
811
+ ; GFX11-CU: ; %bb.0: ; %entry
812
+ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
813
+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
814
+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
815
+ ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
816
+ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
817
+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
818
+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
819
+ ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
820
+ ; GFX11-CU-NEXT: s_endpgm
821
+ ;
822
+ ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
823
+ ; GFX12-WGP: ; %bb.0: ; %entry
824
+ ; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
825
+ ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
826
+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
827
+ ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
828
+ ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
829
+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
830
+ ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
831
+ ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
832
+ ; GFX12-WGP-NEXT: s_endpgm
833
+ ;
834
+ ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
835
+ ; GFX12-CU: ; %bb.0: ; %entry
836
+ ; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
837
+ ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
838
+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
839
+ ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
840
+ ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
841
+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
842
+ ; GFX12-CU-NEXT: s_wait_dscnt 0x0
843
+ ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
844
+ ; GFX12-CU-NEXT: s_endpgm
845
+ ptr %in , ptr %out ) {
846
+ entry:
847
+ %val = load volatile i32 , ptr %in , align 4 , !nontemporal !0
848
+ store i32 %val , ptr %out
849
+ ret void
850
+ }
851
+
687
852
!0 = !{i32 1 }
688
853
declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments