From 32d9510d67187f1f3a379cce81302cdd15a557d2 Mon Sep 17 00:00:00 2001 From: Nandor Licker Date: Wed, 13 Nov 2024 19:03:50 +0000 Subject: [PATCH] bugfix: fix the alignment of o_frag (#608) Since `o_frag` was not always aligned to a 16-byte boundary, `memcpy` implemented using 4x float moves was crashing in `cuda-gdb` when compiled with `-G`. --- include/flashinfer/attention/prefill.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh index c9d760ae..750e150b 100644 --- a/include/flashinfer/attention/prefill.cuh +++ b/include/flashinfer/attention/prefill.cuh @@ -1154,7 +1154,7 @@ __launch_bounds__(NUM_WARPS_Q* NUM_WARPS_KV* WARP_SIZE) void SinglePrefillWithKV constexpr uint32_t channel_size_128b_out = head_dim / num_elems_per_128b(); DTypeQKAccum s_frag[NUM_FRAGS_Q][NUM_FRAGS_KV][8]; - float o_frag[NUM_FRAGS_Q][NUM_FRAGS_D][8]; + alignas(16) float o_frag[NUM_FRAGS_Q][NUM_FRAGS_D][8]; DTypeQKAccum m[NUM_FRAGS_Q][2]; float d[NUM_FRAGS_Q][2]; float rope_freq[NUM_FRAGS_D / 2][4]; @@ -1579,7 +1579,7 @@ __launch_bounds__(NUM_WARPS_Q* NUM_WARPS_KV* WARP_SIZE) void BatchPrefillWithRag constexpr uint32_t channel_size_128b_out = head_dim / num_elems_per_128b(); DTypeQKAccum s_frag[NUM_FRAGS_Q][NUM_FRAGS_KV][8]; - float o_frag[NUM_FRAGS_Q][NUM_FRAGS_D][8]; + alignas(16) float o_frag[NUM_FRAGS_Q][NUM_FRAGS_D][8]; DTypeQKAccum m[NUM_FRAGS_Q][2]; float d[NUM_FRAGS_Q][2]; float rope_freq[NUM_FRAGS_D / 2][4]; @@ -1866,7 +1866,7 @@ __launch_bounds__(NUM_WARPS_Q* NUM_WARPS_KV* WARP_SIZE) void BatchPrefillWithPag constexpr uint32_t channel_size_128b_out = head_dim / num_elems_per_128b(); DTypeQKAccum s_frag[NUM_FRAGS_Q][NUM_FRAGS_KV][8]; - float o_frag[NUM_FRAGS_Q][NUM_FRAGS_D][8]; + alignas(16) float o_frag[NUM_FRAGS_Q][NUM_FRAGS_D][8]; DTypeQKAccum m[NUM_FRAGS_Q][2]; float d[NUM_FRAGS_Q][2]; float rope_freq[NUM_FRAGS_D / 2][4];