-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Speed up compilation by splitting into separate .cu files
- Loading branch information
Showing
13 changed files
with
251 additions
and
318 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
// Copyright (c) 2022, Tri Dao. | ||
|
||
// Splitting the different head dimentions to different files to speed up compilation. | ||
|
||
#include "fmha_bwd_launch_template.h" | ||
|
||
void run_fmha_bwd_hdim128(FMHA_dgrad_params ¶ms, cudaStream_t stream, const bool configure) { | ||
// work around for MSVC issue | ||
FP16_SWITCH(params.is_bf16, [&] { | ||
using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 8, 0x100u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
// Copyright (c) 2022, Tri Dao. | ||
|
||
// Splitting the different head dimentions to different files to speed up compilation. | ||
|
||
#include "fmha_bwd_launch_template.h" | ||
|
||
void run_fmha_bwd_hdim32(FMHA_dgrad_params ¶ms, cudaStream_t stream, const bool configure) { | ||
// work around for MSVC issue | ||
FP16_SWITCH(params.is_bf16, [&] { | ||
if (params.seqlen_k == 128) { | ||
using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 8, 0x08u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
} else if (params.seqlen_k >= 256) { | ||
using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 8, 0x08u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
} | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// Copyright (c) 2022, Tri Dao. | ||
|
||
// Splitting the different head dimentions to different files to speed up compilation. | ||
|
||
#include "fmha_bwd_launch_template.h" | ||
|
||
void run_fmha_bwd_hdim64(FMHA_dgrad_params ¶ms, cudaStream_t stream, const bool configure) { | ||
// work around for MSVC issue | ||
FP16_SWITCH(params.is_bf16, [&] { | ||
auto dprops = at::cuda::getCurrentDeviceProperties(); | ||
if (params.seqlen_k == 128) { | ||
using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
} else if (params.seqlen_k >= 256) { | ||
if (dprops->major == 8 && dprops->minor == 0) { | ||
// Don't share smem for K & V, and don't keep V in registers | ||
// This speeds things up by 2-3% by avoiding register spills, but it | ||
// uses more shared memory, which is fine on A100 but not other GPUs. | ||
// For other GPUs, we keep V in registers. | ||
using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x100u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
} else if (dprops->major == 8 && dprops->minor > 0) { | ||
using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x08u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
} else if (dprops->major == 7 && dprops->minor == 5) { | ||
using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>; | ||
run_fmha_bwd_loop<Kernel_traits>(params, stream, configure); | ||
} | ||
} | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.