diff --git a/cuda/fastermoe/smart_schedule.cpp b/cuda/fastermoe/smart_schedule.cpp index ada1a2b..baa282e 100644 --- a/cuda/fastermoe/smart_schedule.cpp +++ b/cuda/fastermoe/smart_schedule.cpp @@ -155,7 +155,7 @@ torch::Tensor _smart_sch_backward( auto global_grad_in = grad_out.new_zeros({global_batch_size, d_model}); auto grad_in = grad_out.new_zeros({buf_batch_size, d_model}); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_out.scalar_type(), + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,grad_out.scalar_type(), "fmoe_cuda_smartsch_backward", ([&] { fmoe_cuda_fused_backward_impl( backward_fn, diff --git a/fmoe/gates/gshard_gate.py b/fmoe/gates/gshard_gate.py index 387836e..3a018a5 100644 --- a/fmoe/gates/gshard_gate.py +++ b/fmoe/gates/gshard_gate.py @@ -11,9 +11,9 @@ class GShardGate(NaiveGate): def __init__(self, d_model, num_expert, world_size, - topk=2, capacity=(1.2, 2.4), random_routing=True): - assert topk == 2, 'topk should be 2 in gshard' - super().__init__(d_model, num_expert, world_size, top_k=2) + topk=2, capacity=(1, 1), random_routing=False): + # assert topk == 2, 'topk should be 2 in gshard' + super().__init__(d_model, num_expert, world_size, top_k=topk) self.capacity = capacity self.random_routing = random_routing diff --git a/fmoe/layers.py b/fmoe/layers.py index 9dd63cd..3d0d5f7 100644 --- a/fmoe/layers.py +++ b/fmoe/layers.py @@ -117,6 +117,7 @@ def __init__( gate_hook=None, mask=None, mask_dict=None, + gate_kwargs={} ): super().__init__() self.num_expert = num_expert @@ -145,7 +146,7 @@ def __init__( else: self.experts_fused = True - self.gate = gate(d_model, num_expert, world_size, top_k) + self.gate = gate(d_model, num_expert, world_size, top_k, **gate_kwargs) self.gate_hook = gate_hook self.mask = mask self.mask_dict = mask_dict diff --git a/setup.py b/setup.py index 8421d03..a014fdd 100644 --- a/setup.py +++ b/setup.py @@ -7,11 +7,11 @@ ext_libs = [] authors = [ - 'Jiaao He', - 'Jiezhong Qiu', - 'Aohan Zeng', - 'Tiago Antunes', - 'Jinjun Peng', + 'Jiaao He', + 'Jiezhong Qiu', + 'Aohan Zeng', + 'Tiago Antunes', + 'Jinjun Peng', 'Qin Li', 'Mingshu Zhai' ] @@ -37,6 +37,11 @@ else: define_macros=[] +include_dirs = [] +if os.environ.get("NCCL_PATH"): + include_dirs.append(os.environ.get("NCCL_PATH")+'/include') + nccl_lib_path = os.environ.get("NCCL_PATH")+'/lib' + os.environ['LIBRARY_PATH'] = nccl_lib_path+':'+os.environ.get('LIBRARY_PATH','') if __name__ == '__main__': setuptools.setup( @@ -50,7 +55,7 @@ packages=['fmoe', 'fmoe.megatron', 'fmoe.gates', 'fmoe.fastermoe'], ext_modules=[ CUDAExtension( - name='fmoe_cuda', + name='fmoe_cuda', sources=[ 'cuda/stream_manager.cpp', 'cuda/local_exchange.cu', @@ -65,7 +70,8 @@ 'cxx': cxx_flags, 'nvcc': cxx_flags }, - libraries=ext_libs + libraries=ext_libs, + include_dirs=include_dirs ) ], cmdclass={