From a8c30c0511f0aefa138e9fe41a38831d9a9ba776 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 16 Sep 2024 23:30:40 +0000 Subject: [PATCH 01/13] clipping fp16/bf16 addition --- apps/nccl/src/allreduce.hpp | 39 ++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 1b85136ae..c05e19d15 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -28,19 +28,52 @@ __forceinline__ __device__ To bit_cast(const From& src) { return u.t; } +template +__forceinline__ __device__ T clip(T val) { + return val; +} + +template <> +__forceinline__ __device__ __half clip(__half val) { + val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); + val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); + return val; +} + +template <> +__forceinline__ __device__ __half2 clip(__half2 val) { + val.x = clip(val.x); + val.y = clip(val.y); + return val; +} + +template <> +__forceinline__ __device__ __bfloat16 clip(__bfloat16 val) { + val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); + val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); + return val; +} + +template <> +__forceinline__ __device__ __bfloat162 clip(__bfloat162 val) { + val.x = clip(val.x); + val.y = clip(val.y); + return val; +} + template __forceinline__ __device__ T add_elements(T a, T b) { - return a + b; + return clip(a + b); } template <> __forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { - return __hadd2(a, b); + return clip(__hadd2(a, b)); } template <> __forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) { - return __hadd2(a, b); + return clip(__hadd2(a, b)); } template From 780f0f8303ce4b653e53baf4df2092f33f51b0b9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 16 Sep 2024 23:35:12 +0000 Subject: [PATCH 02/13] rccl-tests pass --- apps/nccl/src/allreduce.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index c05e19d15..899df2aed 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -35,8 +35,8 @@ __forceinline__ __device__ T clip(T val) { template <> __forceinline__ __device__ __half clip(__half val) { - val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); + val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); return val; } @@ -49,8 +49,8 @@ __forceinline__ __device__ __half2 clip(__half2 val) { template <> __forceinline__ __device__ __bfloat16 clip(__bfloat16 val) { - val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); + val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); return val; } From c6cf40020e35b5788b8ab006732cc29676139ab4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 25 Sep 2024 07:44:55 +0000 Subject: [PATCH 03/13] align with msccl clipping --- apps/nccl/src/allreduce.hpp | 43 ++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 899df2aed..748b6f33b 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -35,29 +35,33 @@ __forceinline__ __device__ T clip(T val) { template <> __forceinline__ __device__ __half clip(__half val) { - val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); + val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); return val; } template <> __forceinline__ __device__ __half2 clip(__half2 val) { - val.x = clip(val.x); - val.y = clip(val.y); + val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); + val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); + val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); + val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); return val; } template <> __forceinline__ __device__ __bfloat16 clip(__bfloat16 val) { - val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); + val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); return val; } template <> __forceinline__ __device__ __bfloat162 clip(__bfloat162 val) { - val.x = clip(val.x); - val.y = clip(val.y); + val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80)); + val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80)); + val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80)); + val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80)); return val; } @@ -187,6 +191,17 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); } +// template +// __global__ void __launch_bounds__(32, 1) +// test(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, +// size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, +// size_t nelems, uint32_t flag) { +// // add 0.1f to all elements +// for (size_t i = threadIdx.x + blockIdx.x * gridDim.x; i < nelems; i += blockDim.x * gridDim.x) { +// buff[i] = add_elements(buff[i], T(0.1f)); +// } +// } + template __global__ void __launch_bounds__(32, 1) allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, @@ -426,6 +441,10 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< size_t nelems, cudaStream_t stream) { static uint32_t flag = 1; + // test<<<7, 32, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset, + // channelScratchOffset, rank, nRanksPerNode, worldSize, + // nelems, flag++); + if (sizeof(T) * nelems < worldSize * sizeof(int)) { int nBlocks = 7; int nThreadsPerBlock = 32; @@ -433,12 +452,12 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, flag++); } else if (sizeof(T) * nelems <= (1 << 20)) { - int nBlocks = 28; - int nThreadsPerBlock = 1024; - if (nelems >= 8192) { - nBlocks = 56; - nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; - } + int nBlocks = 7; + int nThreadsPerBlock = 64 * 7; + // if (nelems >= 8192) { + // nBlocks = 56; + // nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; + // } allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, flag++); From d929d250efac638d797fa4b0b9b0635cf5be09f3 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 25 Sep 2024 07:49:44 +0000 Subject: [PATCH 04/13] revert --- apps/nccl/src/allreduce.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 748b6f33b..1acc1cd1d 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -452,12 +452,12 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, flag++); } else if (sizeof(T) * nelems <= (1 << 20)) { - int nBlocks = 7; - int nThreadsPerBlock = 64 * 7; - // if (nelems >= 8192) { - // nBlocks = 56; - // nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; - // } + int nBlocks = 28; + int nThreadsPerBlock = 1024; + if (nelems >= 8192) { + nBlocks = 56; + nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; + } allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, flag++); From dee8fe2b520c21c861d10dd09168d86eedc2c49a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 25 Sep 2024 08:04:08 +0000 Subject: [PATCH 05/13] final barrier for allreduce8 --- apps/nccl/src/allreduce.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 1acc1cd1d..91f536f2c 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -432,6 +432,10 @@ __global__ void __launch_bounds__(512, 1) } } } + if (threadIdx.x < static_cast(nPeer)) { + outChannels[threadIdx.x].signal(); + outChannels[threadIdx.x].wait(); + } } template From abe69b84a9eb881a467c0773a9b2f50dc3eb9ad7 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 25 Sep 2024 08:18:14 +0000 Subject: [PATCH 06/13] this is weird --- apps/nccl/src/allreduce.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 91f536f2c..2078cfae2 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -35,17 +35,17 @@ __forceinline__ __device__ T clip(T val) { template <> __forceinline__ __device__ __half clip(__half val) { - val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); + val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); return val; } template <> __forceinline__ __device__ __half2 clip(__half2 val) { - val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); - val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); + val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); + val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); return val; } From ca6741c33a1eab14ba5e86f1cf62e73fb7bfa4fc Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Thu, 26 Sep 2024 12:54:48 -0500 Subject: [PATCH 07/13] apps/nccl: 16B LLPacket for allreduce7 This fixes the data correctness issue of rccl allreduce for half datatypes. --- apps/nccl/src/allreduce.hpp | 65 +++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 2078cfae2..a01f27c87 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -35,17 +35,19 @@ __forceinline__ __device__ T clip(T val) { template <> __forceinline__ __device__ __half clip(__half val) { - val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); + val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); + return val; } template <> __forceinline__ __device__ __half2 clip(__half2 val) { - val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); - val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); + val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); + val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); + val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); return val; } @@ -255,9 +257,9 @@ __global__ void __launch_bounds__(1024, 1) if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); const int nPeers = nRanksPerNode - 1; - const size_t nPkts = nelems; + const size_t nPkts = nelems/2; const int nelemsPerRank = nelems / worldSize; - const int nPktsPerRank = nelemsPerRank; + const int nPktsPerRank = nelemsPerRank/2; // thread block & channel info const int nBlocksPerPeer = gridDim.x / nPeers; const int localBlockIdx = blockIdx.x % nBlocksPerPeer; @@ -265,11 +267,12 @@ __global__ void __launch_bounds__(1024, 1) const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; const int tid = threadIdx.x + localBlockIdx * blockDim.x; void* scratchBuff = (void*)((char*)scratch + channelScratchOffset); - size_t scratchOffset = channelScratchOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); - size_t scratchResultOffset = channelScratchOffset + 2 * nPkts * sizeof(mscclpp::LL8Packet); + size_t scratchOffset = channelScratchOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket); + size_t scratchResultOffset = channelScratchOffset + 2 * nPkts * sizeof(mscclpp::LLPacket); size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int) + channelDataOffset; - uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); - uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // Put channels into shared memory, read channel info from global memory is unexpectable slow. __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; @@ -280,35 +283,47 @@ __global__ void __launch_bounds__(1024, 1) __syncwarp(); // step 1: write to scratch buffer - channels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + channels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { - uint32_t data = 0; + //uint32_t data = 0; + uint2 data = make_uint2(0, 0); for (int index = 0; index < nPeers; index++) { const int remoteRank = index < rank ? index : index + 1; - mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank; - uint32_t val = dstPkt[idx].read(flag, -1); - data = add_vectors(val, data); + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; + //uint32_t val = dstPkt[idx].read(flag, -1); + uint2 val = dstPkt[idx].read(flag); + //data = add_vectors(val, data); + data.x = add_vectors(val.x, data.x); + data.y = add_vectors(val.y, data.y); } - data = add_vectors(data, src[idx]); - dst[idx] = data; - - mscclpp::LL8Packet packet; - packet.data = data; - packet.flag = flag; - size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); + data.x = add_vectors(data.x, src[idx].x); + data.y = add_vectors(data.y, src[idx].y); + + dst[idx].x = data.x; + dst[idx].y = data.y; + + mscclpp::LLPacket packet; + /*packet.data = data; + packet.flag = flag;*/ + packet.data1 = data.x; + packet.flag1 = flag; + packet.data2 = data.y; + packet.flag2 = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { channels[index].write(offset, packet); } } // step 3: get data result from scratch buffer - mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset); + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset); const int dstOffset = remoteRank * nPktsPerRank; - uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { - uint32_t data = dstPkt[idx + dstOffset].read(flag, -1); - result[idx] = data; + uint2 data = dstPkt[idx + dstOffset].read(flag, -1); + result[idx].x = data.x; + result[idx].y = data.y; } } From 6484dce1478dbb2d2f3b15f12c9ac553b517d743 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Thu, 26 Sep 2024 14:56:01 -0500 Subject: [PATCH 08/13] apps/nccl: performance optimization for allreduce7 Add unroll and non-temporal store --- apps/nccl/src/allreduce.hpp | 4 ++-- apps/nccl/src/common.hpp | 2 ++ include/mscclpp/packet_device.hpp | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index a01f27c87..14d0dfbae 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -289,7 +289,7 @@ __global__ void __launch_bounds__(1024, 1) for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { //uint32_t data = 0; uint2 data = make_uint2(0, 0); - for (int index = 0; index < nPeers; index++) { + for (int index = 0; index < NPEERS; index++) { const int remoteRank = index < rank ? index : index + 1; mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; //uint32_t val = dstPkt[idx].read(flag, -1); @@ -312,7 +312,7 @@ __global__ void __launch_bounds__(1024, 1) packet.data2 = data.y; packet.flag2 = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); - for (int index = 0; index < nPeers; index++) { + for (int index = 0; index < NPEERS; index++) { channels[index].write(offset, packet); } } diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp index 25c74e71b..8acd3ffab 100644 --- a/apps/nccl/src/common.hpp +++ b/apps/nccl/src/common.hpp @@ -12,6 +12,8 @@ #endif constexpr int NRANKS_PER_NODE = 8; +constexpr int NPEERS = 7; + constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB #endif // NCCL_COMMON_HPP_ diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index dc3886528..532676d43 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -49,8 +49,10 @@ union alignas(16) LL16Packet { #else // !defined(MSCCLPP_DEVICE_CUDA) uint4 reg = make_uint4(val1, flag, val2, flag); ulonglong2* p = reinterpret_cast(®); - atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); - atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); + /*atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); + atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);*/ + __builtin_nontemporal_store(p->x, &(raw_.x)); + __builtin_nontemporal_store(p->y, &(raw_.y)); #endif } From 01e105ba88cab14dbea3837f9202e09f44c04329 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Mon, 30 Sep 2024 13:55:39 -0500 Subject: [PATCH 09/13] apps/nccl: add unroll in allred8 --- apps/nccl/src/allreduce.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 14d0dfbae..55739e97f 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -382,7 +382,7 @@ __global__ void __launch_bounds__(512, 1) __syncthreads(); // Starts allgather for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { - for (int i = 0; i < nPeer; i++) { + for (int i = 0; i < NPEERS; i++) { const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; @@ -399,13 +399,13 @@ __global__ void __launch_bounds__(512, 1) for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; data = add_vectors(val, data); } resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), data); } @@ -419,7 +419,7 @@ __global__ void __launch_bounds__(512, 1) } __syncthreads(); for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { - for (int i = 0; i < nPeer; i++) { + for (int i = 0; i < NPEERS; i++) { const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; @@ -435,13 +435,13 @@ __global__ void __launch_bounds__(512, 1) for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; data = add_vectors(val, data); } resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), data); } From ea4f77d8ed013dd17e02a7781408f54231092c54 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 2 Oct 2024 09:50:55 -0700 Subject: [PATCH 10/13] Update allgather.hpp --- apps/nccl/src/allgather.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 35c2b60c4..2e2e88487 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -102,6 +102,10 @@ __global__ void __launch_bounds__(1024, 1) } } } + if (threadIdx.x < nPeer) { + smChans[threadIdx.x].relaxedSignal(); + smChans[threadIdx.x].wait(); + } } template From 99b5997ab0892785a462898a60212fa857d68fa7 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Wed, 2 Oct 2024 16:11:11 -0500 Subject: [PATCH 11/13] apps/nccl: Add barrier in allgather --- apps/nccl/src/allgather.hpp | 3 +++ apps/nccl/src/allreduce.hpp | 1 - apps/nccl/src/common.hpp | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 2e2e88487..b4ee696e4 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -102,6 +102,9 @@ __global__ void __launch_bounds__(1024, 1) } } } + + deviceSyncer.sync(gridDim.x); + if (threadIdx.x < nPeer) { smChans[threadIdx.x].relaxedSignal(); smChans[threadIdx.x].wait(); diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 55739e97f..f1f3b020e 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -14,7 +14,6 @@ #include "common.hpp" #include "gpu_data_types.hpp" -__device__ mscclpp::DeviceSyncer deviceSyncer; template __forceinline__ __device__ To bit_cast(const From& src) { diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp index 8acd3ffab..af8ef1785 100644 --- a/apps/nccl/src/common.hpp +++ b/apps/nccl/src/common.hpp @@ -16,4 +16,8 @@ constexpr int NPEERS = 7; constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB +#include + +__device__ mscclpp::DeviceSyncer deviceSyncer; + #endif // NCCL_COMMON_HPP_ From f9def8525097479a92125886d0ea3dfc46b1cb34 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Sat, 5 Oct 2024 15:33:27 -0500 Subject: [PATCH 12/13] apps/nccl: fix allreduce7 correctness issue for non power of 2 sizes --- apps/nccl/src/allreduce.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index f1f3b020e..2ff23fb72 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -254,10 +254,18 @@ __global__ void __launch_bounds__(1024, 1) size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; - nelems = nelems / (sizeof(int) / sizeof(T)); + + if (sizeof(T) == 2) + nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); + else + nelems = nelems / (sizeof(int) / sizeof(T)); + const int nPeers = nRanksPerNode - 1; const size_t nPkts = nelems/2; - const int nelemsPerRank = nelems / worldSize; + + int nelemsPerRank = nelems / worldSize; + if ((nelemsPerRank % 2)) nelemsPerRank = (nelemsPerRank * sizeof(T) + sizeof(T)) / sizeof(T); + const int nPktsPerRank = nelemsPerRank/2; // thread block & channel info const int nBlocksPerPeer = gridDim.x / nPeers; From cdbb2de3bd92557afeb954248b83d65c06a069c9 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Mon, 14 Oct 2024 11:51:07 -0500 Subject: [PATCH 13/13] apps/nccl: allgather tuning --- apps/nccl/src/allgather.hpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index b4ee696e4..4e7441249 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -103,19 +103,33 @@ __global__ void __launch_bounds__(1024, 1) } } - deviceSyncer.sync(gridDim.x); + //deviceSyncer.sync(gridDim.x); if (threadIdx.x < nPeer) { smChans[threadIdx.x].relaxedSignal(); smChans[threadIdx.x].wait(); } + __syncthreads(); } template cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff, mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { - allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize, + + int nBlocks = 28; + + if (nelems <= 4096) { + nBlocks = 7; + } + else if (nelems <= 32768) { + nBlocks = 14; + } else if (nelems >= 2097152) { + nBlocks = 35; + } + + + allgather6<<>>((void*)buff, smChannels, channelOutOffset, rank, worldSize, nRanksPerNode, nelems * sizeof(T) / sizeof(int)); return cudaGetLastError(); }