From a8c30c0511f0aefa138e9fe41a38831d9a9ba776 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 16 Sep 2024 23:30:40 +0000
Subject: [PATCH 01/13] clipping fp16/bf16 addition

---
 apps/nccl/src/allreduce.hpp | 39 ++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 1b85136ae..c05e19d15 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -28,19 +28,52 @@ __forceinline__ __device__ To bit_cast(const From& src) {
   return u.t;
 }
 
+template <typename T>
+__forceinline__ __device__ T clip(T val) {
+  return val;
+}
+
+template <>
+__forceinline__ __device__ __half clip(__half val) {
+  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
+  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
+  return val;
+}
+
+template <>
+__forceinline__ __device__ __half2 clip(__half2 val) {
+  val.x = clip(val.x);
+  val.y = clip(val.y);
+  return val;
+}
+
+template <>
+__forceinline__ __device__ __bfloat16 clip(__bfloat16 val) {
+  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  return val;
+}
+
+template <>
+__forceinline__ __device__ __bfloat162 clip(__bfloat162 val) {
+  val.x = clip(val.x);
+  val.y = clip(val.y);
+  return val;
+}
+
 template <typename T>
 __forceinline__ __device__ T add_elements(T a, T b) {
-  return a + b;
+  return clip(a + b);
 }
 
 template <>
 __forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
-  return __hadd2(a, b);
+  return clip(__hadd2(a, b));
 }
 
 template <>
 __forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
-  return __hadd2(a, b);
+  return clip(__hadd2(a, b));
 }
 
 template <typename T>

From 780f0f8303ce4b653e53baf4df2092f33f51b0b9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 16 Sep 2024 23:35:12 +0000
Subject: [PATCH 02/13] rccl-tests pass

---
 apps/nccl/src/allreduce.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index c05e19d15..899df2aed 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -35,8 +35,8 @@ __forceinline__ __device__ T clip(T val) {
 
 template <>
 __forceinline__ __device__ __half clip(__half val) {
-  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
   val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
+  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
   return val;
 }
 
@@ -49,8 +49,8 @@ __forceinline__ __device__ __half2 clip(__half2 val) {
 
 template <>
 __forceinline__ __device__ __bfloat16 clip(__bfloat16 val) {
-  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
   val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
   return val;
 }
 

From c6cf40020e35b5788b8ab006732cc29676139ab4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 25 Sep 2024 07:44:55 +0000
Subject: [PATCH 03/13] align with msccl clipping

---
 apps/nccl/src/allreduce.hpp | 43 ++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 899df2aed..748b6f33b 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -35,29 +35,33 @@ __forceinline__ __device__ T clip(T val) {
 
 template <>
 __forceinline__ __device__ __half clip(__half val) {
-  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
   val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
+  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
   return val;
 }
 
 template <>
 __forceinline__ __device__ __half2 clip(__half2 val) {
-  val.x = clip(val.x);
-  val.y = clip(val.y);
+  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
+  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
+  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
+  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
   return val;
 }
 
 template <>
 __forceinline__ __device__ __bfloat16 clip(__bfloat16 val) {
-  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
   val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
   return val;
 }
 
 template <>
 __forceinline__ __device__ __bfloat162 clip(__bfloat162 val) {
-  val.x = clip(val.x);
-  val.y = clip(val.y);
+  val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80));
   return val;
 }
 
@@ -187,6 +191,17 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) {
   vectorSum(dst, src, nElem, blockIdx.x, gridDim.x);
 }
 
+// template <typename T>
+// __global__ void __launch_bounds__(32, 1)
+//     test(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
+//                       size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
+//                       size_t nelems, uint32_t flag) {
+//   // add 0.1f to all elements
+//   for (size_t i = threadIdx.x + blockIdx.x * gridDim.x; i < nelems; i += blockDim.x * gridDim.x) {
+//     buff[i] = add_elements<T>(buff[i], T(0.1f));
+//   }
+// }
+
 template <typename T>
 __global__ void __launch_bounds__(32, 1)
     allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
@@ -426,6 +441,10 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
                       size_t nelems, cudaStream_t stream) {
   static uint32_t flag = 1;
 
+  // test<<<7, 32, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
+  //                                                             channelScratchOffset, rank, nRanksPerNode, worldSize,
+  //                                                             nelems, flag++);
+
   if (sizeof(T) * nelems < worldSize * sizeof(int)) {
     int nBlocks = 7;
     int nThreadsPerBlock = 32;
@@ -433,12 +452,12 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
                                                                 channelScratchOffset, rank, nRanksPerNode, worldSize,
                                                                 nelems, flag++);
   } else if (sizeof(T) * nelems <= (1 << 20)) {
-    int nBlocks = 28;
-    int nThreadsPerBlock = 1024;
-    if (nelems >= 8192) {
-      nBlocks = 56;
-      nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
-    }
+    int nBlocks = 7;
+    int nThreadsPerBlock = 64 * 7;
+    // if (nelems >= 8192) {
+    //   nBlocks = 56;
+    //   nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
+    // }
     allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
                                                          channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
                                                          flag++);

From d929d250efac638d797fa4b0b9b0635cf5be09f3 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 25 Sep 2024 07:49:44 +0000
Subject: [PATCH 04/13] revert

---
 apps/nccl/src/allreduce.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 748b6f33b..1acc1cd1d 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -452,12 +452,12 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
                                                                 channelScratchOffset, rank, nRanksPerNode, worldSize,
                                                                 nelems, flag++);
   } else if (sizeof(T) * nelems <= (1 << 20)) {
-    int nBlocks = 7;
-    int nThreadsPerBlock = 64 * 7;
-    // if (nelems >= 8192) {
-    //   nBlocks = 56;
-    //   nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
-    // }
+    int nBlocks = 28;
+    int nThreadsPerBlock = 1024;
+    if (nelems >= 8192) {
+      nBlocks = 56;
+      nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
+    }
     allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
                                                          channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
                                                          flag++);

From dee8fe2b520c21c861d10dd09168d86eedc2c49a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 25 Sep 2024 08:04:08 +0000
Subject: [PATCH 05/13] final barrier for allreduce8

---
 apps/nccl/src/allreduce.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 1acc1cd1d..91f536f2c 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -432,6 +432,10 @@ __global__ void __launch_bounds__(512, 1)
       }
     }
   }
+  if (threadIdx.x < static_cast<uint32_t>(nPeer)) {
+    outChannels[threadIdx.x].signal();
+    outChannels[threadIdx.x].wait();
+  }
 }
 
 template <typename T>

From abe69b84a9eb881a467c0773a9b2f50dc3eb9ad7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 25 Sep 2024 08:18:14 +0000
Subject: [PATCH 06/13] this is weird

---
 apps/nccl/src/allreduce.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 91f536f2c..2078cfae2 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -35,17 +35,17 @@ __forceinline__ __device__ T clip(T val) {
 
 template <>
 __forceinline__ __device__ __half clip(__half val) {
-  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
   val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
+  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
   return val;
 }
 
 template <>
 __forceinline__ __device__ __half2 clip(__half2 val) {
-  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
   val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
-  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
+  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
   val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
+  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
   return val;
 }
 

From ca6741c33a1eab14ba5e86f1cf62e73fb7bfa4fc Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Thu, 26 Sep 2024 12:54:48 -0500
Subject: [PATCH 07/13] apps/nccl: 16B LLPacket for allreduce7

This fixes the data correctness issue of rccl allreduce
for half datatypes.
---
 apps/nccl/src/allreduce.hpp | 65 +++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 2078cfae2..a01f27c87 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -35,17 +35,19 @@ __forceinline__ __device__ T clip(T val) {
 
 template <>
 __forceinline__ __device__ __half clip(__half val) {
-  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
   val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
+  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
+
   return val;
 }
 
 template <>
 __forceinline__ __device__ __half2 clip(__half2 val) {
-  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
   val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
-  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
+  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
+  
   val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
+  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
   return val;
 }
 
@@ -255,9 +257,9 @@ __global__ void __launch_bounds__(1024, 1)
   if (worldSize != nRanksPerNode) return;
   nelems = nelems / (sizeof(int) / sizeof(T));
   const int nPeers = nRanksPerNode - 1;
-  const size_t nPkts = nelems;
+  const size_t nPkts = nelems/2;
   const int nelemsPerRank = nelems / worldSize;
-  const int nPktsPerRank = nelemsPerRank;
+  const int nPktsPerRank = nelemsPerRank/2;
   // thread block & channel info
   const int nBlocksPerPeer = gridDim.x / nPeers;
   const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
@@ -265,11 +267,12 @@ __global__ void __launch_bounds__(1024, 1)
   const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
   const int tid = threadIdx.x + localBlockIdx * blockDim.x;
   void* scratchBuff = (void*)((char*)scratch + channelScratchOffset);
-  size_t scratchOffset = channelScratchOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet);
-  size_t scratchResultOffset = channelScratchOffset + 2 * nPkts * sizeof(mscclpp::LL8Packet);
+  size_t scratchOffset = channelScratchOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket);
+  size_t scratchResultOffset = channelScratchOffset + 2 * nPkts * sizeof(mscclpp::LLPacket);
   size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int) + channelDataOffset;
-  uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int));
-  uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
+
+  uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int));
+  uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // Put channels into shared memory, read channel info from global memory is unexpectable slow.
   __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> channels[NRANKS_PER_NODE - 1];
@@ -280,35 +283,47 @@ __global__ void __launch_bounds__(1024, 1)
   __syncwarp();
 
   // step 1: write to scratch buffer
-  channels[peerIdx].putPackets<mscclpp::LL8Packet>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
+  channels[peerIdx].putPackets<mscclpp::LLPacket>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
                                                    blockDim.x * nBlocksPerPeer, flag);
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
-    uint32_t data = 0;
+    //uint32_t data = 0;
+    uint2 data = make_uint2(0, 0);
     for (int index = 0; index < nPeers; index++) {
       const int remoteRank = index < rank ? index : index + 1;
-      mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank;
-      uint32_t val = dstPkt[idx].read(flag, -1);
-      data = add_vectors<T>(val, data);
+      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+      //uint32_t val = dstPkt[idx].read(flag, -1);
+      uint2 val = dstPkt[idx].read(flag);
+      //data = add_vectors<T>(val, data);
+      data.x = add_vectors<T>(val.x, data.x);
+      data.y = add_vectors<T>(val.y, data.y); 
     }
-    data = add_vectors<T>(data, src[idx]);
-    dst[idx] = data;
-
-    mscclpp::LL8Packet packet;
-    packet.data = data;
-    packet.flag = flag;
-    size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank);
+    data.x = add_vectors<T>(data.x, src[idx].x);
+    data.y = add_vectors<T>(data.y, src[idx].y);	
+
+    dst[idx].x = data.x;
+    dst[idx].y = data.y;
+
+    mscclpp::LLPacket packet;
+    /*packet.data = data;
+    packet.flag = flag;*/
+    packet.data1 = data.x;
+    packet.flag1 = flag;
+    packet.data2 = data.y;
+    packet.flag2 = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
     for (int index = 0; index < nPeers; index++) {
       channels[index].write(offset, packet);
     }
   }
   // step 3: get data result from scratch buffer
-  mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset);
+  mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset);
   const int dstOffset = remoteRank * nPktsPerRank;
-  uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int));
+  uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int));
   for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) {
-    uint32_t data = dstPkt[idx + dstOffset].read(flag, -1);
-    result[idx] = data;
+    uint2 data = dstPkt[idx + dstOffset].read(flag, -1);
+    result[idx].x = data.x;
+    result[idx].y = data.y;
   }
 }
 

From 6484dce1478dbb2d2f3b15f12c9ac553b517d743 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Thu, 26 Sep 2024 14:56:01 -0500
Subject: [PATCH 08/13] apps/nccl: performance optimization for allreduce7

Add unroll and non-temporal store
---
 apps/nccl/src/allreduce.hpp       | 4 ++--
 apps/nccl/src/common.hpp          | 2 ++
 include/mscclpp/packet_device.hpp | 6 ++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index a01f27c87..14d0dfbae 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -289,7 +289,7 @@ __global__ void __launch_bounds__(1024, 1)
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     //uint32_t data = 0;
     uint2 data = make_uint2(0, 0);
-    for (int index = 0; index < nPeers; index++) {
+    for (int index = 0; index < NPEERS; index++) {
       const int remoteRank = index < rank ? index : index + 1;
       mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
       //uint32_t val = dstPkt[idx].read(flag, -1);
@@ -312,7 +312,7 @@ __global__ void __launch_bounds__(1024, 1)
     packet.data2 = data.y;
     packet.flag2 = flag;
     size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
-    for (int index = 0; index < nPeers; index++) {
+    for (int index = 0; index < NPEERS; index++) {
       channels[index].write(offset, packet);
     }
   }
diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
index 25c74e71b..8acd3ffab 100644
--- a/apps/nccl/src/common.hpp
+++ b/apps/nccl/src/common.hpp
@@ -12,6 +12,8 @@
 #endif
 
 constexpr int NRANKS_PER_NODE = 8;
+constexpr int NPEERS = 7;
+
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
 
 #endif  // NCCL_COMMON_HPP_
diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp
index dc3886528..532676d43 100644
--- a/include/mscclpp/packet_device.hpp
+++ b/include/mscclpp/packet_device.hpp
@@ -49,8 +49,10 @@ union alignas(16) LL16Packet {
 #else  // !defined(MSCCLPP_DEVICE_CUDA)
     uint4 reg = make_uint4(val1, flag, val2, flag);
     ulonglong2* p = reinterpret_cast<ulonglong2*>(&reg);
-    atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
-    atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);
+    /*atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
+    atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);*/
+    __builtin_nontemporal_store(p->x, &(raw_.x));
+    __builtin_nontemporal_store(p->y, &(raw_.y));
 #endif
   }
 

From 01e105ba88cab14dbea3837f9202e09f44c04329 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Mon, 30 Sep 2024 13:55:39 -0500
Subject: [PATCH 09/13] apps/nccl: add unroll in allred8

---
 apps/nccl/src/allreduce.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 14d0dfbae..55739e97f 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -382,7 +382,7 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
     // Starts allgather
     for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
-      for (int i = 0; i < nPeer; i++) {
+      for (int i = 0; i < NPEERS; i++) {
         const int peerIdx = (i + blockIdx.x) % nPeer;
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -399,13 +399,13 @@ __global__ void __launch_bounds__(512, 1)
 
     for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
       int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
-      for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
+      for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
         data = add_vectors<T>(val, data);
       }
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
-      for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
+      for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
                                    data);
       }
@@ -419,7 +419,7 @@ __global__ void __launch_bounds__(512, 1)
     }
     __syncthreads();
     for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
-      for (int i = 0; i < nPeer; i++) {
+      for (int i = 0; i < NPEERS; i++) {
         const int peerIdx = (i + blockIdx.x) % nPeer;
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -435,13 +435,13 @@ __global__ void __launch_bounds__(512, 1)
 
     for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
       int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
-      for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
+      for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
         data = add_vectors<T>(val, data);
       }
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
-      for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
+      for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
                                    data);
       }

From ea4f77d8ed013dd17e02a7781408f54231092c54 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 2 Oct 2024 09:50:55 -0700
Subject: [PATCH 10/13] Update allgather.hpp

---
 apps/nccl/src/allgather.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp
index 35c2b60c4..2e2e88487 100644
--- a/apps/nccl/src/allgather.hpp
+++ b/apps/nccl/src/allgather.hpp
@@ -102,6 +102,10 @@ __global__ void __launch_bounds__(1024, 1)
       }
     }
   }
+  if (threadIdx.x < nPeer) {
+    smChans[threadIdx.x].relaxedSignal();
+    smChans[threadIdx.x].wait();
+  }
 }
 
 template <bool IsOutOfPlace, typename T>

From 99b5997ab0892785a462898a60212fa857d68fa7 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Wed, 2 Oct 2024 16:11:11 -0500
Subject: [PATCH 11/13] apps/nccl: Add barrier in allgather

---
 apps/nccl/src/allgather.hpp | 3 +++
 apps/nccl/src/allreduce.hpp | 1 -
 apps/nccl/src/common.hpp    | 4 ++++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp
index 2e2e88487..b4ee696e4 100644
--- a/apps/nccl/src/allgather.hpp
+++ b/apps/nccl/src/allgather.hpp
@@ -102,6 +102,9 @@ __global__ void __launch_bounds__(1024, 1)
       }
     }
   }
+
+  deviceSyncer.sync(gridDim.x);
+
   if (threadIdx.x < nPeer) {
     smChans[threadIdx.x].relaxedSignal();
     smChans[threadIdx.x].wait();
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 55739e97f..f1f3b020e 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -14,7 +14,6 @@
 #include "common.hpp"
 #include "gpu_data_types.hpp"
 
-__device__ mscclpp::DeviceSyncer deviceSyncer;
 
 template <typename To, typename From>
 __forceinline__ __device__ To bit_cast(const From& src) {
diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
index 8acd3ffab..af8ef1785 100644
--- a/apps/nccl/src/common.hpp
+++ b/apps/nccl/src/common.hpp
@@ -16,4 +16,8 @@ constexpr int NPEERS = 7;
 
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
 
+#include <mscclpp/concurrency_device.hpp>
+
+__device__ mscclpp::DeviceSyncer deviceSyncer;
+
 #endif  // NCCL_COMMON_HPP_

From f9def8525097479a92125886d0ea3dfc46b1cb34 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Sat, 5 Oct 2024 15:33:27 -0500
Subject: [PATCH 12/13] apps/nccl: fix allreduce7 correctness issue for non
 power of 2 sizes

---
 apps/nccl/src/allreduce.hpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index f1f3b020e..2ff23fb72 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -254,10 +254,18 @@ __global__ void __launch_bounds__(1024, 1)
                size_t nelems, uint32_t flag) {
   // This version of allreduce only works for single nodes
   if (worldSize != nRanksPerNode) return;
-  nelems = nelems / (sizeof(int) / sizeof(T));
+  
+  if (sizeof(T) == 2)
+          nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
+  else
+        nelems = nelems / (sizeof(int) / sizeof(T));
+  
   const int nPeers = nRanksPerNode - 1;
   const size_t nPkts = nelems/2;
-  const int nelemsPerRank = nelems / worldSize;
+  
+  int nelemsPerRank = nelems / worldSize;
+  if ((nelemsPerRank % 2)) nelemsPerRank = (nelemsPerRank * sizeof(T) + sizeof(T)) / sizeof(T);
+
   const int nPktsPerRank = nelemsPerRank/2;
   // thread block & channel info
   const int nBlocksPerPeer = gridDim.x / nPeers;

From cdbb2de3bd92557afeb954248b83d65c06a069c9 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Mon, 14 Oct 2024 11:51:07 -0500
Subject: [PATCH 13/13] apps/nccl: allgather tuning

---
 apps/nccl/src/allgather.hpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp
index b4ee696e4..4e7441249 100644
--- a/apps/nccl/src/allgather.hpp
+++ b/apps/nccl/src/allgather.hpp
@@ -103,19 +103,33 @@ __global__ void __launch_bounds__(1024, 1)
     }
   }
 
-  deviceSyncer.sync(gridDim.x);
+  //deviceSyncer.sync(gridDim.x);
 
   if (threadIdx.x < nPeer) {
     smChans[threadIdx.x].relaxedSignal();
     smChans[threadIdx.x].wait();
   }
+  __syncthreads();
 }
 
 template <bool IsOutOfPlace, typename T>
 cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff,
                       mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset, int rank,
                       int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) {
-  allgather6<IsOutOfPlace><<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize,
+
+  int nBlocks = 28;
+  
+  if (nelems <= 4096) {
+        nBlocks = 7;
+  }
+  else if (nelems <= 32768) {
+        nBlocks = 14;
+  } else if (nelems >= 2097152) {
+        nBlocks = 35;
+  }
+
+
+  allgather6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize,
                                                     nRanksPerNode, nelems * sizeof(T) / sizeof(int));
   return cudaGetLastError();
 }