From 4c1bbc96f5b73415fd420b2c04e641e8f736e601 Mon Sep 17 00:00:00 2001
From: Alexander Efimov <efimov.alexander@gmail.com>
Date: Wed, 13 Mar 2024 19:29:00 +0000
Subject: [PATCH] [MFMA] MFMA 4x64 64x4 version 2

Extend K dimension of mfma4x64 and mfma64x4 dot operand layout from 4 to 64.
---
 .../Dialect/TritonGPU/Transforms/Utility.h    |   6 +-
 lib/Analysis/Utility.cpp                      |   3 +-
 .../SharedToDotOperandMFMA.cpp                |  14 +-
 .../TritonGPUToLLVM/DotOpToLLVM/MFMA.cpp      | 178 ++++++++++++++++--
 lib/Dialect/TritonGPU/IR/Dialect.cpp          |  36 +++-
 .../Transforms/AccelerateAMDMatmul.cpp        |  41 ++--
 lib/Dialect/TritonGPU/Transforms/Utility.cpp  | 149 ++++++++-------
 python/test/unit/language/test_core_amd.py    |   5 +-
 8 files changed, 301 insertions(+), 131 deletions(-)
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
index 9cb4de97d744..5afa922665ef 100644
--- a/include/triton/Dialect/TritonGPU/Transforms/Utility.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -175,7 +175,8 @@ struct MfmaInsnAttr {
   unsigned n;
   unsigned k;
   // k_base refers to the number of elements per thread
-  unsigned k_base;
+  unsigned k_base_a;
+  unsigned k_base_b;
   llvm::StringRef insn;
 };
 
@@ -223,7 +224,8 @@ class MfmaInsn {
   unsigned getMDim();
   unsigned getNDim();
   StringRef getInsnName();
-  unsigned getKBase();
+  unsigned getKBaseA();
+  unsigned getKBaseB();
 };
 } // namespace mlir
 
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 6dbc10b943a6..51c7ed03b1a9 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -571,7 +571,8 @@ bool isMfmaToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) {
          dotOperandLayout.getOpIdx() == 0 &&
          dotOperandLayout.getKWidth() == 4 &&
          dotOperandLayout.getParent() == mfmaLayout &&
-         (mfmaLayout.getMDim() == 32 || mfmaLayout.getMDim() == 16) &&
+         (mfmaLayout.getMDim() == 32 || mfmaLayout.getMDim() == 16 ||
+          (mfmaLayout.getMDim() == 4 && mfmaLayout.getNDim() == 64)) &&
          mfmaLayout.getIsTransposed() &&
          (srcTy.getElementType().isF16() || srcTy.getElementType().isBF16());
 }
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
index 86a4153603b2..77d5f6ca5160 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
@@ -158,14 +158,12 @@ llvm::SmallVector<llvm::SmallVector<Value>> computeTensorElemMappingInBlock(
     if (iNonKDim == 32)
       laneHOffset = select(icmp_uge(laneId, _32), i32_val(numOfElems), _0);
     else {
-      // In this configuration wave contains 16 copies of same data
-      if ((iKDim == 1 || iKDim == 4) && iNonKDim == 4) {
+      // shortcut for 64x64 tile size.
+      // In this case warp do not wrap, so no need to introduce this offset
+      if (iNonKDim == 64)
         laneHOffset = i32_val(0);
-      } else {
-        assert(iKDim * iNonKDim / numOfElems == 64 &&
-               "seems no all threads in wave contain unique elements");
+      else
         laneHOffset = mul(udiv(laneId, nonKDim), i32_val(numOfElems));
-      }
     }
 
     for (int loadId = 0; loadId < loadsPerThread; ++loadId) {
@@ -346,7 +344,7 @@ fastPathComputeOffsets(ConversionPatternRewriter &rewriter, Location loc,
         // 32 33 34 35 ... 63
         // 32 33 34 35 ... 63
         Value halfOffset;
-        if ((iKDim == 1 || iKDim == 4) && iNonKDim == 4)
+        if (iNonKDim == 64)
           halfOffset = i32_val(0);
         else
           halfOffset =
@@ -456,6 +454,8 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
   int numSubBlocks = 1;
   if ((mfmaInstrK == 4 || mfmaInstrK == 1) && mfmaInstrNonK == 4)
     numSubBlocks = 16;
+  assert(numSubBlocks == 1 &&
+         "after reworking layout, there should be no redundency");
   int numOfElems = mfmaInstrNonK * mfmaInstrK * numSubBlocks / iWaveSize;
   assert(numOfElems >= 1);
 
diff --git a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/MFMA.cpp b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/MFMA.cpp
index 10bec3614969..15ed28f593be 100644
--- a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/MFMA.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -60,16 +60,140 @@ struct DotOpMFMAConversionHelper {
     return rewriter.create<arith::TruncIOp>(loc, i32_ty, tid);
   }
 
+  /**
+   * @param mfmaInsnName
+   * @param valA
+   * @param valB
+   * @param valC
+   * @param cbsz Control Broadcast Size modifier
+   * @param abid A-matrix Broadcast Identifier
+   * @param blgp B-matrix Lane Group Pattern modifier
+   */
   Value generateMFMAOp(StringRef mfmaInsnName, Value valA, Value valB,
-                       Value valC) const {
+                       Value valC, int cbsz = 0, int abid = 0,
+                       int blgp = 0) const {
+    assert(cbsz >= 0 && cbsz <= 4);
+    assert(abid >= 0 && abid <= 15);
+    assert(blgp >= 0 && blgp <= 7);
     auto resType = valC.getType();
-    Value zeroFlag = i32_val(0);
+    Value zeroVal = i32_val(0);
+    Value cbszFlag = cbsz != 0 ? i32_val(cbsz) : zeroVal;
+    Value abidFlag = abid != 0 ? i32_val(abid) : zeroVal;
+    Value blgpFlag = blgp != 0 ? i32_val(blgp) : zeroVal;
     OperationState loweredOp(loc, mfmaInsnName);
     loweredOp.addTypes(resType);
-    loweredOp.addOperands({valA, valB, valC, zeroFlag, zeroFlag, zeroFlag});
+    loweredOp.addOperands({valA, valB, valC, cbszFlag, abidFlag, blgpFlag});
     return rewriter.create(loweredOp)->getResult(0);
   }
 
+  Value getSubVector(Value vec, int numSubVectors, int subVecId) const {
+    auto groupVecType = vec.getType().cast<VectorType>();
+    auto elemType = groupVecType.getElementType();
+    auto totalElems = groupVecType.getNumElements();
+    auto elemsPerRep = totalElems / numSubVectors;
+    VectorType repVecType = vec_ty(elemType, elemsPerRep);
+    Value repVec = undef(repVecType);
+    for (int i = 0; i < elemsPerRep; i++) {
+      Value elem =
+          extract_element(elemType, vec, i32_val(subVecId * elemsPerRep + i));
+      repVec = insert_element(repVecType, repVec, elem, i32_val(i));
+    }
+    return repVec;
+  }
+
+  Value getRepetitionValue(Value vec, int repId) const {
+    auto groupVecType = vec.getType().cast<VectorType>();
+    auto elemType = groupVecType.getElementType();
+    if (elemType.getIntOrFloatBitWidth() == 16) {
+      Value elem = getSubVector(vec, 16, repId);
+      return elem;
+    }
+    auto totalElems = groupVecType.getNumElements();
+    assert(repId < totalElems);
+    Value elem = extract_element(elemType, vec, i32_val(repId));
+    return elem;
+  }
+
+  Value broadcastGroup(Value val, int groupId, int numGroups) const {
+    constexpr int waveSize = 64;
+    const int groupSize = waveSize / numGroups;
+
+    Value lane = getThreadId();
+    // Multiply by 4, because permute requires offset in bytes
+    Value laneOffset = mul(urem(lane, i32_val(groupSize)), i32_val(4));
+    Value permuteAddr = add(laneOffset, i32_val(groupId * groupSize * 4));
+    Type valType = val.getType();
+    Value broadcasted;
+    if (valType.isInteger(32))
+      broadcasted = rewriter.create<ROCDL::DsBpermuteOp>(loc, val.getType(),
+                                                         permuteAddr, val);
+    if (valType.isF32()) {
+      val = bitcast(val, i32_ty);
+      broadcasted = rewriter.create<ROCDL::DsBpermuteOp>(loc, val.getType(),
+                                                         permuteAddr, val);
+      broadcasted = bitcast(broadcasted, f32_ty);
+    }
+    if (valType.isa<VectorType>()) {
+      auto vecTy = valType.dyn_cast<VectorType>();
+      auto vecBitSize = vecTy.getElementType().getIntOrFloatBitWidth() *
+                        vecTy.getNumElements();
+      const int int32VecSize = vecBitSize / 32;
+
+      Type int32VecTy = vec_ty(i32_ty, int32VecSize);
+      Value int32Val = bitcast(val, int32VecTy);
+      Value int32Broadcasted = undef(int32VecTy);
+      for (int i = 0; i < int32VecSize; ++i) {
+        Value int32Chunk = extract_element(i32_ty, int32Val, i32_val(i));
+        Value broadcastedChunk = rewriter.create<ROCDL::DsBpermuteOp>(
+            loc, i32_ty, permuteAddr, int32Chunk);
+        int32Broadcasted = insert_element(int32VecTy, int32Broadcasted,
+                                          broadcastedChunk, i32_val(i));
+      }
+      broadcasted = bitcast(int32Broadcasted, valType);
+    }
+    assert(broadcasted);
+    return broadcasted;
+  }
+
+  Value generateMFMATile(StringRef mfmaInsnName, Value valA, Value valB,
+                         Value valC, int mDim, int nDim, bool transpose) const {
+
+    Value acc;
+    if (mDim == nDim) {
+      acc = transpose ? generateMFMAOp(mfmaInsnName, valB, valA, valC)
+                      : generateMFMAOp(mfmaInsnName, valA, valB, valC);
+    }
+    if (mDim == 4 && nDim == 64 || mDim == 64 && nDim == 4) {
+      // broadcast selected kRep A operand matrix to all A matrices(2^4=16)
+      constexpr int broadcastCtrl = 4;
+      constexpr int numRepeats = 16;
+      acc = valC;
+      for (int kRep = 0; kRep < numRepeats; kRep++) {
+        if (mDim == 4 && !transpose) {
+          Value repVec = getRepetitionValue(valB, kRep);
+          acc = generateMFMAOp(mfmaInsnName, valA, repVec, acc, broadcastCtrl,
+                               kRep);
+        }
+        if (mDim == 4 && transpose) {
+          Value repValB = getRepetitionValue(valB, kRep);
+          Value broadcastValA = broadcastGroup(valA, kRep, numRepeats);
+          acc = generateMFMAOp(mfmaInsnName, repValB, broadcastValA, acc);
+        }
+        if (nDim == 4 && !transpose) {
+          Value repValA = getRepetitionValue(valA, kRep);
+          Value broadcastValB = broadcastGroup(valB, kRep, numRepeats);
+          acc = generateMFMAOp(mfmaInsnName, repValA, broadcastValB, acc);
+        }
+        if (nDim == 4 && transpose) {
+          Value repVec = getRepetitionValue(valA, kRep);
+          acc = generateMFMAOp(mfmaInsnName, valB, repVec, acc, broadcastCtrl,
+                               kRep);
+        }
+      }
+    }
+    return acc;
+  }
+
   int getNumSubmatrices(Type elementType, int mDim, int nDim) const {
     if (mDim == 64 && nDim == 4 || mDim == 4 && nDim == 64)
       return 1;
@@ -187,13 +311,14 @@ struct DotOpMFMAConversionHelper {
       llvm::report_fatal_error("No match found in MFMA database\n");
 
     mfmaInsnName = (*maybeMfmaInsn).getInsnName();
-    unsigned k_base = (*maybeMfmaInsn).getKBase();
+    unsigned kBaseA = (*maybeMfmaInsn).getKBaseA();
+    unsigned kBaseB = (*maybeMfmaInsn).getKBaseB();
 
     auto aEncoding = aTensorTy.getEncoding().cast<DotOperandEncodingAttr>();
     auto bEncoding = bTensorTy.getEncoding().cast<DotOperandEncodingAttr>();
 
-    auto kWidth = aEncoding.getKWidth();
-    assert(kWidth == bEncoding.getKWidth());
+    auto kWidthA = aEncoding.getKWidth();
+    auto kWidthB = bEncoding.getKWidth();
 
     auto repA = aEncoding.getMFMARep(aTensorTy.getShape());
     auto repB = bEncoding.getMFMARep(bTensorTy.getShape());
@@ -209,9 +334,9 @@ struct DotOpMFMAConversionHelper {
     auto numRepK = repA[1];
 
     auto operandA = getValuesFromDotOperandLayoutStruct(
-        loadedA, numRepM, numRepK, kWidth, k_base, aTensorTy.getElementType());
+        loadedA, numRepM, numRepK, kWidthA, kBaseA, aTensorTy.getElementType());
     auto operandB = getValuesFromDotOperandLayoutStruct(
-        loadedB, numRepN, numRepK, kWidth, k_base, aTensorTy.getElementType());
+        loadedB, numRepN, numRepK, kWidthB, kBaseB, aTensorTy.getElementType());
 
     auto dstElemTy = dTensorTy.getElementType();
     auto fc =
@@ -236,12 +361,10 @@ struct DotOpMFMAConversionHelper {
 
         acc = zeroAuxiliarBlocks(subBlocks, acc);
         for (size_t k = 0; k < numRepK; k++)
-          for (int kpack = 0; kpack < kWidth / k_base; ++kpack)
-            acc = mfmaLayout.getIsTransposed()
-                      ? generateMFMAOp(mfmaInsnName, operandB[kpack][{n, k}],
-                                       operandA[kpack][{m, k}], acc)
-                      : generateMFMAOp(mfmaInsnName, operandA[kpack][{m, k}],
-                                       operandB[kpack][{n, k}], acc);
+          for (int kpack = 0; kpack < kWidthA / kBaseA; ++kpack)
+            acc = generateMFMATile(mfmaInsnName, operandA[kpack][{m, k}],
+                                   operandB[kpack][{n, k}], acc, mDim, nDim,
+                                   mfmaLayout.getIsTransposed());
         acc = reduceSubBlocks(subBlocks, acc);
         for (unsigned v = 0; v < elemsPerVec; ++v) {
           fc[m * numRepN * elemsPerVec + n * elemsPerVec + v] =
@@ -276,12 +399,29 @@ struct DotOpMFMAConversionHelper {
             extract_element(type, rawElems, i32_val(elemId + k * k_base));
         vec = insert_element(vecTy, vec, val, i32_val(elemId));
       }
+      // if (64 == k_base) {
+      //   constexpr int numRepeats = 16;
+      //   const int oneOpKWidth = k_base / numRepeats;
+      //   assert(oneOpKWidth == 4);
+      //   auto repVecTy = vec_ty(type, oneOpKWidth);
+      //   auto operandVecTy = vec_ty(repVecTy, numRepeats);
+      //   results.push_back(bitcast(vec, operandVecTy));
+      // }
       if (type.getIntOrFloatBitWidth() == 8) {
         if (4 == k_base)
           // This is for int8 on pre- MI300 GPUs
           results.push_back(bitcast(vec, i32_ty));
         if (8 == k_base)
           results.push_back(bitcast(vec, i64_ty));
+        // In this case one tile is processed by sevelar instructions
+        // repack flat vector into vector of vectors
+        if (64 == k_base) {
+          constexpr int numRepeats = 16;
+          assert(k_base / numRepeats == 4);
+          auto repVecTy = i32_ty;
+          auto operandVecTy = vec_ty(repVecTy, numRepeats);
+          results.push_back(bitcast(vec, operandVecTy));
+        }
       } else
         results.push_back(vec);
     }
@@ -305,8 +445,14 @@ struct DotOpMFMAConversionHelper {
         auto rawElems = elems[n1 * i + j];
 
         if (type.isF32()) {
-          for (int k = 0; k < kpack; ++k) {
-            dotOpVals[k][{i, j}] = extract_element(type, rawElems, i32_val(k));
+          if (k_base == 16) {
+            for (int k = 0; k < kpack; ++k)
+              dotOpVals[k][{i, j}] = getSubVector(rawElems, kpack, k);
+          } else {
+            for (int k = 0; k < kpack; ++k) {
+              dotOpVals[k][{i, j}] =
+                  extract_element(type, rawElems, i32_val(k));
+            }
           }
         } else {
           SmallVector<Value> vals;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 87e8bb218bc9..48171dc43822 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -304,12 +304,17 @@ SmallVector<unsigned> getSizePerThread(Attribute layout) {
         llvm::report_fatal_error("DotOperandEncodingAttr opIdx must be 0 or 1");
         return {};
       }
-    } else if (parentLayout.isa<MfmaEncodingAttr>()) {
+    } else if (auto mfmaLayout = parentLayout.dyn_cast<MfmaEncodingAttr>()) {
       auto opIdx = dotLayout.getOpIdx();
+      auto kWidth = dotLayout.getKWidth();
       if (opIdx == 0) {
-        return {4, 1};
+        int repeats =
+            (mfmaLayout.getMDim() == 64 && mfmaLayout.getNDim() == 4) ? 16 : 1;
+        return {1, kWidth * repeats};
       } else if (opIdx == 1) {
-        return {1, 4};
+        int repeats =
+            (mfmaLayout.getMDim() == 4 && mfmaLayout.getNDim() == 64) ? 16 : 1;
+        return {kWidth * repeats, 1};
       } else {
         assert(0 && "DotOperandEncodingAttr opIdx must be 0 or 1");
         return {};
@@ -458,6 +463,8 @@ SmallVector<unsigned> getShapePerCTATile(Attribute layout,
       auto parentShapePerCTA = getShapePerCTATile(parentLayout, tensorShape);
       auto opIdx = dotLayout.getOpIdx();
 
+      assert(parentMfmaLayout.getMDim() == 32);
+
       if (opIdx == 0) {
         return {parentShapePerCTA[0], 32};
       } else if (opIdx == 1) {
@@ -1102,16 +1109,13 @@ DotOperandEncodingAttr::getMFMAElemsPerInstr() const {
          (mDim == 64 && nDim == 4) || (mDim == 4 && nDim == 64));
   int64_t kWidth = getKWidth();
   constexpr int waveSize = 64; // MFMA is used on wave64 architectures only
-  int kGroups = -1;
-  if (mDim == nDim)
-    kGroups = waveSize / mDim;
-  if (mDim == 64 && nDim == 4 || mDim == 4 && nDim == 64)
-    kGroups = 1;
+  auto nonKDim = getOpIdx() == 0 ? mDim : nDim;
+  int kGroups = waveSize / nonKDim;
   int64_t kDim = kWidth * kGroups;
   if (getOpIdx() == 0)
-    return {mDim, kDim};
+    return {nonKDim, kDim};
   else
-    return {kDim, nDim};
+    return {kDim, nonKDim};
 }
 
 SmallVector<int64_t>
@@ -1902,6 +1906,18 @@ struct TritonGPUInferLayoutInterface
     // Verify that the encodings are valid.
     if (!aEncoding || !bEncoding)
       return op->emitError("mismatching encoding between A and B operands");
+#ifdef USE_ROCM
+    auto aParentEncoding =
+        aEncoding.getParent().dyn_cast_or_null<MfmaEncodingAttr>();
+    auto bParentEncoding =
+        bEncoding.getParent().dyn_cast_or_null<MfmaEncodingAttr>();
+    if (aParentEncoding != bParentEncoding)
+      return op->emitError(
+          "mismatching parent encoding between A and B operands");
+    if (aParentEncoding != nullptr &&
+        aParentEncoding.getMDim() != aParentEncoding.getNDim())
+      return success();
+#endif // USE_ROCM
     if (aEncoding.getKWidth() != bEncoding.getKWidth())
       return op->emitError("mismatching kWidth between A and B operands");
     return success();
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateAMDMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateAMDMatmul.cpp
index 3f39248597bd..32303ca748bc 100644
--- a/lib/Dialect/TritonGPU/Transforms/AccelerateAMDMatmul.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateAMDMatmul.cpp
@@ -158,9 +158,8 @@ class BlockedToMFMA : public mlir::RewritePattern {
 
   /// @brief Choose MFMA instruction parameters
   /// @param dot target dot operation
-  /// @return pair {nonKDim, kDim} sizes of one MFMA instruction arguments
-  std::tuple<int64_t, int64_t, int64_t>
-  chooseMfmaDimensions(tt::DotOp dot) const {
+  /// @return selected mfma instruction
+  MfmaInsn chooseMfmaDimensions(tt::DotOp dot) const {
     // number of matrix elements along k dim per one MFMA intruction
     unsigned kDim = 0;
     auto opType = dot.getA().getType().cast<RankedTensorType>();
@@ -200,6 +199,8 @@ class BlockedToMFMA : public mlir::RewritePattern {
         nDim = 16;
       }
       if (minSize < 16) {
+        assert(opType.getShape()[1] >= 64 &&
+               "k should be at least 64 to use this layout");
         if (resShape[0] < 16 && resShape[1] >= 64) {
           mDim = 4;
           nDim = 64;
@@ -207,8 +208,6 @@ class BlockedToMFMA : public mlir::RewritePattern {
           mDim = 64;
           nDim = 4;
         } else {
-          assert(opType.getShape()[1] >= 64 &&
-                 "k should be at least 64 to use this layout");
           mDim = 4;
           nDim = 4;
         }
@@ -227,7 +226,7 @@ class BlockedToMFMA : public mlir::RewritePattern {
     assert(mDim != 0 && nDim != 0);
     assert(resShape[0] % mDim == 0 && resShape[1] % nDim == 0);
     assert(opType.getShape()[1] % kDim == 0);
-    return {mDim, nDim, kDim};
+    return maybeMfmaInsn.value();
   }
 
   mlir::LogicalResult
@@ -259,7 +258,10 @@ class BlockedToMFMA : public mlir::RewritePattern {
 
     ttg::MfmaEncodingAttr mfmaEnc;
 
-    auto [mDim, nDim, kDim] = chooseMfmaDimensions(dotOp);
+    auto instr = chooseMfmaDimensions(dotOp);
+    auto mDim = instr.getMDim();
+    auto nDim = instr.getNDim();
+    auto kDim = instr.getKDim();
 
     auto warpsPerTile =
         warpsPerTileMFMA(dotOp, retShape, numWarps, {mDim, nDim});
@@ -290,33 +292,24 @@ class BlockedToMFMA : public mlir::RewritePattern {
 
     // kWidth is initialized as k_base, which is the number of elements hold by
     // one thread per mfma instruction
-    auto kWidth = -1;
-    // in mfma 32x32 case argument matrix groups elements in 2 groups
-    // in mfma 16x16 case argument matrix groups elements in 4 groups
-    // in mfma 4x4 case argument matrix groups in 16 groups
-    if (mDim == 32 && nDim == 32)
-      kWidth = kDim / 2;
-    if (mDim == 16 && nDim == 16)
-      kWidth = kDim / 4;
-    if (mDim == 4 && nDim == 4)
-      kWidth = kDim / 16;
-    if (mDim == 4 && nDim == 64 || mDim == 64 && nDim == 4)
-      kWidth = kDim;
-    assert(kWidth != -1);
+    auto kWidthA = instr.getKBaseA();
+    auto kWidthB = instr.getKBaseB();
 
     // We want to extend kWidth by kpack (kpack=1 means no extension)
     // to increase ds_read vector size
     // However, in FA, the second dot can only use kWidth = k_bse since it's
     // limited by the result of the first dot, which is of mfmaLayout.
-    if (!isSecondDot(dotOp))
-      kWidth *= kpack;
+    if (!isSecondDot(dotOp)) {
+      kWidthA *= kpack;
+      kWidthB *= kpack;
+    }
 
     auto newAType = RankedTensorType::get(
         oldAType.getShape(), oldAType.getElementType(),
-        ttg::DotOperandEncodingAttr::get(ctx, 0, mfmaEnc, kWidth));
+        ttg::DotOperandEncodingAttr::get(ctx, 0, mfmaEnc, kWidthA));
     auto newBType = RankedTensorType::get(
         oldBType.getShape(), oldBType.getElementType(),
-        ttg::DotOperandEncodingAttr::get(ctx, 1, mfmaEnc, kWidth));
+        ttg::DotOperandEncodingAttr::get(ctx, 1, mfmaEnc, kWidthB));
     a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
     b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
     auto newDot = rewriter.create<tt::DotOp>(dotOp.getLoc(), newRetType, a, b,
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index 23f1befd2617..5a5046b1f8f0 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -669,173 +669,183 @@ using MfmaInsnGroupMap = llvm::DenseMap<MfmaInsnGroupSelectKey, MfmaInsnAttr,
 
 auto getMfmaInsnGroupAttrMap = []() -> const MfmaInsnGroupMap & {
   static MfmaInsnGroupMap MfmaInsnMap{
+      // MFMA tile description:
+      // M  N  K  k_base_a k_base_b instr_name
       // f32
       // mfma_f32_32x32x2f32
       {{32, 32, MfmaTypeId::Fp32TyId, 1},
-       {32, 32, 2, 1, ROCDL::mfma_f32_32x32x2f32::getOperationName()}},
+       {32, 32, 2, 1, 1, ROCDL::mfma_f32_32x32x2f32::getOperationName()}},
       {{32, 32, MfmaTypeId::Fp32TyId, 2},
-       {32, 32, 2, 1, ROCDL::mfma_f32_32x32x2f32::getOperationName()}},
+       {32, 32, 2, 1, 1, ROCDL::mfma_f32_32x32x2f32::getOperationName()}},
       {{32, 32, MfmaTypeId::Fp32TyId, 3},
-       {32, 32, 2, 1, ROCDL::mfma_f32_32x32x2f32::getOperationName()}},
+       {32, 32, 2, 1, 1, ROCDL::mfma_f32_32x32x2f32::getOperationName()}},
       // mfma_f32_16x16x4f32
       {{16, 16, MfmaTypeId::Fp32TyId, 1},
-       {16, 16, 4, 1, ROCDL::mfma_f32_16x16x4f32::getOperationName()}},
+       {16, 16, 4, 1, 1, ROCDL::mfma_f32_16x16x4f32::getOperationName()}},
       {{16, 16, MfmaTypeId::Fp32TyId, 2},
-       {16, 16, 4, 1, ROCDL::mfma_f32_16x16x4f32::getOperationName()}},
+       {16, 16, 4, 1, 1, ROCDL::mfma_f32_16x16x4f32::getOperationName()}},
       {{16, 16, MfmaTypeId::Fp32TyId, 3},
-       {16, 16, 4, 1, ROCDL::mfma_f32_16x16x4f32::getOperationName()}},
+       {16, 16, 4, 1, 1, ROCDL::mfma_f32_16x16x4f32::getOperationName()}},
       // mfma_f32_4x4x1f32
       {{4, 4, MfmaTypeId::Fp32TyId, 1},
-       {4, 4, 16, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {4, 4, 16, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{4, 4, MfmaTypeId::Fp32TyId, 2},
-       {4, 4, 16, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {4, 4, 16, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{4, 64, MfmaTypeId::Fp32TyId, 1},
-       {4, 64, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {4, 64, 16, 1, 16, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{4, 64, MfmaTypeId::Fp32TyId, 2},
-       {4, 64, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {4, 64, 16, 1, 16, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{64, 4, MfmaTypeId::Fp32TyId, 1},
-       {64, 4, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {64, 4, 16, 16, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{64, 4, MfmaTypeId::Fp32TyId, 2},
-       {64, 4, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {64, 4, 16, 16, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       // mfma_f32_4x4x1_16B_f32
       {{4, 4, MfmaTypeId::Fp32TyId, 3},
-       {4, 4, 16, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {4, 4, 16, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{4, 64, MfmaTypeId::Fp32TyId, 3},
-       {4, 64, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {4, 64, 16, 1, 16, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       {{64, 4, MfmaTypeId::Fp32TyId, 3},
-       {64, 4, 1, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
+       {64, 4, 16, 16, 1, ROCDL::mfma_f32_4x4x1f32::getOperationName()}},
       // f16
       // mfma_f32_32x32x8f16
       {{32, 32, MfmaTypeId::Fp16TyId, 1},
-       {32, 32, 8, 4, ROCDL::mfma_f32_32x32x8f16::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_f32_32x32x8f16::getOperationName()}},
       {{32, 32, MfmaTypeId::Fp16TyId, 2},
-       {32, 32, 8, 4, ROCDL::mfma_f32_32x32x8f16::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_f32_32x32x8f16::getOperationName()}},
       {{32, 32, MfmaTypeId::Fp16TyId, 3},
-       {32, 32, 8, 4, ROCDL::mfma_f32_32x32x8f16::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_f32_32x32x8f16::getOperationName()}},
       // mfma_f32_16x16x16xf16
       {{16, 16, MfmaTypeId::Fp16TyId, 1},
-       {16, 16, 16, 4, ROCDL::mfma_f32_16x16x16f16::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_f32_16x16x16f16::getOperationName()}},
       {{16, 16, MfmaTypeId::Fp16TyId, 2},
-       {16, 16, 16, 4, ROCDL::mfma_f32_16x16x16f16::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_f32_16x16x16f16::getOperationName()}},
       {{16, 16, MfmaTypeId::Fp16TyId, 3},
-       {16, 16, 16, 4, ROCDL::mfma_f32_16x16x16f16::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_f32_16x16x16f16::getOperationName()}},
       // mfma_f32_4x4x4f16
       {{4, 4, MfmaTypeId::Fp16TyId, 1},
-       {4, 4, 64, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{4, 4, MfmaTypeId::Fp16TyId, 2},
-       {4, 4, 64, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{4, 4, MfmaTypeId::Fp16TyId, 3},
-       {4, 4, 64, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{4, 64, MfmaTypeId::Fp16TyId, 1},
-       {4, 64, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{4, 64, MfmaTypeId::Fp16TyId, 2},
-       {4, 64, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{4, 64, MfmaTypeId::Fp16TyId, 3},
-       {4, 64, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{64, 4, MfmaTypeId::Fp16TyId, 1},
-       {64, 4, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{64, 4, MfmaTypeId::Fp16TyId, 2},
-       {64, 4, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       {{64, 4, MfmaTypeId::Fp16TyId, 3},
-       {64, 4, 4, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_f32_4x4x4f16::getOperationName()}},
       // bf16
       // mfma_f32_32x32x4_bf16
       {{32, 32, MfmaTypeId::Bf16TyId, 1},
-       {32, 32, 4, 2, ROCDL::mfma_f32_32x32x4bf16::getOperationName()}},
+       {32, 32, 4, 2, 2, ROCDL::mfma_f32_32x32x4bf16::getOperationName()}},
       // mfma_f32_32x32x8_bf16_1K
       {{32, 32, MfmaTypeId::Bf16TyId, 2},
-       {32, 32, 8, 4, ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName()}},
       {{32, 32, MfmaTypeId::Bf16TyId, 3},
-       {32, 32, 8, 4, ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName()}},
       // mfma_f32_16x16x8_bf16
       {{16, 16, MfmaTypeId::Bf16TyId, 1},
-       {16, 16, 8, 2, ROCDL::mfma_f32_16x16x8bf16::getOperationName()}},
+       {16, 16, 8, 2, 2, ROCDL::mfma_f32_16x16x8bf16::getOperationName()}},
       // mfma_f32_16x16x16_bf16_1K
       {{16, 16, MfmaTypeId::Bf16TyId, 2},
-       {16, 16, 16, 4, ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName()}},
       {{16, 16, MfmaTypeId::Bf16TyId, 3},
-       {16, 16, 16, 4, ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName()}},
       // mfma_f32_4x4x2_bf16
       {{4, 4, MfmaTypeId::Bf16TyId, 1},
-       {4, 4, 32, 2, ROCDL::mfma_f32_4x4x2bf16::getOperationName()}},
+       {4, 4, 32, 2, 2, ROCDL::mfma_f32_4x4x2bf16::getOperationName()}},
       {{4, 64, MfmaTypeId::Bf16TyId, 1},
-       {4, 64, 2, 2, ROCDL::mfma_f32_4x4x2bf16::getOperationName()}},
+       {4, 64, 32, 2, 32, ROCDL::mfma_f32_4x4x2bf16::getOperationName()}},
       {{64, 4, MfmaTypeId::Bf16TyId, 1},
-       {64, 4, 2, 2, ROCDL::mfma_f32_4x4x2bf16::getOperationName()}},
+       {64, 4, 32, 32, 2, ROCDL::mfma_f32_4x4x2bf16::getOperationName()}},
       // mfma_f32_4x4x4_bf16_1K
       {{4, 4, MfmaTypeId::Bf16TyId, 2},
-       {4, 4, 64, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
       {{4, 4, MfmaTypeId::Bf16TyId, 3},
-       {4, 4, 64, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
       {{4, 64, MfmaTypeId::Bf16TyId, 2},
-       {4, 64, 4, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
       {{4, 64, MfmaTypeId::Bf16TyId, 3},
-       {4, 64, 4, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
       {{64, 4, MfmaTypeId::Bf16TyId, 2},
-       {64, 4, 4, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
       {{64, 4, MfmaTypeId::Bf16TyId, 3},
-       {64, 4, 4, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName()}},
       // int8
       // mfma_i32_32x32x8i8
       {{32, 32, MfmaTypeId::I8TyId, 1},
-       {32, 32, 8, 4, ROCDL::mfma_i32_32x32x8i8::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_i32_32x32x8i8::getOperationName()}},
       {{32, 32, MfmaTypeId::I8TyId, 2},
-       {32, 32, 8, 4, ROCDL::mfma_i32_32x32x8i8::getOperationName()}},
+       {32, 32, 8, 4, 4, ROCDL::mfma_i32_32x32x8i8::getOperationName()}},
       // mfma_i32_32x32x16i8
       {{32, 32, MfmaTypeId::I8TyId, 3},
-       {32, 32, 16, 8, ROCDL::mfma_i32_32x32x16_i8::getOperationName()}},
+       {32, 32, 16, 8, 8, ROCDL::mfma_i32_32x32x16_i8::getOperationName()}},
       // mfma_i32_16x16x16i8
       {{16, 16, MfmaTypeId::I8TyId, 1},
-       {16, 16, 16, 4, ROCDL::mfma_i32_16x16x16i8::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_i32_16x16x16i8::getOperationName()}},
       {{16, 16, MfmaTypeId::I8TyId, 2},
-       {16, 16, 16, 4, ROCDL::mfma_i32_16x16x16i8::getOperationName()}},
+       {16, 16, 16, 4, 4, ROCDL::mfma_i32_16x16x16i8::getOperationName()}},
       // mfma_i32_16x16x32i8
       {{16, 16, MfmaTypeId::I8TyId, 3},
-       {16, 16, 32, 8, ROCDL::mfma_i32_16x16x32_i8::getOperationName()}},
+       {16, 16, 32, 8, 8, ROCDL::mfma_i32_16x16x32_i8::getOperationName()}},
       // mfma_i32_4x4x4i8
       {{4, 4, MfmaTypeId::I8TyId, 1},
-       {4, 4, 64, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{4, 4, MfmaTypeId::I8TyId, 2},
-       {4, 4, 64, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{4, 4, MfmaTypeId::I8TyId, 3},
-       {4, 4, 64, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {4, 4, 64, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{4, 64, MfmaTypeId::I8TyId, 1},
-       {4, 64, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{4, 64, MfmaTypeId::I8TyId, 2},
-       {4, 64, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{4, 64, MfmaTypeId::I8TyId, 3},
-       {4, 64, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {4, 64, 64, 4, 64, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{64, 4, MfmaTypeId::I8TyId, 1},
-       {64, 4, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{64, 4, MfmaTypeId::I8TyId, 2},
-       {64, 4, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       {{64, 4, MfmaTypeId::I8TyId, 3},
-       {64, 4, 4, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
+       {64, 4, 64, 64, 4, ROCDL::mfma_i32_4x4x4i8::getOperationName()}},
       // fp8 * pf8
       // mfma_f32_32x32x16_FP8_FP8
       {{32, 32, MfmaTypeId::Fp8Fp8TyId, 3},
-       {32, 32, 16, 8, ROCDL::mfma_f32_32x32x16_fp8_fp8::getOperationName()}},
+       {32, 32, 16, 8, 8,
+        ROCDL::mfma_f32_32x32x16_fp8_fp8::getOperationName()}},
       // mfma_f32_16x16x32_FP8_FP8
       {{16, 16, MfmaTypeId::Fp8Fp8TyId, 3},
-       {16, 16, 32, 8, ROCDL::mfma_f32_16x16x32_fp8_fp8::getOperationName()}},
+       {16, 16, 32, 8, 8,
+        ROCDL::mfma_f32_16x16x32_fp8_fp8::getOperationName()}},
       // mfma_f32_32x32x16_FP8_BF8
       {{32, 32, MfmaTypeId::Fp8Bf8TyId, 3},
-       {32, 32, 16, 8, ROCDL::mfma_f32_32x32x16_fp8_bf8::getOperationName()}},
+       {32, 32, 16, 8, 8,
+        ROCDL::mfma_f32_32x32x16_fp8_bf8::getOperationName()}},
       // mfma_f32_16x16x32_FP8_BF8
       {{16, 16, MfmaTypeId::Fp8Bf8TyId, 3},
-       {16, 16, 32, 8, ROCDL::mfma_f32_16x16x32_fp8_bf8::getOperationName()}},
+       {16, 16, 32, 8, 8,
+        ROCDL::mfma_f32_16x16x32_fp8_bf8::getOperationName()}},
       // mfma_f32_32x32x16_BF8_FP8
       {{32, 32, MfmaTypeId::Bf8Fp8TyId, 3},
-       {32, 32, 16, 8, ROCDL::mfma_f32_32x32x16_bf8_fp8::getOperationName()}},
+       {32, 32, 16, 8, 8,
+        ROCDL::mfma_f32_32x32x16_bf8_fp8::getOperationName()}},
       // mfma_f32_16x16x32_BF8_FP8
       {{16, 16, MfmaTypeId::Bf8Fp8TyId, 3},
-       {16, 16, 32, 8, ROCDL::mfma_f32_16x16x32_bf8_fp8::getOperationName()}},
+       {16, 16, 32, 8, 8,
+        ROCDL::mfma_f32_16x16x32_bf8_fp8::getOperationName()}},
       // mfma_f32_32x32x16_BF8_BF8
       {{32, 32, MfmaTypeId::Bf8Bf8TyId, 3},
-       {32, 32, 16, 8, ROCDL::mfma_f32_32x32x16_bf8_bf8::getOperationName()}},
+       {32, 32, 16, 8, 8,
+        ROCDL::mfma_f32_32x32x16_bf8_bf8::getOperationName()}},
       // mfma_f32_16x16x32_BF8_BF8
       {{16, 16, MfmaTypeId::Bf8Bf8TyId, 3},
-       {16, 16, 32, 8, ROCDL::mfma_f32_16x16x32_bf8_bf8::getOperationName()}}};
+       {16, 16, 32, 8, 8,
+        ROCDL::mfma_f32_16x16x32_bf8_bf8::getOperationName()}}};
   return MfmaInsnMap;
 };
 
@@ -859,6 +869,7 @@ unsigned MfmaInsn::getKDim() { return attr.k; }
 unsigned MfmaInsn::getMDim() { return attr.m; }
 unsigned MfmaInsn::getNDim() { return attr.n; }
 StringRef MfmaInsn::getInsnName() { return attr.insn; }
-unsigned  MfmaInsn::getKBase() { return attr.k_base;}
+unsigned MfmaInsn::getKBaseA() { return attr.k_base_a; }
+unsigned MfmaInsn::getKBaseB() { return attr.k_base_b; }
 
 } // namespace mlir
diff --git a/python/test/unit/language/test_core_amd.py b/python/test/unit/language/test_core_amd.py
index 0a451d539453..d991b689c6c2 100644
--- a/python/test/unit/language/test_core_amd.py
+++ b/python/test/unit/language/test_core_amd.py
@@ -1706,8 +1706,9 @@ def kernel(X, stride_xm, stride_xn,
                                            [4, 32, 64, 4],
                                            [32, 4, 64, 2],
                                            [16, 4, 64, 8],
-                                           [64, 4, 16, 1],
-                                           [4, 64, 16, 1],
+                                           [64, 4, 64, 1],
+                                           [4, 64, 64, 1],
+                                           [4, 64, 64, 4],
                                            ]
                           for allow_tf32 in [False, True]
                           for col_a in [True, False]