[Transform][Tiling] Add deep tile support for matmul #90

zhczhong · 2024-05-20T07:51:52Z

Tracking #53

TODO:

The innerloop generation part depends on the easy builder support(#62)

zhczhong · 2024-06-05T03:21:39Z

Support use linalgx.batch_reduce_vnni(bf16xbf16->f32) and fuse the cast(f32->bf16) to the last loop about K axis

func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %cst_0 = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16>
    %2 = linalgx.mm4d_vnni ins(%arg0, %arg1 : tensor<128x128x32x32xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<128x128x32x32xbf16>)  -> tensor<128x128x32x32xbf16>
    return %2 : tensor<128x128x32x32xbf16>
}

will be transformed into

#map = affine_map<(d0) -> (d0 * 64)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * 64 + s0 + s1)>
module {
  func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %c1 = arith.constant 1 : index
    %c128 = arith.constant 128 : index
    %c2 = arith.constant 2 : index
    %c64 = arith.constant 64 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = scf.forall (%arg2, %arg3) in (2, 2) shared_outs(%arg4 = %0) -> (tensor<128x128x32x32xbf16>) {
      %2 = affine.apply #map(%arg2)
      %3 = affine.apply #map(%arg3)
      %extracted_slice = tensor.extract_slice %arg4[%2, %3, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<64x64x32x32xbf16>
      %4 = scf.for %arg5 = %c0 to %c64 step %c2 iter_args(%arg6 = %extracted_slice) -> (tensor<64x64x32x32xbf16>) {
        %extracted_slice_0 = tensor.extract_slice %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> to tensor<2x64x32x32xbf16>
        %7 = scf.for %arg7 = %c0 to %c64 step %c2 iter_args(%arg8 = %extracted_slice_0) -> (tensor<2x64x32x32xbf16>) {
          %extracted_slice_1 = tensor.extract_slice %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> to tensor<2x2x32x32xbf16>
          %8 = tensor.empty() : tensor<2x2x32x32xf32>
          %9 = scf.for %arg9 = %c0 to %c128 step %c2 iter_args(%arg10 = %8) -> (tensor<2x2x32x32xf32>) {
            %11 = scf.for %arg11 = %c0 to %c2 step %c1 iter_args(%arg12 = %arg10) -> (tensor<2x2x32x32xf32>) {
              %extracted_slice_3 = tensor.extract_slice %arg12[%arg11, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x2x32x32xf32>
              %12 = scf.for %arg13 = %c0 to %c2 step %c1 iter_args(%arg14 = %extracted_slice_3) -> (tensor<1x2x32x32xf32>) {
                %13 = affine.apply #map1(%arg2)[%arg11, %arg5]
                %extracted_slice_5 = tensor.extract_slice %arg0[%13, %arg9, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<2x32x32xbf16>
                %14 = affine.apply #map1(%arg3)[%arg13, %arg7]
                %extracted_slice_6 = tensor.extract_slice %arg1[%14, %arg9, 0, 0, 0] [1, 2, 16, 32, 2] [1, 1, 1, 1, 1] : tensor<128x128x16x32x2xbf16> to tensor<2x16x32x2xbf16>
                %extracted_slice_7 = tensor.extract_slice %arg14[0, %arg13, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> to tensor<32x32xf32>
                %15 = arith.cmpi eq, %arg9, %c0 : index
                %16 = scf.if %15 -> (tensor<32x32xf32>) {
                  %17 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_7 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  %18 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_5, %extracted_slice_6 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%17 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %18 : tensor<32x32xf32>
                } else {
                  %17 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_5, %extracted_slice_6 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%extracted_slice_7 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %17 : tensor<32x32xf32>
                }
                %inserted_slice_8 = tensor.insert_slice %16 into %arg14[0, %arg13, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xf32> into tensor<1x2x32x32xf32>
                scf.yield %inserted_slice_8 : tensor<1x2x32x32xf32>
              }
              %inserted_slice_4 = tensor.insert_slice %12 into %arg12[%arg11, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> into tensor<2x2x32x32xf32>
              scf.yield %inserted_slice_4 : tensor<2x2x32x32xf32>
            }
            scf.yield %11 : tensor<2x2x32x32xf32>
          }
          %10 = linalg.copy ins(%9 : tensor<2x2x32x32xf32>) outs(%extracted_slice_1 : tensor<2x2x32x32xbf16>) -> tensor<2x2x32x32xbf16>
          %inserted_slice_2 = tensor.insert_slice %10 into %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xbf16> into tensor<2x64x32x32xbf16>
          scf.yield %inserted_slice_2 : tensor<2x64x32x32xbf16>
        }
        %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> into tensor<64x64x32x32xbf16>
        scf.yield %inserted_slice : tensor<64x64x32x32xbf16>
      }
      %5 = affine.apply #map(%arg2)
      %6 = affine.apply #map(%arg3)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %4 into %arg4[%5, %6, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> into tensor<128x128x32x32xbf16>
      }
    }
    return %1 : tensor<128x128x32x32xbf16>
  }
}

yifeizh2 · 2024-06-13T02:04:24Z

lib/gc/Transforms/DeepTileContractionNamedOp.cpp

+ return idxList;
+}
+
+MatmulConfig getDefaultMatmulConfig(linalg::LinalgOp &linalgOp) {


Can you expose this method in the future, as it would be used in layout inference logic of global layout analysis pass.

ok, I will expose this method later

yifeizh2 · 2024-06-13T03:14:23Z

lib/gc/Transforms/DeepTileContractionNamedOp.cpp

+
+namespace {
+
+struct SystemDesc {


Will this struct be replaced with target description in future?

Yes, this is just a mock class here and will be replaced with target description when it is ready

zhczhong · 2024-06-13T08:35:27Z

Update: Fuse the cast(f32->bf16) to the innermost loop

func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %cst_0 = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16>
    %2 = linalgx.mm4d_vnni ins(%arg0, %arg1 : tensor<128x128x32x32xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<128x128x32x32xbf16>)  -> tensor<128x128x32x32xbf16>
    return %2 : tensor<128x128x32x32xbf16>
}

will be transformed to

#map = affine_map<(d0) -> (d0 * 64)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * 64 + s0 + s1)>
module {
  func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<128x128x16x32x2xbf16>) -> tensor<128x128x32x32xbf16> {
    %c1 = arith.constant 1 : index
    %c128 = arith.constant 128 : index
    %c2 = arith.constant 2 : index
    %c64 = arith.constant 64 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %0 = tensor.empty() : tensor<128x128x32x32xbf16>
    %1 = scf.forall (%arg2, %arg3) in (2, 2) shared_outs(%arg4 = %0) -> (tensor<128x128x32x32xbf16>) {
      %2 = affine.apply #map(%arg2)
      %3 = affine.apply #map(%arg3)
      %extracted_slice = tensor.extract_slice %arg4[%2, %3, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<64x64x32x32xbf16>
      %4 = scf.for %arg5 = %c0 to %c64 step %c2 iter_args(%arg6 = %extracted_slice) -> (tensor<64x64x32x32xbf16>) {
        %extracted_slice_0 = tensor.extract_slice %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> to tensor<2x64x32x32xbf16>
        %7 = scf.for %arg7 = %c0 to %c64 step %c2 iter_args(%arg8 = %extracted_slice_0) -> (tensor<2x64x32x32xbf16>) {
          %extracted_slice_1 = tensor.extract_slice %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> to tensor<2x2x32x32xbf16>
          %8 = tensor.empty() : tensor<2x2x32x32xf32>
          %9:2 = scf.for %arg9 = %c0 to %c128 step %c2 iter_args(%arg10 = %8, %arg11 = %extracted_slice_1) -> (tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>) {
            %10:2 = scf.for %arg12 = %c0 to %c2 step %c1 iter_args(%arg13 = %arg10, %arg14 = %arg11) -> (tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>) {
              %extracted_slice_3 = tensor.extract_slice %arg13[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x2x32x32xf32>
              %extracted_slice_4 = tensor.extract_slice %arg14[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xbf16> to tensor<1x2x32x32xbf16>
              %11:2 = scf.for %arg15 = %c0 to %c2 step %c1 iter_args(%arg16 = %extracted_slice_3, %arg17 = %extracted_slice_4) -> (tensor<1x2x32x32xf32>, tensor<1x2x32x32xbf16>) {
                %12 = affine.apply #map1(%arg2)[%arg12, %arg5]
                %extracted_slice_7 = tensor.extract_slice %arg0[%12, %arg9, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<128x128x32x32xbf16> to tensor<2x32x32xbf16>
                %13 = affine.apply #map1(%arg3)[%arg15, %arg7]
                %extracted_slice_8 = tensor.extract_slice %arg1[%13, %arg9, 0, 0, 0] [1, 2, 16, 32, 2] [1, 1, 1, 1, 1] : tensor<128x128x16x32x2xbf16> to tensor<2x16x32x2xbf16>
                %extracted_slice_9 = tensor.extract_slice %arg16[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> to tensor<32x32xf32>
                %extracted_slice_10 = tensor.extract_slice %arg17[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<32x32xbf16>
                %14 = arith.cmpi eq, %arg9, %c0 : index
                %15 = scf.if %14 -> (tensor<32x32xf32>) {
                  %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_9 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  %19 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%18 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %19 : tensor<32x32xf32>
                } else {
                  %18 = linalgx.batch_reduce_matmul_vnni ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x32x32xbf16>, tensor<2x16x32x2xbf16>) outs(%extracted_slice_9 : tensor<32x32xf32>) -> tensor<32x32xf32>
                  scf.yield %18 : tensor<32x32xf32>
                }
                %16 = arith.cmpi eq, %arg9, %c0 : index
                %17 = scf.if %16 -> (tensor<32x32xbf16>) {
                  %18 = linalg.copy ins(%15 : tensor<32x32xf32>) outs(%extracted_slice_10 : tensor<32x32xbf16>) -> tensor<32x32xbf16>
                  scf.yield %18 : tensor<32x32xbf16>
                } else {
                  scf.yield %extracted_slice_10 : tensor<32x32xbf16>
                }
                %inserted_slice_11 = tensor.insert_slice %15 into %arg16[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xf32> into tensor<1x2x32x32xf32>
                %inserted_slice_12 = tensor.insert_slice %17 into %arg17[0, %arg15, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xbf16> into tensor<1x2x32x32xbf16>
                scf.yield %inserted_slice_11, %inserted_slice_12 : tensor<1x2x32x32xf32>, tensor<1x2x32x32xbf16>
              }
              %inserted_slice_5 = tensor.insert_slice %11#0 into %arg13[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xf32> into tensor<2x2x32x32xf32>
              %inserted_slice_6 = tensor.insert_slice %11#1 into %arg14[%arg12, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> into tensor<2x2x32x32xbf16>
              scf.yield %inserted_slice_5, %inserted_slice_6 : tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>
            }
            scf.yield %10#0, %10#1 : tensor<2x2x32x32xf32>, tensor<2x2x32x32xbf16>
          }
          %inserted_slice_2 = tensor.insert_slice %9#1 into %arg8[0, %arg7, 0, 0] [2, 2, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xbf16> into tensor<2x64x32x32xbf16>
          scf.yield %inserted_slice_2 : tensor<2x64x32x32xbf16>
        }
        %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0, 0, 0] [2, 64, 32, 32] [1, 1, 1, 1] : tensor<2x64x32x32xbf16> into tensor<64x64x32x32xbf16>
        scf.yield %inserted_slice : tensor<64x64x32x32xbf16>
      }
      %5 = affine.apply #map(%arg2)
      %6 = affine.apply #map(%arg3)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %4 into %arg4[%5, %6, 0, 0] [64, 64, 32, 32] [1, 1, 1, 1] : tensor<64x64x32x32xbf16> into tensor<128x128x32x32xbf16>
      }
    }
    return %1 : tensor<128x128x32x32xbf16>
  }
}

yifeizh2 · 2024-06-14T01:42:56Z

include/gc/IR/EasyBuildSCF.h

+
+} // namespace impl
+
+impl::IfSimulator makeIfRange(const EasyBuilder &s, Operation *op) {


Missing an inline here

yifeizh2 · 2024-06-14T01:43:24Z

include/gc/Dialect/Arith/Utils/EasyBuild.h

+}
+
+#define DEF_EASYBUILD_CMP_OPERATOR(OP, OPCLASS, TYPE, PRED) \
+ EBUnsigned operator OP(const TYPE &a, const TYPE &b) { \


Missing an inline here

zhczhong added the WIP work in progress label May 20, 2024

zhczhong force-pushed the zhicong/deep_tile_matmul branch 3 times, most recently from 7c8cfbb to 927322a Compare May 23, 2024 06:11

zhczhong force-pushed the zhicong/deep_tile_matmul branch 6 times, most recently from ea02416 to f261c3c Compare June 3, 2024 03:47

zhczhong force-pushed the zhicong/deep_tile_matmul branch 5 times, most recently from 5ed4fc1 to 22d86d4 Compare June 5, 2024 03:21

yifeizh2 reviewed Jun 13, 2024

View reviewed changes

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 22d86d4 to 8577250 Compare June 13, 2024 08:25

zhczhong added 8 commits June 13, 2024 01:40

add deep tile pass for matmul and tests

ef728c2

Enhance upstream utility and merge all parallel into one forall

2378633

add easy builder support

6ef42c5

Init C buffer with easy builder

fb7aef4

support partial reduction

9a1c41d

support bf16 cast fuse

3390b61

replace generic op with named op

9af3f96

support 2Dx4D/5D case

14f4918

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 8577250 to 206fead Compare June 13, 2024 08:40

yifeizh2 reviewed Jun 14, 2024

View reviewed changes

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 206fead to 65dfab8 Compare June 14, 2024 03:10

zhczhong force-pushed the zhicong/deep_tile_matmul branch from 65dfab8 to d2e3b51 Compare June 25, 2024 07:34

zhczhong added 2 commits June 25, 2024 01:52

support fusing cast to the innermost loop

ff8dd6d

enhance config

6f1445c

zhczhong force-pushed the zhicong/deep_tile_matmul branch from d2e3b51 to 6f1445c Compare June 25, 2024 08:53

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Transform][Tiling] Add deep tile support for matmul #90

[Transform][Tiling] Add deep tile support for matmul #90

zhczhong commented May 20, 2024 •

edited

Loading

zhczhong commented Jun 5, 2024

yifeizh2 Jun 13, 2024 •

edited

Loading

zhczhong Jun 13, 2024

yifeizh2 Jun 13, 2024

zhczhong Jun 13, 2024 •

edited

Loading

zhczhong commented Jun 13, 2024

yifeizh2 Jun 14, 2024

yifeizh2 Jun 14, 2024


		} // namespace impl

		impl::IfSimulator makeIfRange(const EasyBuilder &s, Operation *op) {

[Transform][Tiling] Add deep tile support for matmul #90

Are you sure you want to change the base?

[Transform][Tiling] Add deep tile support for matmul #90

Conversation

zhczhong commented May 20, 2024 • edited Loading

zhczhong commented Jun 5, 2024

yifeizh2 Jun 13, 2024 • edited Loading

Choose a reason for hiding this comment

zhczhong Jun 13, 2024

Choose a reason for hiding this comment

yifeizh2 Jun 13, 2024

Choose a reason for hiding this comment

zhczhong Jun 13, 2024 • edited Loading

Choose a reason for hiding this comment

zhczhong commented Jun 13, 2024

yifeizh2 Jun 14, 2024

Choose a reason for hiding this comment

yifeizh2 Jun 14, 2024

Choose a reason for hiding this comment

zhczhong commented May 20, 2024 •

edited

Loading

yifeizh2 Jun 13, 2024 •

edited

Loading

zhczhong Jun 13, 2024 •

edited

Loading