[mxfp8 moe training] integrate new cuda kernel for blocked layout for groups along K

danielvegamyhre · danielvegamyhre · commit b57a58e733a7 · 2025-12-17T15:53:27.000-08:00
stack-info: PR: #3505, branch: danielvegamyhre/stack/87
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
@@ -114,13 +114,11 @@ mxfp8_quantize(const at::Tensor& input, bool rowwise, bool colwise,
   if (colwise) {
     const int64_t num_row_blocks = (rows + scale_dim_y - 1) / scale_dim_y;
     output_colwise = at::empty_strided({rows, cols}, {1, rows}, options_fp8);
-    // Need scales_colwise to be this shape so the 'col' dim stride is 1, 
-    // for colwise scaling, we can avoid uncoalesced writes to global memory.
-    // This is because each of the 32 threads in a warp will be computing
-    // a scale for a different column of 32 input data values, then each writing
-    // that scale to global memory - so the stride along this `col` dim should be 1
-    // so writes can be coalesced into a single transaction.
-    scales_colwise = at::empty_strided({cols, num_row_blocks}, {1, cols}, options_scale);
+
+    // Accept uncoalesced global stores for scale tensor, since row major is much for favorable for the subsequent 
+    // per-group blocked format kernel.
+    // Microbenchmarks show the memory bandwidth utilization is virtually identical to coalesced global stores.
+    scales_colwise = at::empty({cols, num_row_blocks}, options_scale);
   } else {
     output_colwise = at::empty({0}, options_fp8);
     scales_colwise = at::empty({0}, options_scale);
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/__init__.py b/torchao/prototype/moe_training/kernels/mxfp8/__init__.py
@@ -1,4 +1,5 @@
 from torchao.prototype.moe_training.kernels.mxfp8.quant import (
+    mx_block_rearrange_2d_K_groups_cuda,  # noqa: F401
     mxfp8_quantize_cuda_3d,  # noqa: F401
     torch_to_blocked_2d_K_groups,  # noqa: F401
     torch_to_blocked_2d_M_groups,  # noqa: F401
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -17,8 +17,8 @@
     triton_fp8_rowwise_3d_transpose_rhs,
 )
 from torchao.prototype.moe_training.kernels.mxfp8 import (
+    mx_block_rearrange_2d_K_groups_cuda,
     mxfp8_quantize_cuda_3d,
-    triton_mx_block_rearrange_2d_K_groups,
     triton_mx_block_rearrange_2d_M_groups,
     triton_mx_block_rearrange_per_group_3d,
 )
@@ -437,11 +437,11 @@ def backward(ctx, grad_out: torch.Tensor):
 
         # Convert scales to blocked format for 2d-2d grouped mm
         scale_group_offsets = offs // block_size
-        grad_out_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
+        grad_out_t_scales_blocked = mx_block_rearrange_2d_K_groups_cuda(
             grad_out_t_scales,
             scale_group_offsets,
         )
-        A_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
+        A_t_scales_blocked = mx_block_rearrange_2d_K_groups_cuda(
             A_t_scales,
             scale_group_offsets,
         )
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1181,10 +1181,9 @@ def _fake_mxfp8_quantize(
                 (rows, cols), (1, rows), dtype=torch.float8_e4m3fn, device=x.device
             )
 
-            # colwise scales are written in column-major format to avoid uncoalesced global memory accesses
-            scales_colwise = torch.empty_strided(
+            # and microb
+            scales_colwise = torch.empty(
                 (cols, num_row_blocks),
-                (1, cols),
                 dtype=torch.float8_e8m0fnu,
                 device=x.device,
             )

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from torchao.prototype.moe_training.kernels.mxfp8.quant import (`
	`2`	`+ mx_block_rearrange_2d_K_groups_cuda, # noqa: F401`
`2`	`3`	`mxfp8_quantize_cuda_3d, # noqa: F401`
`3`	`4`	`torch_to_blocked_2d_K_groups, # noqa: F401`
`4`	`5`	`torch_to_blocked_2d_M_groups, # noqa: F401`
Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,8 @@`
`17`	`17`	`triton_fp8_rowwise_3d_transpose_rhs,`
`18`	`18`	`)`
`19`	`19`	`from torchao.prototype.moe_training.kernels.mxfp8 import (`
	`20`	`+ mx_block_rearrange_2d_K_groups_cuda,`
`20`	`21`	`mxfp8_quantize_cuda_3d,`
`21`		`- triton_mx_block_rearrange_2d_K_groups,`
`22`	`22`	`triton_mx_block_rearrange_2d_M_groups,`
`23`	`23`	`triton_mx_block_rearrange_per_group_3d,`
`24`	`24`	`)`
`@@ -437,11 +437,11 @@ def backward(ctx, grad_out: torch.Tensor):`
`437`	`437`
`438`	`438`	`# Convert scales to blocked format for 2d-2d grouped mm`
`439`	`439`	`scale_group_offsets = offs // block_size`
`440`		`- grad_out_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(`
	`440`	`+ grad_out_t_scales_blocked = mx_block_rearrange_2d_K_groups_cuda(`
`441`	`441`	`grad_out_t_scales,`
`442`	`442`	`scale_group_offsets,`
`443`	`443`	`)`
`444`		`- A_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(`
	`444`	`+ A_t_scales_blocked = mx_block_rearrange_2d_K_groups_cuda(`
`445`	`445`	`A_t_scales,`
`446`	`446`	`scale_group_offsets,`
`447`	`447`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1181,10 +1181,9 @@ def _fake_mxfp8_quantize(`
`1181`	`1181`	`(rows, cols), (1, rows), dtype=torch.float8_e4m3fn, device=x.device`
`1182`	`1182`	`)`
`1183`	`1183`
`1184`		`- # colwise scales are written in column-major format to avoid uncoalesced global memory accesses`
`1185`		`- scales_colwise = torch.empty_strided(`
	`1184`	`+ # and microb`
	`1185`	`+ scales_colwise = torch.empty(`
`1186`	`1186`	`(cols, num_row_blocks),`
`1187`		`- (1, cols),`
`1188`	`1187`	`dtype=torch.float8_e8m0fnu,`
`1189`	`1188`	`device=x.device,`
`1190`	`1189`	`)`