[mxfp8 moe training] integrate new cuda kernel for blocked layout for groups along K

danielvegamyhre · danielvegamyhre · commit c076eb14bc95 · 2025-12-20T18:12:41.000-08:00
stack-info: PR: #3505, branch: danielvegamyhre/stack/87
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
@@ -114,13 +114,11 @@ mxfp8_quantize(const at::Tensor& input, bool rowwise, bool colwise,
   if (colwise) {
     const int64_t num_row_blocks = (rows + scale_dim_y - 1) / scale_dim_y;
     output_colwise = at::empty_strided({rows, cols}, {1, rows}, options_fp8);
-    // Need scales_colwise to be this shape so the 'col' dim stride is 1, 
-    // for colwise scaling, we can avoid uncoalesced writes to global memory.
-    // This is because each of the 32 threads in a warp will be computing
-    // a scale for a different column of 32 input data values, then each writing
-    // that scale to global memory - so the stride along this `col` dim should be 1
-    // so writes can be coalesced into a single transaction.
-    scales_colwise = at::empty_strided({cols, num_row_blocks}, {1, cols}, options_scale);
+
+    // Accept uncoalesced global stores for scale tensor, since row major is much for favorable for the subsequent 
+    // per-group blocked format kernel.
+    // Microbenchmarks show the memory bandwidth utilization is virtually identical to coalesced global stores.
+    scales_colwise = at::empty({cols, num_row_blocks}, options_scale);
   } else {
     output_colwise = at::empty({0}, options_fp8);
     scales_colwise = at::empty({0}, options_scale);
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/__init__.py b/torchao/prototype/moe_training/kernels/mxfp8/__init__.py
@@ -1,4 +1,5 @@
 from torchao.prototype.moe_training.kernels.mxfp8.quant import (
+    mx_block_rearrange_2d_K_groups_cuda,  # noqa: F401
     mxfp8_quantize_cuda_3d,  # noqa: F401
     torch_to_blocked_2d_K_groups,  # noqa: F401
     torch_to_blocked_2d_M_groups,  # noqa: F401
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -17,8 +17,8 @@
     triton_fp8_rowwise_3d_transpose_rhs,
 )
 from torchao.prototype.moe_training.kernels.mxfp8 import (
+    mx_block_rearrange_2d_K_groups_cuda,
     mxfp8_quantize_cuda_3d,
-    triton_mx_block_rearrange_2d_K_groups,
     triton_mx_block_rearrange_2d_M_groups,
     triton_mx_block_rearrange_per_group_3d,
 )
@@ -92,28 +92,28 @@ def forward(
         assert A.ndim == 2 or A.ndim == 3, "A must be 2D or 3D"
         assert B_t.ndim == 3, "B must be 3D"
 
-        assert A.size(-1) % 16 == 0, (
-            f"A must have a last dim divisible by 16, but got shape: {A.shape}"
-        )
-        assert B_t.size(-2) % 16 == 0 and B_t.size(-1) % 16 == 0, (
-            f"B must have last 2 dims divisible by 16, but got shape: {B_t.shape}"
-        )
+        assert (
+            A.size(-1) % 16 == 0
+        ), f"A must have a last dim divisible by 16, but got shape: {A.shape}"
+        assert (
+            B_t.size(-2) % 16 == 0 and B_t.size(-1) % 16 == 0
+        ), f"B must have last 2 dims divisible by 16, but got shape: {B_t.shape}"
 
         # Assert input tensors are in high-precision dtypes.
-        assert A.dtype == torch.float32 or A.dtype == torch.bfloat16, (
-            "A must be float32 or bfloat16"
-        )
-        assert B_t.dtype == torch.float32 or B_t.dtype == torch.bfloat16, (
-            "B must be float32 or bfloat16"
-        )
-        assert offs is None or offs.dtype == torch.int32, (
-            "offs must be int32 tensor or None"
-        )
+        assert (
+            A.dtype == torch.float32 or A.dtype == torch.bfloat16
+        ), "A must be float32 or bfloat16"
+        assert (
+            B_t.dtype == torch.float32 or B_t.dtype == torch.bfloat16
+        ), "B must be float32 or bfloat16"
+        assert (
+            offs is None or offs.dtype == torch.int32
+        ), "offs must be int32 tensor or None"
 
         # Assert A and B dims are compatible for a scaled grouped GEMM.
-        assert A.size(-1) == B_t.size(-2), (
-            f"shape {A.shape} and {B_t.shape} are not compatible for _quantize_then_scaled_grouped_mm"
-        )
+        assert A.size(-1) == B_t.size(
+            -2
+        ), f"shape {A.shape} and {B_t.shape} are not compatible for _quantize_then_scaled_grouped_mm"
 
         # The left operand in the scaled grouped GEMM must be row-major due to hardware requirements.
         assert not _is_column_major(A), "A must be row-major"
@@ -154,12 +154,12 @@ def forward(
 
         # Perform scaled grouped GEMM and return result.
         # output shape: scaled grouped mm of (M,K) @ (B,K,N) = (M,N)
-        assert not _is_column_major(A_data_row_major), (
-            "A must be row-major for output = A @ B"
-        )
-        assert _is_column_major(B_t_data_col_major), (
-            "B must be column-major for output = A @ B"
-        )
+        assert not _is_column_major(
+            A_data_row_major
+        ), "A must be row-major for output = A @ B"
+        assert _is_column_major(
+            B_t_data_col_major
+        ), "B must be column-major for output = A @ B"
 
         # Squeeze empty dims out of scales, to comply with grouped mm API.
         # A_scales shape: (M,1) or (B, M, 1)
@@ -209,12 +209,12 @@ def backward(ctx, grad_output: torch.Tensor):
         # Compute grad_A.
         # grad_A = grad_output @ B
         # grad_A = scaled grouped mm of (M,N) @ (B,N,K) = (M,K)
-        assert not _is_column_major(grad_output_data_row_major), (
-            "grad_output must be row-major for grad_A = grad_output @ B"
-        )
-        assert _is_column_major(B_data_col_major), (
-            "B must be column-major for grad_A = grad_output @ B"
-        )
+        assert not _is_column_major(
+            grad_output_data_row_major
+        ), "grad_output must be row-major for grad_A = grad_output @ B"
+        assert _is_column_major(
+            B_data_col_major
+        ), "B must be column-major for grad_A = grad_output @ B"
 
         # Squeeze empty dims out of scales, to comply with grouped mm API.
         # grad_output_scales shape: (M,1) or (B, M, 1)
@@ -259,12 +259,12 @@ def backward(ctx, grad_output: torch.Tensor):
 
         # Compute grad_B = grad_output_t @ A.
         # grad_B = grad_output_t @ A
-        assert not _is_column_major(grad_output_t_data_row_major), (
-            "grad_output_t must be row-major for grad_B = grad_output_t @ A"
-        )
-        assert _is_column_major(A_data_col_major), (
-            "A must be column-major for grad_B = grad_output_t @ A"
-        )
+        assert not _is_column_major(
+            grad_output_t_data_row_major
+        ), "grad_output_t must be row-major for grad_B = grad_output_t @ A"
+        assert _is_column_major(
+            A_data_col_major
+        ), "A must be column-major for grad_B = grad_output_t @ A"
 
         # Per-token group scales computed via triton kernels above do not have
         # the empty dim like the scales computed via tensor_to_scale, so we need
@@ -449,11 +449,11 @@ def backward(ctx, grad_out: torch.Tensor):
 
             # Convert scales to blocked format for 2d-2d grouped mm
             scale_group_offsets = offs // block_size
-            grad_out_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
+            grad_out_t_scales_blocked = mx_block_rearrange_2d_K_groups_cuda(
                 grad_out_t_scales,
                 scale_group_offsets,
             )
-            A_t_scales_blocked = triton_mx_block_rearrange_2d_K_groups(
+            A_t_scales_blocked = mx_block_rearrange_2d_K_groups_cuda(
                 A_t_scales,
                 scale_group_offsets,
             )
@@ -518,21 +518,21 @@ def _emulated_mxfp8_scaled_grouped_mm_2d_3d(
 ) -> torch.Tensor:
     assert A_data.ndim == 2, f"A must be 2D, got {A_data.ndim}"
     assert B_data.ndim == 3, f"B must be 3D, got {B_data.ndim}"
-    assert A_scale.shape[0] == A_data.shape[0], (
-        f"A_scale must have same M dim as A_data, got A={A_data.shape} and A_scale={A_scale.shape}"
-    )
-    assert A_scale.shape[1] == A_data.shape[1] // block_size, (
-        f"A_scale dim1 should be size K//block_size, got A={A_data.shape} and A_scale={A_scale.shape}"
-    )
-    assert B_scale.shape[0] == B_data.shape[0], (
-        f"B_scale must have same E dim as B_data, got B={B_data.shape} and B_scale={B_scale.shape}"
-    )
-    assert B_scale.shape[1] == B_data.shape[1], (
-        f"B_scale must have same N dim as B_data, got B={B_data.shape} and B_scale={B_scale.shape}"
-    )
-    assert B_scale.shape[2] == B_data.shape[2] // block_size, (
-        f"B_scale dim2 should be size K//block_size, got B={B_data.shape} and B_scale={B_scale.shape}"
-    )
+    assert (
+        A_scale.shape[0] == A_data.shape[0]
+    ), f"A_scale must have same M dim as A_data, got A={A_data.shape} and A_scale={A_scale.shape}"
+    assert (
+        A_scale.shape[1] == A_data.shape[1] // block_size
+    ), f"A_scale dim1 should be size K//block_size, got A={A_data.shape} and A_scale={A_scale.shape}"
+    assert (
+        B_scale.shape[0] == B_data.shape[0]
+    ), f"B_scale must have same E dim as B_data, got B={B_data.shape} and B_scale={B_scale.shape}"
+    assert (
+        B_scale.shape[1] == B_data.shape[1]
+    ), f"B_scale must have same N dim as B_data, got B={B_data.shape} and B_scale={B_scale.shape}"
+    assert (
+        B_scale.shape[2] == B_data.shape[2] // block_size
+    ), f"B_scale dim2 should be size K//block_size, got B={B_data.shape} and B_scale={B_scale.shape}"
 
     # Dequantize input
     # A_data shape: (M, K)
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1219,10 +1219,9 @@ def _fake_mxfp8_quantize(
                 (rows, cols), (1, rows), dtype=torch.float8_e4m3fn, device=x.device
             )
 
-            # colwise scales are written in column-major format to avoid uncoalesced global memory accesses
-            scales_colwise = torch.empty_strided(
+            # and microb
+            scales_colwise = torch.empty(
                 (cols, num_row_blocks),
-                (1, cols),
                 dtype=torch.float8_e8m0fnu,
                 device=x.device,
             )

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from torchao.prototype.moe_training.kernels.mxfp8.quant import (`
	`2`	`+ mx_block_rearrange_2d_K_groups_cuda, # noqa: F401`
`2`	`3`	`mxfp8_quantize_cuda_3d, # noqa: F401`
`3`	`4`	`torch_to_blocked_2d_K_groups, # noqa: F401`
`4`	`5`	`torch_to_blocked_2d_M_groups, # noqa: F401`
Original file line number	Diff line number	Diff line change
`@@ -1219,10 +1219,9 @@ def _fake_mxfp8_quantize(`
`1219`	`1219`	`(rows, cols), (1, rows), dtype=torch.float8_e4m3fn, device=x.device`
`1220`	`1220`	`)`
`1221`	`1221`
`1222`		`- # colwise scales are written in column-major format to avoid uncoalesced global memory accesses`
`1223`		`- scales_colwise = torch.empty_strided(`
	`1222`	`+ # and microb`
	`1223`	`+ scales_colwise = torch.empty(`
`1224`	`1224`	`(cols, num_row_blocks),`
`1225`		`- (1, cols),`
`1226`	`1225`	`dtype=torch.float8_e8m0fnu,`
`1227`	`1226`	`device=x.device,`
`1228`	`1227`	`)`