PSAL-POSTECH
diff --git a/‎PyTorchSimDevice/csrc/aten/native/Extra.cpp‎
Lines changed: 32 additions & 2 deletions b/‎PyTorchSimDevice/csrc/aten/native/Extra.cpp‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎PyTorchSimDevice/torch_openreg/openreg/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎PyTorchSimDevice/torch_openreg/openreg/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎PyTorchSimFrontend/mlir/mlir_lowering.py‎
Lines changed: 14 additions & 46 deletions b/‎PyTorchSimFrontend/mlir/mlir_lowering.py‎
Lines changed: 14 additions & 46 deletions
@@ -20,8 +20,38 @@ int64_t _fused_sdp_choice(
     std::optional<double> scale,
     bool enable_gqa) {
 
-  auto backend = sdp::SDPBackend::overrideable;
-  return static_cast<int64_t>(backend);
+  sdp::sdp_params params{query, key, value, attn_mask, dropout_p, is_causal, enable_gqa};
+
+  // Reject inputs that are fundamentally unsupported (e.g. wrong rank)
+  if (!sdp::check_tensor_shapes(params, /*debug=*/false)) {
+    return static_cast<int64_t>(sdp::SDPBackend::error);
+  }
+
+  // q: (B, Hq, L, E)   k/v: (B, H, S, E)
+  const int64_t Hq = query.size(-3);
+  const int64_t H  = key.size(-3);
+  const int64_t L  = query.size(-2);  // query sequence length
+  const int64_t S  = key.size(-2);    // key/value sequence length
+
+  // Conditions required by the MLIR FlashSDPA kernel:
+  // Prefill only  : L == S  (decode has L == 1, not supported)
+  // Non-GQA       : Hq == H (equal query and KV heads)
+  // No dropout    : template has no dropout implementation
+  // Dense tensors : no nested tensor support
+  const bool can_use_mlir_flash =
+      (L == S) &&
+      (Hq == H) && !enable_gqa &&
+      sdp::check_for_dropout(params, /*debug=*/false) &&
+      sdp::check_nested_tensor(params, /*debug=*/false);
+
+  const bool ctx_flash        = at::globalContext().userEnabledFlashSDP();
+  const bool ctx_math         = at::globalContext().userEnabledMathSDP();
+
+  if (ctx_overrideable && can_use_mlir_flash) {
+    return static_cast<int64_t>(sdp::SDPBackend::overrideable);
+  }
+
+  return static_cast<int64_t>(sdp::SDPBackend::math);
 }
 
 void quantize_tensor_per_tensor_affine_stub(
 
@@ -73,6 +73,11 @@ def _lazy_init():
     register_interface_for_device(custom_device(), ExtensionDeviceInterface)
     _initialized = True
 
+    # Set default SDPA backend to math-only for this device.
+    torch._C._set_sdp_use_flash(False)
+    torch._C._set_sdp_use_overrideable(False)
+    torch._C._set_sdp_use_math(True)
+
     # Create default streams for all devices
     num_devices = device_count()
     for device_idx in range(num_devices):
 
@@ -20,8 +20,6 @@
 from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate
 from PyTorchSimFrontend.mlir.mlir_sdpa_template import (
     MLIRFlashSDPATemplate,
-    MLIRDecodeGQASDPAPartialTemplate,
-    MLIRDecodeGQASDPAReduceTemplate,
     flash_sdpa_args,
     calculate_scale,
 )
@@ -51,56 +49,27 @@ def tuned_bmm(mat1, mat2, *, layout=None):
 
 
 def tuned_flash_sdpa(
-        query             : TensorBox, 
-        key               : TensorBox, 
-        value             : TensorBox, 
+        query             : TensorBox,
+        key               : TensorBox,
+        value             : TensorBox,
         attn_bias         : Optional[TensorBox] = None,
-        dropout_p         : float = 0.0, 
-        is_causal         : bool = False, 
+        dropout_p         : float = 0.0,
+        is_causal         : bool = False,
         return_debug_mask : bool = False,
-        scale             : Optional[float] = None) -> tuple: 
-    
-    
+        scale             : Optional[float] = None,
+        enable_gqa        : bool = False) -> tuple:
+    # _fused_sdp_choice in C++ already guarantees:
+    #   L == S (prefill), Hq == H (non-GQA), dropout_p == 0.0
+    # before routing here via SDPBackend::overrideable.
+    # Non-matching shapes fall back to SDPBackend::math in C++ and decompose
+    # into primitive ops (matmul/softmax) before reaching this lowering.
     scale = calculate_scale(query, scale)
     N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value)
-    
-    # Decode-only GQA fast path: q is (B,Hq,1,Dh), B==1, Hq!=H, Hq%H==0.
-    # Always use the 2-kernel decode path:
-    # 1) block partials over (kv head, sequence block)
-    # 2) reduce/merge across blocks
-    # This keeps KV shared across qsub, avoids dh0-outer duplication, and
-    # stores compact partials instead of full score/prob tensors in DRAM.
-    if L == 1 and Hq != H and N == 1 and (Hq % H) == 0:
-        g = Hq // H
-        vector_lane = extension_config.vpu_num_lanes
-        tile_e = vector_lane
-        dh_tiles = E // tile_e
-        decode_gqa_block_size = 512
-        BlkS = decode_gqa_block_size if S >= decode_gqa_block_size else int(S)
-        # Padding-based tail handling: allow S not divisible by BlkS.
-        nblk = (S + BlkS - 1) // BlkS
-        HgDhTiles = H * g * dh_tiles
-        tile_pack = tile_e * 2
-
-        partial_layout = ir.FixedLayout(
-            query.get_device(),
-            torch.float32,
-            [HgDhTiles, nblk, tile_pack],
-        )
-        partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS)
-        partial = partial_tmpl.generate().output_node()
-        partial.realize()
-        reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS)
-        out_node = reduce_tmpl.generate().output_node()
-        return (out_node, None, None, None, None, None, None, None, None)
-
     mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale)
-
-    # _scaled_dot_product_flash_attention has to return a tuple which has 9 values
-    # since its backward(_scaled_dot_product_flash_attention_backward) needs that values.
-    # (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
     return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None)
 
+
+
 def conv_layout(
     x: TensorBox,
     weight: TensorBox,
@@ -345,5 +314,4 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool):
 
 if extension_config.CONFIG_USE_TIMING_POOLING:
     lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
-
 lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()})