PyFR · WillTrojak · Dec 2, 2025 · Apr 24, 2026 · Apr 29, 2026 · May 11, 2026
diff --git a/gimmik/__init__.py b/gimmik/__init__.py
@@ -8,6 +8,7 @@
 from gimmik.hip import HIPMatMul
 from gimmik.metal import MetalMatMul
 from gimmik.opencl import OpenCLMatMul
+from gimmik.ptx import PTXMatMul
 
 
 def generate_mm(mat, dtype, platform, alpha=1.0, beta=0.0, funcn='gimmik_mm',
@@ -22,7 +23,8 @@ def generate_mm(mat, dtype, platform, alpha=1.0, beta=0.0, funcn='gimmik_mm',
         'cuda': CUDAMatMul,
         'ispc': ISPCMatMul,
         'hip': HIPMatMul,
-        'opencl': OpenCLMatMul
+        'opencl': OpenCLMatMul,
+        'ptx': PTXMatMul
     }
 
     mm = platmap[platform](alpha*mat, beta, None, n, ldb, ldc)

diff --git a/gimmik/base.py b/gimmik/base.py
@@ -144,7 +144,8 @@ def _render_kernel(self, dtype, tplname, tplargs):
         src = tpl.render(**tplargs)
 
         # At single precision suffix all floating point constants by 'f'
-        if dtype == 'float':
+        # (PTX doesn't use an 'f' suffix for FP literals)
+        if dtype == 'float' and self.platform != 'ptx':
             src = re.sub(r'(?=\d*[.eE])(?=\.?\d)\d*\.?\d*(?:[eE][+-]?\d+)?',
                          r'\g<0>f', src)
 

diff --git a/gimmik/cuda.py b/gimmik/cuda.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 
+import numpy as np
+
 from gimmik.base import MatMul
 
 
@@ -8,7 +10,15 @@ class CUDAMatMul(MatMul):
     basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0,
                 'dynamic_shared': 0}
 
-    def _kernel_generators(self, dtype, dsize, *, compute_capability=None):
+    @staticmethod
+    def is_suitable(arr):
+        nnz = np.count_nonzero(arr)
+        nuq = len(np.unique(np.abs(arr)))
+        density = nnz / arr.size
+        return (nuq <= 28) or (density <= 0.15)
+
+    def _kernel_generators(self, dtype, dsize, *, compute_capability=None,
+                           **kwargs):
         # B loading, C streaming kernel
         yield ('cstream', {}, {})
 

diff --git a/gimmik/kernels/ptx/base.mako b/gimmik/kernels/ptx/base.mako
@@ -0,0 +1,4 @@
+.version ${ptx[0]}.${ptx[1]}
+.target sm_${cc[0]}${cc[1]}${'a' if cc[0] >= 9 else ''}
+.address_size 64
+${next.body()}
diff --git a/gimmik/kernels/ptx/bstream-msplit.mako b/gimmik/kernels/ptx/bstream-msplit.mako
@@ -0,0 +1,261 @@
+<%inherit file='base'/>
+
+<%
+mx = partition(A, into=msplit, by='rows')
+bchunks = chunk(bix, bsz)
+m_per_group = max(len(mcx) for mcx in mx)
+bsub_bytes = 2 * bsz * blockx * dwidth_i
+def bsub_off(buf, idx):
+    return (buf * bsz + idx) * blockx * dwidth_i
+use_cpasync = cc is not None and (cc[0], cc[1]) >= (8, 0) and dwidth_i in (4, 8)
+%>
+
+% if n is None:
+.visible .entry ${kname}(.param .u32 _n,
+                         .param .u64 _b,
+                         .param .u32 _ldb,
+                         .param .u64 _c,
+                         .param .u32 _ldc)
+{
+    .reg .u32 ldb, ldc;
+    ld.param.u32 ldb, [_ldb];
+    ld.param.u32 ldc, [_ldc];
+% else:
+.visible .entry ${kname}(.param .u64 _b,
+                         .param .u64 _c)
+{
+% endif
+    .reg .u32 n, id, tid_x, tid_y;
+    .reg .u64 b, c, b_base, c_base, bsub_thread;
+% if use_cpasync:
+    .reg .u32 bsub_sm_thread;
+% endif
+    .reg .${pftype} bv, csub<${m_per_group}>;
+    .reg .pred p1, p_skip;
+    .shared .align 8 .b8 _bsub[${bsub_bytes}];
+
+% if n is None:
+    ld.param.u32 n, [_n];
+% else:
+    mov.u32 n, ${n};
+% endif
+    ld.param.u64 b, [_b];
+    ld.param.u64 c, [_c];
+
+    {
+        .reg .u32 _ctaid_x;
+        mov.u32 _ctaid_x, %ctaid.x;
+        mov.u32 tid_x, %tid.x;
+        mov.u32 tid_y, %tid.y;
+        mad.lo.u32 id, _ctaid_x, ${blockx}, tid_x;
+    }
+
+    setp.ge.u32 p1, id, n;
+    @p1 bra $L_EXIT;
+
+    cvta.to.global.u64 b, b;
+    cvta.to.global.u64 c, c;
+
+    {
+        .reg .u64 _id64;
+        cvt.u64.u32 _id64, id;
+        mad.lo.u64 b_base, _id64, ${dwidth_i}, b;
+        mad.lo.u64 c_base, _id64, ${dwidth_i}, c;
+    }
+
+    {
+        .reg .u64 _tx_off;
+        mul.wide.u32 _tx_off, tid_x, ${dwidth_i};
+        mov.u64 bsub_thread, _bsub;
+        add.u64 bsub_thread, bsub_thread, _tx_off;
+    }
+% if use_cpasync:
+    {
+        .reg .u64 _sm64;
+        cvta.to.shared.u64 _sm64, bsub_thread;
+        cvt.u32.u64 bsub_sm_thread, _sm64;
+    }
+% endif
+
+% for cid, mcx in enumerate(mx):
+## cid = ${cid}, rows ${mcx}
+    setp.ne.u32 p_skip, tid_y, ${cid};
+    @p_skip bra $L_END_CID_${cid};
+
+%  if use_cpasync:
+## Async fill of chunk 0
+%   for idx, kx in [(i, k) for i, k in enumerate(bchunks[0]) if i % msplit == cid]:
+%    if n is None:
+    {
+        .reg .u32 _boff;
+        .reg .u64 _bptr;
+        mul.lo.u32 _boff, ldb, ${kx};
+        mad.wide.u32 _bptr, ${dwidth_i}, _boff, b_base;
+        cp.async.ca.shared::cta.global [bsub_sm_thread + ${bsub_off(0, idx)}], [_bptr], ${dwidth_i};
+    }
+%    else:
+    cp.async.ca.shared::cta.global [bsub_sm_thread + ${bsub_off(0, idx)}], [b_base + ${ldb*kx*dwidth_i}], ${dwidth_i};
+%    endif
+%   endfor
+    cp.async.commit_group;
+    cp.async.wait_all;
+    bar.sync 0;
+%  else:
+## Sync fill of chunk 0
+%   for idx, kx in [(i, k) for i, k in enumerate(bchunks[0]) if i % msplit == cid]:
+    {
+        .reg .${pftype} _bv;
+%    if n is None:
+        .reg .u32 _boff;
+        .reg .u64 _bptr;
+        mul.lo.u32 _boff, ldb, ${kx};
+        mad.wide.u32 _bptr, ${dwidth_i}, _boff, b_base;
+        ld.weak.global.cg.${pftype} _bv, [_bptr];
+%    else:
+        ld.weak.global.cg.${pftype} _bv, [b_base + ${ldb*kx*dwidth_i}];
+%    endif
+        st.shared.${pftype} [bsub_thread + ${bsub_off(0, idx)}], _bv;
+    }
+%   endfor
+    bar.sync 0;
+%  endif
+
+## Main loop over B-chunks (double-buffered)
+%  for bb in range(len(bchunks)):
+<%
+        buf_cur = bb % 2
+        buf_next = (bb + 1) % 2
+%>
+%   if not loop.last:
+%    for idx, kx in [(i, k) for i, k in enumerate(bchunks[bb + 1]) if i % msplit == cid]:
+%     if use_cpasync:
+%      if n is None:
+    {
+        .reg .u32 _boff;
+        .reg .u64 _bptr;
+        mul.lo.u32 _boff, ldb, ${kx};
+        mad.wide.u32 _bptr, ${dwidth_i}, _boff, b_base;
+        cp.async.ca.shared::cta.global [bsub_sm_thread + ${bsub_off(buf_next, idx)}], [_bptr], ${dwidth_i};
+    }
+%      else:
+    cp.async.ca.shared::cta.global [bsub_sm_thread + ${bsub_off(buf_next, idx)}], [b_base + ${ldb*kx*dwidth_i}], ${dwidth_i};
+%      endif
+%     else:
+    {
+        .reg .${pftype} _bv;
+%      if n is None:
+        .reg .u32 _boff;
+        .reg .u64 _bptr;
+        mul.lo.u32 _boff, ldb, ${kx};
+        mad.wide.u32 _bptr, ${dwidth_i}, _boff, b_base;
+        ld.weak.global.cg.${pftype} _bv, [_bptr];
+%      else:
+        ld.weak.global.cg.${pftype} _bv, [b_base + ${ldb*kx*dwidth_i}];
+%      endif
+        st.shared.${pftype} [bsub_thread + ${bsub_off(buf_next, idx)}], _bv;
+    }
+%     endif
+%    endfor
+%    if use_cpasync:
+    cp.async.commit_group;
+%    endif
+%   endif
+
+%   for idx, kx in enumerate(bchunks[bb]):
+    ld.shared.${pftype} bv, [bsub_thread + ${bsub_off(buf_cur, idx)}];
+%    for j, row_j in enumerate(mcx):
+<%    jx = A[row_j, kx] %>
+%     if jx != 0 and kx == afix[row_j]:
+    mul.${pftype} csub${j}, bv, ${jx};
+%     elif jx != 0:
+    fma.rn.${pftype} csub${j}, bv, ${jx}, csub${j};
+%     endif
+%     if kx == alix[row_j]:
+%      if beta_zero:
+%       if n is None:
+    {
+        .reg .u32 _coff;
+        .reg .u64 _cptr;
+        mul.lo.u32 _coff, ldc, ${row_j};
+        mad.wide.u32 _cptr, ${dwidth_i}, _coff, c_base;
+        st.weak.global.cg.${pftype} [_cptr], csub${j};
+    }
+%       else:
+    st.weak.global.cg.${pftype} [c_base + ${ldc*row_j*dwidth_i}], csub${j};
+%       endif
+%      else:
+    {
+        .reg .${pftype} _ctmp;
+%       if n is None:
+        .reg .u32 _coff;
+        .reg .u64 _cptr;
+        mul.lo.u32 _coff, ldc, ${row_j};
+        mad.wide.u32 _cptr, ${dwidth_i}, _coff, c_base;
+        ld.weak.global.cg.${pftype} _ctmp, [_cptr];
+        fma.rn.${pftype} _ctmp, _ctmp, ${float(beta)}, csub${j};
+        st.weak.global.${pftype} [_cptr], _ctmp;
+%       else:
+        ld.weak.global.cg.${pftype} _ctmp, [c_base + ${ldc*row_j*dwidth_i}];
+        fma.rn.${pftype} _ctmp, _ctmp, ${float(beta)}, csub${j};
+        st.weak.global.${pftype} [c_base + ${ldc*row_j*dwidth_i}], _ctmp;
+%       endif
+    }
+%      endif
+%     endif
+%    endfor
+%   endfor
+%   if use_cpasync:
+%    if not loop.last:
+    cp.async.wait_all;
+%    endif
+%   endif
+    bar.sync 0;
+%  endfor
+## End of Main loop over B-chunks
+
+## Handle zero rows in this cid's group
+%  if has_zero_rows:
+%   for row_j in mcx:
+%    if afix[row_j] == -1:
+%     if beta_zero:
+    {
+        .reg .${pftype} _tmp;
+        mov.${pftype} _tmp, ${fzero};
+%      if n is None:
+        .reg .u32 _coff;
+        .reg .u64 _cptr;
+        mul.lo.u32 _coff, ldc, ${row_j};
+        mad.wide.u32 _cptr, ${dwidth_i}, _coff, c_base;
+        st.weak.global.cg.${pftype} [_cptr], _tmp;
+%      else:
+        st.weak.global.cg.${pftype} [c_base + ${ldc*row_j*dwidth_i}], _tmp;
+%      endif
+    }
+%     elif beta != 1:
+    {
+        .reg .${pftype} _tmp;
+%      if n is None:
+        .reg .u32 _coff;
+        .reg .u64 _cptr;
+        mul.lo.u32 _coff, ldc, ${row_j};
+        mad.wide.u32 _cptr, ${dwidth_i}, _coff, c_base;
+        ld.weak.global.cg.${pftype} _tmp, [_cptr];
+        mul.${pftype} _tmp, _tmp, ${float(beta)};
+        st.weak.global.${pftype} [_cptr], _tmp;
+%      else:
+        ld.weak.global.cg.${pftype} _tmp, [c_base + ${ldc*row_j*dwidth_i}];
+        mul.${pftype} _tmp, _tmp, ${float(beta)};
+        st.weak.global.${pftype} [c_base + ${ldc*row_j*dwidth_i}], _tmp;
+%      endif
+    }
+%     endif
+%    endif
+%   endfor
+%  endif
+
+$L_END_CID_${cid}:
+% endfor
+
+$L_EXIT:
+    ret;
+}