Manual merge of #16307 (#16328)

SS-JIA · web-flow · commit 0fb422f9c59e · 2025-12-18T19:29:09.000-05:00
Summary: As title. This PR contains all the changes contained in the #16307 PR stack. There was an issue creating a cherry pick PR after the changes landed via phabricator
diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp
@@ -52,8 +52,11 @@ std::string make_arg_json(ComputeGraph* const compute_graph, ValueRef arg) {
   } else if (compute_graph->val_is_value_list(arg)) {
     ValueListPtr val_list = compute_graph->get_value_list(arg);
     ss << ", \"values\": [";
-    for (const ValueRef& value : *val_list) {
-      ss << value << ", ";
+    for (size_t i = 0; i < val_list->size(); ++i) {
+      ss << val_list->at(i);
+      if (i + 1 < val_list->size()) {
+        ss << ", ";
+      }
     }
     ss << "]";
   } else if (compute_graph->val_is_int_list(arg)) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -94,8 +94,7 @@ void main() {
     return;
   }
 
-  TensorIndex outp_tidx;
-  linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);
+  TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
 
   TensorIndex inp_tidx = outp_tidx;
   clamp_tensor_idx(inp, inp_tidx);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
@@ -29,8 +29,7 @@ void main() {
     return;
   }
 
-  TensorIndex inp_tidx;
-  linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
+  TensorIndex inp_tidx = linear_idx_to_tensor_idx(inp, inp_bufi);
 
   uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl
@@ -33,13 +33,12 @@ void main() {
     return;
   }
 
-  TensorIndex outp_tidx;
-  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
+  TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, outp_bufi);
 
   // Map output tensor index to input tensor index by taking modulo
   // with input tensor sizes for each dimension
   TensorIndex inp_tidx = outp_tidx;
-  for (int d = 0; d < ndim(inp); ++d) {
+  for (int d = 0; d < ndim(outp); ++d) {
     uint inp_size = size_at(inp, d);
     uint outp_idx = idx_at(outp_tidx, d);
     inp_tidx.data[div_4(d)][mod_4(d)] = outp_idx % inp_size;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -11,9 +11,59 @@
 
 #include "common.glslh"
 
+#extension GL_EXT_control_flow_attributes : require
+
 #define DIMLIMIT 8
 #define DIMLIMIT_DIV4 2
 
+//
+// Hashed layout utils
+//
+
+/*
+ * The hashed layout is a packed int32 where each group of 4 bits contain some
+ * information about the memory layout of a tensor buffer or texture. It is
+ * passed into shaders as a specialization constant and allows shader compilers
+ * to select optimized code paths suited for a particular memory layout.
+ *
+ * Currently the following information is packed into the layout integer:
+ * - bits 0-15: first 4 elements of the dim order array
+ *   - bits 0-3: dim_order[0]
+ *   - bits 4-7: dim_order[1]
+ *   - bits 8-11: dim_order[2]
+ *   - bits 12-15: dim_order[3]
+ */
+
+// Extracts the 4-bit packed value at the given position (0..7) from a 32-bit
+// int. Position 0 corresponds to the least-significant 4 bits; position 7 to
+// the most-significant.
+int extract_4b(const int packed, const int pos) {
+  return (packed >> (pos * 4)) & 0xF;
+}
+
+
+// Corresponds to dim_order[:4] = [0, 1, 2, 3]
+#define CONTIGUOUS_BUFFER_LAYOUT_ID 12816
+// Corresponds to dim_order[:4] = [2, 0, 1, 3]
+#define CHANNELS_LAST_BUFFER_LAYOUT_ID 12546
+
+// Used as a default value for hashed layout ints, representing the most common
+// layout used for buffer-backed tensors (i.e. contiguous buffers)
+#define CONTIG_LAYOUT_INT 12816
+
+int layout_id(const int hashed_layout) {
+  // Extract the first 16 bits
+  return hashed_layout & 0xFFFF;
+}
+
+bool is_contiguous(const int hashed_layout) {
+  return layout_id(hashed_layout) == CONTIGUOUS_BUFFER_LAYOUT_ID;
+}
+
+bool is_channels_last(const int hashed_layout) {
+  return layout_id(hashed_layout) == CHANNELS_LAST_BUFFER_LAYOUT_ID;
+}
+
 //
 // BufferMetadata
 //
@@ -126,15 +176,6 @@ uint idx_at(const TensorIndex tidx, const uint dim) {
   return tidx.data[div_4(dim)][mod_4(dim)];
 }
 
-void permute(inout TensorIndex tidx, const ivec4 permute_order[DIMLIMIT_DIV4]) {
-  TensorIndex new_tidx = tidx;
-  for (int d = 0; d < DIMLIMIT; ++d) {
-    int src_dim = permute_order[div_4(d)][mod_4(d)];
-    new_tidx.data[div_4(d)][mod_4(d)] = idx_at(tidx, src_dim);
-  }
-  tidx = new_tidx;
-}
-
 uint x(const TensorIndex tidx) {
   return tidx.data[0][0];
 }
@@ -174,83 +215,110 @@ struct TextureElementIndex {
 // Index Conversions
 //
 
-void contiguous_idx_to_tensor_idx(
+TensorIndex contiguous_idx_to_tensor_idx(
     const BufferMetadata meta,
-    uint contiguous_idx,
-    out TensorIndex tidx) {
-  initialize(tidx);
-  int dim = int_ndim(meta);
-  int i = 0;
+    uint contiguous_idx) {
+  TensorIndex tidx;
 
   uint contiguous_strides[DIMLIMIT];
+
   contiguous_strides[0] = 1;
-  for (int d = 1; d < DIMLIMIT; ++d) {
+  [[unroll]] for (int d = 1; d < DIMLIMIT; ++d) {
     contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
   }
 
-  for (int d = max(dim - 1, 0); d >= 0; d--) {
-    uint dim_stride = contiguous_strides[d];
-
-    tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / dim_stride;
-    contiguous_idx = contiguous_idx % dim_stride;
+  [[unroll]] for (int d = DIMLIMIT - 1; d >= 0; --d) {
+    tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / contiguous_strides[d];
+    contiguous_idx = contiguous_idx % contiguous_strides[d];
   }
-}
 
-TensorIndex contiguous_idx_to_tensor_idx(
-    const BufferMetadata meta,
-    uint contiguous_idx) {
-  TensorIndex tidx;
-  contiguous_idx_to_tensor_idx(meta, contiguous_idx, tidx);
   return tidx;
 }
 
 uint tensor_idx_to_contiguous_idx(
     const BufferMetadata meta,
     const TensorIndex tidx) {
   uint contiguous_strides[DIMLIMIT];
+
   contiguous_strides[0] = 1;
-  for (int d = 1; d < DIMLIMIT; ++d) {
+  [[unroll]] for (int d = 1; d < DIMLIMIT; ++d) {
     contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
   }
 
   uint contig_idx = 0;
-  for (int d = 0; d < ndim(meta); ++d) {
+  [[unroll]] for (int d = 0; d < DIMLIMIT; ++d) {
     contig_idx += contiguous_strides[d] * idx_at(tidx, d);
   }
+
   return contig_idx;
 }
 
-void linear_idx_to_tensor_idx(
+TensorIndex linear_idx_to_tensor_idx(
     const BufferMetadata meta,
-    uint linear_idx,
-    out TensorIndex tidx) {
+    uint linear_idx) {
+  TensorIndex tidx;
   initialize(tidx);
   int dim = int_ndim(meta);
   int i = 0;
   for (int d = max(dim - 1, 0); d >= 0; d--) {
-    uint dim_idx = dim_order_at(meta, d);
-    uint dim_stride = stride_at(meta, dim_idx);
+    uint dim_idx = meta.dim_order[div_4(d)][mod_4(d)];
+    uint dim_stride = meta.strides[div_4(dim_idx)][mod_4(dim_idx)];
 
     tidx.data[div_4(dim_idx)][mod_4(dim_idx)] = linear_idx / dim_stride;
     linear_idx = linear_idx % dim_stride;
   }
+  return tidx;
 }
 
-TensorIndex linear_idx_to_tensor_idx(
+TensorIndex linear_idx_to_tensor_idx_contig_case(
+    const BufferMetadata meta,
+    uint linear_idx) {
+  TensorIndex tidx;
+
+  [[unroll]] for (int d = DIMLIMIT - 1; d >= 0; --d) {
+    tidx.data[div_4(d)][mod_4(d)] = linear_idx / stride_at(meta, d);
+    linear_idx = linear_idx % stride_at(meta, d);
+  }
+
+  return tidx;
+}
+
+TensorIndex linear_idx_to_tensor_idx_channelslast_case(
     const BufferMetadata meta,
     uint linear_idx) {
   TensorIndex tidx;
-  linear_idx_to_tensor_idx(meta, linear_idx, tidx);
+
+  const uint dim_order[DIMLIMIT] = uint[DIMLIMIT](2, 0, 1, 3, 6, 5, 4, 7);
+
+  [[unroll]] for (int d = DIMLIMIT - 1; d >= 0; --d) {
+    uint dim = dim_order[d];
+    tidx.data[div_4(dim)][mod_4(dim)] = linear_idx / stride_at(meta, dim);
+    linear_idx = linear_idx % stride_at(meta, dim);
+  }
+
   return tidx;
 }
 
+TensorIndex linear_idx_to_tensor_idx(
+    const BufferMetadata meta,
+    uint linear_idx,
+    int hashed_layout) {
+  if (is_contiguous(hashed_layout)) {
+    return linear_idx_to_tensor_idx_contig_case(meta, linear_idx);
+  } else if (is_channels_last(hashed_layout)) {
+    return linear_idx_to_tensor_idx_channelslast_case(meta, linear_idx);
+  }
+  return linear_idx_to_tensor_idx(meta, linear_idx);
+}
+
 uint tensor_idx_to_linear_idx(
     const BufferMetadata meta,
     const TensorIndex tidx) {
   uint lin_idx = 0;
-  for (int d = 0; d < ndim(meta); ++d) {
+  [[unroll]] for (int d = 0; d < DIMLIMIT; ++d) {
     lin_idx += stride_at(meta, d) * idx_at(tidx, d);
   }
+
   return lin_idx;
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -29,11 +29,9 @@ void main() {
     return;
   }
 
-  TensorIndex outp_tidx;
+  TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, outp_bufi);
   uint nchwi;
 
-  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
-
   if (transpose_hw == 1) {
     BufferMetadata transposed_meta = outp;
     transposed_meta.sizes[0].xy = transposed_meta.sizes[0].yx;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl
@@ -26,23 +26,38 @@ ${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}
 ${layout_declare_ubo(B, "BufferMetadata", "outp")}
 ${layout_declare_ubo(B, "BufferMetadata", "inp")}
 
-${layout_declare_ubo(B, "ivec4[DIMLIMIT_DIV4]", "permute_order")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "permute_order", "0")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+TensorIndex permute(TensorIndex tidx) {
+  TensorIndex new_tidx = tidx;
+
+  new_tidx.data[0][0] = idx_at(tidx, extract_4b(permute_order, 0));
+  new_tidx.data[0][1] = idx_at(tidx, extract_4b(permute_order, 1));
+  new_tidx.data[0][2] = idx_at(tidx, extract_4b(permute_order, 2));
+  new_tidx.data[0][3] = idx_at(tidx, extract_4b(permute_order, 3));
+
+  new_tidx.data[1][0] = idx_at(tidx, extract_4b(permute_order, 4));
+  new_tidx.data[1][1] = idx_at(tidx, extract_4b(permute_order, 5));
+  new_tidx.data[1][2] = idx_at(tidx, extract_4b(permute_order, 6));
+  new_tidx.data[1][3] = idx_at(tidx, extract_4b(permute_order, 7));
+
+  return new_tidx;
+}
+
 void main() {
   const uint inp_bufi = gl_GlobalInvocationID.x;
   if (inp_bufi >= numel(inp)) {
     return;
   }
 
-  TensorIndex inp_tidx;
-  linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
-
-  TensorIndex outp_tidx = inp_tidx;
-  permute(outp_tidx, permute_order);
-
+  TensorIndex inp_tidx = linear_idx_to_tensor_idx(inp, inp_bufi, inp_layout);
+  TensorIndex outp_tidx = permute(inp_tidx);
   const uint outp_bufi = tensor_idx_to_linear_idx(outp, outp_tidx);
+
   // Copy data from input to output
   t_outp[outp_bufi] = t_inp[inp_bufi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl
@@ -18,7 +18,8 @@ ${layout_declare_ubo(B, "BufferMetadata", "inp")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-${layout_declare_spec_const(C, "int", "all_contiguous", "0")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
 
 /*
  * The insight behind the view operation is that the contiguous index of each
@@ -31,16 +32,15 @@ void main() {
   }
 
   uint inp_bufi = outp_bufi;
-  if (all_contiguous == 0) {
-    TensorIndex outp_tidx;
-    linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
+  if (!is_contiguous(outp_layout) || !is_contiguous(inp_layout)) {
+    TensorIndex outp_tidx = linear_idx_to_tensor_idx(
+        outp, outp_bufi, outp_layout);
 
     // To map the output to the input, find the input element that has the same
     // contiguous index as the output element.
     const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx);
 
-    TensorIndex inp_tidx;
-    contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx);
+    TensorIndex inp_tidx = contiguous_idx_to_tensor_idx(inp, contig_idx);
 
     inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_convert_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/view_convert_buffer.glsl
@@ -20,7 +20,8 @@ ${layout_declare_ubo(B, "BufferMetadata", "inp")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-${layout_declare_spec_const(C, "int", "all_contiguous", "0")}
+${layout_declare_spec_const(C, "int", "outp_layout", "0")}
+${layout_declare_spec_const(C, "int", "inp_layout", "0")}
 
 /*
  * The insight behind the view_convert operation is that the contiguous index of each
@@ -35,16 +36,15 @@ void main() {
 
   uint inp_bufi = outp_bufi;
 
-  if (all_contiguous == 0) {
-    TensorIndex outp_tidx;
-    linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
+  if (!is_contiguous(outp_layout) || !is_contiguous(inp_layout)) {
+    TensorIndex outp_tidx = linear_idx_to_tensor_idx(
+        outp, outp_bufi, outp_layout);
 
     // To map the output to the input, find the input element that has the same
     // contiguous index as the output element.
     const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx);
 
-    TensorIndex inp_tidx;
-    contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx);
+    TensorIndex inp_tidx = contiguous_idx_to_tensor_idx(inp, contig_idx);
 
     inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp

Original file line number	Diff line number	Diff line change
`@@ -94,8 +94,7 @@ void main() {`
`94`	`94`	`return;`
`95`	`95`	`}`
`96`	`96`
`97`		`- TensorIndex outp_tidx;`
`98`		`- linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);`
	`97`	`+ TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, out_bufi);`
`99`	`98`
`100`	`99`	`TensorIndex inp_tidx = outp_tidx;`
`101`	`100`	`clamp_tensor_idx(inp, inp_tidx);`
Original file line number	Diff line number	Diff line change
`@@ -29,8 +29,7 @@ void main() {`
`29`	`29`	`return;`
`30`	`30`	`}`
`31`	`31`
`32`		`- TensorIndex inp_tidx;`
`33`		`- linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);`
	`32`	`+ TensorIndex inp_tidx = linear_idx_to_tensor_idx(inp, inp_bufi);`
`34`	`33`
`35`	`34`	`uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);`
`36`	`35`