Skip to content

Commit 0fb422f

Browse files
authored
Manual merge of #16307 (#16328)
Summary: As title. This PR contains all the changes contained in the #16307 PR stack. There was an issue creating a cherry pick PR after the changes landed via phabricator
1 parent ab02129 commit 0fb422f

File tree

11 files changed

+168
-83
lines changed

11 files changed

+168
-83
lines changed

backends/vulkan/runtime/graph/Logging.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,11 @@ std::string make_arg_json(ComputeGraph* const compute_graph, ValueRef arg) {
5252
} else if (compute_graph->val_is_value_list(arg)) {
5353
ValueListPtr val_list = compute_graph->get_value_list(arg);
5454
ss << ", \"values\": [";
55-
for (const ValueRef& value : *val_list) {
56-
ss << value << ", ";
55+
for (size_t i = 0; i < val_list->size(); ++i) {
56+
ss << val_list->at(i);
57+
if (i + 1 < val_list->size()) {
58+
ss << ", ";
59+
}
5760
}
5861
ss << "]";
5962
} else if (compute_graph->val_is_int_list(arg)) {

backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,7 @@ void main() {
9494
return;
9595
}
9696

97-
TensorIndex outp_tidx;
98-
linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);
97+
TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
9998

10099
TensorIndex inp_tidx = outp_tidx;
101100
clamp_tensor_idx(inp, inp_tidx);

backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ void main() {
2929
return;
3030
}
3131

32-
TensorIndex inp_tidx;
33-
linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
32+
TensorIndex inp_tidx = linear_idx_to_tensor_idx(inp, inp_bufi);
3433

3534
uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);
3635

backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,12 @@ void main() {
3333
return;
3434
}
3535

36-
TensorIndex outp_tidx;
37-
linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
36+
TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, outp_bufi);
3837

3938
// Map output tensor index to input tensor index by taking modulo
4039
// with input tensor sizes for each dimension
4140
TensorIndex inp_tidx = outp_tidx;
42-
for (int d = 0; d < ndim(inp); ++d) {
41+
for (int d = 0; d < ndim(outp); ++d) {
4342
uint inp_size = size_at(inp, d);
4443
uint outp_idx = idx_at(outp_tidx, d);
4544
inp_tidx.data[div_4(d)][mod_4(d)] = outp_idx % inp_size;

backends/vulkan/runtime/graph/ops/glsl/indexing.glslh

Lines changed: 105 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,59 @@
1111

1212
#include "common.glslh"
1313

14+
#extension GL_EXT_control_flow_attributes : require
15+
1416
#define DIMLIMIT 8
1517
#define DIMLIMIT_DIV4 2
1618

19+
//
20+
// Hashed layout utils
21+
//
22+
23+
/*
24+
* The hashed layout is a packed int32 where each group of 4 bits contain some
25+
* information about the memory layout of a tensor buffer or texture. It is
26+
* passed into shaders as a specialization constant and allows shader compilers
27+
* to select optimized code paths suited for a particular memory layout.
28+
*
29+
* Currently the following information is packed into the layout integer:
30+
* - bits 0-15: first 4 elements of the dim order array
31+
* - bits 0-3: dim_order[0]
32+
* - bits 4-7: dim_order[1]
33+
* - bits 8-11: dim_order[2]
34+
* - bits 12-15: dim_order[3]
35+
*/
36+
37+
// Extracts the 4-bit packed value at the given position (0..7) from a 32-bit
38+
// int. Position 0 corresponds to the least-significant 4 bits; position 7 to
39+
// the most-significant.
40+
int extract_4b(const int packed, const int pos) {
41+
return (packed >> (pos * 4)) & 0xF;
42+
}
43+
44+
45+
// Corresponds to dim_order[:4] = [0, 1, 2, 3]
46+
#define CONTIGUOUS_BUFFER_LAYOUT_ID 12816
47+
// Corresponds to dim_order[:4] = [2, 0, 1, 3]
48+
#define CHANNELS_LAST_BUFFER_LAYOUT_ID 12546
49+
50+
// Used as a default value for hashed layout ints, representing the most common
51+
// layout used for buffer-backed tensors (i.e. contiguous buffers)
52+
#define CONTIG_LAYOUT_INT 12816
53+
54+
int layout_id(const int hashed_layout) {
55+
// Extract the first 16 bits
56+
return hashed_layout & 0xFFFF;
57+
}
58+
59+
bool is_contiguous(const int hashed_layout) {
60+
return layout_id(hashed_layout) == CONTIGUOUS_BUFFER_LAYOUT_ID;
61+
}
62+
63+
bool is_channels_last(const int hashed_layout) {
64+
return layout_id(hashed_layout) == CHANNELS_LAST_BUFFER_LAYOUT_ID;
65+
}
66+
1767
//
1868
// BufferMetadata
1969
//
@@ -126,15 +176,6 @@ uint idx_at(const TensorIndex tidx, const uint dim) {
126176
return tidx.data[div_4(dim)][mod_4(dim)];
127177
}
128178

129-
void permute(inout TensorIndex tidx, const ivec4 permute_order[DIMLIMIT_DIV4]) {
130-
TensorIndex new_tidx = tidx;
131-
for (int d = 0; d < DIMLIMIT; ++d) {
132-
int src_dim = permute_order[div_4(d)][mod_4(d)];
133-
new_tidx.data[div_4(d)][mod_4(d)] = idx_at(tidx, src_dim);
134-
}
135-
tidx = new_tidx;
136-
}
137-
138179
uint x(const TensorIndex tidx) {
139180
return tidx.data[0][0];
140181
}
@@ -174,83 +215,110 @@ struct TextureElementIndex {
174215
// Index Conversions
175216
//
176217

177-
void contiguous_idx_to_tensor_idx(
218+
TensorIndex contiguous_idx_to_tensor_idx(
178219
const BufferMetadata meta,
179-
uint contiguous_idx,
180-
out TensorIndex tidx) {
181-
initialize(tidx);
182-
int dim = int_ndim(meta);
183-
int i = 0;
220+
uint contiguous_idx) {
221+
TensorIndex tidx;
184222

185223
uint contiguous_strides[DIMLIMIT];
224+
186225
contiguous_strides[0] = 1;
187-
for (int d = 1; d < DIMLIMIT; ++d) {
226+
[[unroll]] for (int d = 1; d < DIMLIMIT; ++d) {
188227
contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
189228
}
190229

191-
for (int d = max(dim - 1, 0); d >= 0; d--) {
192-
uint dim_stride = contiguous_strides[d];
193-
194-
tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / dim_stride;
195-
contiguous_idx = contiguous_idx % dim_stride;
230+
[[unroll]] for (int d = DIMLIMIT - 1; d >= 0; --d) {
231+
tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / contiguous_strides[d];
232+
contiguous_idx = contiguous_idx % contiguous_strides[d];
196233
}
197-
}
198234

199-
TensorIndex contiguous_idx_to_tensor_idx(
200-
const BufferMetadata meta,
201-
uint contiguous_idx) {
202-
TensorIndex tidx;
203-
contiguous_idx_to_tensor_idx(meta, contiguous_idx, tidx);
204235
return tidx;
205236
}
206237

207238
uint tensor_idx_to_contiguous_idx(
208239
const BufferMetadata meta,
209240
const TensorIndex tidx) {
210241
uint contiguous_strides[DIMLIMIT];
242+
211243
contiguous_strides[0] = 1;
212-
for (int d = 1; d < DIMLIMIT; ++d) {
244+
[[unroll]] for (int d = 1; d < DIMLIMIT; ++d) {
213245
contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
214246
}
215247

216248
uint contig_idx = 0;
217-
for (int d = 0; d < ndim(meta); ++d) {
249+
[[unroll]] for (int d = 0; d < DIMLIMIT; ++d) {
218250
contig_idx += contiguous_strides[d] * idx_at(tidx, d);
219251
}
252+
220253
return contig_idx;
221254
}
222255

223-
void linear_idx_to_tensor_idx(
256+
TensorIndex linear_idx_to_tensor_idx(
224257
const BufferMetadata meta,
225-
uint linear_idx,
226-
out TensorIndex tidx) {
258+
uint linear_idx) {
259+
TensorIndex tidx;
227260
initialize(tidx);
228261
int dim = int_ndim(meta);
229262
int i = 0;
230263
for (int d = max(dim - 1, 0); d >= 0; d--) {
231-
uint dim_idx = dim_order_at(meta, d);
232-
uint dim_stride = stride_at(meta, dim_idx);
264+
uint dim_idx = meta.dim_order[div_4(d)][mod_4(d)];
265+
uint dim_stride = meta.strides[div_4(dim_idx)][mod_4(dim_idx)];
233266

234267
tidx.data[div_4(dim_idx)][mod_4(dim_idx)] = linear_idx / dim_stride;
235268
linear_idx = linear_idx % dim_stride;
236269
}
270+
return tidx;
237271
}
238272

239-
TensorIndex linear_idx_to_tensor_idx(
273+
TensorIndex linear_idx_to_tensor_idx_contig_case(
274+
const BufferMetadata meta,
275+
uint linear_idx) {
276+
TensorIndex tidx;
277+
278+
[[unroll]] for (int d = DIMLIMIT - 1; d >= 0; --d) {
279+
tidx.data[div_4(d)][mod_4(d)] = linear_idx / stride_at(meta, d);
280+
linear_idx = linear_idx % stride_at(meta, d);
281+
}
282+
283+
return tidx;
284+
}
285+
286+
TensorIndex linear_idx_to_tensor_idx_channelslast_case(
240287
const BufferMetadata meta,
241288
uint linear_idx) {
242289
TensorIndex tidx;
243-
linear_idx_to_tensor_idx(meta, linear_idx, tidx);
290+
291+
const uint dim_order[DIMLIMIT] = uint[DIMLIMIT](2, 0, 1, 3, 6, 5, 4, 7);
292+
293+
[[unroll]] for (int d = DIMLIMIT - 1; d >= 0; --d) {
294+
uint dim = dim_order[d];
295+
tidx.data[div_4(dim)][mod_4(dim)] = linear_idx / stride_at(meta, dim);
296+
linear_idx = linear_idx % stride_at(meta, dim);
297+
}
298+
244299
return tidx;
245300
}
246301

302+
TensorIndex linear_idx_to_tensor_idx(
303+
const BufferMetadata meta,
304+
uint linear_idx,
305+
int hashed_layout) {
306+
if (is_contiguous(hashed_layout)) {
307+
return linear_idx_to_tensor_idx_contig_case(meta, linear_idx);
308+
} else if (is_channels_last(hashed_layout)) {
309+
return linear_idx_to_tensor_idx_channelslast_case(meta, linear_idx);
310+
}
311+
return linear_idx_to_tensor_idx(meta, linear_idx);
312+
}
313+
247314
uint tensor_idx_to_linear_idx(
248315
const BufferMetadata meta,
249316
const TensorIndex tidx) {
250317
uint lin_idx = 0;
251-
for (int d = 0; d < ndim(meta); ++d) {
318+
[[unroll]] for (int d = 0; d < DIMLIMIT; ++d) {
252319
lin_idx += stride_at(meta, d) * idx_at(tidx, d);
253320
}
321+
254322
return lin_idx;
255323
}
256324

backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,9 @@ void main() {
2929
return;
3030
}
3131

32-
TensorIndex outp_tidx;
32+
TensorIndex outp_tidx = linear_idx_to_tensor_idx(outp, outp_bufi);
3333
uint nchwi;
3434

35-
linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
36-
3735
if (transpose_hw == 1) {
3836
BufferMetadata transposed_meta = outp;
3937
transposed_meta.sizes[0].xy = transposed_meta.sizes[0].yx;

backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,38 @@ ${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}
2626
${layout_declare_ubo(B, "BufferMetadata", "outp")}
2727
${layout_declare_ubo(B, "BufferMetadata", "inp")}
2828

29-
${layout_declare_ubo(B, "ivec4[DIMLIMIT_DIV4]", "permute_order")}
29+
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
30+
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
31+
${layout_declare_spec_const(C, "int", "permute_order", "0")}
3032

3133
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3234

35+
TensorIndex permute(TensorIndex tidx) {
36+
TensorIndex new_tidx = tidx;
37+
38+
new_tidx.data[0][0] = idx_at(tidx, extract_4b(permute_order, 0));
39+
new_tidx.data[0][1] = idx_at(tidx, extract_4b(permute_order, 1));
40+
new_tidx.data[0][2] = idx_at(tidx, extract_4b(permute_order, 2));
41+
new_tidx.data[0][3] = idx_at(tidx, extract_4b(permute_order, 3));
42+
43+
new_tidx.data[1][0] = idx_at(tidx, extract_4b(permute_order, 4));
44+
new_tidx.data[1][1] = idx_at(tidx, extract_4b(permute_order, 5));
45+
new_tidx.data[1][2] = idx_at(tidx, extract_4b(permute_order, 6));
46+
new_tidx.data[1][3] = idx_at(tidx, extract_4b(permute_order, 7));
47+
48+
return new_tidx;
49+
}
50+
3351
void main() {
3452
const uint inp_bufi = gl_GlobalInvocationID.x;
3553
if (inp_bufi >= numel(inp)) {
3654
return;
3755
}
3856

39-
TensorIndex inp_tidx;
40-
linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
41-
42-
TensorIndex outp_tidx = inp_tidx;
43-
permute(outp_tidx, permute_order);
44-
57+
TensorIndex inp_tidx = linear_idx_to_tensor_idx(inp, inp_bufi, inp_layout);
58+
TensorIndex outp_tidx = permute(inp_tidx);
4559
const uint outp_bufi = tensor_idx_to_linear_idx(outp, outp_tidx);
60+
4661
// Copy data from input to output
4762
t_outp[outp_bufi] = t_inp[inp_bufi];
4863
}

backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ ${layout_declare_ubo(B, "BufferMetadata", "inp")}
1818

1919
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2020

21-
${layout_declare_spec_const(C, "int", "all_contiguous", "0")}
21+
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
22+
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
2223

2324
/*
2425
* The insight behind the view operation is that the contiguous index of each
@@ -31,16 +32,15 @@ void main() {
3132
}
3233

3334
uint inp_bufi = outp_bufi;
34-
if (all_contiguous == 0) {
35-
TensorIndex outp_tidx;
36-
linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
35+
if (!is_contiguous(outp_layout) || !is_contiguous(inp_layout)) {
36+
TensorIndex outp_tidx = linear_idx_to_tensor_idx(
37+
outp, outp_bufi, outp_layout);
3738

3839
// To map the output to the input, find the input element that has the same
3940
// contiguous index as the output element.
4041
const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx);
4142

42-
TensorIndex inp_tidx;
43-
contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx);
43+
TensorIndex inp_tidx = contiguous_idx_to_tensor_idx(inp, contig_idx);
4444

4545
inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
4646
}

backends/vulkan/runtime/graph/ops/glsl/view_convert_buffer.glsl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ ${layout_declare_ubo(B, "BufferMetadata", "inp")}
2020

2121
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2222

23-
${layout_declare_spec_const(C, "int", "all_contiguous", "0")}
23+
${layout_declare_spec_const(C, "int", "outp_layout", "0")}
24+
${layout_declare_spec_const(C, "int", "inp_layout", "0")}
2425

2526
/*
2627
* The insight behind the view_convert operation is that the contiguous index of each
@@ -35,16 +36,15 @@ void main() {
3536

3637
uint inp_bufi = outp_bufi;
3738

38-
if (all_contiguous == 0) {
39-
TensorIndex outp_tidx;
40-
linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
39+
if (!is_contiguous(outp_layout) || !is_contiguous(inp_layout)) {
40+
TensorIndex outp_tidx = linear_idx_to_tensor_idx(
41+
outp, outp_bufi, outp_layout);
4142

4243
// To map the output to the input, find the input element that has the same
4344
// contiguous index as the output element.
4445
const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx);
4546

46-
TensorIndex inp_tidx;
47-
contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx);
47+
TensorIndex inp_tidx = contiguous_idx_to_tensor_idx(inp, contig_idx);
4848

4949
inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
5050
}

0 commit comments

Comments
 (0)