Skip to content

Commit cdbada8

Browse files
authored
vulkan: Add perf logger mode with concurrency (#17944)
This implements a variation of the perf logger where rather than timing each operation individually with effectively a barrier in between, we put the timing boundaries where we already synchronize and time the groups of work that normally overlap. This can be useful to help understand whether individual operations need to be optimized, or if the group is already running efficiently. GGML_VK_PERF_LOGGER_CONCURRENT=1 enables the new mode (when GGML_VK_PERF_LOGGER is also set). GGML_VK_SYNC_LOGGER=1 replaces the ENABLE_SYNC_LOGGING compile time switch.
1 parent 8ea958d commit cdbada8

File tree

1 file changed

+104
-42
lines changed

1 file changed

+104
-42
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 104 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,6 +1527,8 @@ class vk_memory_logger {
15271527
#endif // GGML_VULKAN_MEMORY_DEBUG
15281528

15291529
static bool vk_perf_logger_enabled = false;
1530+
static bool vk_perf_logger_concurrent = false;
1531+
static bool vk_enable_sync_logger = false;
15301532
// number of calls between perf logger prints
15311533
static uint32_t vk_perf_logger_frequency = 1;
15321534

@@ -1577,14 +1579,14 @@ class vk_perf_logger {
15771579
flops.clear();
15781580
}
15791581

1580-
void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1582+
std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
1583+
*n_flops = 0;
15811584
std::string fusion_str;
15821585
if (fusion_name) {
15831586
fusion_str = fusion_name + std::string(" ");
15841587
}
15851588
if (node->op == GGML_OP_UNARY) {
1586-
timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1587-
return;
1589+
return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node));
15881590
}
15891591
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
15901592
const uint64_t m = node->ne[0];
@@ -1606,9 +1608,8 @@ class vk_perf_logger {
16061608
name += " batch=" + std::to_string(batch);
16071609
}
16081610
name = fusion_str + name;
1609-
timings[name].push_back(time);
1610-
flops[name].push_back(m * n * (k + (k - 1)) * batch);
1611-
return;
1611+
*n_flops = m * n * (k + (k - 1)) * batch;
1612+
return name;
16121613
}
16131614
if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
16141615
std::string name = ggml_op_name(node->op);
@@ -1624,20 +1625,17 @@ class vk_perf_logger {
16241625
uint64_t size_M = Cout;
16251626
uint64_t size_K = Cin * KW * KH;
16261627
uint64_t size_N = N * OW * OH;
1627-
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
1628+
*n_flops = size_M * size_N * (size_K + (size_K - 1));
16281629
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16291630
", N=N*OW*OH=" + std::to_string(size_N);
16301631
name = fusion_str + name;
1631-
flops[name].push_back(n_flops);
1632-
timings[name].push_back(time);
1633-
return;
1632+
return name;
16341633
}
16351634
if (node->op == GGML_OP_RMS_NORM) {
16361635
std::string name = ggml_op_name(node->op);
16371636
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
16381637
name = fusion_str + name;
1639-
timings[name].push_back(time);
1640-
return;
1638+
return name;
16411639
}
16421640
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
16431641
const ggml_tensor * dst = node;
@@ -1653,20 +1651,46 @@ class vk_perf_logger {
16531651
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
16541652
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
16551653
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
1656-
timings[name.str()].push_back(time);
1657-
return;
1654+
return name.str();
16581655
}
16591656
if (node->op == GGML_OP_TOP_K) {
16601657
std::stringstream name;
16611658
name << fusion_str;
16621659
name << ggml_op_name(node->op) <<
16631660
" K=" << node->ne[0] <<
16641661
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
1665-
timings[name.str()].push_back(time);
1666-
return;
1662+
return name.str();
1663+
}
1664+
return fusion_str + ggml_op_name(node->op);
1665+
}
1666+
1667+
void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1668+
uint64_t n_flops;
1669+
std::string name = get_node_fusion_name(node, fusion_name, &n_flops);
1670+
if (n_flops) {
1671+
flops[name].push_back(n_flops);
1672+
}
1673+
timings[name].push_back(time);
1674+
}
1675+
1676+
void log_timing(const std::vector<ggml_tensor *> &nodes, const std::vector<const char *> &names, uint64_t time) {
1677+
uint64_t total_flops = 0;
1678+
std::string name;
1679+
for (size_t n = 0; n < nodes.size(); ++n) {
1680+
uint64_t n_flops = 0;
1681+
name += get_node_fusion_name(nodes[n], names[n], &n_flops);
1682+
total_flops += n_flops;
1683+
1684+
if (n != nodes.size() - 1) {
1685+
name += ", ";
1686+
}
16671687
}
1668-
timings[fusion_str + ggml_op_name(node->op)].push_back(time);
1688+
if (total_flops) {
1689+
flops[name].push_back(total_flops);
1690+
}
1691+
timings[name].push_back(time);
16691692
}
1693+
16701694
private:
16711695
std::map<std::string, std::vector<uint64_t>> timings;
16721696
std::map<std::string, std::vector<uint64_t>> flops;
@@ -1729,7 +1753,9 @@ struct ggml_backend_vk_context {
17291753
std::unique_ptr<vk_perf_logger> perf_logger;
17301754
vk::QueryPool query_pool;
17311755
std::vector<const char *> query_fusion_names;
1756+
std::vector<int> query_fusion_node_count;
17321757
std::vector<ggml_tensor *> query_nodes;
1758+
std::vector<int> query_node_idx;
17331759
int32_t num_queries {};
17341760
int32_t query_idx {};
17351761
};
@@ -5194,6 +5220,8 @@ static void ggml_vk_instance_init() {
51945220
}
51955221

51965222
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5223+
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
5224+
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
51975225
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
51985226

51995227
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
@@ -11820,15 +11848,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1182011848
}
1182111849
}
1182211850

11823-
#define ENABLE_SYNC_LOGGING 0
11824-
1182511851
if (need_sync) {
11826-
#if ENABLE_SYNC_LOGGING
11827-
std::cerr << "sync" << std::endl;
11828-
#endif
11852+
if (vk_enable_sync_logger) {
11853+
std::cerr << "sync" << std::endl;
11854+
}
1182911855
ctx->unsynced_nodes_written.clear();
1183011856
ctx->unsynced_nodes_read.clear();
1183111857
ggml_vk_sync_buffers(ctx, compute_ctx);
11858+
11859+
if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
11860+
ctx->query_node_idx[ctx->query_idx] = node_idx;
11861+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
11862+
}
1183211863
}
1183311864
// Add all fused nodes to the unsynchronized lists.
1183411865
for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
@@ -11845,20 +11876,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1184511876
}
1184611877
}
1184711878
}
11848-
#if ENABLE_SYNC_LOGGING
11849-
for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
11850-
auto *n = cgraph->nodes[node_idx + i];
11851-
std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
11852-
if (n->op == GGML_OP_GLU) {
11853-
std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
11854-
}
11855-
if (n->op == GGML_OP_ROPE) {
11856-
const int mode = ((const int32_t *) n->op_params)[2];
11857-
std::cerr << " rope mode: " << mode;
11879+
if (vk_enable_sync_logger) {
11880+
for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
11881+
auto *n = cgraph->nodes[node_idx + i];
11882+
std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
11883+
if (n->op == GGML_OP_GLU) {
11884+
std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
11885+
}
11886+
if (n->op == GGML_OP_ROPE) {
11887+
const int mode = ((const int32_t *) n->op_params)[2];
11888+
std::cerr << " rope mode: " << mode;
11889+
}
11890+
std::cerr << std::endl;
1185811891
}
11859-
std::cerr << std::endl;
1186011892
}
11861-
#endif
1186211893

1186311894
switch (node->op) {
1186411895
case GGML_OP_REPEAT:
@@ -13138,12 +13169,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1313813169
ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
1313913170
ctx->num_queries = query_create_info.queryCount;
1314013171
ctx->query_fusion_names.resize(ctx->num_queries);
13172+
ctx->query_fusion_node_count.resize(ctx->num_queries);
1314113173
ctx->query_nodes.resize(ctx->num_queries);
13174+
ctx->query_node_idx.resize(ctx->num_queries);
1314213175
}
1314313176

1314413177
ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
1314513178
std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13179+
std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0);
1314613180
std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
13181+
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
1314713182

1314813183
GGML_ASSERT(ctx->compute_ctx.expired());
1314913184
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
@@ -13272,9 +13307,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1327213307
} else {
1327313308
compute_ctx = ctx->compute_ctx.lock();
1327413309
}
13275-
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13276-
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13277-
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
13310+
if (!vk_perf_logger_concurrent) {
13311+
// track a single node/fusion for the current query
13312+
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13313+
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13314+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
13315+
} else {
13316+
// track a fusion string and number of fused ops for the current node_idx
13317+
ctx->query_fusion_names[i] = fusion_string;
13318+
ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops;
13319+
}
1327813320
}
1327913321

1328013322
if (enqueued) {
@@ -13316,12 +13358,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1331613358
// Get the results and pass them to the logger
1331713359
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
1331813360
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13319-
for (int i = 1; i < ctx->query_idx; i++) {
13320-
auto node = ctx->query_nodes[i];
13321-
auto name = ctx->query_fusion_names[i];
13322-
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
13361+
if (!vk_perf_logger_concurrent) {
13362+
// Log each op separately
13363+
for (int i = 1; i < ctx->query_idx; i++) {
13364+
auto node = ctx->query_nodes[i];
13365+
auto name = ctx->query_fusion_names[i];
13366+
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
13367+
}
13368+
} else {
13369+
// Log each group of nodes
13370+
int prev_node_idx = 0;
13371+
for (int i = 1; i < ctx->query_idx; i++) {
13372+
auto cur_node_idx = ctx->query_node_idx[i];
13373+
std::vector<ggml_tensor *> nodes;
13374+
std::vector<const char *> names;
13375+
for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) {
13376+
if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) {
13377+
continue;
13378+
}
13379+
nodes.push_back(cgraph->nodes[node_idx]);
13380+
names.push_back(ctx->query_fusion_names[node_idx]);
13381+
node_idx += ctx->query_fusion_node_count[node_idx];
13382+
}
13383+
prev_node_idx = cur_node_idx;
13384+
ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
13385+
}
1332313386
}
13324-
1332513387
ctx->perf_logger->print_timings();
1332613388
}
1332713389

0 commit comments

Comments
 (0)