@@ -1527,6 +1527,8 @@ class vk_memory_logger {
15271527#endif // GGML_VULKAN_MEMORY_DEBUG
15281528
15291529static bool vk_perf_logger_enabled = false;
1530+ static bool vk_perf_logger_concurrent = false;
1531+ static bool vk_enable_sync_logger = false;
15301532// number of calls between perf logger prints
15311533static uint32_t vk_perf_logger_frequency = 1;
15321534
@@ -1577,14 +1579,14 @@ class vk_perf_logger {
15771579 flops.clear();
15781580 }
15791581
1580- void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1582+ std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
1583+ *n_flops = 0;
15811584 std::string fusion_str;
15821585 if (fusion_name) {
15831586 fusion_str = fusion_name + std::string(" ");
15841587 }
15851588 if (node->op == GGML_OP_UNARY) {
1586- timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1587- return;
1589+ return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node));
15881590 }
15891591 if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
15901592 const uint64_t m = node->ne[0];
@@ -1606,9 +1608,8 @@ class vk_perf_logger {
16061608 name += " batch=" + std::to_string(batch);
16071609 }
16081610 name = fusion_str + name;
1609- timings[name].push_back(time);
1610- flops[name].push_back(m * n * (k + (k - 1)) * batch);
1611- return;
1611+ *n_flops = m * n * (k + (k - 1)) * batch;
1612+ return name;
16121613 }
16131614 if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
16141615 std::string name = ggml_op_name(node->op);
@@ -1624,20 +1625,17 @@ class vk_perf_logger {
16241625 uint64_t size_M = Cout;
16251626 uint64_t size_K = Cin * KW * KH;
16261627 uint64_t size_N = N * OW * OH;
1627- uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
1628+ * n_flops = size_M * size_N * (size_K + (size_K - 1));
16281629 name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16291630 ", N=N*OW*OH=" + std::to_string(size_N);
16301631 name = fusion_str + name;
1631- flops[name].push_back(n_flops);
1632- timings[name].push_back(time);
1633- return;
1632+ return name;
16341633 }
16351634 if (node->op == GGML_OP_RMS_NORM) {
16361635 std::string name = ggml_op_name(node->op);
16371636 name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
16381637 name = fusion_str + name;
1639- timings[name].push_back(time);
1640- return;
1638+ return name;
16411639 }
16421640 if (node->op == GGML_OP_FLASH_ATTN_EXT) {
16431641 const ggml_tensor * dst = node;
@@ -1653,20 +1651,46 @@ class vk_perf_logger {
16531651 " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
16541652 " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
16551653 " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
1656- timings[name.str()].push_back(time);
1657- return;
1654+ return name.str();
16581655 }
16591656 if (node->op == GGML_OP_TOP_K) {
16601657 std::stringstream name;
16611658 name << fusion_str;
16621659 name << ggml_op_name(node->op) <<
16631660 " K=" << node->ne[0] <<
16641661 " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
1665- timings[name.str()].push_back(time);
1666- return;
1662+ return name.str();
1663+ }
1664+ return fusion_str + ggml_op_name(node->op);
1665+ }
1666+
1667+ void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1668+ uint64_t n_flops;
1669+ std::string name = get_node_fusion_name(node, fusion_name, &n_flops);
1670+ if (n_flops) {
1671+ flops[name].push_back(n_flops);
1672+ }
1673+ timings[name].push_back(time);
1674+ }
1675+
1676+ void log_timing(const std::vector<ggml_tensor *> &nodes, const std::vector<const char *> &names, uint64_t time) {
1677+ uint64_t total_flops = 0;
1678+ std::string name;
1679+ for (size_t n = 0; n < nodes.size(); ++n) {
1680+ uint64_t n_flops = 0;
1681+ name += get_node_fusion_name(nodes[n], names[n], &n_flops);
1682+ total_flops += n_flops;
1683+
1684+ if (n != nodes.size() - 1) {
1685+ name += ", ";
1686+ }
16671687 }
1668- timings[fusion_str + ggml_op_name(node->op)].push_back(time);
1688+ if (total_flops) {
1689+ flops[name].push_back(total_flops);
1690+ }
1691+ timings[name].push_back(time);
16691692 }
1693+
16701694 private:
16711695 std::map<std::string, std::vector<uint64_t>> timings;
16721696 std::map<std::string, std::vector<uint64_t>> flops;
@@ -1729,7 +1753,9 @@ struct ggml_backend_vk_context {
17291753 std::unique_ptr<vk_perf_logger> perf_logger;
17301754 vk::QueryPool query_pool;
17311755 std::vector<const char *> query_fusion_names;
1756+ std::vector<int> query_fusion_node_count;
17321757 std::vector<ggml_tensor *> query_nodes;
1758+ std::vector<int> query_node_idx;
17331759 int32_t num_queries {};
17341760 int32_t query_idx {};
17351761};
@@ -5194,6 +5220,8 @@ static void ggml_vk_instance_init() {
51945220 }
51955221
51965222 vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5223+ vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
5224+ vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
51975225 const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
51985226
51995227 if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
@@ -11820,15 +11848,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1182011848 }
1182111849 }
1182211850
11823- #define ENABLE_SYNC_LOGGING 0
11824-
1182511851 if (need_sync) {
11826- # if ENABLE_SYNC_LOGGING
11827- std::cerr << "sync" << std::endl;
11828- #endif
11852+ if (vk_enable_sync_logger) {
11853+ std::cerr << "sync" << std::endl;
11854+ }
1182911855 ctx->unsynced_nodes_written.clear();
1183011856 ctx->unsynced_nodes_read.clear();
1183111857 ggml_vk_sync_buffers(ctx, compute_ctx);
11858+
11859+ if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
11860+ ctx->query_node_idx[ctx->query_idx] = node_idx;
11861+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
11862+ }
1183211863 }
1183311864 // Add all fused nodes to the unsynchronized lists.
1183411865 for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
@@ -11845,20 +11876,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1184511876 }
1184611877 }
1184711878 }
11848- #if ENABLE_SYNC_LOGGING
11849- for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
11850- auto *n = cgraph->nodes[node_idx + i];
11851- std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
11852- if (n->op == GGML_OP_GLU) {
11853- std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
11854- }
11855- if (n->op == GGML_OP_ROPE) {
11856- const int mode = ((const int32_t *) n->op_params)[2];
11857- std::cerr << " rope mode: " << mode;
11879+ if (vk_enable_sync_logger) {
11880+ for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
11881+ auto *n = cgraph->nodes[node_idx + i];
11882+ std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
11883+ if (n->op == GGML_OP_GLU) {
11884+ std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
11885+ }
11886+ if (n->op == GGML_OP_ROPE) {
11887+ const int mode = ((const int32_t *) n->op_params)[2];
11888+ std::cerr << " rope mode: " << mode;
11889+ }
11890+ std::cerr << std::endl;
1185811891 }
11859- std::cerr << std::endl;
1186011892 }
11861- #endif
1186211893
1186311894 switch (node->op) {
1186411895 case GGML_OP_REPEAT:
@@ -13138,12 +13169,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1313813169 ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
1313913170 ctx->num_queries = query_create_info.queryCount;
1314013171 ctx->query_fusion_names.resize(ctx->num_queries);
13172+ ctx->query_fusion_node_count.resize(ctx->num_queries);
1314113173 ctx->query_nodes.resize(ctx->num_queries);
13174+ ctx->query_node_idx.resize(ctx->num_queries);
1314213175 }
1314313176
1314413177 ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
1314513178 std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13179+ std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0);
1314613180 std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
13181+ std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
1314713182
1314813183 GGML_ASSERT(ctx->compute_ctx.expired());
1314913184 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
@@ -13272,9 +13307,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1327213307 } else {
1327313308 compute_ctx = ctx->compute_ctx.lock();
1327413309 }
13275- ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13276- ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13277- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
13310+ if (!vk_perf_logger_concurrent) {
13311+ // track a single node/fusion for the current query
13312+ ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13313+ ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13314+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
13315+ } else {
13316+ // track a fusion string and number of fused ops for the current node_idx
13317+ ctx->query_fusion_names[i] = fusion_string;
13318+ ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops;
13319+ }
1327813320 }
1327913321
1328013322 if (enqueued) {
@@ -13316,12 +13358,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1331613358 // Get the results and pass them to the logger
1331713359 std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
1331813360 VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13319- for (int i = 1; i < ctx->query_idx; i++) {
13320- auto node = ctx->query_nodes[i];
13321- auto name = ctx->query_fusion_names[i];
13322- ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
13361+ if (!vk_perf_logger_concurrent) {
13362+ // Log each op separately
13363+ for (int i = 1; i < ctx->query_idx; i++) {
13364+ auto node = ctx->query_nodes[i];
13365+ auto name = ctx->query_fusion_names[i];
13366+ ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
13367+ }
13368+ } else {
13369+ // Log each group of nodes
13370+ int prev_node_idx = 0;
13371+ for (int i = 1; i < ctx->query_idx; i++) {
13372+ auto cur_node_idx = ctx->query_node_idx[i];
13373+ std::vector<ggml_tensor *> nodes;
13374+ std::vector<const char *> names;
13375+ for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) {
13376+ if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) {
13377+ continue;
13378+ }
13379+ nodes.push_back(cgraph->nodes[node_idx]);
13380+ names.push_back(ctx->query_fusion_names[node_idx]);
13381+ node_idx += ctx->query_fusion_node_count[node_idx];
13382+ }
13383+ prev_node_idx = cur_node_idx;
13384+ ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
13385+ }
1332313386 }
13324-
1332513387 ctx->perf_logger->print_timings();
1332613388 }
1332713389
0 commit comments