From 2bfe4ff9ca96e3967b94e6c91559e164d2189dfa Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Tue, 19 May 2026 11:52:22 +0200 Subject: [PATCH] tentative Metal support --- ggml/src/ggml-metal/ggml-metal-context.h | 26 ++++ ggml/src/ggml-metal/ggml-metal-context.m | 117 ++++++++++++++++++ ggml/src/ggml-metal/ggml-metal-device.h | 40 ++++++ ggml/src/ggml-metal/ggml-metal-device.m | 149 +++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal-ops.cpp | 40 ++++++ ggml/src/ggml-metal/ggml-metal-ops.h | 23 ++++ ggml/src/ggml-metal/ggml-metal.cpp | 101 +++++++++++++++ 7 files changed, 496 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal-context.h b/ggml/src/ggml-metal/ggml-metal-context.h index abf4b06ed2..8f61a826be 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.h +++ b/ggml/src/ggml-metal/ggml-metal-context.h @@ -36,6 +36,32 @@ void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort bool ggml_metal_supports_family (ggml_metal_t ctx, int family); void ggml_metal_capture_next_compute(ggml_metal_t ctx); +// +// Profiling +// + +// Opaque profiler state, owned by the C++ backend layer (ggml-metal.cpp). Holds the std::vector of +// records returned via the ggml_backend_profiler interface. ggml_metal_t keeps a borrowed pointer +// so that graph_compute can push records when sampling is active. +struct ggml_metal_profiler_state; + +// Inject (or clear, with NULL) the profiler state pointer. Called once at backend init. +void ggml_metal_set_profiler_state(ggml_metal_t ctx, struct ggml_metal_profiler_state * state); + +// Bridge function implemented in ggml-metal.cpp. Used by graph_compute (in .m) to push records. +void ggml_metal_profiler_push_record( + struct ggml_metal_profiler_state * state, + const struct ggml_tensor * node, + uint64_t start_ns, + uint64_t end_ns); + +// Query whether the injected profiler state is currently enabled. +// (Avoids exposing the C++ struct layout to the .m file.) +bool ggml_metal_profiler_is_enabled(struct ggml_metal_profiler_state * state); + +// Query the split-id currently set on the profiler state. +int ggml_metal_profiler_get_split_id(struct ggml_metal_profiler_state * state); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index 32d97cd5d0..4f8e4ce34f 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -6,6 +6,7 @@ #import "ggml-metal-impl.h" #import "ggml-metal-common.h" #import "ggml-metal-ops.h" +#import "ggml-profiler.h" #import @@ -79,6 +80,19 @@ struct ggml_metal { // error state - set when a command buffer fails during synchronize // once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated bool has_error; + + // Profiling + // Borrowed; owned by the C++ backend layer (ggml-metal.cpp). NULL when profiling is unavailable. + struct ggml_metal_profiler_state * profiler_state; + + // Per-graph-compute scratch state populated when profiling is active for the current invocation. + // Lifetime: from start of ggml_metal_graph_compute until its end. + bool profiler_active; + ggml_metal_sample_buf_t profiler_sample_buf; + size_t profiler_total_slots; + uint64_t profiler_cpu_anchor_ns; + uint64_t profiler_gpu_anchor_ns; + struct ggml_metal_op_sample_slot * profiler_slot_map; // size = gf->n_nodes }; ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { @@ -450,6 +464,34 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * // keep the memory wired ggml_metal_device_rsets_keep_alive(ctx->dev); + // Decide whether profiling is active for this invocation. Activation is sticky for the whole + // call: allocate sample buffer + slot map up front so the encode_async block can use them. + ctx->profiler_active = false; + if (ctx->profiler_state != NULL && ggml_metal_profiler_is_enabled(ctx->profiler_state)) { + if (ggml_metal_device_supports_profiling(ctx->dev) && gf->n_nodes > 0) { + const size_t total_slots = 2 * (size_t) gf->n_nodes; + ctx->profiler_sample_buf = ggml_metal_device_create_sample_buf(ctx->dev, total_slots); + if (ctx->profiler_sample_buf != NULL) { + ctx->profiler_total_slots = total_slots; + ctx->profiler_slot_map = (struct ggml_metal_op_sample_slot *) calloc( + (size_t) gf->n_nodes, sizeof(struct ggml_metal_op_sample_slot)); + if (ctx->profiler_slot_map != NULL) { + // Mark all entries unused (node_idx < 0). + for (int i = 0; i < gf->n_nodes; ++i) { + ctx->profiler_slot_map[i].node_idx = -1; + } + ggml_metal_device_sample_timestamps(ctx->dev, + &ctx->profiler_cpu_anchor_ns, + &ctx->profiler_gpu_anchor_ns); + ctx->profiler_active = true; + } else { + ggml_metal_sample_buf_free(ctx->profiler_sample_buf); + ctx->profiler_sample_buf = NULL; + } + } + } + } + // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes @@ -609,6 +651,66 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * ctx->capture_started = false; } + + // Profiling drain: wait for all command buffers to complete, resolve timestamps, push records. + // This forces a synchronous wait — Vulkan does the same when its profiler is active. + if (ctx->profiler_active) { + { + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + if (cmd_buf) { + [cmd_buf waitUntilCompleted]; + } + } + for (int i = 0; i < n_cb; ++i) { + id cmd_buf = ctx->cmd_bufs[i].obj; + if (cmd_buf) { + // Ensure cmd_bufs that were not auto-enqueued get committed. + if ([cmd_buf status] == MTLCommandBufferStatusNotEnqueued) { + [cmd_buf commit]; + } + [cmd_buf waitUntilCompleted]; + } + } + + uint64_t * ns = (uint64_t *) calloc(ctx->profiler_total_slots, sizeof(uint64_t)); + if (ns != NULL) { + ggml_metal_sample_buf_resolve(ctx->profiler_sample_buf, + /*base=*/0, + ctx->profiler_total_slots, + ctx->profiler_cpu_anchor_ns, + ctx->profiler_gpu_anchor_ns, + ns); + + for (int i = 0; i < gf->n_nodes; ++i) { + const struct ggml_metal_op_sample_slot * slot = &ctx->profiler_slot_map[i]; + if (slot->node_idx < 0) { + continue; + } + if (slot->slot_start >= ctx->profiler_total_slots || + slot->slot_end >= ctx->profiler_total_slots) { + continue; + } + const uint64_t t0 = ns[slot->slot_start]; + const uint64_t t1 = ns[slot->slot_end]; + if (t0 == 0 || t1 == 0 || t1 < t0) { + continue; + } + ggml_metal_profiler_push_record(ctx->profiler_state, + ggml_graph_node(gf, slot->node_idx), + t0, + t1); + } + + free(ns); + } + + free(ctx->profiler_slot_map); + ctx->profiler_slot_map = NULL; + ggml_metal_sample_buf_free(ctx->profiler_sample_buf); + ctx->profiler_sample_buf = NULL; + ctx->profiler_total_slots = 0; + ctx->profiler_active = false; + } } return GGML_STATUS_SUCCESS; @@ -660,6 +762,10 @@ ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) { return ctx->ev_cpy; } +void ggml_metal_set_profiler_state(ggml_metal_t ctx, struct ggml_metal_profiler_state * state) { + ctx->profiler_state = state; +} + void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) { if (ctx->n_cb != n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); @@ -704,6 +810,17 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) { ctx->debug_graph, ctx->debug_fusion); + if (ctx->profiler_active) { + // Base slot for this command buffer is 2 * (first graph-node index it processes). + // The encoder uses one (start, end) pair per encoded op group; we over-allocate so + // that empty/filtered nodes simply leave gaps in the sample buffer. + const size_t base_slot = 2 * (size_t) idx_start; + ggml_metal_op_enable_profiling(ctx_op, + ctx->profiler_sample_buf, + base_slot, + ctx->profiler_slot_map); + } + for (int idx = 0; idx < ggml_metal_op_n_nodes(ctx_op); ++idx) { const int res = ggml_metal_op_encode(ctx_op, idx); if (res == 0) { diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 4a3ebb5569..191c150fae 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -91,6 +91,17 @@ void ggml_metal_encoder_memory_barrier(ggml_metal_encoder_t encoder); void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder); +// +// MTLCounterSampleBuffer wrapper (used by the profiler) +// + +typedef struct ggml_metal_sample_buf * ggml_metal_sample_buf_t; + +// Insert a GPU timestamp sample on the encoder at the given slot. +// Caller must ensure (a) the encoder belongs to a command buffer using a counter sample buffer that +// supports MTLCounterSamplingPointAtDispatchBoundary, and (b) `index` is unique within the buffer. +void ggml_metal_encoder_sample_timestamp(ggml_metal_encoder_t encoder, ggml_metal_sample_buf_t buf, size_t index); + // // MTLLibrary wrapper // @@ -292,6 +303,35 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_device_t dev); +// +// Profiling helpers +// + +// Returns true if the device supports MTLCounterSamplingPointAtDispatchBoundary on compute encoders +// AND exposes the MTLCommonCounterSetTimestamp counter set. +bool ggml_metal_device_supports_profiling(ggml_metal_device_t dev); + +// Allocate a counter sample buffer with `sample_count` slots backed by the timestamp counter set. +// Returns NULL on failure (e.g. unsupported device). +ggml_metal_sample_buf_t ggml_metal_device_create_sample_buf(ggml_metal_device_t dev, size_t sample_count); + +void ggml_metal_sample_buf_free(ggml_metal_sample_buf_t buf); + +// Capture correlated CPU/GPU timestamps (both in Mach-absolute units, i.e. nanoseconds on +// current Apple Silicon; the conversion via mach_timebase_info is applied internally). +// Used to anchor the GPU timestamp domain to the CPU clock returned by ggml_profiler_time_ns(). +void ggml_metal_device_sample_timestamps(ggml_metal_device_t dev, uint64_t * cpu_ns, uint64_t * gpu_ns); + +// Resolve `count` consecutive samples starting at `base` into nanosecond timestamps anchored against +// `cpu_anchor_ns` / `gpu_anchor_ns` (obtained from ggml_metal_device_sample_timestamps). +// `out_ns` must hold at least `count` uint64_t entries. +void ggml_metal_sample_buf_resolve(ggml_metal_sample_buf_t buf, + size_t base, + size_t count, + uint64_t cpu_anchor_ns, + uint64_t gpu_anchor_ns, + uint64_t * out_ns); + // // device buffers // diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 05d7f43051..f5866a7367 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -8,6 +8,7 @@ #include #include +#include #ifndef TARGET_OS_VISION #define TARGET_OS_VISION 0 @@ -517,6 +518,21 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) { [encoder->obj endEncoding]; } +// +// MTLCounterSampleBuffer wrapper +// + +struct ggml_metal_sample_buf { + id obj; + size_t sample_count; +}; + +void ggml_metal_encoder_sample_timestamp(ggml_metal_encoder_t encoder, ggml_metal_sample_buf_t buf, size_t index) { + if (@available(macOS 11.0, iOS 14.0, *)) { + [encoder->obj sampleCountersInBuffer:buf->obj atSampleIndex:index withBarrier:YES]; + } +} + struct ggml_metal_device { id mtl_device; @@ -1341,6 +1357,139 @@ const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_de return &dev->props; } +// +// Profiling helpers +// + +// Look up the MTLCommonCounterSetTimestamp counter set, or nil if the device doesn't expose it. +static id ggml_metal_device_get_timestamp_counter_set(ggml_metal_device_t dev) { + if (@available(macOS 10.15, iOS 14.0, *)) { + NSArray> * sets = [dev->mtl_device counterSets]; + for (id cs in sets) { + if ([[cs name] isEqualToString:MTLCommonCounterSetTimestamp]) { + return cs; + } + } + } + return nil; +} + +bool ggml_metal_device_supports_profiling(ggml_metal_device_t dev) { + if (@available(macOS 11.0, iOS 14.0, *)) { + if (![dev->mtl_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) { + return false; + } + return ggml_metal_device_get_timestamp_counter_set(dev) != nil; + } + return false; +} + +ggml_metal_sample_buf_t ggml_metal_device_create_sample_buf(ggml_metal_device_t dev, size_t sample_count) { + if (sample_count == 0) { + return NULL; + } + if (@available(macOS 10.15, iOS 14.0, *)) { + id cs = ggml_metal_device_get_timestamp_counter_set(dev); + if (cs == nil) { + return NULL; + } + + MTLCounterSampleBufferDescriptor * desc = [[MTLCounterSampleBufferDescriptor alloc] init]; + desc.counterSet = cs; + desc.label = @"ggml-metal-profiler"; + desc.storageMode = MTLStorageModeShared; + desc.sampleCount = sample_count; + + NSError * error = nil; + id sb = [dev->mtl_device newCounterSampleBufferWithDescriptor:desc error:&error]; + [desc release]; + + if (sb == nil) { + GGML_LOG_WARN("%s: failed to create counter sample buffer: %s\n", __func__, + error ? [[error localizedDescription] UTF8String] : "unknown"); + return NULL; + } + + ggml_metal_sample_buf_t res = calloc(1, sizeof(struct ggml_metal_sample_buf)); + res->obj = sb; + res->sample_count = sample_count; + return res; + } + return NULL; +} + +void ggml_metal_sample_buf_free(ggml_metal_sample_buf_t buf) { + if (buf == NULL) { + return; + } + [buf->obj release]; + free(buf); +} + +void ggml_metal_device_sample_timestamps(ggml_metal_device_t dev, uint64_t * cpu_ns, uint64_t * gpu_ns) { + if (@available(macOS 10.15, iOS 14.0, *)) { + MTLTimestamp cpu_t = 0; + MTLTimestamp gpu_t = 0; + [dev->mtl_device sampleTimestamps:&cpu_t gpuTimestamp:&gpu_t]; + + // Apple docs: both timestamps are in Mach-absolute time units. Convert to nanoseconds. + static mach_timebase_info_data_t tb = {0, 0}; + if (tb.denom == 0) { + mach_timebase_info(&tb); + } + if (cpu_ns) *cpu_ns = (uint64_t)((__uint128_t)cpu_t * tb.numer / tb.denom); + if (gpu_ns) *gpu_ns = (uint64_t)((__uint128_t)gpu_t * tb.numer / tb.denom); + } else { + if (cpu_ns) *cpu_ns = 0; + if (gpu_ns) *gpu_ns = 0; + } +} + +void ggml_metal_sample_buf_resolve(ggml_metal_sample_buf_t buf, + size_t base, + size_t count, + uint64_t cpu_anchor_ns, + uint64_t gpu_anchor_ns, + uint64_t * out_ns) { + if (buf == NULL || count == 0 || out_ns == NULL) { + return; + } + if (@available(macOS 10.15, iOS 14.0, *)) { + NSRange range = NSMakeRange(base, count); + NSData * data = [buf->obj resolveCounterRange:range]; + if (data == nil || [data length] < count * sizeof(MTLCounterResultTimestamp)) { + // Failed resolve: fill with zeros so the caller can detect dropped samples. + for (size_t i = 0; i < count; ++i) { + out_ns[i] = 0; + } + return; + } + + static mach_timebase_info_data_t tb = {0, 0}; + if (tb.denom == 0) { + mach_timebase_info(&tb); + } + + const MTLCounterResultTimestamp * src = (const MTLCounterResultTimestamp *) [data bytes]; + for (size_t i = 0; i < count; ++i) { + // MTLCounterErrorValue (~0ULL) marks an unrecorded sample slot; map to 0. + if (src[i].timestamp == MTLCounterErrorValue) { + out_ns[i] = 0; + continue; + } + const uint64_t gpu_ns = (uint64_t)((__uint128_t) src[i].timestamp * tb.numer / tb.denom); + // Anchor to CPU clock returned by ggml_profiler_time_ns(). + out_ns[i] = (gpu_ns >= gpu_anchor_ns) + ? cpu_anchor_ns + (gpu_ns - gpu_anchor_ns) + : cpu_anchor_ns - (gpu_anchor_ns - gpu_ns); + } + } else { + for (size_t i = 0; i < count; ++i) { + out_ns[i] = 0; + } + } +} + // // device buffers // diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e2ce56e9e2..0a55535f68 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -77,6 +77,11 @@ struct ggml_metal_op { return ggml_graph_node(gf, idxs[i]); } + int node_global_idx(int i) const { + assert(i >= 0 && i < (int) idxs.size()); + return idxs[i]; + } + bool can_fuse(int i0, const ggml_op * ops, int n_ops) const { assert(use_fusion); assert(i0 >= 0 && i0 < n_nodes()); @@ -100,6 +105,12 @@ struct ggml_metal_op { int debug_graph; int debug_fusion; + // Profiling: when sample_buf is non-null, ggml_metal_op_encode brackets each impl call with + // two timestamp samples. The (node_idx, slot_start, slot_end) tuple is recorded in slot_map. + ggml_metal_sample_buf_t sample_buf = nullptr; + size_t next_slot = 0; + ggml_metal_op_sample_slot * slot_map = nullptr; + private: ggml_cgraph * gf; @@ -144,6 +155,16 @@ int ggml_metal_op_n_nodes(ggml_metal_op_t ctx) { return ctx->n_nodes(); } +void ggml_metal_op_enable_profiling( + ggml_metal_op_t ctx, + ggml_metal_sample_buf_t sample_buf, + size_t slot_base, + ggml_metal_op_sample_slot * slot_map) { + ctx->sample_buf = sample_buf; + ctx->next_slot = slot_base; + ctx->slot_map = slot_map; +} + static bool ggml_metal_op_concurrency_reset(ggml_metal_op_t ctx) { if (!ctx->mem_ranges) { return true; @@ -500,12 +521,31 @@ int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_debug_group_push(ctx->enc, ggml_op_desc(ctx->node(idx))); } + const size_t slot_start = ctx->next_slot; + const bool profiling = (ctx->sample_buf != nullptr && ctx->slot_map != nullptr); + if (profiling) { + ggml_metal_encoder_sample_timestamp(ctx->enc, ctx->sample_buf, slot_start); + ctx->next_slot++; + } + int res = ggml_metal_op_encode_impl(ctx, idx); if (idx + res > ctx->n_nodes()) { GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s", "https://github.com/ggml-org/llama.cpp/pull/14849"); } + if (profiling) { + const size_t slot_end = ctx->next_slot; + ggml_metal_encoder_sample_timestamp(ctx->enc, ctx->sample_buf, slot_end); + ctx->next_slot++; + + const int gidx = ctx->node_global_idx(idx); + ctx->slot_map[gidx].node_idx = gidx; + ctx->slot_map[gidx].n_fused = res; + ctx->slot_map[gidx].slot_start = slot_start; + ctx->slot_map[gidx].slot_end = slot_end; + } + if (ctx->use_capture) { ggml_metal_encoder_debug_group_pop(ctx->enc); } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 36c61071b4..83fa4a320c 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -26,6 +26,29 @@ int ggml_metal_op_n_nodes(ggml_metal_op_t ctx); int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx); +// +// Profiling: per-op timestamp sampling +// + +// (node_idx, slot_start, slot_end) triple recorded for each encoded op group. +// node_idx < 0 marks an unused slot. +struct ggml_metal_op_sample_slot { + int node_idx; + int n_fused; + size_t slot_start; + size_t slot_end; +}; + +// Enable profiling on this op encoder. `slot_base` is the first index in `sample_buf` available +// to this encoder; `slot_map` is an array (size = number of graph nodes) into which +// the encoder will record one entry per non-empty encoded op group. +// Must be called before any ggml_metal_op_encode(). +void ggml_metal_op_enable_profiling( + ggml_metal_op_t ctx, + ggml_metal_sample_buf_t sample_buf, + size_t slot_base, + struct ggml_metal_op_sample_slot * slot_map); + // // available ops: // diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index a537136210..c87eb7830e 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -2,6 +2,7 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-profiler.h" #include "ggml-metal-device.h" #include "ggml-metal-context.h" @@ -9,6 +10,7 @@ #include #include +#include #define GGML_METAL_NAME "MTL" #define GGML_METAL_MAX_DEVICES 16 @@ -20,6 +22,59 @@ static int g_devices = 1; // forward declaration static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer); +// +// Profiler state (mirrors the per-backend profiler state from the Vulkan integration) +// +// Owned by ggml_backend_metal_init / device_init_backend; lifetime is the backend's lifetime. +// The Metal context holds a borrowed pointer and calls into the bridge functions below to push +// records during graph_compute. +// + +struct ggml_metal_profiler_state { + bool enabled = false; + int split_id = -1; + std::vector records; + + void reset() { + records.clear(); + split_id = -1; + } +}; + +extern "C" { + +void ggml_metal_profiler_push_record( + ggml_metal_profiler_state * state, + const ggml_tensor * node, + uint64_t start_ns, + uint64_t end_ns) { + if (state == nullptr || node == nullptr) { + return; + } + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = -1; + rec.split_id = state->split_id; + rec.start_ns = start_ns; + rec.end_ns = end_ns; + rec.bytes = ggml_nbytes(node); + rec.extra = nullptr; + ggml_profile_record_from_tensor(&rec, node); + state->records.push_back(rec); +} + +bool ggml_metal_profiler_is_enabled(ggml_metal_profiler_state * state) { + return state != nullptr && state->enabled; +} + +int ggml_metal_profiler_get_split_id(ggml_metal_profiler_state * state) { + return state != nullptr ? state->split_id : -1; +} + +} // extern "C" + //////////////////////////////////////////////////////////////////////////////// // backend interface //////////////////////////////////////////////////////////////////////////////// @@ -589,6 +644,48 @@ static ggml_guid_t ggml_backend_metal_guid(void) { return &guid; } +// Register the per-backend ggml_backend_profiler interface on `backend` and inject the borrowed +// state pointer into the Metal context so graph_compute can push records. +static void ggml_backend_metal_register_profiler(ggml_backend_t backend, ggml_metal_t ctx) { + auto * state = new ggml_metal_profiler_state(); + ggml_metal_set_profiler_state(ctx, state); + + static auto metal_prof_enable = [](void * context, bool enable) { + auto * state = (ggml_metal_profiler_state *) context; + state->enabled = enable; + if (!enable) { + state->reset(); + } + }; + static auto metal_prof_reset = [](void * context) { + auto * state = (ggml_metal_profiler_state *) context; + state->reset(); + }; + static auto metal_prof_set_split_id = [](void * context, int split_id) { + auto * state = (ggml_metal_profiler_state *) context; + state->split_id = split_id; + }; + static auto metal_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int { + auto * state = (ggml_metal_profiler_state *) context; + *out = state->records.data(); + return (int) state->records.size(); + }; + static auto metal_prof_free = [](void * context) { + auto * state = (ggml_metal_profiler_state *) context; + delete state; + }; + + auto * profiler = new ggml_backend_profiler { + /* .context = */ state, + /* .enable = */ metal_prof_enable, + /* .reset = */ metal_prof_reset, + /* .set_split_id = */ metal_prof_set_split_id, + /* .get_records = */ metal_prof_get_records, + /* .free_context = */ metal_prof_free, + }; + ggml_backend_set_profiler(backend, profiler); +} + ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), 0); ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; @@ -611,6 +708,8 @@ ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_metal_set_n_cb(backend, 1); + ggml_backend_metal_register_profiler(backend, ctx); + return backend; } @@ -706,6 +805,8 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t ggml_backend_metal_set_n_cb(backend, 1); + ggml_backend_metal_register_profiler(backend, ctx); + return backend; GGML_UNUSED(params);