tentative Metal support

2026-06-27 23:50:20 -05:00 · 2026-05-19 11:52:22 +02:00 · 2026-05-19 11:52:22 +02:00 · 2bfe4ff9ca
commit 2bfe4ff9ca
parent 28ef941775
7 changed files with 496 additions and 0 deletions
--- a/ggml/src/ggml-metal/ggml-metal-context.h
+++ b/ggml/src/ggml-metal/ggml-metal-context.h
@ -36,6 +36,32 @@ void ggml_metal_set_abort_callback  (ggml_metal_t ctx, ggml_abort_callback abort
 bool ggml_metal_supports_family     (ggml_metal_t ctx, int family);
 void ggml_metal_capture_next_compute(ggml_metal_t ctx);

+//
+// Profiling
+//
+
+// Opaque profiler state, owned by the C++ backend layer (ggml-metal.cpp). Holds the std::vector of
+// records returned via the ggml_backend_profiler interface. ggml_metal_t keeps a borrowed pointer
+// so that graph_compute can push records when sampling is active.
+struct ggml_metal_profiler_state;
+
+// Inject (or clear, with NULL) the profiler state pointer. Called once at backend init.
+void ggml_metal_set_profiler_state(ggml_metal_t ctx, struct ggml_metal_profiler_state * state);
+
+// Bridge function implemented in ggml-metal.cpp. Used by graph_compute (in .m) to push records.
+void ggml_metal_profiler_push_record(
+        struct ggml_metal_profiler_state * state,
+        const struct ggml_tensor * node,
+        uint64_t start_ns,
+        uint64_t end_ns);
+
+// Query whether the injected profiler state is currently enabled.
+// (Avoids exposing the C++ struct layout to the .m file.)
+bool ggml_metal_profiler_is_enabled(struct ggml_metal_profiler_state * state);
+
+// Query the split-id currently set on the profiler state.
+int  ggml_metal_profiler_get_split_id(struct ggml_metal_profiler_state * state);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@ -6,6 +6,7 @@
 #import "ggml-metal-impl.h"
 #import "ggml-metal-common.h"
 #import "ggml-metal-ops.h"
+#import "ggml-profiler.h"

 #import <Foundation/Foundation.h>

@ -79,6 +80,19 @@ struct ggml_metal {
    // error state - set when a command buffer fails during synchronize
    // once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated
    bool has_error;
+
+    // Profiling
+    // Borrowed; owned by the C++ backend layer (ggml-metal.cpp). NULL when profiling is unavailable.
+    struct ggml_metal_profiler_state * profiler_state;
+
+    // Per-graph-compute scratch state populated when profiling is active for the current invocation.
+    // Lifetime: from start of ggml_metal_graph_compute until its end.
+    bool                              profiler_active;
+    ggml_metal_sample_buf_t           profiler_sample_buf;
+    size_t                            profiler_total_slots;
+    uint64_t                          profiler_cpu_anchor_ns;
+    uint64_t                          profiler_gpu_anchor_ns;
+    struct ggml_metal_op_sample_slot * profiler_slot_map;  // size = gf->n_nodes
 };

 ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
@ -450,6 +464,34 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
    // keep the memory wired
    ggml_metal_device_rsets_keep_alive(ctx->dev);

+    // Decide whether profiling is active for this invocation. Activation is sticky for the whole
+    // call: allocate sample buffer + slot map up front so the encode_async block can use them.
+    ctx->profiler_active = false;
+    if (ctx->profiler_state != NULL && ggml_metal_profiler_is_enabled(ctx->profiler_state)) {
+        if (ggml_metal_device_supports_profiling(ctx->dev) && gf->n_nodes > 0) {
+            const size_t total_slots = 2 * (size_t) gf->n_nodes;
+            ctx->profiler_sample_buf = ggml_metal_device_create_sample_buf(ctx->dev, total_slots);
+            if (ctx->profiler_sample_buf != NULL) {
+                ctx->profiler_total_slots = total_slots;
+                ctx->profiler_slot_map = (struct ggml_metal_op_sample_slot *) calloc(
+                    (size_t) gf->n_nodes, sizeof(struct ggml_metal_op_sample_slot));
+                if (ctx->profiler_slot_map != NULL) {
+                    // Mark all entries unused (node_idx < 0).
+                    for (int i = 0; i < gf->n_nodes; ++i) {
+                        ctx->profiler_slot_map[i].node_idx = -1;
+                    }
+                    ggml_metal_device_sample_timestamps(ctx->dev,
+                                                       &ctx->profiler_cpu_anchor_ns,
+                                                       &ctx->profiler_gpu_anchor_ns);
+                    ctx->profiler_active = true;
+                } else {
+                    ggml_metal_sample_buf_free(ctx->profiler_sample_buf);
+                    ctx->profiler_sample_buf = NULL;
+                }
+            }
+        }
+    }
+
    // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
    // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
    // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@ -609,6 +651,66 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *

            ctx->capture_started = false;
        }
+
+        // Profiling drain: wait for all command buffers to complete, resolve timestamps, push records.
+        // This forces a synchronous wait — Vulkan does the same when its profiler is active.
+        if (ctx->profiler_active) {
+            {
+                id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
+                if (cmd_buf) {
+                    [cmd_buf waitUntilCompleted];
+                }
+            }
+            for (int i = 0; i < n_cb; ++i) {
+                id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
+                if (cmd_buf) {
+                    // Ensure cmd_bufs that were not auto-enqueued get committed.
+                    if ([cmd_buf status] == MTLCommandBufferStatusNotEnqueued) {
+                        [cmd_buf commit];
+                    }
+                    [cmd_buf waitUntilCompleted];
+                }
+            }
+
+            uint64_t * ns = (uint64_t *) calloc(ctx->profiler_total_slots, sizeof(uint64_t));
+            if (ns != NULL) {
+                ggml_metal_sample_buf_resolve(ctx->profiler_sample_buf,
+                                              /*base=*/0,
+                                              ctx->profiler_total_slots,
+                                              ctx->profiler_cpu_anchor_ns,
+                                              ctx->profiler_gpu_anchor_ns,
+                                              ns);
+
+                for (int i = 0; i < gf->n_nodes; ++i) {
+                    const struct ggml_metal_op_sample_slot * slot = &ctx->profiler_slot_map[i];
+                    if (slot->node_idx < 0) {
+                        continue;
+                    }
+                    if (slot->slot_start >= ctx->profiler_total_slots ||
+                        slot->slot_end   >= ctx->profiler_total_slots) {
+                        continue;
+                    }
+                    const uint64_t t0 = ns[slot->slot_start];
+                    const uint64_t t1 = ns[slot->slot_end];
+                    if (t0 == 0 || t1 == 0 || t1 < t0) {
+                        continue;
+                    }
+                    ggml_metal_profiler_push_record(ctx->profiler_state,
+                                                    ggml_graph_node(gf, slot->node_idx),
+                                                    t0,
+                                                    t1);
+                }
+
+                free(ns);
+            }
+
+            free(ctx->profiler_slot_map);
+            ctx->profiler_slot_map = NULL;
+            ggml_metal_sample_buf_free(ctx->profiler_sample_buf);
+            ctx->profiler_sample_buf = NULL;
+            ctx->profiler_total_slots = 0;
+            ctx->profiler_active = false;
+        }
    }

    return GGML_STATUS_SUCCESS;
@ -660,6 +762,10 @@ ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) {
    return ctx->ev_cpy;
 }

+void ggml_metal_set_profiler_state(ggml_metal_t ctx, struct ggml_metal_profiler_state * state) {
+    ctx->profiler_state = state;
+}
+
 void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
    if (ctx->n_cb != n_cb) {
        ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
@ -704,6 +810,17 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
            ctx->debug_graph,
            ctx->debug_fusion);

+        if (ctx->profiler_active) {
+            // Base slot for this command buffer is 2 * (first graph-node index it processes).
+            // The encoder uses one (start, end) pair per encoded op group; we over-allocate so
+            // that empty/filtered nodes simply leave gaps in the sample buffer.
+            const size_t base_slot = 2 * (size_t) idx_start;
+            ggml_metal_op_enable_profiling(ctx_op,
+                                           ctx->profiler_sample_buf,
+                                           base_slot,
+                                           ctx->profiler_slot_map);
+        }
+
        for (int idx = 0; idx < ggml_metal_op_n_nodes(ctx_op); ++idx) {
            const int res = ggml_metal_op_encode(ctx_op, idx);
            if (res == 0) {
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@ -91,6 +91,17 @@ void ggml_metal_encoder_memory_barrier(ggml_metal_encoder_t encoder);

 void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);

+//
+// MTLCounterSampleBuffer wrapper (used by the profiler)
+//
+
+typedef struct ggml_metal_sample_buf * ggml_metal_sample_buf_t;
+
+// Insert a GPU timestamp sample on the encoder at the given slot.
+// Caller must ensure (a) the encoder belongs to a command buffer using a counter sample buffer that
+// supports MTLCounterSamplingPointAtDispatchBoundary, and (b) `index` is unique within the buffer.
+void ggml_metal_encoder_sample_timestamp(ggml_metal_encoder_t encoder, ggml_metal_sample_buf_t buf, size_t index);
+
 //
 // MTLLibrary wrapper
 //
@ -292,6 +303,35 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te

 const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_device_t dev);

+//
+// Profiling helpers
+//
+
+// Returns true if the device supports MTLCounterSamplingPointAtDispatchBoundary on compute encoders
+// AND exposes the MTLCommonCounterSetTimestamp counter set.
+bool ggml_metal_device_supports_profiling(ggml_metal_device_t dev);
+
+// Allocate a counter sample buffer with `sample_count` slots backed by the timestamp counter set.
+// Returns NULL on failure (e.g. unsupported device).
+ggml_metal_sample_buf_t ggml_metal_device_create_sample_buf(ggml_metal_device_t dev, size_t sample_count);
+
+void ggml_metal_sample_buf_free(ggml_metal_sample_buf_t buf);
+
+// Capture correlated CPU/GPU timestamps (both in Mach-absolute units, i.e. nanoseconds on
+// current Apple Silicon; the conversion via mach_timebase_info is applied internally).
+// Used to anchor the GPU timestamp domain to the CPU clock returned by ggml_profiler_time_ns().
+void ggml_metal_device_sample_timestamps(ggml_metal_device_t dev, uint64_t * cpu_ns, uint64_t * gpu_ns);
+
+// Resolve `count` consecutive samples starting at `base` into nanosecond timestamps anchored against
+// `cpu_anchor_ns` / `gpu_anchor_ns` (obtained from ggml_metal_device_sample_timestamps).
+// `out_ns` must hold at least `count` uint64_t entries.
+void ggml_metal_sample_buf_resolve(ggml_metal_sample_buf_t buf,
+                                   size_t   base,
+                                   size_t   count,
+                                   uint64_t cpu_anchor_ns,
+                                   uint64_t gpu_anchor_ns,
+                                   uint64_t * out_ns);
+
 //
 // device buffers
 //
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@ -8,6 +8,7 @@
 #include <Metal/Metal.h>

 #include <stdatomic.h>
+#include <mach/mach_time.h>

 #ifndef TARGET_OS_VISION
 #define TARGET_OS_VISION 0
@ -517,6 +518,21 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) {
    [encoder->obj endEncoding];
 }

+//
+// MTLCounterSampleBuffer wrapper
+//
+
+struct ggml_metal_sample_buf {
+    id<MTLCounterSampleBuffer> obj;
+    size_t sample_count;
+};
+
+void ggml_metal_encoder_sample_timestamp(ggml_metal_encoder_t encoder, ggml_metal_sample_buf_t buf, size_t index) {
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        [encoder->obj sampleCountersInBuffer:buf->obj atSampleIndex:index withBarrier:YES];
+    }
+}
+
 struct ggml_metal_device {
    id<MTLDevice> mtl_device;

@ -1341,6 +1357,139 @@ const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_de
    return &dev->props;
 }

+//
+// Profiling helpers
+//
+
+// Look up the MTLCommonCounterSetTimestamp counter set, or nil if the device doesn't expose it.
+static id<MTLCounterSet> ggml_metal_device_get_timestamp_counter_set(ggml_metal_device_t dev) {
+    if (@available(macOS 10.15, iOS 14.0, *)) {
+        NSArray<id<MTLCounterSet>> * sets = [dev->mtl_device counterSets];
+        for (id<MTLCounterSet> cs in sets) {
+            if ([[cs name] isEqualToString:MTLCommonCounterSetTimestamp]) {
+                return cs;
+            }
+        }
+    }
+    return nil;
+}
+
+bool ggml_metal_device_supports_profiling(ggml_metal_device_t dev) {
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        if (![dev->mtl_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) {
+            return false;
+        }
+        return ggml_metal_device_get_timestamp_counter_set(dev) != nil;
+    }
+    return false;
+}
+
+ggml_metal_sample_buf_t ggml_metal_device_create_sample_buf(ggml_metal_device_t dev, size_t sample_count) {
+    if (sample_count == 0) {
+        return NULL;
+    }
+    if (@available(macOS 10.15, iOS 14.0, *)) {
+        id<MTLCounterSet> cs = ggml_metal_device_get_timestamp_counter_set(dev);
+        if (cs == nil) {
+            return NULL;
+        }
+
+        MTLCounterSampleBufferDescriptor * desc = [[MTLCounterSampleBufferDescriptor alloc] init];
+        desc.counterSet = cs;
+        desc.label = @"ggml-metal-profiler";
+        desc.storageMode = MTLStorageModeShared;
+        desc.sampleCount = sample_count;
+
+        NSError * error = nil;
+        id<MTLCounterSampleBuffer> sb = [dev->mtl_device newCounterSampleBufferWithDescriptor:desc error:&error];
+        [desc release];
+
+        if (sb == nil) {
+            GGML_LOG_WARN("%s: failed to create counter sample buffer: %s\n", __func__,
+                          error ? [[error localizedDescription] UTF8String] : "unknown");
+            return NULL;
+        }
+
+        ggml_metal_sample_buf_t res = calloc(1, sizeof(struct ggml_metal_sample_buf));
+        res->obj = sb;
+        res->sample_count = sample_count;
+        return res;
+    }
+    return NULL;
+}
+
+void ggml_metal_sample_buf_free(ggml_metal_sample_buf_t buf) {
+    if (buf == NULL) {
+        return;
+    }
+    [buf->obj release];
+    free(buf);
+}
+
+void ggml_metal_device_sample_timestamps(ggml_metal_device_t dev, uint64_t * cpu_ns, uint64_t * gpu_ns) {
+    if (@available(macOS 10.15, iOS 14.0, *)) {
+        MTLTimestamp cpu_t = 0;
+        MTLTimestamp gpu_t = 0;
+        [dev->mtl_device sampleTimestamps:&cpu_t gpuTimestamp:&gpu_t];
+
+        // Apple docs: both timestamps are in Mach-absolute time units. Convert to nanoseconds.
+        static mach_timebase_info_data_t tb = {0, 0};
+        if (tb.denom == 0) {
+            mach_timebase_info(&tb);
+        }
+        if (cpu_ns) *cpu_ns = (uint64_t)((__uint128_t)cpu_t * tb.numer / tb.denom);
+        if (gpu_ns) *gpu_ns = (uint64_t)((__uint128_t)gpu_t * tb.numer / tb.denom);
+    } else {
+        if (cpu_ns) *cpu_ns = 0;
+        if (gpu_ns) *gpu_ns = 0;
+    }
+}
+
+void ggml_metal_sample_buf_resolve(ggml_metal_sample_buf_t buf,
+                                   size_t   base,
+                                   size_t   count,
+                                   uint64_t cpu_anchor_ns,
+                                   uint64_t gpu_anchor_ns,
+                                   uint64_t * out_ns) {
+    if (buf == NULL || count == 0 || out_ns == NULL) {
+        return;
+    }
+    if (@available(macOS 10.15, iOS 14.0, *)) {
+        NSRange range = NSMakeRange(base, count);
+        NSData * data = [buf->obj resolveCounterRange:range];
+        if (data == nil || [data length] < count * sizeof(MTLCounterResultTimestamp)) {
+            // Failed resolve: fill with zeros so the caller can detect dropped samples.
+            for (size_t i = 0; i < count; ++i) {
+                out_ns[i] = 0;
+            }
+            return;
+        }
+
+        static mach_timebase_info_data_t tb = {0, 0};
+        if (tb.denom == 0) {
+            mach_timebase_info(&tb);
+        }
+
+        const MTLCounterResultTimestamp * src = (const MTLCounterResultTimestamp *) [data bytes];
+        for (size_t i = 0; i < count; ++i) {
+            // MTLCounterErrorValue (~0ULL) marks an unrecorded sample slot; map to 0.
+            if (src[i].timestamp == MTLCounterErrorValue) {
+                out_ns[i] = 0;
+                continue;
+            }
+            const uint64_t gpu_ns = (uint64_t)((__uint128_t) src[i].timestamp * tb.numer / tb.denom);
+            // Anchor to CPU clock returned by ggml_profiler_time_ns().
+            out_ns[i] = (gpu_ns >= gpu_anchor_ns)
+                      ? cpu_anchor_ns + (gpu_ns - gpu_anchor_ns)
+                      : cpu_anchor_ns - (gpu_anchor_ns - gpu_ns);
+        }
+    } else {
+        for (size_t i = 0; i < count; ++i) {
+            out_ns[i] = 0;
+        }
+    }
+}
+
 //
 // device buffers
 //
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@ -77,6 +77,11 @@ struct ggml_metal_op {
        return ggml_graph_node(gf, idxs[i]);
    }

+    int node_global_idx(int i) const {
+        assert(i >= 0 && i < (int) idxs.size());
+        return idxs[i];
+    }
+
    bool can_fuse(int i0, const ggml_op * ops, int n_ops) const {
        assert(use_fusion);
        assert(i0 >= 0 && i0 < n_nodes());
@ -100,6 +105,12 @@ struct ggml_metal_op {
    int debug_graph;
    int debug_fusion;

+    // Profiling: when sample_buf is non-null, ggml_metal_op_encode brackets each impl call with
+    // two timestamp samples. The (node_idx, slot_start, slot_end) tuple is recorded in slot_map.
+    ggml_metal_sample_buf_t sample_buf      = nullptr;
+    size_t                  next_slot       = 0;
+    ggml_metal_op_sample_slot * slot_map    = nullptr;
+
 private:
    ggml_cgraph * gf;

@ -144,6 +155,16 @@ int ggml_metal_op_n_nodes(ggml_metal_op_t ctx) {
    return ctx->n_nodes();
 }

+void ggml_metal_op_enable_profiling(
+        ggml_metal_op_t ctx,
+        ggml_metal_sample_buf_t sample_buf,
+        size_t slot_base,
+        ggml_metal_op_sample_slot * slot_map) {
+    ctx->sample_buf = sample_buf;
+    ctx->next_slot  = slot_base;
+    ctx->slot_map   = slot_map;
+}
+
 static bool ggml_metal_op_concurrency_reset(ggml_metal_op_t ctx) {
    if (!ctx->mem_ranges) {
        return true;
@ -500,12 +521,31 @@ int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx) {
        ggml_metal_encoder_debug_group_push(ctx->enc, ggml_op_desc(ctx->node(idx)));
    }

+    const size_t slot_start = ctx->next_slot;
+    const bool   profiling  = (ctx->sample_buf != nullptr && ctx->slot_map != nullptr);
+    if (profiling) {
+        ggml_metal_encoder_sample_timestamp(ctx->enc, ctx->sample_buf, slot_start);
+        ctx->next_slot++;
+    }
+
    int res = ggml_metal_op_encode_impl(ctx, idx);
    if (idx + res > ctx->n_nodes()) {
        GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
                "https://github.com/ggml-org/llama.cpp/pull/14849");
    }

+    if (profiling) {
+        const size_t slot_end = ctx->next_slot;
+        ggml_metal_encoder_sample_timestamp(ctx->enc, ctx->sample_buf, slot_end);
+        ctx->next_slot++;
+
+        const int gidx = ctx->node_global_idx(idx);
+        ctx->slot_map[gidx].node_idx   = gidx;
+        ctx->slot_map[gidx].n_fused    = res;
+        ctx->slot_map[gidx].slot_start = slot_start;
+        ctx->slot_map[gidx].slot_end   = slot_end;
+    }
+
    if (ctx->use_capture) {
        ggml_metal_encoder_debug_group_pop(ctx->enc);
    }
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@ -26,6 +26,29 @@ int ggml_metal_op_n_nodes(ggml_metal_op_t ctx);

 int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx);

+//
+// Profiling: per-op timestamp sampling
+//
+
+// (node_idx, slot_start, slot_end) triple recorded for each encoded op group.
+// node_idx < 0 marks an unused slot.
+struct ggml_metal_op_sample_slot {
+    int    node_idx;
+    int    n_fused;
+    size_t slot_start;
+    size_t slot_end;
+};
+
+// Enable profiling on this op encoder. `slot_base` is the first index in `sample_buf` available
+// to this encoder; `slot_map` is an array (size = number of graph nodes) into which
+// the encoder will record one entry per non-empty encoded op group.
+// Must be called before any ggml_metal_op_encode().
+void ggml_metal_op_enable_profiling(
+        ggml_metal_op_t ctx,
+        ggml_metal_sample_buf_t sample_buf,
+        size_t slot_base,
+        struct ggml_metal_op_sample_slot * slot_map);
+
 //
 // available ops:
 //
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@ -2,6 +2,7 @@

 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
+#include "ggml-profiler.h"

 #include "ggml-metal-device.h"
 #include "ggml-metal-context.h"
@ -9,6 +10,7 @@

 #include <mutex>
 #include <string>
+#include <vector>

 #define GGML_METAL_NAME "MTL"
 #define GGML_METAL_MAX_DEVICES 16
@ -20,6 +22,59 @@ static int g_devices = 1;
 // forward declaration
 static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer);

+//
+// Profiler state (mirrors the per-backend profiler state from the Vulkan integration)
+//
+// Owned by ggml_backend_metal_init / device_init_backend; lifetime is the backend's lifetime.
+// The Metal context holds a borrowed pointer and calls into the bridge functions below to push
+// records during graph_compute.
+//
+
+struct ggml_metal_profiler_state {
+    bool enabled  = false;
+    int  split_id = -1;
+    std::vector<ggml_profile_record> records;
+
+    void reset() {
+        records.clear();
+        split_id = -1;
+    }
+};
+
+extern "C" {
+
+void ggml_metal_profiler_push_record(
+        ggml_metal_profiler_state * state,
+        const ggml_tensor * node,
+        uint64_t start_ns,
+        uint64_t end_ns) {
+    if (state == nullptr || node == nullptr) {
+        return;
+    }
+
+    ggml_profile_record rec;
+    rec.type       = GGML_PROFILE_EVENT_OP;
+    rec.name       = ggml_op_name(node->op);
+    rec.backend_id = -1;
+    rec.split_id   = state->split_id;
+    rec.start_ns   = start_ns;
+    rec.end_ns     = end_ns;
+    rec.bytes      = ggml_nbytes(node);
+    rec.extra      = nullptr;
+    ggml_profile_record_from_tensor(&rec, node);
+    state->records.push_back(rec);
+}
+
+bool ggml_metal_profiler_is_enabled(ggml_metal_profiler_state * state) {
+    return state != nullptr && state->enabled;
+}
+
+int ggml_metal_profiler_get_split_id(ggml_metal_profiler_state * state) {
+    return state != nullptr ? state->split_id : -1;
+}
+
+} // extern "C"
+
 ////////////////////////////////////////////////////////////////////////////////
 // backend interface
 ////////////////////////////////////////////////////////////////////////////////
@ -589,6 +644,48 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
    return &guid;
 }

+// Register the per-backend ggml_backend_profiler interface on `backend` and inject the borrowed
+// state pointer into the Metal context so graph_compute can push records.
+static void ggml_backend_metal_register_profiler(ggml_backend_t backend, ggml_metal_t ctx) {
+    auto * state = new ggml_metal_profiler_state();
+    ggml_metal_set_profiler_state(ctx, state);
+
+    static auto metal_prof_enable = [](void * context, bool enable) {
+        auto * state = (ggml_metal_profiler_state *) context;
+        state->enabled = enable;
+        if (!enable) {
+            state->reset();
+        }
+    };
+    static auto metal_prof_reset = [](void * context) {
+        auto * state = (ggml_metal_profiler_state *) context;
+        state->reset();
+    };
+    static auto metal_prof_set_split_id = [](void * context, int split_id) {
+        auto * state = (ggml_metal_profiler_state *) context;
+        state->split_id = split_id;
+    };
+    static auto metal_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
+        auto * state = (ggml_metal_profiler_state *) context;
+        *out = state->records.data();
+        return (int) state->records.size();
+    };
+    static auto metal_prof_free = [](void * context) {
+        auto * state = (ggml_metal_profiler_state *) context;
+        delete state;
+    };
+
+    auto * profiler = new ggml_backend_profiler {
+        /* .context      = */ state,
+        /* .enable       = */ metal_prof_enable,
+        /* .reset        = */ metal_prof_reset,
+        /* .set_split_id = */ metal_prof_set_split_id,
+        /* .get_records  = */ metal_prof_get_records,
+        /* .free_context = */ metal_prof_free,
+    };
+    ggml_backend_set_profiler(backend, profiler);
+}
+
 ggml_backend_t ggml_backend_metal_init(void) {
    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), 0);
    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
@ -611,6 +708,8 @@ ggml_backend_t ggml_backend_metal_init(void) {

    ggml_backend_metal_set_n_cb(backend, 1);

+    ggml_backend_metal_register_profiler(backend, ctx);
+
    return backend;
 }

@ -706,6 +805,8 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t

    ggml_backend_metal_set_n_cb(backend, 1);

+    ggml_backend_metal_register_profiler(backend, ctx);
+
    return backend;

    GGML_UNUSED(params);