From ef98016946b595bf6aa06fbf972e1e44d4e3d041 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Apr 2023 18:15:26 +0300 Subject: [PATCH] Updated GGML Tips & Tricks (markdown) --- GGML-Tips-&-Tricks.md | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/GGML-Tips-&-Tricks.md b/GGML-Tips-&-Tricks.md index cd92499..3d5d8f4 100644 --- a/GGML-Tips-&-Tricks.md +++ b/GGML-Tips-&-Tricks.md @@ -1,11 +1,37 @@ ## Measuring the performance of the inference -- Build with `GGML_PERF` -- Uncomment `ggml_graph_print(&gf);` in `llama.cpp` +- Build with `LLAMA_PERF`: + ```bash + make clean + LLAMA_PERF=1 make + ``` + + This adds `-DGGML_PERF` to the compile flags which enables the internal `ggml` performance timers You will see output like this: ```java +n_nodes = 1188 + - 0: [ 4096, 1, 1] GET_ROWS ( 1) cpu = 0.019 / 0.019 ms, wall = 0.006 / 0.006 ms + - 1: [ 4096, 1, 1] RMS_NORM ( 1) cpu = 0.008 / 0.008 ms, wall = 0.008 / 0.008 ms + - 2: [ 4096, 1, 1] MUL ( 1) cpu = 0.001 / 0.001 ms, wall = 0.001 / 0.001 ms + - 3: [ 4096, 1, 1] MUL_MAT ( 1) cpu = 0.814 / 0.814 ms, wall = 0.817 / 0.817 ms + - 4: [ 128, 32, 1] RESHAPE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms + - 5: [ 128, 32, 1] ROPE ( 1) cpu = 0.011 / 0.011 ms, wall = 0.011 / 0.011 ms + - 6: [ 4096, 1, 1] VIEW ( 1) cpu = 0.000 / 0.000 ms, wall = 0.000 / 0.000 ms + - 7: [ 4096, 1, 1] CPY ( 1) cpu = 0.004 / 0.004 ms, wall = 0.004 / 0.004 ms + - 8: [ 4096, 1, 1] MUL_MAT ( 1) cpu = 3.273 / 3.273 ms, wall = 0.356 / 0.356 ms + - 9: [ 4096, 1, 1] RESHAPE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms + - 10: [ 1, 4096, 1] TRANSPOSE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms + - 11: [ 1, 4096, 1] VIEW ( 1) cpu = 0.000 / 0.000 ms, wall = 0.001 / 0.001 ms + - 12: [ 1, 4096, 1] CPY ( 1) cpu = 0.022 / 0.022 ms, wall = 0.023 / 0.023 ms + - 13: [ 17, 128, 32] VIEW ( 1) cpu = 0.001 / 0.001 ms, wall = 0.001 / 0.001 ms + - 14: [ 69632, 1, 1] VIEW ( 1) cpu = 0.000 / 0.000 ms, wall = 0.000 / 0.000 ms + - 15: [ 128, 32, 17] RESHAPE ( 1) cpu = 0.002 / 0.002 ms, wall = 0.000 / 0.000 ms + - 16: [ 128, 17, 32] PERMUTE ( 1) cpu = 0.000 / 0.000 ms, wall = 0.000 / 0.000 ms + - 17: [ 4096, 1, 1] MUL_MAT ( 1) cpu = 0.744 / 0.744 ms, wall = 0.246 / 0.246 ms + - 18: [ 128, 32, 1] RESHAPE ( 1) cpu = 0.001 / 0.001 ms, wall = 0.000 / 0.000 ms +... perf_total_per_op_us[ NONE] = 0.000 ms perf_total_per_op_us[ DUP] = 0.000 ms perf_total_per_op_us[ ADD] = 0.339 ms