mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
* hex-mm: new weight layout and fusion updates * hvx-mm: unroll the new tiled vec_dots to optimize hvx register util * hex-mm: optimize dyn.quant format for q8_0 and q8_1 to reduce overhead in vec_dots. * hvx-mm: parallel quantizer per block for large rows * hvx-mm: simplify and futher optimize dyn.quant and vec_dots * hvx-mm: keep intermediate per tile accumulators in fp16 * hmx-mm: optimize weight dequant by aligning the repacked tiles with the DMA * hmx-mm: remove qweight scratch and just use vtcm_weight * hmx-mm: remove all unused and obsolete code * hmx-mm: the new tiled repack format is here to stay -- rename all x4x2 to _tiled * hmx-mm: improve activation processing with dma prefetch * hex-mm: fix hmx/hvx fallback logic and MUL_MAT_ID allocation (unbreaks OLMoE) * hex-mm: align the weight tiles with dma just like we did in hmx-mm * hex-mm: factor out common mm bits into htp/matmul-ops.h * hex-mm: start moving mm kernel selection to the host * hex-mm: move all of the matmul param compute into the host * hmx-mm: restore pipelined mode * hmx-mm: unroll the dequant functions to optimize register usage * hmx-mm: further improve activation process * hex-mm: use vtcm_seq_alloc for all vtcm allocations and define more common functions * hex-mm: improve mm optimizer to acount for number of activation threads * hex-mm: fix matmul-id kernel params selection (unbreaks OLMoE and LFM) * hexagon: remove support for arch < v73 since HMX is now required for most use-cases * hex-mm: cleanup naming for consistency * hex-mm: make sure matmul fusion accounts for vtcm allocation * hex-mm: minor cleanup for kernel_params definition * hex-mm: replace hardcoded limits with proper checks for vtcm requirements * hex-mm: add support for non-tiled mm as a fallback option and factor out hvx kernels into separate header * hex-mm: remove unused functions * hex-mm: add shorthand for MM_SELECT in run-tool script * hvx-mm: factor out hvx/hmx microkernels and unify matmul entry and dispatch * hex-mm: further cleanup matmul fallback path * hex-mm: refactor matmul entry point and dispatch a bit further * hexagon: update cmake build to enable hmx for everything * hex-ops: optimize kernel_param updates and include summary in the logs * hex-mm: add support for GGML_HEXAGON_MM_SELECT * hex-mm: add hex-common header * hex-mm: pass correct number of tasks to workpool * hex-mm: add proper checks for no-work in dyn.quant tasks * hex-mm: convert all quantizers into a macro * hex-mm: fix hvx-flat fallback to pass all MUL_MAT tests * hex-mm: vectorize q8_1 quantizer * hex-mm: improve fused ffn mm stride handling * hex-mm: consistent use of n_threads and pipeline in kernel_params * hexagon: minor formatting * hex-mm: update MUL_MAT_ID kernel_param handling to make sure host/npu are in sync * hvx-mm: go back to accumulating in fp32 in tiled hvx kernels, more accurate and same perf * hvx-mm: unroll the loops and remove masking that is not needed for tiled accums * hmx-mm: optimize activation processing (slit loops, some unrolling, etc) * hmx-mm: minor optimization for output processing * hex-mm: consistent use of uint32_t and size_t in mm kernels * hex-mm: remove legacy restrictions for rows to be multiple of 256 * hexagon: replace sprintf with snprintf * hex-mm: relax hardcoded nrows checks and rely on VTCM size requirements * hexagon: minor alignment fix * hexagon: fix trailing spaces * hex-mm: relax padding from 256 to 128 (leftovers) * hex-mm: remove redundant checks for weight align to 128 we always use 2D dma for the weights and align them properly * hmx-mm: MUL_MAT_ID better work distribution between hvx threads and hmx tracing * hex-mm: specialize per-token mmid activation handling * hex-profile: update python scripts to handle kernel-params section in the logging output * hex-mm: move n_prefetch (aka dma_depth) into kernel params and remove unused fields * hex-trace: use easier to parse format, simply and fix post-proc scripts * hmx-mm: relax 32 row limit for output processing which helps utilization * hmx-mm: use start-chunk idx for tracing info * hmx-mm: parameterize activation dma pipeline * hexagon: add support for simple graph caching to avoid recomputing kernel-params * hex-mm: remove left-over repack functions * hex-mm: tighten n_prefetch asserts * hex-mm: remove duplicate round/align_up helper * hexagon: cleanup common header used in host/npu * hexagon: update early wakeup threshold * hmx-mm: define cost constants and update solver to assume that repacked ne[1] is padded to 32 * hmx-mm: make precompute_matmul a bit more readable (split into smaller functions, etc) * hex-mm: remove n_threads constraint * hex-mm: minor formatting updates * hex-mm: remove obsolete profiling logs * hex-mm: restore hardcode gate to refuse lm-head to avoid repacking that tensor
470 lines
17 KiB
Python
Executable File
470 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import argparse
|
|
import statistics
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from collections import defaultdict
|
|
|
|
# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
|
|
COL_MAP = {
|
|
"tot-usec": ("tot_usec", "Tot usec", "_sort_tot_usec"),
|
|
"op": ("op", "Op", "op"),
|
|
"dims": ("dims", "Dims", "dims"),
|
|
"dtypes": ("dtypes", "DTypes", "dtypes"),
|
|
"count": ("count", "Count", "_sort_count"),
|
|
"max-usec": ("max_usec", "Max usec", "_sort_max_usec"),
|
|
"avg-usec": ("avg_usec", "Avg usec", "_sort_avg_usec"),
|
|
"max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"),
|
|
"avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"),
|
|
"max-pmu": ("max_pmu", "Max PMU", "_sort_max_pmu"),
|
|
"avg-pmu": ("avg_pmu", "Avg PMU", "_sort_avg_pmu"),
|
|
}
|
|
|
|
op_pattern = re.compile(
|
|
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
|
)
|
|
|
|
trace_pattern = re.compile(
|
|
r"trace-op\s+(?P<op_name>[A-Z_0-9+]+):\s+thread\s+(?P<thread>\d+)\s+event\s+(?P<event>[A-Z_0-9\-]+)\s+info\s+(?P<info>\d+)\s+(?P<state>start|stop)\s+(?P<cycles>\d+)"
|
|
)
|
|
|
|
logger = logging.getLogger("ggml-hexagon-profile")
|
|
|
|
|
|
def normalize_event_name(evt_type):
|
|
if evt_type == "HVX_COMP":
|
|
return "V-COMP"
|
|
if evt_type == "HMX_COMP":
|
|
return "M-COMP"
|
|
|
|
# Strip HVX_ or HMX_ prefixes
|
|
name = evt_type
|
|
if name.startswith("HVX_") or name.startswith("HMX_"):
|
|
name = name[4:]
|
|
return name.replace("_", "-")
|
|
|
|
|
|
class CycleUnwrapper:
|
|
def __init__(self):
|
|
self.last_raw = None
|
|
self.high_part = 0
|
|
|
|
def unwrap(self, raw):
|
|
if self.last_raw is None:
|
|
self.last_raw = raw
|
|
return raw
|
|
diff = raw - self.last_raw
|
|
if diff < -0x80000000:
|
|
self.high_part += 0x100000000
|
|
elif diff > 0x80000000:
|
|
self.high_part -= 0x100000000
|
|
self.last_raw = raw
|
|
return raw + self.high_part
|
|
|
|
|
|
def parse_log(file_path, pmu_index=None):
|
|
try:
|
|
if file_path != "-":
|
|
f = open(file_path, 'r', encoding='utf-8', errors='ignore')
|
|
else:
|
|
f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore')
|
|
except FileNotFoundError:
|
|
logger.error(f"file '{file_path}' not found.")
|
|
sys.exit(1)
|
|
|
|
all_ops: List[Dict[str, Any]] = []
|
|
current_op: Optional[Dict[str, Any]] = None
|
|
|
|
timestamp_pattern = re.compile(r"^(?P<min>\d+)\.(?P<sec>\d+)\.(?P<ms>\d+)\.(?P<us>\d+)\s+[A-Z]\s+")
|
|
unwrapper = CycleUnwrapper()
|
|
|
|
for line in f:
|
|
ts_match = timestamp_pattern.match(line)
|
|
abs_usec = 0
|
|
if ts_match:
|
|
abs_usec = (
|
|
(int(ts_match.group('min')) * 60 + int(ts_match.group('sec'))) * 1000000
|
|
+ int(ts_match.group('ms')) * 1000
|
|
+ int(ts_match.group('us'))
|
|
)
|
|
|
|
if "|" in line and "profile-op" in line:
|
|
parts = [p.strip() for p in line.split("|")]
|
|
prefix = parts[0]
|
|
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
|
|
if not prefix_match:
|
|
continue
|
|
|
|
if len(parts) == 7:
|
|
dims, types, timings = parts[2], parts[3], parts[6]
|
|
elif len(parts) == 6:
|
|
dims, types, timings = parts[2], parts[3], parts[5]
|
|
else:
|
|
continue
|
|
|
|
timing_match = re.search(
|
|
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
|
|
timings
|
|
)
|
|
if not timing_match:
|
|
continue
|
|
|
|
op_match = timing_match
|
|
op_name = prefix_match.group("op_name")
|
|
else:
|
|
op_match = op_pattern.search(line)
|
|
if op_match:
|
|
op_name = op_match.group('op_name')
|
|
dims = op_match.group('dims').strip()
|
|
types = op_match.group('types').strip()
|
|
else:
|
|
op_match = None
|
|
|
|
if op_match:
|
|
pmu_raw = op_match.group('pmu') if 'pmu' in op_match.groupdict() else None
|
|
pmu_val = None
|
|
if pmu_raw and pmu_index is not None:
|
|
try:
|
|
pmu_list = [int(x.strip()) for x in pmu_raw.split(',')]
|
|
if len(pmu_list) > pmu_index:
|
|
pmu_val = pmu_list[pmu_index]
|
|
except (ValueError, IndexError):
|
|
pmu_val = None
|
|
|
|
evt_raw = op_match.group('evt') if 'evt' in op_match.groupdict() else None
|
|
evt_val = None
|
|
if evt_raw:
|
|
try:
|
|
evt_val = [int(x.strip()) for x in evt_raw.split(',')]
|
|
except ValueError:
|
|
evt_val = None
|
|
|
|
cycles_start_raw = op_match.group('start')
|
|
unwrapped_cycles_start = None
|
|
if cycles_start_raw:
|
|
unwrapped_cycles_start = unwrapper.unwrap(int(cycles_start_raw))
|
|
|
|
idx = line.find("profile-op ")
|
|
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
|
|
|
|
current_op = {
|
|
'name': op_name,
|
|
'dims': dims,
|
|
'types': types,
|
|
'op_text': op_text,
|
|
'usec': int(op_match.group('usec')),
|
|
'cycles': int(op_match.group('cycles')),
|
|
'cycles_start': int(cycles_start_raw) if cycles_start_raw else None,
|
|
'unwrapped_cycles_start': unwrapped_cycles_start,
|
|
'pmu_val': pmu_val,
|
|
'evt_val': evt_val,
|
|
'abs_usec': abs_usec,
|
|
'trace_events': []
|
|
}
|
|
all_ops.append(current_op)
|
|
continue
|
|
|
|
trace_match = trace_pattern.search(line)
|
|
if trace_match and current_op:
|
|
if trace_match.group('op_name') == current_op['name']:
|
|
raw_cyc = int(trace_match.group('cycles'))
|
|
current_op['trace_events'].append({
|
|
'thread': int(trace_match.group('thread')),
|
|
'event': trace_match.group('event'),
|
|
'info': int(trace_match.group('info')),
|
|
'cycles': raw_cyc,
|
|
'unwrapped_cycles': unwrapper.unwrap(raw_cyc),
|
|
'state': trace_match.group('state')
|
|
})
|
|
|
|
f.close()
|
|
return all_ops
|
|
|
|
|
|
def print_ascii_timeline(op_name, dims, types, usec, cycles, events, evt_val=None):
|
|
evt_str = ""
|
|
if evt_val:
|
|
evt_str = " - evt [" + ",".join(str(x) for x in evt_val) + "]"
|
|
logger.info("=" * 100)
|
|
logger.info(f"{op_name} ({dims} : {types}) - {usec} usec {cycles} cycles{evt_str}")
|
|
logger.info("=" * 100)
|
|
|
|
events = sorted(events, key=lambda e: e['cycles'])
|
|
if not events:
|
|
logger.info(" No trace events recorded.")
|
|
return
|
|
|
|
min_cycles = events[0]['cycles']
|
|
|
|
logger.info("Cycles %-30s" % "EventDetails" + " ".join(f"T{i:<2}" for i in range(10)) + " HMX")
|
|
logger.info("-" * 100)
|
|
|
|
thread_stacks = [[] for _ in range(11)]
|
|
|
|
for e in events:
|
|
t = e['thread']
|
|
if t < 0 or t > 10:
|
|
continue
|
|
|
|
if e['cycles'] >= min_cycles:
|
|
rel_cycles = e['cycles'] - min_cycles
|
|
else:
|
|
rel_cycles = (e['cycles'] + 0x100000000) - min_cycles
|
|
|
|
state = e['state']
|
|
evt_type = e['event']
|
|
|
|
# Determine char representing the event
|
|
norm_evt = normalize_event_name(evt_type)
|
|
char = '?'
|
|
if norm_evt == 'V-COMP':
|
|
char = 'V'
|
|
elif norm_evt == 'M-COMP':
|
|
char = 'H'
|
|
elif norm_evt == 'A-QUANT':
|
|
char = 'Q'
|
|
elif norm_evt == 'A-PREP':
|
|
char = 'A'
|
|
elif norm_evt == 'W-DEQUANT':
|
|
char = 'D'
|
|
elif norm_evt == 'O-PROC':
|
|
char = 'O'
|
|
elif norm_evt == 'W-PREP':
|
|
char = 'P'
|
|
elif norm_evt == 'DMA':
|
|
char = 'M'
|
|
|
|
if state == 'start':
|
|
thread_stacks[t].append(char)
|
|
elif state == 'stop':
|
|
if thread_stacks[t]:
|
|
if thread_stacks[t][-1] == char:
|
|
thread_stacks[t].pop()
|
|
elif char in thread_stacks[t]:
|
|
thread_stacks[t].remove(char)
|
|
else:
|
|
thread_stacks[t].pop()
|
|
|
|
cols = []
|
|
for i in range(11):
|
|
if thread_stacks[i]:
|
|
cols.append(f"[{thread_stacks[i][-1]}]")
|
|
else:
|
|
cols.append(" | ")
|
|
|
|
evt_desc = f"T{t}: {evt_type} {state} ({e['info']})"
|
|
logger.info(f"{rel_cycles:10d} %-30s" % evt_desc + " ".join(cols[:10]) + " " + cols[10])
|
|
logger.info("-" * 100)
|
|
|
|
|
|
def print_ascii_summary(op_name, dims, types, usec, cycles, events, evt_val=None):
|
|
evt_str = ""
|
|
if evt_val:
|
|
evt_str = " - evt [" + ",".join(str(x) for x in evt_val) + "]"
|
|
logger.info("=" * 100)
|
|
logger.info(f"{op_name} ({dims} : {types}) - {usec} usec {cycles} cycles{evt_str}")
|
|
logger.info("=" * 100)
|
|
|
|
events = sorted(events, key=lambda e: e['cycles'])
|
|
if not events:
|
|
logger.info(" No trace events recorded.")
|
|
return
|
|
|
|
active_starts = {}
|
|
thread_totals = defaultdict(lambda: defaultdict(int))
|
|
|
|
for e in events:
|
|
t = e['thread']
|
|
evt = e['event']
|
|
info = e['info']
|
|
cyc = e['cycles']
|
|
state = e['state']
|
|
|
|
key = (t, evt, info)
|
|
if state == 'start':
|
|
active_starts[key] = cyc
|
|
elif state == 'stop':
|
|
if key in active_starts:
|
|
start_cyc = active_starts[key]
|
|
del active_starts[key]
|
|
|
|
if cyc >= start_cyc:
|
|
dur = cyc - start_cyc
|
|
else:
|
|
dur = (cyc + 0x100000000) - start_cyc
|
|
|
|
norm_evt = normalize_event_name(evt)
|
|
thread_totals[t][norm_evt] += dur
|
|
|
|
for t in sorted(thread_totals.keys()):
|
|
thread_name = f"Thread {t} (HVX)" if t != 10 else "Thread 10 (HMX)"
|
|
sorted_evts = sorted(thread_totals[t].items(), key=lambda item: item[0])
|
|
|
|
evt_strs = []
|
|
for evt, dur in sorted_evts:
|
|
pct = (dur / cycles * 100) if cycles > 0 else 0
|
|
evt_strs.append(f"{evt} {dur} ({pct:.1f}%)")
|
|
|
|
logger.info(f" {thread_name:<16}: " + " | ".join(evt_strs))
|
|
|
|
|
|
def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
|
|
if not ops:
|
|
logger.info("No valid records found.")
|
|
return
|
|
|
|
grouped = defaultdict(list)
|
|
for op in ops:
|
|
key = (op['name'], op['dims'], op['types'])
|
|
grouped[key].append(op)
|
|
|
|
group_stats = []
|
|
for (name, dims, types), group_ops in grouped.items():
|
|
usecs = [o['usec'] for o in group_ops]
|
|
cycles = [o['cycles'] for o in group_ops]
|
|
pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
|
|
|
|
avg_usec_val = statistics.mean(usecs)
|
|
count_val = len(group_ops)
|
|
tot_usec_val = avg_usec_val * count_val
|
|
|
|
group_stats.append({
|
|
'op': name,
|
|
'dims': dims,
|
|
'dtypes': types,
|
|
'count': str(count_val),
|
|
'max_usec': str(max(usecs)),
|
|
'avg_usec': f"{avg_usec_val:.2f}",
|
|
'tot_usec': f"{tot_usec_val:.2f}",
|
|
'max_cycles': str(max(cycles)),
|
|
'avg_cycles': f"{statistics.mean(cycles):.2f}",
|
|
'max_pmu': str(max(pmu_vals)) if pmu_vals else "0",
|
|
'avg_pmu': f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
|
|
# Numeric values for accurate sorting
|
|
'_sort_count': count_val,
|
|
'_sort_max_usec': max(usecs),
|
|
'_sort_avg_usec': avg_usec_val,
|
|
'_sort_tot_usec': tot_usec_val,
|
|
'_sort_max_cycles': max(cycles),
|
|
'_sort_avg_cycles': statistics.mean(cycles),
|
|
'_sort_max_pmu': max(pmu_vals) if pmu_vals else 0,
|
|
'_sort_avg_pmu': statistics.mean(pmu_vals) if pmu_vals else 0
|
|
})
|
|
|
|
# Sorting logic
|
|
actual_sort_key = COL_MAP[sort_col][2]
|
|
is_numeric = actual_sort_key.startswith("_") or actual_sort_key == "count"
|
|
sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n]
|
|
|
|
# Define initial column order
|
|
active_cols = ["op", "dims", "dtypes"]
|
|
if pmu_name:
|
|
active_cols += ["max-pmu", "avg-pmu"]
|
|
active_cols += ["tot-usec", "avg-usec", "avg-cycles", "max-usec", "max-cycles", "count"]
|
|
|
|
final_headers, final_keys, final_widths = [], [], []
|
|
|
|
for col_name in active_cols:
|
|
data_key, header_text, _ = COL_MAP[col_name]
|
|
if "pmu" in col_name and pmu_name:
|
|
header_text = header_text.replace("PMU", pmu_name)
|
|
|
|
natural_width = max([len(str(row[data_key])) for row in sorted_groups] + [len(header_text)])
|
|
target_width = width_overrides.get(col_name, natural_width)
|
|
|
|
if target_width == 0:
|
|
continue
|
|
|
|
final_headers.append(header_text)
|
|
final_keys.append(data_key)
|
|
final_widths.append(target_width)
|
|
|
|
# Print Report
|
|
logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n")
|
|
header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |"
|
|
sep_line = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |"
|
|
logger.info(header_line)
|
|
logger.info(sep_line)
|
|
|
|
for group in sorted_groups:
|
|
row_vals = []
|
|
for i, key in enumerate(final_keys):
|
|
val = str(group[key])
|
|
if len(val) > final_widths[i]:
|
|
val = val[:final_widths[i] - 3] + "..."
|
|
row_vals.append(f"{val:<{final_widths[i]}}")
|
|
logger.info("| " + " | ".join(row_vals) + " |")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Post-process Op profile info.")
|
|
parser.add_argument("logfile")
|
|
parser.add_argument("-n", "--top", type=int, default=100)
|
|
parser.add_argument("--sort", type=str, default="tot-usec", choices=list(COL_MAP.keys()))
|
|
parser.add_argument("--pmu-index", type=int)
|
|
parser.add_argument("--pmu-name", type=str)
|
|
parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
|
|
parser.add_argument("--timeline", type=str, nargs='?', const='summary', choices=["summary", "diagram"],
|
|
help="Output ASCII art event summary or timing diagram (default: summary)")
|
|
parser.add_argument("--filter", type=str, help="Regex filter matching against the original profile-op line")
|
|
|
|
group = parser.add_mutually_exclusive_group()
|
|
group.add_argument("--head", type=int, help="Limit to first N ops")
|
|
group.add_argument("--tail", type=int, help="Limit to last N ops")
|
|
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
|
|
|
if "pmu" in args.sort and args.pmu_index is None:
|
|
logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.")
|
|
sys.exit(1)
|
|
|
|
overrides = {}
|
|
if args.width:
|
|
for w in args.width:
|
|
try:
|
|
name, val = w.split(':')
|
|
overrides[name.lower()] = int(val)
|
|
except ValueError:
|
|
logger.warning(f"Invalid width format '{w}'")
|
|
|
|
final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None
|
|
ops = parse_log(args.logfile, pmu_index=args.pmu_index)
|
|
|
|
if args.filter:
|
|
try:
|
|
filter_re = re.compile(args.filter)
|
|
except re.error as e:
|
|
logger.error(f"Invalid regex filter: {e}")
|
|
sys.exit(1)
|
|
ops = [op for op in ops if filter_re.search(op['op_text'])]
|
|
|
|
if args.head is not None:
|
|
ops = ops[:args.head]
|
|
elif args.tail is not None:
|
|
ops = ops[-args.tail:]
|
|
|
|
if args.timeline:
|
|
logger.info(f"\n# ASCII Timing {args.timeline.capitalize()}\n")
|
|
printed_cnt = 0
|
|
for op in ops:
|
|
if args.timeline == "summary":
|
|
print_ascii_summary(op['name'], op['dims'], op['types'], op['usec'], op['cycles'], op['trace_events'], op.get('evt_val'))
|
|
elif args.timeline == "diagram":
|
|
print_ascii_timeline(op['name'], op['dims'], op['types'], op['usec'], op['cycles'], op['trace_events'], op.get('evt_val'))
|
|
printed_cnt += 1
|
|
if printed_cnt >= args.top:
|
|
break
|
|
else:
|
|
generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|