mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
* hex-mm: new weight layout and fusion updates * hvx-mm: unroll the new tiled vec_dots to optimize hvx register util * hex-mm: optimize dyn.quant format for q8_0 and q8_1 to reduce overhead in vec_dots. * hvx-mm: parallel quantizer per block for large rows * hvx-mm: simplify and futher optimize dyn.quant and vec_dots * hvx-mm: keep intermediate per tile accumulators in fp16 * hmx-mm: optimize weight dequant by aligning the repacked tiles with the DMA * hmx-mm: remove qweight scratch and just use vtcm_weight * hmx-mm: remove all unused and obsolete code * hmx-mm: the new tiled repack format is here to stay -- rename all x4x2 to _tiled * hmx-mm: improve activation processing with dma prefetch * hex-mm: fix hmx/hvx fallback logic and MUL_MAT_ID allocation (unbreaks OLMoE) * hex-mm: align the weight tiles with dma just like we did in hmx-mm * hex-mm: factor out common mm bits into htp/matmul-ops.h * hex-mm: start moving mm kernel selection to the host * hex-mm: move all of the matmul param compute into the host * hmx-mm: restore pipelined mode * hmx-mm: unroll the dequant functions to optimize register usage * hmx-mm: further improve activation process * hex-mm: use vtcm_seq_alloc for all vtcm allocations and define more common functions * hex-mm: improve mm optimizer to acount for number of activation threads * hex-mm: fix matmul-id kernel params selection (unbreaks OLMoE and LFM) * hexagon: remove support for arch < v73 since HMX is now required for most use-cases * hex-mm: cleanup naming for consistency * hex-mm: make sure matmul fusion accounts for vtcm allocation * hex-mm: minor cleanup for kernel_params definition * hex-mm: replace hardcoded limits with proper checks for vtcm requirements * hex-mm: add support for non-tiled mm as a fallback option and factor out hvx kernels into separate header * hex-mm: remove unused functions * hex-mm: add shorthand for MM_SELECT in run-tool script * hvx-mm: factor out hvx/hmx microkernels and unify matmul entry and dispatch * hex-mm: further cleanup matmul fallback path * hex-mm: refactor matmul entry point and dispatch a bit further * hexagon: update cmake build to enable hmx for everything * hex-ops: optimize kernel_param updates and include summary in the logs * hex-mm: add support for GGML_HEXAGON_MM_SELECT * hex-mm: add hex-common header * hex-mm: pass correct number of tasks to workpool * hex-mm: add proper checks for no-work in dyn.quant tasks * hex-mm: convert all quantizers into a macro * hex-mm: fix hvx-flat fallback to pass all MUL_MAT tests * hex-mm: vectorize q8_1 quantizer * hex-mm: improve fused ffn mm stride handling * hex-mm: consistent use of n_threads and pipeline in kernel_params * hexagon: minor formatting * hex-mm: update MUL_MAT_ID kernel_param handling to make sure host/npu are in sync * hvx-mm: go back to accumulating in fp32 in tiled hvx kernels, more accurate and same perf * hvx-mm: unroll the loops and remove masking that is not needed for tiled accums * hmx-mm: optimize activation processing (slit loops, some unrolling, etc) * hmx-mm: minor optimization for output processing * hex-mm: consistent use of uint32_t and size_t in mm kernels * hex-mm: remove legacy restrictions for rows to be multiple of 256 * hexagon: replace sprintf with snprintf * hex-mm: relax hardcoded nrows checks and rely on VTCM size requirements * hexagon: minor alignment fix * hexagon: fix trailing spaces * hex-mm: relax padding from 256 to 128 (leftovers) * hex-mm: remove redundant checks for weight align to 128 we always use 2D dma for the weights and align them properly * hmx-mm: MUL_MAT_ID better work distribution between hvx threads and hmx tracing * hex-mm: specialize per-token mmid activation handling * hex-profile: update python scripts to handle kernel-params section in the logging output * hex-mm: move n_prefetch (aka dma_depth) into kernel params and remove unused fields * hex-trace: use easier to parse format, simply and fix post-proc scripts * hmx-mm: relax 32 row limit for output processing which helps utilization * hmx-mm: use start-chunk idx for tracing info * hmx-mm: parameterize activation dma pipeline * hexagon: add support for simple graph caching to avoid recomputing kernel-params * hex-mm: remove left-over repack functions * hex-mm: tighten n_prefetch asserts * hex-mm: remove duplicate round/align_up helper * hexagon: cleanup common header used in host/npu * hexagon: update early wakeup threshold * hmx-mm: define cost constants and update solver to assume that repacked ne[1] is padded to 32 * hmx-mm: make precompute_matmul a bit more readable (split into smaller functions, etc) * hex-mm: remove n_threads constraint * hex-mm: minor formatting updates * hex-mm: remove obsolete profiling logs * hex-mm: restore hardcode gate to refuse lm-head to avoid repacking that tensor
156 lines
5.4 KiB
CMake
156 lines
5.4 KiB
CMake
if (HEXAGON_TOOLCHAIN_INCLUDED)
|
|
return()
|
|
endif()
|
|
set(HEXAGON_TOOLCHAIN_INCLUDED true)
|
|
|
|
# Cross Compiling for Hexagon
|
|
set(HEXAGON TRUE)
|
|
set(CMAKE_SYSTEM_NAME QURT)
|
|
set(CMAKE_SYSTEM_PROCESSOR Hexagon)
|
|
set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL})
|
|
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
|
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
|
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
|
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
|
set(CUSTOM_RUNELF_PATH "")
|
|
|
|
if (NOT HEXAGON_SDK_ROOT)
|
|
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
|
|
endif()
|
|
|
|
if (NOT HEXAGON_TOOLS_ROOT)
|
|
if (DEFINED ENV{HEXAGON_TOOLS_ROOT})
|
|
set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT})
|
|
endif()
|
|
if(NOT HEXAGON_TOOLS_ROOT)
|
|
set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT})
|
|
endif()
|
|
endif()
|
|
|
|
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
|
|
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
|
|
|
|
if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
|
|
set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
|
|
endif()
|
|
message(DEBUG "CMAKE_HOST_SYSTEM_NAME:${CMAKE_HOST_SYSTEM_NAME}")
|
|
|
|
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake)
|
|
|
|
set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT})
|
|
set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib")
|
|
set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss)
|
|
|
|
set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
|
|
HEXAGON_SDK_ROOT
|
|
HEXAGON_TOOLS_ROOT
|
|
)
|
|
|
|
# QURT Related includes and linker flags
|
|
set(V_ARCH ${HEXAGON_ARCH})
|
|
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
|
|
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
|
|
|
|
if (${TREE} MATCHES PAKMAN)
|
|
set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
|
|
endif()
|
|
message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
|
|
set(RTOS_DIR ${_QURT_INSTALL_DIR})
|
|
set(QCC_DIR "${HEXAGON_QCC_DIR}/${V_ARCH}/G0")
|
|
set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0")
|
|
|
|
include_directories(
|
|
${_QURT_INSTALL_DIR}/include
|
|
${_QURT_INSTALL_DIR}/include/qurt
|
|
${_QURT_INSTALL_DIR}/include/posix
|
|
)
|
|
|
|
set(QURT_START_LINK_LIBS)
|
|
set(QURT_START_LINK_LIBS
|
|
"${TARGET_DIR}/init.o"
|
|
"${RTOS_DIR}/lib/crt1.o"
|
|
"${RTOS_DIR}/lib/debugmon.o"
|
|
"${RTOS_DIR}/lib/libqurt.a"
|
|
"${TARGET_DIR}/libc.a"
|
|
"${TARGET_DIR}/libqcc.a"
|
|
"${TARGET_DIR}/libhexagon.a"
|
|
"${RTOS_DIR}/lib/libqurtcfs.a"
|
|
"${RTOS_DIR}/lib/libtimer_island.a"
|
|
"${RTOS_DIR}/lib/libtimer_main.a"
|
|
"${RTOS_DIR}/lib/libposix.a"
|
|
)
|
|
STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
|
|
|
|
set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o)
|
|
|
|
# Non QURT related includes and linker flags
|
|
|
|
set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
|
|
|
|
if (NOT NO_WRAP_MEM_API)
|
|
set(WRAP_MALLOC -Wl,--wrap=malloc)
|
|
set(WRAP_CALLOC -Wl,--wrap=calloc)
|
|
set(WRAP_FREE -Wl,--wrap=free)
|
|
set(WRAP_REALLOC -Wl,--wrap=realloc)
|
|
set(WRAP_MEMALIGN -Wl,--wrap=memalign)
|
|
endif()
|
|
|
|
set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx")
|
|
|
|
set(PIC_SHARED_LD_FLAGS
|
|
${ARCH_FLAGS}
|
|
-G0
|
|
-fpic
|
|
-Wl,-Bsymbolic
|
|
-Wl,-L${TARGET_DIR_NOOS}/G0/pic
|
|
-Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/
|
|
-Wl,--no-threads ${WRAP_MALLOC} ${WRAP_CALLOC} ${WRAP_FREE} ${WRAP_REALLOC} ${WRAP_MEMALIGN}
|
|
-shared
|
|
"-o <TARGET> <SONAME_FLAG><TARGET_SONAME>"
|
|
"<LINK_FLAGS>"
|
|
-Wl,--start-group
|
|
"<OBJECTS>"
|
|
"<LINK_LIBRARIES>"
|
|
-Wl,--end-group
|
|
-lc
|
|
)
|
|
STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
|
|
|
|
set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
|
|
|
|
# System include paths
|
|
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
|
|
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
|
|
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
|
|
|
|
# LLVM toolchain setup
|
|
# Compiler paths, options and architecture
|
|
set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
|
|
set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
|
|
set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
|
|
set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
|
|
set(HEXAGON_LINKER ${CMAKE_C_COMPILER})
|
|
set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
|
|
|
|
set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,")
|
|
set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
|
|
|
|
# Compiler Options
|
|
set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
|
|
|
|
set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
|
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
|
|
set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O2")
|
|
|
|
set(CMAKE_C_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
|
|
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
|
|
set(CMAKE_C_FLAGS_RELEASE "${COMMON_FLAGS} -O2")
|
|
|
|
set(CMAKE_ASM_FLAGS_DEBUG "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
|
|
set(CMAKE_ASM_FLAGS_RELEASE "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
|
|
set(CMAKE_ASM_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" )
|
|
|
|
#Linker Options
|
|
set(CMAKE_C_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
|
|
set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
|