diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 0ec62a3773..dd354db3ec 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -340,9 +340,6 @@ set(GGML_PUBLIC_HEADERS include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") -#if (GGML_METAL) -# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") -#endif() install(TARGETS ggml LIBRARY PUBLIC_HEADER) install(TARGETS ggml-base LIBRARY) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index 42054d841a..f02654a2e7 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -24,62 +24,119 @@ if (GGML_METAL_NDEBUG) endif() set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h") +set(METALLIB_KERNELS_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/kernels/common.h") +set(METALLIB_KERNELS_DEQUANTIZE "${CMAKE_CURRENT_SOURCE_DIR}/kernels/dequantize.h") +set(METALLIB_KERNELS_QUANTIZE "${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantize.h") + +set(METALLIB_KERNEL_SOURCES + kernels/fa.metal + kernels/mul_mv.metal + kernels/mul_mm.metal + kernels/quantize.metal + kernels/softmax.metal + kernels/norm.metal + kernels/unary.metal + kernels/binbcast.metal + kernels/reduce.metal + kernels/tri.metal + kernels/ssm.metal + kernels/wkv.metal + kernels/gated_delta_net.metal + kernels/solve_tri.metal + kernels/rope.metal + kernels/conv.metal + kernels/upscale.metal + kernels/argsort.metal + kernels/pool.metal + kernels/misc.metal +) + if (GGML_METAL_EMBED_LIBRARY) enable_language(ASM) add_compile_definitions(GGML_METAL_EMBED_LIBRARY) - set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") - set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h") + set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h") file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated") - # merge ggml-common.h and ggml-metal.metal into a single file - set(METALLIB_EMBED_ASM "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s") - set(METALLIB_SOURCE_EMBED "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") - set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") + set(METALLIB_EMBED_ASM_FILES "") + foreach(src ${METALLIB_KERNEL_SOURCES}) + get_filename_component(kind ${src} NAME_WE) + # symbol names must be valid C identifiers ('-' is not allowed) + string(REPLACE "-" "_" kind_sym ${kind}) - add_custom_command( - OUTPUT "${METALLIB_EMBED_ASM}" - COMMAND echo "Embedding Metal library" - COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}" - COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}" - COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}" - COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}" - COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}" - COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}" - COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}" - COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}" - DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h - COMMENT "Generate assembly for embedded Metal library" - VERBATIM - ) + set(SRC "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kind}.metal") + set(EMBED "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed-${kind}.metal") + set(ASM "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed-${kind}.s") - target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}") + # only prepend headers that this source actually includes + set(HEADERS_FOR_SRC ${METALLIB_KERNELS_COMMON}) + file(STRINGS ${SRC} _has_dequantize REGEX "#include \"dequantize\\.h\"") + file(STRINGS ${SRC} _has_quantize REGEX "#include \"quantize\\.h\"") + if(_has_dequantize) + list(APPEND HEADERS_FOR_SRC ${METALLIB_KERNELS_DEQUANTIZE}) + endif() + if(_has_quantize) + list(APPEND HEADERS_FOR_SRC ${METALLIB_KERNELS_QUANTIZE}) + endif() + + add_custom_command( + OUTPUT "${ASM}" + # Step 1: concatenate shared headers + this kernel source + COMMAND cat ${HEADERS_FOR_SRC} ${SRC} > "${EMBED}.tmp1" + # Step 2: remove internal #include and #pragma once + COMMAND sed -e "/\#include \"common.h\"/d" -e "/\#include \"dequantize.h\"/d" -e "/\#include \"quantize.h\"/d" -e "/\#pragma once/d" < "${EMBED}.tmp1" > "${EMBED}.tmp2" + # Step 3: inline ggml-common.h (replacing __embed_ggml-common.h__ sentinel) + COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${EMBED}.tmp2" > "${EMBED}.tmp3" + # Step 4: inline ggml-metal-impl.h + COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${EMBED}.tmp3" > "${EMBED}" + # Step 5: emit an asm chunk with kind-specific start/end symbols + # note: '-' is illegal in C symbols, so we use kind_sym; the macOS + # section name is limited to 16 chars so we keep it shared + # across kinds (__ggml_metallib) and only vary the global symbols. + COMMAND echo ".section __DATA,__ggml_metallib" > "${ASM}" + COMMAND echo ".globl _ggml_metallib_${kind_sym}_start" >> "${ASM}" + COMMAND echo "_ggml_metallib_${kind_sym}_start:" >> "${ASM}" + COMMAND echo .incbin "\"${EMBED}\"" >> "${ASM}" + COMMAND echo ".globl _ggml_metallib_${kind_sym}_end" >> "${ASM}" + COMMAND echo "_ggml_metallib_${kind_sym}_end:" >> "${ASM}" + DEPENDS ../ggml-common.h ggml-metal-impl.h + kernels/common.h kernels/dequantize.h kernels/quantize.h + kernels/${kind}.metal + COMMENT "Generate embedded Metal library for ${kind}" + VERBATIM + ) + + list(APPEND METALLIB_EMBED_ASM_FILES "${ASM}") + endforeach() + + target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM_FILES}) else() - # copy metal files to bin directory + # copy header files to bin directory configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) - configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY) + file(MAKE_DIRECTORY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels") + configure_file(kernels/common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels/common.h COPYONLY) + configure_file(kernels/dequantize.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels/dequantize.h COPYONLY) + configure_file(kernels/quantize.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels/quantize.h COPYONLY) + + foreach(src ${METALLIB_KERNEL_SOURCES}) + configure_file(${src} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${src} COPYONLY) + endforeach() + if (GGML_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops + # note: disabling fast math is needed in order to pass tests/test-backend-ops # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 # note: unfortunately, we have to call it default.metallib instead of ggml.metallib # ref: https://github.com/ggml-org/whisper.cpp/issues/1720 # note: adding -g causes segmentation fault during compile - #set(XC_FLAGS -fno-fast-math -fno-inline -g) set(XC_FLAGS -fno-fast-math -fno-inline) else() set(XC_FLAGS -O3) endif() - # Append macOS metal versioning flags if (GGML_METAL_MACOSX_VERSION_MIN) message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation") list (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN}) @@ -90,35 +147,46 @@ else() list (APPEND XC_FLAGS -std=${GGML_METAL_STD}) endif() + # Compile each kernel source to .air, then link into default.metallib + set(AIR_FILES "") + foreach(src ${METALLIB_KERNEL_SOURCES}) + get_filename_component(name ${src} NAME_WE) + set(AIR "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}.air") + list(APPEND AIR_FILES ${AIR}) + add_custom_command( + OUTPUT ${AIR} + COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -I ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${src} -o ${AIR} + DEPENDS ${src} kernels/common.h kernels/dequantize.h kernels/quantize.h ${METALLIB_COMMON} ggml-metal-impl.h + COMMENT "Compiling ${src}" + VERBATIM + ) + endforeach() + add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - | - xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND xcrun -sdk macosx metallib ${AIR_FILES} -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal - DEPENDS ggml-metal.metal ${METALLIB_COMMON} - COMMENT "Compiling Metal kernels" - ) + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h + COMMAND rm -rf ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels + DEPENDS ${AIR_FILES} + COMMENT "Linking Metal kernels into default.metallib" + ) - # FIXME: only add to the ggml-metal target? add_custom_target( ggml-metal-lib ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) + ) endif() # GGML_METAL_EMBED_LIBRARY if (NOT GGML_METAL_EMBED_LIBRARY) install( - FILES src/ggml-metal/ggml-metal.metal - PERMISSIONS - OWNER_READ - OWNER_WRITE - GROUP_READ - WORLD_READ - DESTINATION ${CMAKE_INSTALL_BINDIR}) + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ + DESTINATION ${CMAKE_INSTALL_BINDIR}/kernels + FILES_MATCHING PATTERN "*.metal" PATTERN "*.h" + ) - install( - FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DESTINATION ${CMAKE_INSTALL_BINDIR} - ) + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) endif() diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index a7cbc60ebe..c5b0a79af9 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -94,8 +94,63 @@ int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_wi return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup; } +// +// MTLLibrary collection (one library per op-source, compiled separately) +// + +// Single source of truth for the per-kind metal libraries. The order here +// defines the enum values and every per-kind table below, so adding a library +// is a one-line change here (plus adding its source to CMakeLists.txt). +// X(suffix, name): name is both the kernels/.metal basename and the +// ggml_metallib__{start,end} embed-symbol stem. +#define GGML_METAL_LIBS \ + X(FA, fa) \ + X(MUL_MV, mul_mv) \ + X(MUL_MM, mul_mm) \ + X(QUANTIZE, quantize) \ + X(SOFTMAX, softmax) \ + X(NORM, norm) \ + X(UNARY, unary) \ + X(BINBCAST, binbcast) \ + X(REDUCE, reduce) \ + X(TRI, tri) \ + X(SSM, ssm) \ + X(WKV, wkv) \ + X(GATED_DELTA_NET, gated_delta_net)\ + X(SOLVE_TRI, solve_tri) \ + X(ROPE, rope) \ + X(CONV, conv) \ + X(UPSCALE, upscale) \ + X(ARGSORT, argsort) \ + X(POOL, pool) \ + X(MISC, misc) + +enum ggml_metal_lib_kind { +#define X(e, s) GGML_METAL_LIB_##e, + GGML_METAL_LIBS +#undef X + GGML_METAL_LIB_COUNT, +}; + +static const char * const k_lib_names[GGML_METAL_LIB_COUNT] = { +#define X(e, s) [GGML_METAL_LIB_##e] = #s, + GGML_METAL_LIBS +#undef X +}; + struct ggml_metal_library { - id obj; + // Per-kind compiled libraries. When single_library is true, the whole library + // (e.g. a pre-compiled default.metallib or a from-source build) lives at + // objs[0] and the remaining slots are nil. + id objs[GGML_METAL_LIB_COUNT]; + bool single_library; // true: combined library at objs[0]; false: per-kind libs in objs[*] + + // Routing table: kernel function name -> objs[] index, populated from each + // compiled library's -[MTLLibrary functionNames]. The actual compiled + // libraries are the single source of truth for which library owns a kernel, + // so adding kernels later requires no manual routing maintenance. + // nil in single_library mode (everything resolves to objs[0]). + NSMutableDictionary * fn_to_lib; ggml_metal_device_t dev; ggml_metal_pipelines_t pipelines; // cache of compiled pipelines @@ -103,160 +158,376 @@ struct ggml_metal_library { NSLock * lock; }; -ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) { - id library = nil; - id device = ggml_metal_device_get_obj(dev); +// Build the fn_to_lib routing table by querying each compiled library's public +// function names. Call once after all per-kind libraries have been compiled. +static void ggml_metal_library_build_index(ggml_metal_library_t lib) { + @autoreleasepool { + NSMutableDictionary * index = [[NSMutableDictionary alloc] init]; + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + for (NSString * fname in [lib->objs[kind] functionNames]) { + index[fname] = @(kind); + } + } + lib->fn_to_lib = index; + } +} - // load library - // - // - first check if the library is embedded - // - then check if the library is in the bundle - // - if not found, load the source and compile it - // - if that fails, return NULL - // - // TODO: move to a function - { - const int64_t t_start = ggml_time_us(); +// Parse a `#include "name"` line. Returns the quoted name in *include_name on +// success. Whitespace-tolerant; ignores `#include <...>` (system headers). +static bool ggml_metal_library_parse_quoted_include(NSString * line, NSString ** include_name) { + NSScanner * scanner = [NSScanner scannerWithString:line]; + scanner.charactersToBeSkipped = [NSCharacterSet whitespaceCharacterSet]; - NSError * error = nil; - NSString * src = nil; + if (![scanner scanString:@"#" intoString:NULL] || + ![scanner scanString:@"include" intoString:NULL] || + ![scanner scanString:@"\"" intoString:NULL]) { + return false; + } -#if GGML_METAL_EMBED_LIBRARY - GGML_LOG_INFO("%s: using embedded metal library\n", __func__); + NSString * name = nil; + if (![scanner scanUpToString:@"\"" intoString:&name]) { + return false; + } - extern const char ggml_metallib_start[]; - extern const char ggml_metallib_end[]; + if (include_name) { + *include_name = name; + } + return true; +} - src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; -#else +// Recursively inline `#include "name"` directives. System includes (<...>), +// `#if/#else/#endif`, and other preprocessor lines are passed through to the +// Metal compiler unchanged. `#pragma once` is dropped since `seen` already +// guards against double-inclusion. +static bool ggml_metal_library_flatten_file(NSMutableString * dst, NSString * path, + NSArray * search_paths, + NSMutableSet * seen, NSError ** error) { + NSString * key = [path stringByStandardizingPath]; + if ([seen containsObject:key]) { + return true; + } + [seen addObject:key]; -#ifdef SWIFT_PACKAGE - NSBundle * bundle = SWIFTPM_MODULE_BUNDLE; -#else - NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; -#endif + NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:error]; + if (!src) { + return false; + } - NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; - if (path_lib == nil) { - // Try to find the resource in the directory where the current binary located. - NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0]; - NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent]; + NSFileManager * fm = [NSFileManager defaultManager]; + for (NSString * line in [src componentsSeparatedByString:@"\n"]) { + NSString * trimmed = [line stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceCharacterSet]]; + if ([trimmed isEqualToString:@"#pragma once"]) { + continue; + } - NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]]; - if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) { - GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]); - - NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error]; - if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) { - // Optionally, if this is a symlink, try to resolve it. - path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error]; - if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) { - // It is a relative path, adding the binary directory as directory prefix. - path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]]; - } - if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) { - // Link to the resource could not be resolved. - path_lib_default = nil; - } else { - GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]); - } + NSString * include_name = nil; + if (ggml_metal_library_parse_quoted_include(line, &include_name)) { + NSString * resolved = nil; + for (NSString * dir in search_paths) { + NSString * candidate = [dir stringByAppendingPathComponent:include_name]; + if ([fm isReadableFileAtPath:candidate]) { + resolved = candidate; + break; } - } else { - // The resource couldn't be found in the binary's directory. - path_lib_default = nil; } - - path_lib = path_lib_default; + if (!resolved) { + if (error) { + NSString * msg = [NSString stringWithFormat:@"could not resolve include \"%@\" from '%@'", include_name, path]; + *error = [NSError errorWithDomain:@"ggml-metal-source-flatten" code:1 + userInfo:@{NSLocalizedDescriptionKey: msg}]; + } + return false; + } + if (!ggml_metal_library_flatten_file(dst, resolved, search_paths, seen, error)) { + return false; + } + continue; } - if (path_lib != nil) { - // pre-compiled library found - NSURL * libURL = [NSURL fileURLWithPath:path_lib]; - GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + [dst appendString:line]; + [dst appendString:@"\n"]; + } - library = [device newLibraryWithURL:libURL error:&error]; - if (error) { - GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return nil; - } - } else { - GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); + return true; +} - NSString * path_source; - NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; +static NSString * ggml_metal_library_flatten_source(NSString * path_source, NSError ** error) { + // Search paths cover both runtime layout (build/bin/kernels + build/bin) + // and source-tree layout (ggml/src/ggml-metal/kernels + ggml/src/ggml-metal + ggml/src). + NSString * path_kernels = [path_source stringByDeletingLastPathComponent]; + NSString * path_base = [path_kernels stringByDeletingLastPathComponent]; + NSArray * search_paths = @[ + path_kernels, + path_base, + [path_base stringByDeletingLastPathComponent], + ]; - GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); + NSMutableString * src = [[NSMutableString alloc] init]; + NSMutableSet * seen = [NSMutableSet set]; - if (path_resource) { - path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"]; - } else { - path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + if (!ggml_metal_library_flatten_file(src, path_source, search_paths, seen, error)) { + [src release]; + return nil; + } + return src; +} + +// Compile all per-kind libraries in parallel. `source_for_kind` returns the MSL +// source for a kind (the helper takes ownership and releases it), or nil with +// *err set on failure. On success the objs[] slots are populated and the routing +// index is built; on any failure every error is logged and false is returned +// (the caller is responsible for freeing `res`). +static bool ggml_metal_library_compile_all( + ggml_metal_library_t res, + id device, + NSDictionary * prep, + NSString * (^source_for_kind)(int kind, NSError ** err), + const char * origin) { + const int64_t t_start = ggml_time_us(); + + int64_t * t_per_lib = calloc(GGML_METAL_LIB_COUNT, sizeof(int64_t)); + NSError ** err_per_lib = calloc(GGML_METAL_LIB_COUNT, sizeof(NSError *)); + __block atomic_bool any_failure = false; + + dispatch_group_t group = dispatch_group_create(); + dispatch_queue_t queue = dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0); + + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + dispatch_group_async(group, queue, ^{ + + const int64_t t0 = ggml_time_us(); + + NSError * error = nil; + + NSString * src = source_for_kind(kind, &error); + if (!src) { + err_per_lib[kind] = [error retain]; + atomic_store(&any_failure, true); + return; } - if (path_source == nil) { - GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); - path_source = @"ggml-metal.metal"; - } + id lib = nil; - GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); - - src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; - if (error) { - GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return nil; - } - } -#endif - - if (!library) { @autoreleasepool { - // dictionary of preprocessor macros - NSMutableDictionary * prep = [NSMutableDictionary dictionary]; - - if (ggml_metal_device_get_props(dev)->has_bfloat) { - [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"]; - } - - if (ggml_metal_device_get_props(dev)->has_tensor) { - [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"]; - } - -#if GGML_METAL_EMBED_LIBRARY - [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"]; -#endif - MTLCompileOptions * options = [MTLCompileOptions new]; options.preprocessorMacros = prep; - //[options setFastMathEnabled:false]; + lib = [device newLibraryWithSource:src options:options error:&error]; - library = [device newLibraryWithSource:src options:options error:&error]; - if (error) { - GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return nil; - } - -#if !__has_feature(objc_arc) [options release]; -#endif + + // retain the error before the autorelease pool drains it + if (!lib) { + err_per_lib[kind] = [error retain]; + } + } + + [src release]; + + t_per_lib[kind] = ggml_time_us() - t0; + + if (!lib) { + atomic_store(&any_failure, true); + return; + } + + res->objs[kind] = lib; + }); + } + dispatch_group_wait(group, DISPATCH_TIME_FOREVER); + dispatch_release(group); + + const bool ok = !atomic_load(&any_failure); + + if (ok) { + const int64_t t_total = ggml_time_us() - t_start; + int64_t t_max = 0; + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + GGML_LOG_DEBUG("%s: compiled '%s' library in %.3f sec\n", + __func__, k_lib_names[kind], t_per_lib[kind] / 1e6); + if (t_per_lib[kind] > t_max) t_max = t_per_lib[kind]; + } + GGML_LOG_INFO("%s: loaded %d libraries from %s in %.3f sec (max single = %.3f sec)\n", + __func__, GGML_METAL_LIB_COUNT, origin, t_total / 1e6, t_max / 1e6); + + ggml_metal_library_build_index(res); + } else { + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + if (err_per_lib[kind]) { + GGML_LOG_ERROR("%s: failed to build '%s' library: %s\n", __func__, + k_lib_names[kind], [[err_per_lib[kind] description] UTF8String]); + [err_per_lib[kind] release]; } } - -#if GGML_METAL_EMBED_LIBRARY - [src release]; -#endif // GGML_METAL_EMBED_LIBRARY - - GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6); } - ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library)); + free(err_per_lib); + free(t_per_lib); - res->obj = library; + return ok; +} + +ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) { + id device = ggml_metal_device_get_obj(dev); + + ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library)); res->dev = dev; res->pipelines = ggml_metal_pipelines_init(); res->lock = [NSLock new]; + // shared MTLCompileOptions preprocessor macros (matches the build-time defines) + NSMutableDictionary * prep = [NSMutableDictionary dictionary]; + if (ggml_metal_device_get_props(dev)->has_bfloat) { + [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"]; + } + if (ggml_metal_device_get_props(dev)->has_tensor) { + [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"]; + } +#if GGML_METAL_EMBED_LIBRARY + [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"]; +#endif + +#if GGML_METAL_EMBED_LIBRARY + GGML_LOG_INFO("%s: using embedded metal library\n", __func__); + + // start/end symbols emitted by CMake (see CMakeLists.txt), one pair per kind +#define X(e, s) extern const char ggml_metallib_##s##_start[]; extern const char ggml_metallib_##s##_end[]; + GGML_METAL_LIBS +#undef X + + static const char * const lib_start[GGML_METAL_LIB_COUNT] = { +#define X(e, s) [GGML_METAL_LIB_##e] = ggml_metallib_##s##_start, + GGML_METAL_LIBS +#undef X + }; + static const char * const lib_end[GGML_METAL_LIB_COUNT] = { +#define X(e, s) [GGML_METAL_LIB_##e] = ggml_metallib_##s##_end, + GGML_METAL_LIBS +#undef X + }; + + const bool ok = ggml_metal_library_compile_all(res, device, prep, + ^NSString * (int kind, NSError ** err) { + (void) err; + return [[NSString alloc] initWithBytes:lib_start[kind] + length:(lib_end[kind] - lib_start[kind]) + encoding:NSUTF8StringEncoding]; + }, "embedded data"); + + if (!ok) { + ggml_metal_library_free(res); + return NULL; + } + return res; +#else +#ifdef SWIFT_PACKAGE + NSBundle * bundle = SWIFTPM_MODULE_BUNDLE; +#else + NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; +#endif + + const int64_t t_start = ggml_time_us(); + + NSError * error = nil; + NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (path_lib == nil) { + // Try to find the resource in the directory where the current binary located. + NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0]; + NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent]; + + NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]]; + if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) { + GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]); + + NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error]; + if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) { + // Optionally, if this is a symlink, try to resolve it. + path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error]; + if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) { + // It is a relative path, adding the binary directory as directory prefix. + path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]]; + } + if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) { + // Link to the resource could not be resolved. + path_lib_default = nil; + } else { + GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]); + } + } + } else { + // The resource couldn't be found in the binary's directory. + path_lib_default = nil; + } + + path_lib = path_lib_default; + } + + if (path_lib != nil) { + // pre-compiled library found: a single combined default.metallib + NSURL * libURL = [NSURL fileURLWithPath:path_lib]; + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + + res->objs[0] = [device newLibraryWithURL:libURL error:&error]; + res->single_library = true; + if (!res->objs[0]) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + ggml_metal_library_free(res); + return NULL; + } + + GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6); + return res; + } + + // no pre-compiled metallib: fall back to compiling each kernel source separately + GGML_LOG_INFO("%s: default.metallib not found, loading kernel sources\n", __func__); + + NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; + if (path_resource) { + GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, [path_resource UTF8String]); + } + + // resolve each kind's source path up front (file lookup/logging stays on the calling thread) + NSString ** path_per_kind = calloc(GGML_METAL_LIB_COUNT, sizeof(NSString *)); + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + NSString * rel = [NSString stringWithFormat:@"kernels/%s.metal", k_lib_names[kind]]; + + NSString * path_source = nil; + if (path_resource) { + path_source = [path_resource stringByAppendingPathComponent:rel]; + } else { + NSString * stem = [NSString stringWithFormat:@"kernels/%s", k_lib_names[kind]]; + path_source = [bundle pathForResource:stem ofType:@"metal"]; + } + + if (path_source == nil || ![[NSFileManager defaultManager] isReadableFileAtPath:path_source]) { + GGML_LOG_WARN("%s: could not locate %s in bundle, falling back to cwd\n", __func__, [rel UTF8String]); + path_source = rel; + } + + GGML_LOG_DEBUG("%s: loading '%s'\n", __func__, [path_source UTF8String]); + + path_per_kind[kind] = [path_source retain]; + } + + const bool ok = ggml_metal_library_compile_all(res, device, prep, + ^NSString * (int kind, NSError ** err) { + return ggml_metal_library_flatten_source(path_per_kind[kind], err); + }, "source"); + + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + [path_per_kind[kind] release]; + } + free(path_per_kind); + + if (!ok) { + ggml_metal_library_free(res); + return NULL; + } + + return res; +#endif } ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) { @@ -318,10 +589,11 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev return NULL; } - res->obj = library; - res->dev = dev; - res->pipelines = ggml_metal_pipelines_init(); - res->lock = [NSLock new]; + res->objs[0] = library; + res->single_library = true; + res->dev = dev; + res->pipelines = ggml_metal_pipelines_init(); + res->lock = [NSLock new]; return res; } @@ -331,8 +603,14 @@ void ggml_metal_library_free(ggml_metal_library_t lib) { return; } - if (lib->obj) { - [lib->obj release]; + for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) { + if (lib->objs[kind]) { + [lib->objs[kind] release]; + } + } + + if (lib->fn_to_lib) { + [lib->fn_to_lib release]; } ggml_metal_pipelines_free(lib->pipelines); @@ -393,11 +671,28 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_ GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name); + // route to the library that actually defines this kernel; fn_to_lib is + // built from -[MTLLibrary functionNames] so it's always in sync + int lib_idx = 0; + if (!lib->single_library) { + NSNumber * idx = lib->fn_to_lib[base_func]; + if (!idx) { + [lib->lock unlock]; + + GGML_LOG_ERROR("%s: kernel not found in any metal library: base = '%s', name = '%s'\n", __func__, base, name); + + return res; + } + lib_idx = [idx intValue]; + } + + id mtl_lib = lib->objs[lib_idx]; + id mtl_function; if (!cv) { - mtl_function = [lib->obj newFunctionWithName:base_func]; + mtl_function = [mtl_lib newFunctionWithName:base_func]; } else { - mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error]; + mtl_function = [mtl_lib newFunctionWithName:base_func constantValues:cv->obj error:&error]; } if (!mtl_function) { [lib->lock unlock]; diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal deleted file mode 100644 index 25e78e1008..0000000000 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ /dev/null @@ -1,10754 +0,0 @@ -#define GGML_COMMON_DECL_METAL -#define GGML_COMMON_IMPL_METAL -#if defined(GGML_METAL_EMBED_LIBRARY) -__embed_ggml-common.h__ -#else -#include "ggml-common.h" -#endif -#include "ggml-metal-impl.h" - -#include - -#ifdef GGML_METAL_HAS_TENSOR -#include - -#include -#endif - -using namespace metal; - -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) < (y) ? (x) : (y)) -#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; } - -#define PAD2(x, n) (((x) + (n) - 1) & ~((n) - 1)) - -#define FOR_UNROLL(x) _Pragma("clang loop unroll(full)") for (x) - -#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 - -// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf -// -// cmd: -// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal/ggml-metal.metal -// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal/ggml-metal.metal -// -#if __METAL_VERSION__ < 310 && defined(GGML_METAL_HAS_BF16) -#undef GGML_METAL_HAS_BF16 -#endif - -#if defined(GGML_METAL_HAS_BF16) -typedef matrix bfloat4x4; -typedef matrix bfloat2x4; -#endif - -constexpr constant static float kvalues_iq4nl_f[16] = { - -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f -}; - -constexpr constant static float kvalues_mxfp4_f[16] = { - 0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f -}; - -static inline int best_index_int8(int n, constant float * val, float x) { - if (x <= val[0]) return 0; - if (x >= val[n-1]) return n-1; - int ml = 0, mu = n-1; - while (mu-ml > 1) { - int mav = (ml+mu)/2; - if (x < val[mav]) mu = mav; else ml = mav; - } - return x - val[mu-1] < val[mu] - x ? mu-1 : mu; -} - -static inline float e8m0_to_fp32(uint8_t x) { - uint32_t bits; - - if (x == 0) { - bits = 0x00400000; - } else { - bits = (uint32_t) x << 23; - } - - return as_type(bits); -} - -static inline float dot(float x, float y) { - return x*y; -} - -static inline float sum(float x) { - return x; -} - -static inline float sum(float4 x) { - return x[0] + x[1] + x[2] + x[3]; -} - -// NOTE: this is not dequantizing - we are simply fitting the template -template -void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { - reg = (type4x4)(*src); -} - -template -void dequantize_f32_t4(device const float4 * src, short il, thread type4 & reg) { - reg = (type4)(*src); -} - -template -void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { - reg = (type4x4)(*src); -} - -template -void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) { - reg = (type4)(*(src)); -} - -#if defined(GGML_METAL_HAS_BF16) -template -void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) { - reg = (type4x4)(*src); -} - -template -void dequantize_bf16_t4(device const bfloat4 * src, short il, thread type4 & reg) { - reg = (type4)(*(src)); -} -#endif - -template -void dequantize_q1_0(device const block_q1_0 * xb, short il, thread type4x4 & reg) { - device const uint8_t * qs = xb->qs; - const float d = xb->d; - const float neg_d = -d; - - const int byte_offset = il * 2; // il*16 bits = il*2 bytes - const uint8_t b0 = qs[byte_offset]; - const uint8_t b1 = qs[byte_offset + 1]; - - float4x4 reg_f; - - reg_f[0][0] = select(neg_d, d, bool(b0 & 0x01)); - reg_f[0][1] = select(neg_d, d, bool(b0 & 0x02)); - reg_f[0][2] = select(neg_d, d, bool(b0 & 0x04)); - reg_f[0][3] = select(neg_d, d, bool(b0 & 0x08)); - reg_f[1][0] = select(neg_d, d, bool(b0 & 0x10)); - reg_f[1][1] = select(neg_d, d, bool(b0 & 0x20)); - reg_f[1][2] = select(neg_d, d, bool(b0 & 0x40)); - reg_f[1][3] = select(neg_d, d, bool(b0 & 0x80)); - - reg_f[2][0] = select(neg_d, d, bool(b1 & 0x01)); - reg_f[2][1] = select(neg_d, d, bool(b1 & 0x02)); - reg_f[2][2] = select(neg_d, d, bool(b1 & 0x04)); - reg_f[2][3] = select(neg_d, d, bool(b1 & 0x08)); - reg_f[3][0] = select(neg_d, d, bool(b1 & 0x10)); - reg_f[3][1] = select(neg_d, d, bool(b1 & 0x20)); - reg_f[3][2] = select(neg_d, d, bool(b1 & 0x40)); - reg_f[3][3] = select(neg_d, d, bool(b1 & 0x80)); - - reg = (type4x4) reg_f; -} - -template -void dequantize_q1_0_t4(device const block_q1_0 * xb, short il, thread type4 & reg) { - const float d = xb->d; - const float neg_d = -d; - const int base = il * 4; - const uint8_t byte = xb->qs[base / 8]; - const int s = base % 8; - - float4 reg_f; - reg_f[0] = select(neg_d, d, bool((byte >> (s )) & 1)); - reg_f[1] = select(neg_d, d, bool((byte >> (s + 1)) & 1)); - reg_f[2] = select(neg_d, d, bool((byte >> (s + 2)) & 1)); - reg_f[3] = select(neg_d, d, bool((byte >> (s + 3)) & 1)); - - reg = (type4) reg_f; -} - -template -void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float md = -8.h * xb->d; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - float4x4 reg_f; - - for (int i = 0; i < 8; i++) { - reg_f[i/2][2*(i%2) + 0] = d1 * (qs[i] & mask0) + md; - reg_f[i/2][2*(i%2) + 1] = d2 * (qs[i] & mask1) + md; - } - - reg = (type4x4) reg_f; -} - -template -void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const float d1 = (il/4) ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float md = -8.h * xb->d; - const ushort mask0 = (il/4) ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - for (int i = 0; i < 2; i++) { - reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + md; - reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + md; - } -} - -void quantize_q1_0(device const float * src, device block_q1_0 & dst) { - float sum_abs = 0.0f; - for (int j = 0; j < QK1_0; j++) { - sum_abs += fabs(src[j]); - } - dst.d = sum_abs / QK1_0; - - for (int j = 0; j < QK1_0 / 8; j++) { - dst.qs[j] = 0; - } - for (int j = 0; j < QK1_0; j++) { - if (src[j] >= 0.0f) { - dst.qs[j / 8] |= (1 << (j % 8)); - } - } -} - -void quantize_q4_0(device const float * src, device block_q4_0 & dst) { -#pragma METAL fp math_mode(safe) - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK4_0; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / -8; - const float id = d ? 1.0f/d : 0.0f; - - dst.d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK4_0/2 + j]*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - - dst.qs[j] = xi0; - dst.qs[j] |= xi1 << 4; - } -} - -void quantize_q4_1(device const float * src, device block_q4_1 & dst) { -#pragma METAL fp math_mode(safe) - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < QK4_1; j++) { - const float v = src[j]; - if (min > v) min = v; - if (max < v) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dst.d = d; - dst.m = min; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (src[0 + j] - min)*id; - const float x1 = (src[QK4_1/2 + j] - min)*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - - dst.qs[j] = xi0; - dst.qs[j] |= xi1 << 4; - } -} - -void quantize_q5_0(device const float * src, device block_q5_0 & dst) { -#pragma METAL fp math_mode(safe) - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK5_0; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / -16; - const float id = d ? 1.0f/d : 0.0f; - - dst.d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK5_0/2 + j]*id; - - const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - - dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); - } - - thread const uint8_t * qh8 = (thread const uint8_t *)&qh; - - for (int j = 0; j < 4; ++j) { - dst.qh[j] = qh8[j]; - } -} - -void quantize_q5_1(device const float * src, device block_q5_1 & dst) { -#pragma METAL fp math_mode(safe) - float max = src[0]; - float min = src[0]; - - for (int j = 1; j < QK5_1; j++) { - const float v = src[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f/d : 0.0f; - - dst.d = d; - dst.m = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1/2; ++j) { - const float x0 = (src[0 + j] - min)*id; - const float x1 = (src[QK5_1/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); - } - - thread const uint8_t * qh8 = (thread const uint8_t *)&qh; - - for (int j = 0; j < 4; ++j) { - dst.qh[j] = qh8[j]; - } -} - -void quantize_q8_0(device const float * src, device block_q8_0 & dst) { -#pragma METAL fp math_mode(safe) - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = src[j]; - amax = MAX(amax, fabs(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dst.d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = src[j]*id; - - dst.qs[j] = round(x0); - } -} - -void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) { -#pragma METAL fp math_mode(safe) - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK4_NL; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / kvalues_iq4nl_f[0]; - const float id = d ? 1.0f/d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK4_NL/2 + j]*id; - - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1); - - dst.qs[j] = xi0 | (xi1 << 4); - - const float v0 = kvalues_iq4nl_f[xi0]; - const float v1 = kvalues_iq4nl_f[xi1]; - const float w0 = src[0 + j]*src[0 + j]; - const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j]; - sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j]; - sumq2 += w0*v0*v0 + w1*v1*v1; - - } - - dst.d = sumq2 > 0 ? sumqx/sumq2 : d; -} - -template -void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float m = xb->m; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - float4x4 reg_f; - - for (int i = 0; i < 8; i++) { - reg_f[i/2][2*(i%2) + 0] = ((qs[i] & mask0) * d1) + m; - reg_f[i/2][2*(i%2) + 1] = ((qs[i] & mask1) * d2) + m; - } - - reg = (type4x4) reg_f; -} - -template -void dequantize_q4_1_t4(device const block_q4_1 * xb, short il, thread type4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const float d1 = (il/4) ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float m = xb->m; - const ushort mask0 = (il/4) ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - for (int i = 0; i < 2; i++) { - reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + m; - reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + m; - } -} - -template -void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 3); - const float d = xb->d; - const float md = -16.h * xb->d; - const ushort mask = il ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = il ? 4 : 0; - - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; - - float4x4 reg_f; - - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg_f[i/2][2*(i%2) + 0] = d * x0 + md; - reg_f[i/2][2*(i%2) + 1] = d * x1 + md; - } - - reg = (type4x4) reg_f; -} - -template -void dequantize_q5_0_t4(device const block_q5_0 * xb, short il, thread type4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 3); - const float d = xb->d; - const float md = -16.h * xb->d; - const ushort mask = (il/4) ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = (il/4) ? 4 : 0; - - const int gh_mv = (il/4) ? 12 : 0; - const int gh_bk = (il/4) ? 0 : 4; - - for (int ii = 0; ii < 2; ii++) { - int i = 2*(il%4) + ii; - - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg[2*ii + 0] = d * x0 + md; - reg[2*ii + 1] = d * x1 + md; - } -} - -template -void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 4); - const float d = xb->d; - const float m = xb->m; - const ushort mask = il ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = il ? 4 : 0; - - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; - - float4x4 reg_f; - - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg_f[i/2][2*(i%2) + 0] = d * x0 + m; - reg_f[i/2][2*(i%2) + 1] = d * x1 + m; - } - - reg = (type4x4) reg_f; -} - -template -void dequantize_q5_1_t4(device const block_q5_1 * xb, short il, thread type4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 4); - const float d = xb->d; - const float m = xb->m; - const ushort mask = (il/4) ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = (il/4) ? 4 : 0; - - const int gh_mv = (il/4) ? 12 : 0; - const int gh_bk = (il/4) ? 0 : 4; - - for (int ii = 0; ii < 2; ii++) { - int i = 2*(il%4) + ii; - - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg[2*ii + 0] = d * x0 + m; - reg[2*ii + 1] = d * x1 + m; - } -} - -template -void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { - device const int8_t * qs = ((device const int8_t *)xb->qs); - const float d = xb->d; - - float4x4 reg_f; - - for (int i = 0; i < 16; i++) { - reg_f[i/4][i%4] = (qs[i + 16*il] * d); - } - - reg = (type4x4) reg_f; -} - -template -void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & reg) { - device const int8_t * qs = ((device const int8_t *)xb->qs); - const float d = xb->d; - - for (int i = 0; i < 4; i++) { - reg[i] = (qs[4*(il%4) + i + 16*(il/4)] * d); - } -} - -template -void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) { - device const uint8_t * q2 = (device const uint8_t *)xb->qs; - - const float d = e8m0_to_fp32(xb->e); - const uint8_t shr = il >= 1 ? 4 : 0; - - for (int i = 0; i < 4; ++i) { - reg[i][0] = d * kvalues_mxfp4_f[(q2[4*i + 0] >> shr) & 0x0F]; - reg[i][1] = d * kvalues_mxfp4_f[(q2[4*i + 1] >> shr) & 0x0F]; - reg[i][2] = d * kvalues_mxfp4_f[(q2[4*i + 2] >> shr) & 0x0F]; - reg[i][3] = d * kvalues_mxfp4_f[(q2[4*i + 3] >> shr) & 0x0F]; - } -} - -template -void dequantize_mxfp4_t4(device const block_mxfp4 * xb, short il, thread type4 & reg) { - device const uint8_t * q2 = (device const uint8_t *)xb->qs; - - const float d = e8m0_to_fp32(xb->e); - const short il4 = il%4; - - const uint8_t shr = il >= 4 ? 4 : 0; - - reg[0] = d * kvalues_mxfp4_f[(q2[4*il4 + 0] >> shr) & 0x0F]; - reg[1] = d * kvalues_mxfp4_f[(q2[4*il4 + 1] >> shr) & 0x0F]; - reg[2] = d * kvalues_mxfp4_f[(q2[4*il4 + 2] >> shr) & 0x0F]; - reg[3] = d * kvalues_mxfp4_f[(q2[4*il4 + 3] >> shr) & 0x0F]; -} - -template -void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { - const float d = xb->d; - const float min = xb->dmin; - device const uint8_t * q = (device const uint8_t *)xb->qs; - float dl, ml; - uint8_t sc = xb->scales[il]; - - q = q + 32*(il/8) + 16*(il&1); - il = (il/2)%4; - - half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } -} - -template -void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint8_t * q = (device const uint8_t *)xb->qs; - device const uint8_t * h = (device const uint8_t *)xb->hmask; - device const int8_t * scales = (device const int8_t *)xb->scales; - - q = q + 32 * (il/8) + 16 * (il&1); - h = h + 16 * (il&1); - uint8_t m = 1 << (il/2); - uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ - ((il/4)>0 ? 12 : 3); - uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; - uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; - int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) - : (scale_2&kmask2) | ((scale_1&kmask1) << 4); - float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); - const float ml = 4.f * dl; - - il = (il/2) & 3; - const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl *= coef; - - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); - } -} - -static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { - return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} - : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; -} - -template -void dequantize_q4_K(device const block_q4_K * xb, short il, thread type4x4 & reg) { - device const uchar * q = xb->qs; - - short is = (il/4) * 2; - q = q + (il/4) * 32 + 16 * (il&1); - il = il & 3; - const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.h; - const float min = xb->dmin; - const float dl = d * sc[0]; - const float ml = min * sc[1]; - - const ushort mask = il < 2 ? 0x0F : 0xF0; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } -} - -template -void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { - device const uint8_t * q = xb->qs; - device const uint8_t * qh = xb->qh; - - short is = (il/4) * 2; - q = q + 32 * (il/4) + 16 * (il&1); - qh = qh + 16 * (il&1); - uint8_t ul = 1 << (il/2); - il = il & 3; - const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.f; - const float min = xb->dmin; - const float dl = d * sc[0]; - const float ml = min * sc[1]; - - const ushort mask = il<2 ? 0x0F : 0xF0; - const float qh_val = il<2 ? 16.f : 256.f; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; - } -} - -template -void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint16_t * ql = (device const uint16_t *)xb->ql; - device const uint16_t * qh = (device const uint16_t *)xb->qh; - device const int8_t * scales = (device const int8_t *)xb->scales; - - ql = ql + 32*(il/8) + 16*((il/2)&1) + 8*(il&1); - qh = qh + 16*(il/8) + 8*(il&1); - float sc = scales[(il%2) + 2 * ((il/2))]; - il = (il/2) & 3; - - const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303); - const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F; - const float ml = d_all * sc * 32.f; - const float dl0 = d_all * sc; - const float dl1 = dl0 / 256.f; - const float dl2 = dl0 / (256.f * 256.f); - const float dl3 = dl0 / (256.f * 256.f * 256.f); - const uint8_t shr_h = il>2 ? 2 : 0; - const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4); - const uint8_t shr_l = il>1 ? 4 : 0; - for (int i = 0; i < 4; ++i) { - const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2; - const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1; - const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l); - reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml; - reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml; - reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml; - reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml; - } -} - -template -void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. - device const uint16_t * q2 = xb->qs + 4*ib32; - const uint32_t aux32_g = q2[0] | (q2[1] << 16); - const uint32_t aux32_s = q2[2] | (q2[3] << 16); - thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; - const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; - for (int i = 0; i < 8; ++i) { - reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } - grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; - for (int i = 0; i < 8; ++i) { - reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } -} - -template -void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint16_t * q2 = xb->qs + 4*ib32; - const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); - uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; - for (int i = 0; i < 8; ++i) { - reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } - grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); - signs = ksigns_iq2xs[q2[2*il+1] >> 9]; - for (int i = 0; i < 8; ++i) { - reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } -} - -template -void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * q3 = xb->qs + 8*ib32; - device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; - constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); - constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); - uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); - reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); - } - grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); - grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); - signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; - for (int i = 0; i < 4; ++i) { - reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); - reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); - } -} - -template -void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * qs = xb->qs + 8*ib32; - device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; - const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); - constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); - constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); - reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); - } - grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); - grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); - for (int i = 0; i < 4; ++i) { - reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); - reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); - } -} - -template -void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint8_t * signs = qs + QK_K/8; - const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; - constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300))); - constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300))); - for (int i = 0; i < 8; ++i) { - reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]); - reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]); - } -} - -template -void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - const float d = xb->d; - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint16_t * qh = xb->qh; - const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1); - const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA); - const uint16_t h = qh[ib32] >> 6*il; - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * (grid1[i] & 0xf) + ml; - reg[1][i] = dl * (grid1[i] >> 4) + ml; - reg[2][i] = dl * (grid2[i] & 0xf) + ml; - reg[3][i] = dl * (grid2[i] >> 4) + ml; - } -} - -template -void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - device const uint16_t * sc = (device const uint16_t *)xb->scales; - - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const float d = scale.f16; - - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint8_t * qh = xb->qh + 2*ib32 + il; - - const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); - const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * (grid1[i] & 0xf) + ml1; - reg[1][i] = dl * (grid1[i] >> 4) + ml1; - reg[2][i] = dl * (grid2[i] & 0xf) + ml2; - reg[3][i] = dl * (grid2[i] >> 4) + ml2; - } -} - -template -void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { - device const uint16_t * q4 = (device const uint16_t *)xb->qs; - const float d = xb->d; - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - for (int i = 0; i < 4; ++i) { - aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; - reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; - reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; - reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; - reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; - } -} - -template -void dequantize_iq4_nl_t4(device const block_iq4_nl * xb, short il, thread type4 & reg) { - device const uint16_t * q4 = (device const uint16_t *)xb->qs; - const float d = xb->d; - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - aux32 = ((q4[2*(il%4)] | (q4[2*(il%4)+1] << 16)) >> 4*(il/4)) & 0x0f0f0f0f; - reg[0] = d * kvalues_iq4nl_f[q8[0]]; - reg[1] = d * kvalues_iq4nl_f[q8[1]]; - reg[2] = d * kvalues_iq4nl_f[q8[2]]; - reg[3] = d * kvalues_iq4nl_f[q8[3]]; -} - -template -void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32; - const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4); - const float d = (float)xb->d * (ls - 32); - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - for (int i = 0; i < 4; ++i) { - aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f; - reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; - reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; - reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; - reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; - } -} - -enum ggml_sort_order { - GGML_SORT_ORDER_ASC, - GGML_SORT_ORDER_DESC, -}; - -constant float GELU_COEF_A = 0.044715f; -constant float GELU_QUICK_COEF = -1.702f; -constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; -constant float SQRT_2_INV = 0.70710678118654752440084436210484f; - -// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation -// ref: https://www.johndcook.com/blog/python_erf/ -constant float p_erf = 0.3275911f; -constant float a1_erf = 0.254829592f; -constant float a2_erf = -0.284496736f; -constant float a3_erf = 1.421413741f; -constant float a4_erf = -1.453152027f; -constant float a5_erf = 1.061405429f; - -template -inline T erf_approx(T x) { - T sign_x = sign(x); - x = fabs(x); - T t = 1.0f / (1.0f + p_erf * x); - T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); - return sign_x * y; -} - -template T elu_approx(T x); - -template<> inline float elu_approx(float x) { - return (x > 0.f) ? x : (exp(x) - 1); -} - -template<> inline float4 elu_approx(float4 x) { - float4 res; - - res[0] = (x[0] > 0.0f) ? x[0] : (exp(x[0]) - 1.0f); - res[1] = (x[1] > 0.0f) ? x[1] : (exp(x[1]) - 1.0f); - res[2] = (x[2] > 0.0f) ? x[2] : (exp(x[2]) - 1.0f); - res[3] = (x[3] > 0.0f) ? x[3] : (exp(x[3]) - 1.0f); - - return res; -} - -constant short FC_unary_op [[function_constant(FC_UNARY + 0)]]; -constant bool FC_unary_cnt[[function_constant(FC_UNARY + 1)]]; - -template -kernel void kernel_unary_impl( - constant ggml_metal_kargs_unary & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { -#define FC_OP FC_unary_op -#define FC_CNT FC_unary_cnt - - device const T0 * src0_ptr; - device T * dst_ptr; - - int i0; - - if (FC_CNT) { - i0 = tgpig.x; - - src0_ptr = (device const T0 *) (src0); - dst_ptr = (device T *) (dst); - } else { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int k0 = tgpig.x/args.ne01; - const int i01 = tgpig.x - k0*args.ne01; - - i0 = k0*ntg.x + tpitg.x; - - src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 ); - } - - { - //threadgroup_barrier(mem_flags::mem_none); - - if (!FC_CNT) { - if (i0 >= args.ne0) { - return; - } - } - - const TC x = (TC) src0_ptr[i0]; - - if (FC_OP == OP_UNARY_NUM_SCALE) { - dst_ptr[i0] = (T) (args.scale * x + args.bias); - } - - if (FC_OP == OP_UNARY_NUM_FILL) { - dst_ptr[i0] = (T) args.val; - } - - if (FC_OP == OP_UNARY_NUM_CLAMP) { - dst_ptr[i0] = (T) clamp(x, args.min, args.max); - } - - if (FC_OP == OP_UNARY_NUM_SQR) { - dst_ptr[i0] = (T) (x * x); - } - - if (FC_OP == OP_UNARY_NUM_SQRT) { - dst_ptr[i0] = (T) sqrt(x); - } - - if (FC_OP == OP_UNARY_NUM_SIN) { - dst_ptr[i0] = (T) sin(x); - } - - if (FC_OP == OP_UNARY_NUM_COS) { - dst_ptr[i0] = (T) cos(x); - } - - if (FC_OP == OP_UNARY_NUM_LOG) { - dst_ptr[i0] = (T) log(x); - } - - if (FC_OP == OP_UNARY_NUM_LEAKY_RELU) { - dst_ptr[i0] = (T) (TC(x > 0)*x + TC(x <= 0)*(x * args.slope)); - } - - if (FC_OP == OP_UNARY_NUM_TANH) { - dst_ptr[i0] = (T) precise::tanh(x); - } - - if (FC_OP == OP_UNARY_NUM_RELU) { - dst_ptr[i0] = (T) fmax(0, x); - } - - if (FC_OP == OP_UNARY_NUM_SIGMOID) { - dst_ptr[i0] = (T) (1 / (1 + exp(-x))); - } - - if (FC_OP == OP_UNARY_NUM_GELU) { - dst_ptr[i0] = (T) (0.5*x*(1 + precise::tanh(SQRT_2_OVER_PI*x*(1 + GELU_COEF_A*x*x)))); - } - - if (FC_OP == OP_UNARY_NUM_GELU_ERF) { - dst_ptr[i0] = (T) (0.5*x*(1 + erf_approx(SQRT_2_INV*x))); - } - - if (FC_OP == OP_UNARY_NUM_GELU_QUICK) { - dst_ptr[i0] = (T) (x * (1/(1 + exp(GELU_QUICK_COEF*x)))); - } - - if (FC_OP == OP_UNARY_NUM_SILU) { - dst_ptr[i0] = (T) (x / (1 + exp(-x))); - } - - if (FC_OP == OP_UNARY_NUM_ELU) { - dst_ptr[i0] = (T) elu_approx(x); - } - - if (FC_OP == OP_UNARY_NUM_NEG) { - dst_ptr[i0] = (T) -x; - } - - if (FC_OP == OP_UNARY_NUM_ABS) { - dst_ptr[i0] = (T) fabs(x); - } - - if (FC_OP == OP_UNARY_NUM_SGN) { - dst_ptr[i0] = T(x > 0) - T(x < 0); - } - - if (FC_OP == OP_UNARY_NUM_STEP) { - dst_ptr[i0] = T(x > 0); - } - - if (FC_OP == OP_UNARY_NUM_HARDSWISH) { - dst_ptr[i0] = (T) (x * fmax(0, fmin(1, x/6 + 0.5))); - } - - if (FC_OP == OP_UNARY_NUM_HARDSIGMOID) { - dst_ptr[i0] = (T) fmax(0, fmin(1, x/6 + 0.5)); - } - - if (FC_OP == OP_UNARY_NUM_EXP) { - dst_ptr[i0] = (T) exp(x); - } - - if (FC_OP == OP_UNARY_NUM_SOFTPLUS) { - dst_ptr[i0] = (T) select(log(1 + exp(x)), x, x > 20); - } - - if (FC_OP == OP_UNARY_NUM_EXPM1) { - // TODO: precise implementation - dst_ptr[i0] = (T) (exp(x) - 1); - } - - if (FC_OP == OP_UNARY_NUM_FLOOR) { - dst_ptr[i0] = (T) floor(x); - } - - if (FC_OP == OP_UNARY_NUM_CEIL) { - dst_ptr[i0] = (T) ceil(x); - } - - if (FC_OP == OP_UNARY_NUM_ROUND) { - dst_ptr[i0] = (T) round(x); - } - - if (FC_OP == OP_UNARY_NUM_TRUNC) { - dst_ptr[i0] = (T) trunc(x); - } - - if (FC_OP == OP_UNARY_NUM_XIELU) { - const TC xi = x; - const TC gate = TC(xi > TC(0.0f)); - const TC clamped = fmin(xi, TC(args.val)); - const TC y_pos = TC(args.scale) * xi * xi + TC(args.bias) * xi; - const TC y_neg = (exp(clamped) - TC(1.0f) - xi) * TC(args.slope) + TC(args.bias) * xi; - dst_ptr[i0] = (T) (gate * y_pos + (TC(1.0f) - gate) * y_neg); - } - } - -#undef FC_OP -#undef FC_CNT -} - -typedef decltype(kernel_unary_impl) kernel_unary_t; - -template [[host_name("kernel_unary_f32_f32")]] kernel kernel_unary_t kernel_unary_impl; -template [[host_name("kernel_unary_f32_f32_4")]] kernel kernel_unary_t kernel_unary_impl; -template [[host_name("kernel_unary_f16_f16")]] kernel kernel_unary_t kernel_unary_impl; -template [[host_name("kernel_unary_f16_f16_4")]] kernel kernel_unary_t kernel_unary_impl; - -// OP: 0 - add, 1 - sub, 2 - mul, 3 - div -constant short FC_bin_op [[function_constant(FC_BIN + 0)]]; -constant short FC_bin_f [[function_constant(FC_BIN + 1)]]; -constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]]; -constant bool FC_bin_cb [[function_constant(FC_BIN + 3)]]; - -template -kernel void kernel_bin_fuse_impl( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { -#define FC_OP FC_bin_op -#define FC_F FC_bin_f -#define FC_RB FC_bin_rb -#define FC_CB FC_bin_cb - - if (FC_RB) { - // row broadcast - const uint i0 = tgpig.y*args.ne00 + tgpig.x; - const uint i1 = FC_CB ? tgpig.x%args.ne10 : tgpig.x; - - device const T0 * src0_row = (device const T0 *) (src0); - device T * dst_row = (device T *) (dst); - - if (FC_F == 1) { - device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]); - - if (FC_OP == 0) { - dst_row[i0] = src0_row[i0] + src1_row[i1]; - } - - if (FC_OP == 1) { - dst_row[i0] = src0_row[i0] - src1_row[i1]; - } - - if (FC_OP == 2) { - dst_row[i0] = src0_row[i0] * src1_row[i1]; - } - - if (FC_OP == 3) { - dst_row[i0] = src0_row[i0] / src1_row[i1]; - } - } else { - T0 res = src0_row[i0]; - - if (FC_OP == 0) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res += ((device const T1 *) (src1 + args.o1[j]))[i1]; - } - } - - if (FC_OP == 1) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res -= ((device const T1 *) (src1 + args.o1[j]))[i1]; - } - } - - if (FC_OP == 2) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res *= ((device const T1 *) (src1 + args.o1[j]))[i1]; - } - } - - if (FC_OP == 3) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res /= ((device const T1 *) (src1 + args.o1[j]))[i1]; - } - } - - dst_row[i0] = res; - } - } else { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int i01 = tgpig.x; - - if (i01 >= args.ne01) { - return; - } - - const int i13 = i03%args.ne13; - const int i12 = i02%args.ne12; - const int i11 = i01%args.ne11; - - device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs); - device T * dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs); - - if (FC_F == 1) { - device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = FC_CB ? i0%args.ne10 : i0; - - if (FC_OP == 0) { - dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10]; - } - - if (FC_OP == 1) { - dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10]; - } - - if (FC_OP == 2) { - dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10]; - } - - if (FC_OP == 3) { - dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10]; - } - } - } else { - device const T1 * src1_ptr[8]; - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); - } - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = FC_CB ? i0%args.ne10 : i0; - - T res = src0_ptr[i0]; - - if (FC_OP == 0) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res += src1_ptr[j][i10]; - } - } - - if (FC_OP == 1) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res -= src1_ptr[j][i10]; - } - } - - if (FC_OP == 2) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res *= src1_ptr[j][i10]; - } - } - - if (FC_OP == 3) { - FOR_UNROLL (short j = 0; j < FC_F; ++j) { - res /= src1_ptr[j][i10]; - } - } - - dst_ptr[i0] = res; - } - } - } - -#undef FC_OP -#undef FC_F -#undef FC_RB -#undef FC_CB -} - -typedef decltype(kernel_bin_fuse_impl) kernel_bin_fuse_t; - -template [[host_name("kernel_bin_fuse_f32_f32_f32")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl; -template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl; - -kernel void kernel_add_id( - constant ggml_metal_kargs_add_id & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i1 = tgpig.x; - const int i2 = tgpig.y; - - const int i11 = *((device const int32_t *) (src2 + i1*sizeof(int32_t) + i2*args.nb21)); - - const size_t nb1 = args.ne0 * sizeof(float); - const size_t nb2 = args.ne1 * nb1; - - device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2); - device const float * src0_row = (device const float *)((device char *)src0 + i1*args.nb01 + i2*args.nb02); - device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11); - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - dst_row[i0] = src0_row[i0] + src1_row[i0]; - } -} - -template -kernel void kernel_repeat( - constant ggml_metal_kargs_repeat & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i3 = tgpig.z; - const int i2 = tgpig.y; - const int i1 = tgpig.x; - - const int i03 = i3%args.ne03; - const int i02 = i2%args.ne02; - const int i01 = i1%args.ne01; - - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01; - device char * dst_ptr = dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1; - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i00 = i0%args.ne00; - *((device T *)(dst_ptr + i0*args.nb0)) = *((device T *)(src0_ptr + i00*args.nb00)); - } -} - -typedef decltype(kernel_repeat) kernel_repeat_t; - -template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat; -template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat; -#endif -template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; -template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; - -template -kernel void kernel_reglu( - constant ggml_metal_kargs_glu & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); - - for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { - const float x0 = src0_row[i0]; - const float x1 = src1_row[i0]; - - dst_row[i0] = (T)(x0*x1*(x0 > 0.0f)); - } -} - -typedef decltype(kernel_reglu) kernel_reglu_t; - -template [[host_name("kernel_reglu_f32")]] kernel kernel_reglu_t kernel_reglu; -template [[host_name("kernel_reglu_f16")]] kernel kernel_reglu_t kernel_reglu; - -template -kernel void kernel_geglu( - constant ggml_metal_kargs_glu & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); - - for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { - const float x0 = src0_row[i0]; - const float x1 = src1_row[i0]; - - const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); - - dst_row[i0] = (T)(gelu*x1); - } -} - -typedef decltype(kernel_geglu) kernel_geglu_t; - -template [[host_name("kernel_geglu_f32")]] kernel kernel_geglu_t kernel_geglu; -template [[host_name("kernel_geglu_f16")]] kernel kernel_geglu_t kernel_geglu; - -template -kernel void kernel_swiglu( - constant ggml_metal_kargs_glu & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); - - for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { - const float x0 = src0_row[i0]; - const float x1 = src1_row[i0]; - - const float silu = x0 / (1.0f + exp(-x0)); - - dst_row[i0] = (T)(silu*x1); - } -} - -typedef decltype(kernel_swiglu) kernel_swiglu_t; - -template [[host_name("kernel_swiglu_f32")]] kernel kernel_swiglu_t kernel_swiglu; -template [[host_name("kernel_swiglu_f16")]] kernel kernel_swiglu_t kernel_swiglu; - -template -kernel void kernel_swiglu_oai( - constant ggml_metal_kargs_glu & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); - - for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { - float x0 = src0_row[i0]; - float x1 = src1_row[i0]; - - x0 = min(x0, args.limit); - x1 = max(min(x1, args.limit), -args.limit); - - float out_glu = x0 / (1.0f + exp(-x0 * args.alpha)); - out_glu = out_glu * (1.0f + x1); - - dst_row[i0] = (T)out_glu; - } -} - -typedef decltype(kernel_swiglu_oai) kernel_swiglu_oai_t; - -template [[host_name("kernel_swiglu_oai_f32")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai; -template [[host_name("kernel_swiglu_oai_f16")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai; - -template -kernel void kernel_geglu_erf( - constant ggml_metal_kargs_glu & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); - - for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { - const float x0 = src0_row[i0]; - const float x1 = src1_row[i0]; - - const float gelu_erf = 0.5f*x0*(1.0f+erf_approx(x0*SQRT_2_INV)); - - dst_row[i0] = (T)(gelu_erf*x1); - } -} - -typedef decltype(kernel_geglu_erf) kernel_geglu_erf_t; - -template [[host_name("kernel_geglu_erf_f32")]] kernel kernel_geglu_erf_t kernel_geglu_erf; -template [[host_name("kernel_geglu_erf_f16")]] kernel kernel_geglu_erf_t kernel_geglu_erf; - -template -kernel void kernel_geglu_quick( - constant ggml_metal_kargs_glu & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); - - for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { - const float x0 = src0_row[i0]; - const float x1 = src1_row[i0]; - - const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0))); - - dst_row[i0] = (T)(gelu_quick*x1); - } -} - -typedef decltype(kernel_geglu_quick) kernel_geglu_quick_t; - -template [[host_name("kernel_geglu_quick_f32")]] kernel kernel_geglu_quick_t kernel_geglu_quick; -template [[host_name("kernel_geglu_quick_f16")]] kernel kernel_geglu_quick_t kernel_geglu_quick; - -kernel void kernel_op_sum_f32( - constant ggml_metal_kargs_sum & args, - device const float * src0, - device float * dst, - threadgroup float * shmem_f32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - - if (args.np == 0) { - return; - } - - // TODO: become function constant - const uint nsg = (ntg.x + 31) / 32; - - float sumf = 0; - - for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) { - sumf += src0[i0]; - } - - sumf = simd_sum(sumf); - - if (tiisg == 0) { - shmem_f32[sgitg] = sumf; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - float total = 0; - - if (sgitg == 0) { - float v = 0; - - if (tpitg.x < nsg) { - v = shmem_f32[tpitg.x]; - } - - total = simd_sum(v); - - if (tpitg.x == 0) { - dst[0] = total; - } - } -} - -constant short FC_sum_rows_op [[function_constant(FC_SUM_ROWS + 0)]]; - -template -kernel void kernel_sum_rows_impl( - constant ggml_metal_kargs_sum_rows & args, - device const char * src0, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { -#define FC_OP FC_sum_rows_op - - const int i3 = tgpig.z; - const int i2 = tgpig.y; - const int i1 = tgpig.x; - - threadgroup T0 * shmem_t = (threadgroup T0 *) shmem; - - if (sgitg == 0) { - shmem_t[tiisg] = 0.0f; - } - - device const T0 * src_row = (device const T0 *) (src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03); - device T * dst_row = (device T *) (dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3); - - T0 sumf = T0(0.0f); - - for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { - sumf += src_row[i0]; - } - - sumf = simd_sum(sumf); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - shmem_t[sgitg] = sumf; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sumf = shmem_t[tiisg]; - sumf = simd_sum(sumf); - - if (tpitg.x == 0) { - if (FC_OP == OP_SUM_ROWS_NUM_MEAN) { - if (is_same::value) { - dst_row[0] = sum(sumf) / (4*args.ne00); - } else { - dst_row[0] = sum(sumf) / args.ne00; - } - } else { - dst_row[0] = sum(sumf); - } - } - -#undef FC_OP -} - -typedef decltype(kernel_sum_rows_impl) kernel_sum_rows_t; - -template [[host_name("kernel_sum_rows_f32_f32")]] kernel kernel_sum_rows_t kernel_sum_rows_impl; -template [[host_name("kernel_sum_rows_f32_f32_4")]] kernel kernel_sum_rows_t kernel_sum_rows_impl; - -template -kernel void kernel_cumsum_blk( - constant ggml_metal_kargs_cumsum_blk & args, - device const char * src0, - device char * tmp, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int ib = tgpig[0]/args.ne01; - - const int i00 = ib*ntg.x; - const int i01 = tgpig[0]%args.ne01; - const int i02 = tgpig[1]; - const int i03 = tgpig[2]; - - device const float * src0_row = (device const float *) (src0 + - args.nb01*i01 + - args.nb02*i02 + - args.nb03*i03); - - threadgroup float * shmem_f32 = (threadgroup float *) shmem; - - float v = 0.0f; - - if (i00 + tpitg.x < args.ne00) { - v = src0_row[i00 + tpitg.x]; - } - - float s = simd_prefix_inclusive_sum(v); - - if (tiisg == N_SIMDWIDTH - 1) { - shmem_f32[sgitg] = s; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (sgitg == 0) { - shmem_f32[tiisg] = simd_prefix_exclusive_sum(shmem_f32[tiisg]); - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - s += shmem_f32[sgitg]; - - device float * dst_row = (device float *) dst + - args.ne00*i01 + - args.ne00*args.ne01*i02 + - args.ne00*args.ne01*args.ne02*i03; - - if (i00 + tpitg.x < args.ne00) { - dst_row[i00 + tpitg.x] = s; - } - - if (args.outb && tpitg.x == ntg.x - 1) { - device float * tmp_row = (device float *) tmp + - args.net0*i01 + - args.net0*args.net1*i02 + - args.net0*args.net1*args.net2*i03; - - tmp_row[ib] = s; - } -} - -typedef decltype(kernel_cumsum_blk) kernel_cumsum_blk_t; - -template [[host_name("kernel_cumsum_blk_f32")]] kernel kernel_cumsum_blk_t kernel_cumsum_blk; - -template -kernel void kernel_cumsum_add( - constant ggml_metal_kargs_cumsum_add & args, - device const char * tmp, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int ib = tgpig[0]/args.ne01; - - if (ib == 0) { - return; - } - - const int i00 = ib*ntg.x; - const int i01 = tgpig[0]%args.ne01; - const int i02 = tgpig[1]; - const int i03 = tgpig[2]; - - device const float * tmp_row = (device const float *) (tmp + - args.nbt1*i01 + - args.nbt2*i02 + - args.nbt3*i03); - - device float * dst_row = (device float *) dst + - args.ne00*i01 + - args.ne00*args.ne01*i02 + - args.ne00*args.ne01*args.ne02*i03; - - if (i00 + tpitg.x < args.ne00) { - dst_row[i00 + tpitg.x] += tmp_row[ib - 1]; - } -} - -typedef decltype(kernel_cumsum_add) kernel_cumsum_add_t; - -template [[host_name("kernel_cumsum_add_f32")]] kernel kernel_cumsum_add_t kernel_cumsum_add; - - -template -bool _ggml_vec_tri_cmp(const int i, const int r); - -template<> -bool _ggml_vec_tri_cmp(const int i, const int r) { - return i < r; -} - -template<> -bool _ggml_vec_tri_cmp(const int i, const int r) { - return i <= r; -} - -template<> -bool _ggml_vec_tri_cmp(const int i, const int r) { - return i > r; -} - -template<> -bool _ggml_vec_tri_cmp(const int i, const int r) { - return i >= r; -} - -template -kernel void kernel_tri( - constant ggml_metal_kargs_tri & args, - device const char * src0, - device const char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i3 = tgpig.z; - const int i2 = tgpig.y; - const int i1 = tgpig.x; - - if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { - return; - } - - device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03); - device T * dst_row = (device T *) ((device char *) dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3); - - // Each thread is a single element of the row if ne00 < max threads per - // threadgroup, so this will loop once for each index that this thread is - // responsible for - for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { - // Use the comparison as a mask for branchless - dst_row[i0] = static_cast(_ggml_vec_tri_cmp(i0, i1)) * src_row[i0]; - } -} - -typedef decltype(kernel_tri) kernel_tri_t; - -template [[host_name("kernel_tri_f32_0")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f32_1")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f32_2")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f32_3")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f16_0")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f16_1")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f16_2")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_f16_3")]] kernel kernel_tri_t kernel_tri; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_tri_bf16_0")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_bf16_1")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_bf16_2")]] kernel kernel_tri_t kernel_tri; -template [[host_name("kernel_tri_bf16_3")]] kernel kernel_tri_t kernel_tri; -#endif - -template -kernel void kernel_soft_max( - constant ggml_metal_kargs_soft_max & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - threadgroup float * buf [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint3 tptg[[threads_per_threadgroup]]) { - const int32_t i03 = tgpig.z; - const int32_t i02 = tgpig.y; - const int32_t i01 = tgpig.x; - - const int32_t i13 = i03%args.ne13; - const int32_t i12 = i02%args.ne12; - const int32_t i11 = i01; - - device const float * psrc0 = (device const float *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); - device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr; - device const float * psrc2 = src2 != src0 ? (device const float *) (src2) : nullptr; - device float * pdst = (device float *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3); - - float slope = 1.0f; - - // ALiBi - if (args.max_bias > 0.0f) { - const int32_t h = i02; - - const float base = h < args.n_head_log2 ? args.m0 : args.m1; - const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; - - slope = pow(base, exp); - } - - // parallel max - float lmax = psrc2 ? psrc2[i02] : -INFINITY; - - for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { - lmax = MAX(lmax, psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)); - } - - // find the max value in the block - float max_val = simd_max(lmax); - if (tptg.x > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = -INFINITY; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - buf[sgitg] = max_val; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - max_val = buf[tiisg]; - max_val = simd_max(max_val); - } - - // parallel sum - float lsum = 0.0f; - for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { - const float exp_psrc0 = exp((psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val); - lsum += exp_psrc0; - pdst[i00] = exp_psrc0; - } - - // This barrier fixes a failing test - // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335 - threadgroup_barrier(mem_flags::mem_none); - - float sum = simd_sum(lsum); - - if (tptg.x > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = 0.0f; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - buf[sgitg] = sum; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sum = buf[tiisg]; - sum = simd_sum(sum); - } - - if (psrc2) { - sum += exp(psrc2[i02] - max_val); - } - - const float inv_sum = 1.0f/sum; - - for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { - pdst[i00] *= inv_sum; - } -} - -template -kernel void kernel_soft_max_4( - constant ggml_metal_kargs_soft_max & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - threadgroup float * buf [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint3 tptg[[threads_per_threadgroup]]) { - const int32_t i03 = tgpig.z; - const int32_t i02 = tgpig.y; - const int32_t i01 = tgpig.x; - - const int32_t i13 = i03%args.ne13; - const int32_t i12 = i02%args.ne12; - const int32_t i11 = i01; - - device const float4 * psrc4 = (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); - device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr; - device const float * psrc2 = src2 != src0 ? (device const float * ) (src2) : nullptr; - device float4 * pdst4 = (device float4 *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3); - - float slope = 1.0f; - - if (args.max_bias > 0.0f) { - const int32_t h = i02; - - const float base = h < args.n_head_log2 ? args.m0 : args.m1; - const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; - - slope = pow(base, exp); - } - - // parallel max - float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY; - - for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { - lmax4 = fmax(lmax4, psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))); - } - - const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); - - float max_val = simd_max(lmax); - if (tptg.x > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = -INFINITY; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - buf[sgitg] = max_val; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - max_val = buf[tiisg]; - max_val = simd_max(max_val); - } - - // parallel sum - float4 lsum4 = 0.0f; - for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { - const float4 exp_psrc4 = exp((psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val); - lsum4 += exp_psrc4; - pdst4[i00] = exp_psrc4; - } - - const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; - - // This barrier fixes a failing test - // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335 - threadgroup_barrier(mem_flags::mem_none); - - float sum = simd_sum(lsum); - - if (tptg.x > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = 0.0f; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - buf[sgitg] = sum; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sum = buf[tiisg]; - sum = simd_sum(sum); - } - - if (psrc2) { - sum += exp(psrc2[i02] - max_val); - } - - const float inv_sum = 1.0f/sum; - - for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { - pdst4[i00] *= inv_sum; - } -} - -typedef decltype(kernel_soft_max) kernel_soft_max_t; -typedef decltype(kernel_soft_max_4) kernel_soft_max_4_t; - -template [[host_name("kernel_soft_max_f16")]] kernel kernel_soft_max_t kernel_soft_max; -template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kernel_soft_max; -template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; -template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; - -// ref: ggml.c:ggml_compute_forward_ssm_conv_f32 -kernel void kernel_ssm_conv_f32_f32( - constant ggml_metal_kargs_ssm_conv & args, - device const void * src0, - device const void * src1, - device float * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t ir = tgpig.x; - const int64_t i2 = tgpig.y; - const int64_t i3 = tgpig.z; - - const int64_t nc = args.ne10; - //const int64_t ncs = args.ne00; - //const int64_t nr = args.ne01; - //const int64_t n_t = args.ne1; - //const int64_t n_s = args.ne2; - - device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); - device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11); - device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); - - float sumf = 0.0f; - - for (int64_t i0 = 0; i0 < nc; ++i0) { - sumf += s[i0] * c[i0]; - } - - x[0] = sumf; -} - -kernel void kernel_ssm_conv_f32_f32_4( - constant ggml_metal_kargs_ssm_conv & args, - device const void * src0, - device const void * src1, - device float * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t ir = tgpig.x; - const int64_t i2 = tgpig.y; - const int64_t i3 = tgpig.z; - - const int64_t nc = args.ne10; - //const int64_t ncs = args.ne00; - //const int64_t nr = args.ne01; - //const int64_t n_t = args.ne1; - //const int64_t n_s = args.ne2; - - device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); - device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11); - device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); - - float sumf = 0.0f; - - for (int64_t i0 = 0; i0 < nc/4; ++i0) { - sumf += dot(s[i0], c[i0]); - } - - x[0] = sumf; -} - -constant short FC_ssm_conv_bs [[function_constant(FC_SSM_CONV + 0)]]; - -// Batched version: each threadgroup processes multiple tokens for better efficiency -// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens -kernel void kernel_ssm_conv_f32_f32_batched( - constant ggml_metal_kargs_ssm_conv & args, - device const void * src0, - device const void * src1, - device float * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - // tgpig.x = row index (ir) - // tgpig.y = batch of tokens (i2_base / BATCH_SIZE) - // tgpig.z = sequence index (i3) - // tpitg.x = thread within batch (0..BATCH_SIZE-1) - const short BATCH_SIZE = FC_ssm_conv_bs; - - const int64_t ir = tgpig.x; - const int64_t i2_base = tgpig.y * BATCH_SIZE; - const int64_t i3 = tgpig.z; - const int64_t i2_off = tpitg.x; - const int64_t i2 = i2_base + i2_off; - - const int64_t nc = args.ne10; // conv kernel size (typically 4) - const int64_t n_t = args.ne1; // number of tokens - - // Bounds check for partial batches at the end - if (i2 >= n_t) { - return; - } - - // Load conv weights (shared across all tokens for this row) - device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11); - - // Load source for this specific token - device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); - - // Output location for this token - device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); - - float sumf = 0.0f; - for (int64_t i0 = 0; i0 < nc; ++i0) { - sumf += s[i0] * c[i0]; - } - - x[0] = sumf; -} - -kernel void kernel_ssm_conv_f32_f32_batched_4( - constant ggml_metal_kargs_ssm_conv & args, - device const void * src0, - device const void * src1, - device float * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - // tgpig.x = row index (ir) - // tgpig.y = batch of tokens (i2_base / BATCH_SIZE) - // tgpig.z = sequence index (i3) - // tpitg.x = thread within batch (0..BATCH_SIZE-1) - const short BATCH_SIZE = FC_ssm_conv_bs; - - const int64_t ir = tgpig.x; - const int64_t i2_base = tgpig.y * BATCH_SIZE; - const int64_t i3 = tgpig.z; - const int64_t i2_off = tpitg.x; - const int64_t i2 = i2_base + i2_off; - - const int64_t nc = args.ne10; // conv kernel size (typically 4) - const int64_t n_t = args.ne1; // number of tokens - - // Bounds check for partial batches at the end - if (i2 >= n_t) { - return; - } - - // Load conv weights (shared across all tokens for this row) - device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11); - - // Load source for this specific token - device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); - - // Output location for this token - device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); - - float sumf = 0.0f; - for (int64_t i0 = 0; i0 < nc/4; ++i0) { - sumf += dot(s[i0], c[i0]); - } - - x[0] = sumf; -} - -// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part -// Optimized version: reduces redundant memory loads by having one thread load shared values -kernel void kernel_ssm_scan_f32( - constant ggml_metal_kargs_ssm_scan & args, - device const void * src0, - device const void * src1, - device const void * src2, - device const void * src3, - device const void * src4, - device const void * src5, - device const void * src6, - device float * dst, - threadgroup float * shared [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgptg[[simdgroups_per_threadgroup]], - uint3 tgpg[[threadgroups_per_grid]]) { - constexpr short NW = N_SIMDWIDTH; - - // Shared memory layout: - // [0..sgptg*NW-1]: partial sums for reduction (existing) - // [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch - // [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch - threadgroup float * shared_sums = shared; - threadgroup float * shared_x_dt = shared + sgptg * NW; - threadgroup float * shared_dA = shared + sgptg * NW + sgptg; - - shared_sums[tpitg.x] = 0.0f; - - const int32_t i0 = tpitg.x; - const int32_t i1 = tgpig.x; - const int32_t ir = tgpig.y; // current head - const int32_t i3 = tgpig.z; // current seq - - const int32_t nc = args.d_state; - const int32_t nr = args.d_inner; - const int32_t nh = args.n_head; - const int32_t ng = args.n_group; - const int32_t n_t = args.n_seq_tokens; - - const int32_t s_off = args.s_off; - - device const int32_t * ids = (device const int32_t *) src6; - - device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); - device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); - - const int32_t i = i0 + i1*nc; - const int32_t g = ir / (nh / ng); // repeat_interleave - - float s0 = s0_buff[i]; - float s = 0.0f; - - device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {ne30, nh} - - const float A0 = A[i0%args.ne30]; - - device const float * x = (device const float *)((device const char *) src1 + i1*args.nb10 + ir*args.nb11 + i3*args.nb13); // {dim, nh, nt, ns} - device const float * dt = (device const float *)((device const char *) src2 + ir*args.nb20 + i3*args.nb22); // {nh, nt, ns} - device const float * B = (device const float *)((device const char *) src4 + g*args.nb41 + i3*args.nb43); // {d_state, ng, nt, ns} - device const float * C = (device const float *)((device const char *) src5 + g*args.nb51 + i3*args.nb53); // {d_state, ng, nt, ns} - - device float * y = dst + (i1 + ir*(nr) + i3*(n_t*nh*nr)); // {dim, nh, nt, ns} - - for (int i2 = 0; i2 < n_t; i2 += sgptg) { - threadgroup_barrier(mem_flags::mem_threadgroup); - - // Pre-compute x_dt and dA for this batch of tokens - // Only first sgptg threads do the loads and expensive math - if (i0 < sgptg && i2 + i0 < n_t) { - // ns12 and ns21 are element strides (nb12/nb10, nb21/nb20) - device const float * x_t = x + i0 * args.ns12; - device const float * dt_t = dt + i0 * args.ns21; - - const float dt0 = dt_t[0]; - const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0; - shared_x_dt[i0] = x_t[0] * dtsp; - shared_dA[i0] = dtsp; // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (int t = 0; t < sgptg && i2 + t < n_t; t++) { - const float x_dt = shared_x_dt[t]; - const float dA = exp(shared_dA[t] * A0); - - s = (s0 * dA) + (B[i0] * x_dt); - - const float sumf = simd_sum(s * C[i0]); - - if (tiisg == 0) { - shared_sums[t*NW + sgitg] = sumf; - } - - // recurse - s0 = s; - - B += args.ns42; - C += args.ns52; - } - - // Advance pointers for next batch - x += sgptg * args.ns12; - dt += sgptg * args.ns21; - - threadgroup_barrier(mem_flags::mem_threadgroup); - - const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]); - - if (tiisg == 0 && i2 + sgitg < n_t) { - y[sgitg*nh*nr] = sumf; - } - - y += sgptg*nh*nr; - } - - s_buff[i] = s; -} - -kernel void kernel_rwkv_wkv6_f32( - device const float * k, - device const float * v, - device const float * r, - device const float * tf, - device const float * td, - device const float * state_in, - device float * dst, - constant uint & B, - constant uint & T, - constant uint & C, - constant uint & H, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const uint head_size = 64; // TODO: support head_size = 128 - const uint batch_id = tgpig.x / H; - const uint head_id = tgpig.x % H; - const uint tid = tpitg.x; - - if (batch_id >= B || head_id >= H) { - return; - } - - const uint state_size = C * head_size; - const uint n_seq_tokens = T / B; - - threadgroup float _k[head_size]; - threadgroup float _r[head_size]; - threadgroup float _tf[head_size]; - threadgroup float _td[head_size]; - - float state[head_size]; - - for (uint i = 0; i < head_size; i++) { - state[i] = state_in[batch_id * state_size + head_id * head_size * head_size - + i * head_size + tid]; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - _tf[tid] = tf[head_id * head_size + tid]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; - const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; - - for (uint t = start_t; t < end_t; t += C) { - threadgroup_barrier(mem_flags::mem_threadgroup); - _k[tid] = k[t]; - _r[tid] = r[t]; - _td[tid] = td[t]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - const float v_val = v[t]; - float y = 0.0; - - for (uint j = 0; j < head_size; j += 4) { - float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); - float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); - float4 tf_vec = float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); - float4 td_vec = float4(_td[j], _td[j+1], _td[j+2], _td[j+3]); - float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); - - float4 kv = k_vec * v_val; - - float4 temp = tf_vec * kv + s_vec; - y += dot(r_vec, temp); - - s_vec = s_vec * td_vec + kv; - state[j] = s_vec[0]; - state[j+1] = s_vec[1]; - state[j+2] = s_vec[2]; - state[j+3] = s_vec[3]; - } - - dst[t] = y; - } - - for (uint i = 0; i < head_size; i++) { - dst[T * C + batch_id * state_size + head_id * head_size * head_size - + i * head_size + tid] = state[i]; - } -} - -kernel void kernel_rwkv_wkv7_f32( - device const float * r, - device const float * w, - device const float * k, - device const float * v, - device const float * a, - device const float * b, - device const float * state_in, - device float * dst, - constant uint & B, - constant uint & T, - constant uint & C, - constant uint & H, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const uint head_size = 64; // TODO: support head_size = 128 - const uint batch_id = tgpig.x / H; - const uint head_id = tgpig.x % H; - const uint tid = tpitg.x; - - if (batch_id >= B || head_id >= H) { - return; - } - - const uint state_size = C * head_size; - const uint n_seq_tokens = T / B; - - threadgroup float _r[head_size]; - threadgroup float _w[head_size]; - threadgroup float _k[head_size]; - threadgroup float _a[head_size]; - threadgroup float _b[head_size]; - - float state[head_size]; - - for (uint i = 0; i < head_size; i++) { - state[i] = state_in[batch_id * state_size + head_id * head_size * head_size - + tid * head_size + i]; - } - - const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; - const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; - - for (uint t = start_t; t < end_t; t += C) { - threadgroup_barrier(mem_flags::mem_threadgroup); - _r[tid] = r[t]; - _w[tid] = w[t]; - _k[tid] = k[t]; - _a[tid] = a[t]; - _b[tid] = b[t]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - const float v_val = v[t]; - float y = 0.0, sa = 0.0; - - float4 sa_vec(0.0); - - for (uint j = 0; j < head_size; j += 4) { - float4 a_vec = float4(_a[j], _a[j+1], _a[j+2], _a[j+3]); - float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); - sa_vec += a_vec * s_vec; - } - sa = sa_vec[0] + sa_vec[1] + sa_vec[2] + sa_vec[3]; - - for (uint j = 0; j < head_size; j += 4) { - float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); - float4 w_vec = float4(_w[j], _w[j+1], _w[j+2], _w[j+3]); - float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); - float4 b_vec = float4(_b[j], _b[j+1], _b[j+2], _b[j+3]); - float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); - - float4 kv = k_vec * v_val; - - s_vec = s_vec * w_vec + kv + sa * b_vec; - y += dot(s_vec, r_vec); - - state[j] = s_vec[0]; - state[j+1] = s_vec[1]; - state[j+2] = s_vec[2]; - state[j+3] = s_vec[3]; - } - - dst[t] = y; - } - - for (uint i = 0; i < head_size; i++) { - dst[T * C + batch_id * state_size + head_id * head_size * head_size - + tid * head_size + i] = state[i]; - } -} - -constant short FC_gated_delta_net_ne20 [[function_constant(FC_GATED_DELTA_NET + 0)]]; -constant short FC_gated_delta_net_ne30 [[function_constant(FC_GATED_DELTA_NET + 1)]]; -constant short FC_gated_delta_net_K [[function_constant(FC_GATED_DELTA_NET + 2)]]; - -#if 1 -template -kernel void kernel_gated_delta_net_impl( - constant ggml_metal_kargs_gated_delta_net & args, - device const char * q, - device const char * k, - device const char * v, - device const char * g, - device const char * b, - device const char * s, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { -#define S_v FC_gated_delta_net_ne20 -#define G FC_gated_delta_net_ne30 -#define K FC_gated_delta_net_K - - const uint tx = tpitg.x; - const uint ty = tpitg.y; - - const uint i23 = tgpig.z; // B (n_seqs) - const uint i21 = tgpig.y; // H (head) - const uint i20 = tgpig.x*NSG + ty; // row within S_v - - const uint i01 = i21 % args.ne01; - const uint i11 = i21 % args.ne11; - - const float scale = 1.0f / sqrt((float)S_v); - - // input state layout [S_v, S_v, H, n_seqs] (s0 only): per-seq stride is H*D. - // state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous - const uint state_in_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; - device const float * s_ptr = (device const float *) (s) + state_in_base; - - float ls[NSG]; - - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - ls[j] = s_ptr[is]; - } - - device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20; - - device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01); - device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11); - device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21); - - device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21); - device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G; - - // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back. - // When n_tokens < K, only slots 0..n_tokens-1 are written; older slots are caller-owned. - - // output state base offset: after attention scores - const uint attn_size = args.ne22 * args.ne21 * S_v * args.ne23; - // output state per-slot size: S_v * S_v * H * n_seqs - const uint state_size_per_snap = S_v * S_v * args.ne21 * args.ne23; - // per-(seq,head) offset within a slot - const uint state_out_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; - - for (short t = 0; t < args.ne22; t++) { - float s_k = 0.0f; - - if (G == 1) { - const float g_exp = exp(g_ptr[0]); - - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - ls[j] *= g_exp; - - s_k += ls[j]*k_ptr[is]; - } - } else { - // KDA - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - ls[j] *= exp(g_ptr[is]); - - s_k += ls[j]*k_ptr[is]; - } - } - - s_k = simd_sum(s_k); - - const float d = (v_ptr[i20] - s_k)*b_ptr[0]; - - float y = 0.0f; - - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - ls[j] += k_ptr[is]*d; - - y += ls[j]*q_ptr[is]; - } - - y = simd_sum(y); - - if (tx == 0) { - dst_attn[t*args.ne21*S_v] = y*scale; - } - - q_ptr += args.ns02; - k_ptr += args.ns12; - v_ptr += args.ns22; - - b_ptr += args.ne21; - g_ptr += args.ne21*G; - - if (K > 1) { - const int target_slot = (int)args.ne22 - 1 - (int)t; - if (target_slot >= 0 && target_slot < (int)K) { - device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base; - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - dst_state[is] = ls[j]; - } - } - } - } - - if (K == 1) { - device float * dst_state = (device float *) (dst) + attn_size + state_out_base; - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - dst_state[is] = ls[j]; - } - } - -#undef S_v -#undef G -#undef K -} - -typedef decltype(kernel_gated_delta_net_impl<4>) kernel_gated_delta_net_t; - -template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<1>; -template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<2>; -template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<4>; - -#else -// a simplified version of the above -// no performance improvement, so keep the above version for now - -template -kernel void kernel_gated_delta_net_impl( - constant ggml_metal_kargs_gated_delta_net & args, - device const char * q, - device const char * k, - device const char * v, - device const char * g, - device const char * b, - device const char * s, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { -#define S_v FC_gated_delta_net_ne20 -#define G FC_gated_delta_net_ne30 - - const uint tx = tpitg.x; - const uint ty = tpitg.y; - - const uint i23 = tgpig.z; // B - const uint i21 = tgpig.y; // H - const uint i20 = tgpig.x*NSG + ty; - - const uint i01 = i21 % args.ne01; - const uint i11 = i21 % args.ne11; - - const float scale = 1.0f / sqrt((float)S_v); - - device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20; - - float lsf[NSG]; - - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - lsf[j] = s_ptr[is*S_v]; - } - - thread T * ls = (thread T *) (lsf); - - device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20; - - device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01); - device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11); - device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21); - - device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21); - device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G; - - for (short t = 0; t < args.ne22; t++) { - device const T * qt_ptr = (device const T *) (q_ptr); - device const T * kt_ptr = (device const T *) (k_ptr); - device const T * gt_ptr = (device const T *) (g_ptr); - - if (G == 1) { - *ls *= exp(g_ptr[0]); - } else { - // KDA - *ls *= exp(gt_ptr[tx]); - } - - const float s_k = simd_sum(dot(*ls, kt_ptr[tx])); - - const float d = (v_ptr[i20] - s_k)*b_ptr[0]; - - *ls += kt_ptr[tx]*d; - - const float y = simd_sum(dot(*ls, qt_ptr[tx])); - - if (tx == 0) { - *dst_attn = y*scale; - } - - q_ptr += args.ns02; - k_ptr += args.ns12; - v_ptr += args.ns22; - - b_ptr += args.ne21; - g_ptr += args.ne21*G; - - dst_attn += args.ne21*S_v; - } - - device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20; - device T * dstt_state = (device T *) (dst_state); - - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - dst_state[is*S_v] = lsf[j]; - } - -#undef S_v -#undef G -} - -typedef decltype(kernel_gated_delta_net_impl) kernel_gated_delta_net_t; - -template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl; -template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl; -template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl; -#endif - -constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]]; -constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]]; -constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]]; - -kernel void kernel_solve_tri_f32( - constant ggml_metal_kargs_solve_tri & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - ushort3 tgpig[[threadgroup_position_in_grid]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - constexpr short NW = N_SIMDWIDTH; - - const short NSG = FC_solve_tri_nsg; - const short N = FC_solve_tri_n; - const short K = FC_solve_tri_k; - const short NP = PAD2(N, NW); - - const int32_t i03 = tgpig.z; - const int32_t i02 = tgpig.y; - const int32_t i01 = tgpig.x*NSG + sgitg; - - threadgroup float * sh0 = (threadgroup float *) shmem; - - device const float * src0_ptr = (device const float *)(src0 + i02 * args.nb02 + i03 * args.nb03) + sgitg*N; - device const float * src1_ptr = (device const float *)(src1 + i02 * args.nb12 + i03 * args.nb13) + i01; - device float * dst_ptr = (device float *)(dst + i02 * args.nb2 + i03 * args.nb3) + i01; - - for (short rr = 0; rr < N; rr += NSG) { - threadgroup_barrier(mem_flags::mem_threadgroup); - - { - threadgroup float * sh0_cur = sh0 + sgitg*NP; - - for (short t = 0; t*NW < N; ++t) { - const short idx = t*NW + tiisg; - sh0_cur[idx] = src0_ptr[idx]; - } - - src0_ptr += NSG*N; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (i01 >= args.ne10) { - continue; - } - - for (short ir = 0; ir < NSG && rr + ir < N; ++ir) { - const short r = rr + ir; - - threadgroup float * sh0_cur = sh0 + ir*NP; - - float sum = 0.0f; - - for (short t = 0; t*NW < r; ++t) { - const short idx = t*NW + tiisg; - sum += sh0_cur[idx] * dst_ptr[idx*K] * (idx < r); - } - - sum = simd_sum(sum); - - if (tiisg == 0) { - const float diag = sh0_cur[r]; - - dst_ptr[r*K] = (src1_ptr[r*K] - sum) / diag; - } - } - } -} - -kernel void kernel_argmax_f32( - constant ggml_metal_kargs_argmax & args, - device const char * src0, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint ntg[[threads_per_threadgroup]]) { - device const float * x_row = (device const float *) ((device const char *) src0 + tgpig * args.nb01); - - float lmax = -INFINITY; - int32_t larg = -1; - - for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) { - if (x_row[i00] > lmax) { - lmax = x_row[i00]; - larg = i00; - } - } - - // find the argmax value in the block - float max_val = simd_max(lmax); - int32_t arg_val = simd_max(select(-1, larg, lmax == max_val)); - - device int32_t * dst_i32 = (device int32_t *) dst; - - threadgroup float * shared_maxval = (threadgroup float *) shmem; - threadgroup int32_t * shared_argmax = (threadgroup int32_t *) shmem + N_SIMDWIDTH; - - if (ntg > N_SIMDWIDTH) { - if (sgitg == 0) { - shared_maxval[tiisg] = -INFINITY; - shared_argmax[tiisg] = -1; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - shared_maxval[sgitg] = max_val; - shared_argmax[sgitg] = arg_val; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - max_val = shared_maxval[tiisg]; - arg_val = shared_argmax[tiisg]; - - float max_val_reduced = simd_max(max_val); - int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced)); - - dst_i32[tgpig] = arg_val_reduced; - - return; - } - - dst_i32[tgpig] = arg_val; -} - -// F == 1 : norm (no fuse) -// F == 2 : norm + mul -// F == 3 : norm + mul + add -template -kernel void kernel_norm_fuse_impl( - constant ggml_metal_kargs_norm & args, - device const char * src0, - device const char * src1_0, - device const char * src1_1, - device char * dst, - threadgroup float * shmem_f32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - if (sgitg == 0) { - shmem_f32[tiisg] = 0.0f; - } - - const int i01 = tgpig.x; - const int i02 = tgpig.y; - const int i03 = tgpig.z; - - device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]); - - device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]); - device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]); - - T sumft(0.0f); - - float sumf = 0.0f; - - for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { - sumft += x[i00]; - } - sumf = dot(sumft, T(1.0f)); - sumf = simd_sum(sumf); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - shmem_f32[sgitg] = sumf; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sumf = shmem_f32[tiisg]; - sumf = simd_sum(sumf); - - const float mean = sumf/args.ne00; - - device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); - - sumf = 0.0f; - for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { - y[i00] = x[i00] - mean; - sumf += dot(y[i00], y[i00]); - } - sumf = simd_sum(sumf); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - shmem_f32[sgitg] = sumf; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sumf = shmem_f32[tiisg]; - sumf = simd_sum(sumf); - - const float variance = sumf/args.ne00; - - const float scale = 1.0f/sqrt(variance + args.eps); - for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { - if (F == 1) { - y[i00] = (y[i00]*scale); - } - if (F == 2) { - y[i00] = (y[i00]*scale)*f0[i00]; - } - if (F == 3) { - y[i00] = (y[i00]*scale)*f0[i00] + f1[i00]; - } - } -} - -typedef decltype(kernel_norm_fuse_impl) kernel_norm_fuse_t; - -template [[host_name("kernel_norm_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; -template [[host_name("kernel_norm_mul_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; -template [[host_name("kernel_norm_mul_add_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; - -template [[host_name("kernel_norm_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; -template [[host_name("kernel_norm_mul_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; -template [[host_name("kernel_norm_mul_add_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; - -// F == 1 : rms_norm (no fuse) -// F == 2 : rms_norm + mul -// F == 3 : rms_norm + mul + add -template -kernel void kernel_rms_norm_fuse_impl( - constant ggml_metal_kargs_norm & args, - device const char * src0, - device const char * src1_0, - device const char * src1_1, - device char * dst, - threadgroup float * shmem_f32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - if (sgitg == 0) { - shmem_f32[tiisg] = 0.0f; - } - - const int i01 = tgpig.x; - const int i02 = tgpig.y; - const int i03 = tgpig.z; - - device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]); - - device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]); - device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]); - - float sumf = 0.0f; - - // parallel sum - for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { - sumf += dot(x[i00], x[i00]); - } - sumf = simd_sum(sumf); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - shmem_f32[sgitg] = sumf; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sumf = shmem_f32[tiisg]; - sumf = simd_sum(sumf); - - const float mean = sumf/args.ne00; - const float scale = 1.0f/sqrt(mean + args.eps); - - device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); - for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { - if (F == 1) { - y[i00] = (x[i00]*scale); - } - if (F == 2) { - y[i00] = (x[i00]*scale)*f0[i00]; - } - if (F == 3) { - y[i00] = (x[i00]*scale)*f0[i00] + f1[i00]; - } - } -} - -typedef decltype(kernel_rms_norm_fuse_impl) kernel_rms_norm_fuse_t; - -template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; -template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; -template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; - -template [[host_name("kernel_rms_norm_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; -template [[host_name("kernel_rms_norm_mul_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; -template [[host_name("kernel_rms_norm_mul_add_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; - -template -kernel void kernel_l2_norm_impl( - constant ggml_metal_kargs_l2_norm & args, - device const char * src0, - device char * dst, - threadgroup float * shmem_f32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int i01 = tgpig.x; - - if (sgitg == 0) { - shmem_f32[tiisg] = 0.0f; - } - - device const T0 * x = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); - - float sumf = 0.0f; - - // parallel sum - for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { - sumf += dot(x[i00], x[i00]); - } - sumf = simd_sum(sumf); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - shmem_f32[sgitg] = sumf; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - sumf = shmem_f32[tiisg]; - sumf = simd_sum(sumf); - - const float scale = 1.0f/max(sqrt(sumf), args.eps); - - for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { - y[i00] = x[i00] * scale; - } -} - -typedef decltype(kernel_l2_norm_impl) kernel_l2_norm_t; - -template [[host_name("kernel_l2_norm_f32_f32")]] kernel kernel_l2_norm_t kernel_l2_norm_impl; -template [[host_name("kernel_l2_norm_f32_f32_4")]] kernel kernel_l2_norm_t kernel_l2_norm_impl; - -kernel void kernel_group_norm_f32( - constant ggml_metal_kargs_group_norm & args, - device const float * src0, - device float * dst, - threadgroup float * buf [[threadgroup(0)]], - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint ntg[[threads_per_threadgroup]]) { - const int64_t ne = args.ne00*args.ne01*args.ne02; - const int64_t gs = args.ne00*args.ne01*((args.ne02 + args.ngrp - 1) / args.ngrp); - - int start = tgpig * gs; - int end = start + gs; - - start += tpitg; - - if (end >= ne) { - end = ne; - } - - float tmp = 0.0f; // partial sum for thread in warp - - for (int j = start; j < end; j += ntg) { - tmp += src0[j]; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - tmp = simd_sum(tmp); - if (ntg > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = 0.0f; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - buf[sgitg] = tmp; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - tmp = buf[tiisg]; - tmp = simd_sum(tmp); - } - - const float mean = tmp / gs; - tmp = 0.0f; - - for (int j = start; j < end; j += ntg) { - float xi = src0[j] - mean; - dst[j] = xi; - tmp += xi * xi; - } - - tmp = simd_sum(tmp); - if (ntg > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = 0.0f; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tiisg == 0) { - buf[sgitg] = tmp; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - tmp = buf[tiisg]; - tmp = simd_sum(tmp); - } - - const float variance = tmp / gs; - const float scale = 1.0f/sqrt(variance + args.eps); - for (int j = start; j < end; j += ntg) { - dst[j] *= scale; - } -} - -// Q1_0 dot product: dot = d * (2 * Σ(yl[i] where bit=1) - sumy) -inline float block_q_n_dot_y(device const block_q1_0 * qb_curr, float sumy, thread float * yl, int il) { - device const uint8_t * qs = qb_curr->qs + il / 8; - const uint8_t b0 = qs[0]; - const uint8_t b1 = qs[1]; - - float acc = 0.0f; - - acc += select(0.0f, yl[ 0], bool(b0 & 0x01)); - acc += select(0.0f, yl[ 1], bool(b0 & 0x02)); - acc += select(0.0f, yl[ 2], bool(b0 & 0x04)); - acc += select(0.0f, yl[ 3], bool(b0 & 0x08)); - acc += select(0.0f, yl[ 4], bool(b0 & 0x10)); - acc += select(0.0f, yl[ 5], bool(b0 & 0x20)); - acc += select(0.0f, yl[ 6], bool(b0 & 0x40)); - acc += select(0.0f, yl[ 7], bool(b0 & 0x80)); - - acc += select(0.0f, yl[ 8], bool(b1 & 0x01)); - acc += select(0.0f, yl[ 9], bool(b1 & 0x02)); - acc += select(0.0f, yl[10], bool(b1 & 0x04)); - acc += select(0.0f, yl[11], bool(b1 & 0x08)); - acc += select(0.0f, yl[12], bool(b1 & 0x10)); - acc += select(0.0f, yl[13], bool(b1 & 0x20)); - acc += select(0.0f, yl[14], bool(b1 & 0x40)); - acc += select(0.0f, yl[15], bool(b1 & 0x80)); - - return qb_curr->d * (2.0f * acc - sumy); -} - -// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i]) -// il indicates where the q4 quants begin (0 or QK4_0/4) -// we assume that the yl's have been multiplied with the appropriate scale factor -// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) -inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) { - float d = qb_curr->d; - - float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; - - device const uint16_t * qs = ((device const uint16_t *) qb_curr + 1 + il/2); - - for (int i = 0; i < 8; i += 2) { - acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F); - acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00); - acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0); - acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000); - } - - return d * (sumy * -8.f + acc[0] + acc[1] + acc[2] + acc[3]); -} - -// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i]) -// il indicates where the q4 quants begin (0 or QK4_0/4) -// we assume that the yl's have been multiplied with the appropriate scale factor -// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) -inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) { - float d = qb_curr->d; - float m = qb_curr->m; - - float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; - - device const uint16_t * qs = ((device const uint16_t *) qb_curr + 2 + il/2); - - for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F); - acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00); - acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0); - acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000); - } - - return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m; -} - -// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i]) -// il indicates where the q5 quants begin (0 or QK5_0/4) -// we assume that the yl's have been multiplied with the appropriate scale factor -// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) -inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) { - float d = qb_curr->d; - - float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; - - device const uint16_t * qs = ((device const uint16_t *)qb_curr + 3 + il/2); - const uint32_t qh = *((device const uint32_t *)qb_curr->qh); - - for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)); - acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); - acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)); - acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); - } - - return d * (sumy * -16.f + acc[0] + acc[1] + acc[2] + acc[3]); -} - -// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i]) -// il indicates where the q5 quants begin (0 or QK5_1/4) -// we assume that the yl's have been multiplied with the appropriate scale factor -// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) -inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) { - float d = qb_curr->d; - float m = qb_curr->m; - - float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; - - device const uint16_t * qs = ((device const uint16_t *)qb_curr + 4 + il/2); - const uint32_t qh = *((device const uint32_t *)qb_curr->qh); - - for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)); - acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); - acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)); - acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); - } - - return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m; -} - -template -static inline void helper_mv_reduce_and_write( - device float * dst_f32, - float sumf[NR0], - const int r0, - const int ne01, - ushort tiisg, - ushort sgitg, - threadgroup char * shmem) { - constexpr short NW = N_SIMDWIDTH; - - threadgroup float * shmem_f32[NR0]; - - for (short row = 0; row < NR0; ++row) { - shmem_f32[row] = (threadgroup float *) shmem + NW*row; - - if (sgitg == 0) { - shmem_f32[row][tiisg] = 0.0f; - } - - sumf[row] = simd_sum(sumf[row]); - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (short row = 0; row < NR0; ++row) { - if (tiisg == 0) { - shmem_f32[row][sgitg] = sumf[row]; - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (short row = 0; row < NR0 && r0 + row < ne01; ++row) { - float tot = simd_sum(shmem_f32[row][tiisg]); - - if (tiisg == 0 && sgitg == 0) { - dst_f32[r0 + row] = tot; - } - } -} - -constant short FC_mul_mv_nsg [[function_constant(FC_MUL_MV + 0)]]; -constant short FC_mul_mv_nxpsg [[function_constant(FC_MUL_MV + 1)]]; -constant short FC_mul_mv_ne12 [[function_constant(FC_MUL_MV + 2)]]; -constant short FC_mul_mv_r2 [[function_constant(FC_MUL_MV + 3)]]; -constant short FC_mul_mv_r3 [[function_constant(FC_MUL_MV + 4)]]; - -template -void mul_vec_q_n_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - constexpr short NW = N_SIMDWIDTH; - constexpr short NQ = 16; - - const int nb = args.ne00/QK4_0; - - const int r0 = (tgpig.x*NSG + sgitg)*NR0; - //const int r0 = tgpig.x*NR0; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - //device const block_q_type * x = (device const block_q_type *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - // pointers to src0 rows - device const block_q_type * ax[NR0]; - FOR_UNROLL (int row = 0; row < NR0; ++row) { - const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - - ax[row] = (device const block_q_type *) ((device char *) src0 + offset0); - } - - float sumf[NR0] = {0.f}; - - const short ix = (tiisg/(NW/NQ)); - const short il = (tiisg%(NW/NQ))*8; - - //const int ib0 = sgitg*NQ + ix; - const int ib0 = ix; - - float yl[16]; // src1 vector cache - - //device const float * yb = y + ix*QK4_0 + il; - device const float * yb = y + ib0*QK4_0 + il; - - // each thread in a SIMD group deals with half a block. - //for (int ib = ib0; ib < nb; ib += NSG*NQ) { - for (int ib = ib0; ib < nb; ib += NQ) { - float sumy[2] = { 0.f, 0.f }; - - FOR_UNROLL (short i = 0; i < 8; i += 2) { - sumy[0] += yb[i + 0] + yb[i + 1]; - yl[i + 0] = yb[i + 0]; - yl[i + 1] = yb[i + 1]/256.f; - - sumy[1] += yb[i + 16] + yb[i + 17]; - yl[i + 8] = yb[i + 16]/16.f; - yl[i + 9] = yb[i + 17]/4096.f; - } - - FOR_UNROLL (short row = 0; row < NR0; row++) { - sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy[0] + sumy[1], yl, il); - } - - yb += QK4_0 * 16; - //yb += NSG*NQ*QK4_0; - } - - device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0; - - //helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); - - for (int row = 0; row < NR0; ++row) { - const float tot = simd_sum(sumf[row]); - - if (tiisg == 0 && r0 + row < args.ne01) { - dst_f32[r0 + row] = tot; - } - } -} - -template -void kernel_mul_mv_q1_0_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK1_0; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset1 = r1*args.nb11 + (i12)*args.nb12 + (i13)*args.nb13; - - device const float * y = (device const float *) (src1 + offset1); - - device const block_q1_0 * ax[nr0]; - for (int row = 0; row < nr0; ++row) { - const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - ax[row] = (device const block_q1_0 *) ((device char *) src0 + offset0); - } - - float yl[16]; - float sumf[nr0] = {0.f}; - - const short ix = (tiisg/8); - const short il = (tiisg%8)*16; - - device const float * yb = y + ix*QK1_0 + il; - - for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/8) { - float sumy = 0.f; - - FOR_UNROLL (short i = 0; i < 16; i++) { - yl[i] = yb[i]; - sumy += yb[i]; - } - - FOR_UNROLL (short row = 0; row < nr0; row++) { - sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy, yl, il); - } - - yb += QK1_0 * (N_SIMDWIDTH/8); - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0; ++row) { - const float tot = simd_sum(sumf[row]); - - if (tiisg == 0 && first_row + row < args.ne01) { - dst_f32[first_row + row] = tot; - } - } -} - -[[host_name("kernel_mul_mv_q1_0_f32")]] -kernel void kernel_mul_mv_q1_0_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q1_0_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -kernel void kernel_mul_mv_q4_0_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -kernel void kernel_mul_mv_q4_1_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -kernel void kernel_mul_mv_q5_0_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -kernel void kernel_mul_mv_q5_1_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_q8_0_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - constexpr short NW = N_SIMDWIDTH; - constexpr short NQ = 8; - - const int nb = args.ne00/QK8_0; - - const int r0 = tgpig.x*NR0; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - //device const block_q8_0 * x = (device const block_q8_0 *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - // pointers to src0 rows - device const block_q8_0 * ax[NR0]; - FOR_UNROLL (short row = 0; row < NR0; ++row) { - const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - - ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0); - } - - float sumf[NR0] = { 0.f }; - - const short ix = tiisg/(NW/NQ); - const short il = tiisg%(NW/NQ); - - const int ib0 = sgitg*NQ + ix; - - float yl[NQ]; - - device const float * yb = y + ib0*QK8_0 + il*NQ; - - // each thread in a SIMD group deals with NQ quants at a time - for (int ib = ib0; ib < nb; ib += NSG*NQ) { - for (short i = 0; i < NQ; ++i) { - yl[i] = yb[i]; - } - - for (short row = 0; row < NR0; row++) { - device const int8_t * qs = ax[row][ib].qs + il*NQ; - - float sumq = 0.f; - FOR_UNROLL (short i = 0; i < NQ; ++i) { - sumq += qs[i] * yl[i]; - } - - sumf[row] += sumq*ax[row][ib].d; - } - - yb += NSG*NQ*QK8_0; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); -} - -[[host_name("kernel_mul_mv_q8_0_f32")]] -kernel void kernel_mul_mv_q8_0_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q8_0_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -// mat-vec kernel processing in chunks of float4 -// chpb - chunks per quantization block -template -void kernel_mul_mv_ext_q4_f32_impl( - constant ggml_metal_kargs_mul_mv_ext & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - const short NSG = FC_mul_mv_nsg; - const short nxpsg = FC_mul_mv_nxpsg; - - const short chpt = 4; // chunks per thread - - //const short nxpsg = (32); - const short nypsg = (32/nxpsg); - - const short tx = tiisg%nxpsg; - const short ty = tiisg/nxpsg; - - const int i01 = tgpig.x*(nypsg*NSG) + nypsg*sgitg + ty; - const int i11 = tgpig.y*r1ptg; - const int i1m = tgpig.z; - - const int i12 = i1m%FC_mul_mv_ne12; - const int i13 = i1m/FC_mul_mv_ne12; - - const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0; - - device const float4 * y4[r1ptg]; - - for (int ir1 = 0; ir1 < r1ptg; ++ir1) { - y4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4 *) src1; - } - - float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f }; - - short cch = tx%chpb; // current chunk index - - for (int ich = tx; 4*ich < args.ne00; ich += chpt*nxpsg) { - float4 lx[chpt]; - -#pragma unroll(chpt) - for (short ch = 0; ch < chpt; ++ch) { - deq_t4(xq, cch, lx[ch]); - - cch += nxpsg; - if (cch >= chpb) { - xq += cch/chpb; - cch %= chpb; - } - } - -#pragma unroll(chpt) - for (short ch = 0; ch < chpt; ++ch) { -#pragma unroll(r1ptg) - for (short ir1 = 0; ir1 < r1ptg; ++ir1) { - sumf[ir1] += dot(lx[ch], y4[ir1][ch*nxpsg]); - } - } - -#pragma unroll(r1ptg) - for (short ir1 = 0; ir1 < r1ptg; ++ir1) { - y4[ir1] += chpt*nxpsg; - } - } - - // reduce only the threads in each row - for (short ir1 = 0; ir1 < r1ptg; ++ir1) { - if (nxpsg >= 32) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 16); - } - if (nxpsg >= 16) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 8); - } - if (nxpsg >= 8) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 4); - } - if (nxpsg >= 4) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 2); - } - if (nxpsg >= 2) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 1); - } - - //sumf[ir1] = simd_sum(sumf[ir1]); - } - - if (tx == 0) { - for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) { - device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0; - - if (i01 < args.ne01) { - dst_f32[i01] = sumf[ir1]; - } - } - } -} - -// mat-vec kernel processing in chunks of float4x4 -template -void kernel_mul_mv_ext_q4x4_f32_impl( - constant ggml_metal_kargs_mul_mv_ext & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - const short NSG = FC_mul_mv_nsg; - const short nxpsg = FC_mul_mv_nxpsg; - - const short chpt = 1; - - //const short nxpsg = (32); - const short nypsg = (32/nxpsg); - - const short tx = tiisg%nxpsg; - const short ty = tiisg/nxpsg; - - const int i01 = tgpig.x*(nypsg*NSG) + nypsg*sgitg + ty; - const int i11 = tgpig.y*r1ptg; - const int i1m = tgpig.z; - - const int i12 = i1m%FC_mul_mv_ne12; - const int i13 = i1m/FC_mul_mv_ne12; - - const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0; - - device const float4x4 * y4x4[r1ptg]; - - for (int ir1 = 0; ir1 < r1ptg; ++ir1) { - y4x4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4x4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4x4 *) src1; - } - - float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f }; - - short cch = tx%chpb; - - for (int ich = tx; 16*ich < args.ne00; ich += chpt*nxpsg) { - float4x4 lx[chpt]; - -#pragma unroll(chpt) - for (short ch = 0; ch < chpt; ++ch) { - deq_t4x4(xq, cch, lx[ch]); - - cch += nxpsg; - if (cch >= chpb) { - xq += cch/chpb; - cch %= chpb; - } - } - -#pragma unroll(chpt) - for (short ch = 0; ch < chpt; ++ch) { -#pragma unroll(r1ptg) - for (short ir1 = 0; ir1 < r1ptg; ++ir1) { - sumf[ir1] += - dot(lx[ch][0], y4x4[ir1][ch*nxpsg][0]) + - dot(lx[ch][1], y4x4[ir1][ch*nxpsg][1]) + - dot(lx[ch][2], y4x4[ir1][ch*nxpsg][2]) + - dot(lx[ch][3], y4x4[ir1][ch*nxpsg][3]); - - } - } - -#pragma unroll(r1ptg) - for (short ir1 = 0; ir1 < r1ptg; ++ir1) { - y4x4[ir1] += chpt*nxpsg; - } - } - - for (short ir1 = 0; ir1 < r1ptg; ++ir1) { - if (nxpsg >= 32) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 16); - } - if (nxpsg >= 16) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 8); - } - if (nxpsg >= 8) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 4); - } - if (nxpsg >= 4) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 2); - } - if (nxpsg >= 2) { - sumf[ir1] += simd_shuffle_down(sumf[ir1], 1); - } - - //sumf[ir1] = simd_sum(sumf[ir1]); - } - - if (tx == 0) { - for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) { - device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0; - - if (i01 < args.ne01) { - dst_f32[i01] = sumf[ir1]; - } - } - } -} - -// dispatchers needed for compile-time nxpsg -// epb - elements per quantization block -template -kernel void kernel_mul_mv_ext_q4_f32_disp( - constant ggml_metal_kargs_mul_mv_ext & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_ext_q4_f32_impl(args, src0, src1, dst, tgpig, tiisg, sgitg); -} - -template -kernel void kernel_mul_mv_ext_q4x4_f32_disp( - constant ggml_metal_kargs_mul_mv_ext & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_ext_q4x4_f32_impl(args, src0, src1, dst, tgpig, tiisg, sgitg); -} - -typedef decltype(kernel_mul_mv_ext_q4_f32_disp <2, block_q8_0, 32, dequantize_q8_0_t4>) mul_mv_ext_q4_f32_t; -typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>) mul_mv_ext_q4x4_f32_t; - -template [[host_name("kernel_mul_mv_ext_f32_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, float4, 4, dequantize_f32_t4>; -template [[host_name("kernel_mul_mv_ext_f32_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, float4, 4, dequantize_f32_t4>; -template [[host_name("kernel_mul_mv_ext_f32_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, float4, 4, dequantize_f32_t4>; -template [[host_name("kernel_mul_mv_ext_f32_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, float4, 4, dequantize_f32_t4>; - -template [[host_name("kernel_mul_mv_ext_f16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, half4, 4, dequantize_f16_t4>; -template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, half4, 4, dequantize_f16_t4>; -template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>; -template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>; - -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, bfloat4, 4, dequantize_bf16_t4>; -template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, bfloat4, 4, dequantize_bf16_t4>; -template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, bfloat4, 4, dequantize_bf16_t4>; -template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>; -#endif - -template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q1_0, 128, dequantize_q1_0_t4>; -template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q1_0, 128, dequantize_q1_0_t4>; -template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q1_0, 128, dequantize_q1_0_t4>; -template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q1_0, 128, dequantize_q1_0_t4>; - -template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>; -template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>; -template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>; -template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_0, 32, dequantize_q4_0_t4>; - -template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_1, 32, dequantize_q4_1_t4>; -template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_1, 32, dequantize_q4_1_t4>; -template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_1, 32, dequantize_q4_1_t4>; -template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_1, 32, dequantize_q4_1_t4>; - -template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_0, 32, dequantize_q5_0_t4>; -template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_0, 32, dequantize_q5_0_t4>; -template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_0, 32, dequantize_q5_0_t4>; -template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_0, 32, dequantize_q5_0_t4>; - -template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_1, 32, dequantize_q5_1_t4>; -template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_1, 32, dequantize_q5_1_t4>; -template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_1, 32, dequantize_q5_1_t4>; -template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_1, 32, dequantize_q5_1_t4>; - -template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q8_0, 32, dequantize_q8_0_t4>; -template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q8_0, 32, dequantize_q8_0_t4>; -template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q8_0, 32, dequantize_q8_0_t4>; -template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q8_0, 32, dequantize_q8_0_t4>; - -template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_mxfp4, 32, dequantize_mxfp4_t4>; -template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_mxfp4, 32, dequantize_mxfp4_t4>; -template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_mxfp4, 32, dequantize_mxfp4_t4>; -template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_mxfp4, 32, dequantize_mxfp4_t4>; - -template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_iq4_nl, 32, dequantize_iq4_nl_t4>; -template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_iq4_nl, 32, dequantize_iq4_nl_t4>; -template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_iq4_nl, 32, dequantize_iq4_nl_t4>; -template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_iq4_nl, 32, dequantize_iq4_nl_t4>; - -template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>; -template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q4_K, 256, dequantize_q4_K>; -template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q4_K, 256, dequantize_q4_K>; -template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q4_K, 256, dequantize_q4_K>; - -template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_K, 256, dequantize_q5_K>; -template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q5_K, 256, dequantize_q5_K>; -template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_K, 256, dequantize_q5_K>; -template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_K, 256, dequantize_q5_K>; - -template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_K, 256, dequantize_q6_K>; -template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_K, 256, dequantize_q6_K>; -template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>; -template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>; - -template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q2_K, 256, dequantize_q2_K>; -template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q2_K, 256, dequantize_q2_K>; -template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q2_K, 256, dequantize_q2_K>; -template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q2_K, 256, dequantize_q2_K>; - -template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q3_K, 256, dequantize_q3_K>; -template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q3_K, 256, dequantize_q3_K>; -template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q3_K, 256, dequantize_q3_K>; -template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q3_K, 256, dequantize_q3_K>; - -template -void kernel_mul_mv_t_t_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - constexpr short NW = N_SIMDWIDTH; - constexpr short NB = 32; - constexpr short NF = 8; - - const int nb = args.ne00/NB; - - const int r0 = tgpig.x*NR0; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - //device const T0 * x = (device const T0 *) (src0 + offset0); - device const T1 * y = (device const T1 *) (src1 + offset1); - - // pointers to src0 rows - device const T0 * ax [NR0]; - FOR_UNROLL (short row = 0; row < NR0; ++row) { - const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - - ax[row] = (device const T0 *) ((device char *) src0 + offset0); - } - - float sumf[NR0] = { 0.f }; - - const short ix = tiisg/(NW/NF); - const short il = tiisg%(NW/NF); - - const int ib0 = sgitg*NF + ix; - - T1 yl[NF]; - - device const T1 * yb = y + (ib0*NB + il*NF); - - for (int ib = ib0; ib < nb; ib += NSG*NF) { - for (short i = 0; i < NF; ++i) { - yl[i] = yb[i]; - } - - for (short row = 0; row < NR0; row++) { - device const T0 * xb = ax[row] + (ib*NB + il*NF); - - float sumq = 0.f; - FOR_UNROLL (short i = 0; i < NF; ++i) { - sumq += xb[i] * yl[i]; - } - - sumf[row] += sumq; - } - - yb += NSG*NF*NW; - } - - for (int i = nb*NB + sgitg*NW + tiisg; i < args.ne00; i += NW*NSG) { - for (short row = 0; row < NR0; row++) { - sumf[row] += ax[row][i] * y[i]; - } - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); -} - -template -void kernel_mul_mv_t_t_disp( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - switch (args.nr0) { - //case 1: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - case 2: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - //case 3: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - //case 4: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - } -} - -template -kernel void kernel_mul_mv_t_t( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_t_t_disp(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -typedef decltype(kernel_mul_mv_t_t) mul_mv_t_t; - -template [[host_name("kernel_mul_mv_f32_f32")]] kernel mul_mv_t_t kernel_mul_mv_t_t; -template [[host_name("kernel_mul_mv_f16_f32")]] kernel mul_mv_t_t kernel_mul_mv_t_t; -template [[host_name("kernel_mul_mv_f16_f16")]] kernel mul_mv_t_t kernel_mul_mv_t_t; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t_t kernel_mul_mv_t_t; -template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t_t kernel_mul_mv_t_t; -#endif - -template -void kernel_mul_mv_t_t_4_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - constexpr short NW = N_SIMDWIDTH; - constexpr short NB = 32; - constexpr short NF = 16; - constexpr short NF4 = NF/4; - - const int nb = args.ne00/NB; - - const int r0 = tgpig.x*NR0; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const T1 * y = (device const T1 *) (src1 + offset1); - device const T14 * y4 = (device const T14 *) (src1 + offset1); - - // pointers to src0 rows - device const T0 * ax [NR0]; - device const T04 * ax4[NR0]; - FOR_UNROLL (short row = 0; row < NR0; ++row) { - const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - - ax [row] = (device const T0 *) ((device char *) src0 + offset0); - ax4[row] = (device const T04 *) ((device char *) src0 + offset0); - } - - float sumf[NR0] = { 0.f }; - - const short ix = tiisg/(NW/NF); - const short il = tiisg%(NW/NF); - - const int ib0 = sgitg*NF + ix; - - T14 yl4[NF4]; - - device const T14 * yb4 = y4 + (ib0*NB + il*NF)/4; - - for (int ib = ib0; ib < nb; ib += NSG*NF) { - for (short i = 0; i < NF4; ++i) { - yl4[i] = yb4[i]; - } - - for (short row = 0; row < NR0; row++) { - device const T04 * xb4 = ax4[row] + (ib*NB + il*NF)/4; - - float sumq = 0.f; - FOR_UNROLL (short i = 0; i < NF4; ++i) { - sumq += dot(float4(xb4[i]), float4(yl4[i])); - } - - sumf[row] += sumq; - } - - yb4 += NSG*NF*NW/4; - } - - for (int i = nb*NB + sgitg*NW + tiisg; i < args.ne00; i += NW*NSG) { - for (short row = 0; row < NR0; row++) { - sumf[row] += ax[row][i] * y[i]; - } - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); -} - -template -void kernel_mul_mv_t_t_4_disp( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - switch (args.nr0) { - //case 1: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - case 2: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - //case 3: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - //case 4: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; - }; -} - -template -kernel void kernel_mul_mv_t_t_4( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_t_t_4_disp(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -typedef decltype(kernel_mul_mv_t_t_4) mul_mv_t_t_4; - -template [[host_name("kernel_mul_mv_f32_f32_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; -template [[host_name("kernel_mul_mv_f16_f32_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; -template [[host_name("kernel_mul_mv_f16_f16_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mv_bf16_f32_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; -template [[host_name("kernel_mul_mv_bf16_bf16_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; -#endif - -template -void kernel_mul_mv_t_t_short_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig, - ushort tiisg) { - const int r0 = tgpig.x*32 + tiisg; - const int r1 = tgpig.y; - const int im = tgpig.z; - - if (r0 >= args.ne01) { - return; - } - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - - device const T0 * x = (device const T0 *) (src0 + offset0); - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1; - - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const T1 * y = (device const T1 *) (src1 + offset1); - - float res = 0.0f; - - for (int i = 0; i < args.ne00; ++i) { - res += (float) x[i] * (float) y[i]; - } - - dst_f32[(uint64_t)r1*args.ne0 + r0] = res; -} - -template -kernel void kernel_mul_mv_t_t_short( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]]) { - kernel_mul_mv_t_t_short_impl( - args, - src0, - src1, - dst, - tgpig, - tiisg); -} - -typedef decltype(kernel_mul_mv_t_t_short) mul_mv_t_t_short_t; - -template [[host_name("kernel_mul_mv_f32_f32_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; -template [[host_name("kernel_mul_mv_f16_f32_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; -template [[host_name("kernel_mul_mv_f16_f16_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mv_bf16_f32_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; -template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; -#endif - -constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]]; -constant bool FC_rope_is_back [[function_constant(FC_ROPE + 1)]]; - -static float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / max(0.001f, high - low); - return 1.0f - min(1.0f, max(0.0f, y)); -} - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static void rope_yarn( - float theta_extrap, float freq_scale, float corr_dims[2], int i0, float ext_factor, float mscale, - thread float * cos_theta, thread float * sin_theta) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); - } - *cos_theta = cos(theta) * mscale; - *sin_theta = sin(theta) * mscale; - if (FC_rope_is_back) { - *sin_theta *= -1.0f; - } -} - -// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base)); -} - -static void rope_yarn_corr_dims( - int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] -) { - // start and end correction dims - dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))); - dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base))); -} - -template -kernel void kernel_rope_norm( - constant ggml_metal_kargs_rope & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 tptg [[threads_per_threadgroup]], - uint3 tgpig[[threadgroup_position_in_grid]]) { - const int i3 = tgpig[2]; - const int i2 = tgpig[1]; - const int i1 = tgpig[0]; - - float corr_dims[2]; - rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); - - device const int32_t * pos = (device const int32_t *) src1; - - const float theta_base = (float) pos[i2]; - const float inv_ndims = -1.f/args.n_dims; - - float cos_theta; - float sin_theta; - - for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { - if (i0 < args.n_dims) { - const int ic = i0/2; - - const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); - - const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; - - rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); - - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - const float x0 = src[0]; - const float x1 = src[1]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[1] = x0*sin_theta + x1*cos_theta; - } else { - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } -} - -template -kernel void kernel_rope_neox( - constant ggml_metal_kargs_rope & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 tptg [[threads_per_threadgroup]], - uint3 tgpig[[threadgroup_position_in_grid]]) { - const int i3 = tgpig[2]; - const int i2 = tgpig[1]; - const int i1 = tgpig[0]; - - float corr_dims[2]; - rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); - - device const int32_t * pos = (device const int32_t *) src1; - - const float theta_base = (float) pos[i2]; - const float inv_ndims = -1.f/args.n_dims; - - float cos_theta; - float sin_theta; - - for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { - if (i0 < args.n_dims) { - const int ic = i0/2; - - const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); - - const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; - - rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); - - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); - - const float x0 = src[0]; - const float x1 = src[args.n_dims/2]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; - } else { - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } -} - -template -kernel void kernel_rope_multi( - constant ggml_metal_kargs_rope & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 tptg [[threads_per_threadgroup]], - uint3 tgpig[[threadgroup_position_in_grid]]) { - const int i3 = tgpig[2]; - const int i2 = tgpig[1]; - const int i1 = tgpig[0]; - - float corr_dims[2]; - rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); - - device const int32_t * pos = (device const int32_t *) src1; - - const float inv_ndims = -1.f/args.n_dims; - - float cos_theta; - float sin_theta; - - for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { - if (i0 < args.n_dims) { - const int ic = i0/2; - - // mrope theta calculations - // note: the rest is the same as kernel_rope_neox - const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3; - const int sec_w01 = args.sect_0 + args.sect_1; // end of section 1 - const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 - const int sector = ic % sect_dims; - - float theta_base; - if (FC_rope_is_imrope) { - if (sector % 3 == 1 && sector < 3 * args.sect_1) { // h - theta_base = (float) pos[i2 + args.ne02 * 1]; - } else if (sector % 3 == 2 && sector < 3 * args.sect_2) { // w - theta_base = (float) pos[i2 + args.ne02 * 2]; - } else if (sector % 3 == 0 && sector < 3 * args.sect_0) { // t - theta_base = (float) pos[i2 + args.ne02 * 0]; - } else { // e - theta_base = (float) pos[i2 + args.ne02 * 3]; - } - } else { - if (sector < args.sect_0) { - theta_base = (float) pos[i2]; - } else if (sector < sec_w01) { - theta_base = (float) pos[i2 + args.ne02 * 1]; - } else if (sector < sec_w012) { - theta_base = (float) pos[i2 + args.ne02 * 2]; - } else { - theta_base = (float) pos[i2 + args.ne02 * 3]; - } - } - // end of mrope - - const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); - - const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; - - rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); - - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); - - const float x0 = src[0]; - const float x1 = src[args.n_dims/2]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; - } else { - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } -} - -template -kernel void kernel_rope_vision( - constant ggml_metal_kargs_rope & args, - device const char * src0, - device const char * src1, - device const char * src2, - device char * dst, - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 tptg [[threads_per_threadgroup]], - uint3 tgpig[[threadgroup_position_in_grid]]) { - const int i3 = tgpig[2]; - const int i2 = tgpig[1]; - const int i1 = tgpig[0]; - - float corr_dims[2]; - rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); - - device const int32_t * pos = (device const int32_t *) src1; - - const float inv_ndims = -1.f/args.n_dims; - - float cos_theta; - float sin_theta; - - for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { - if (i0 < 2*args.n_dims) { // different from kernel_rope_multi - const int ic = i0/2; - - // mrope theta calculations (only support 2 dimensions) - const int sect_dims = args.sect_0 + args.sect_1; - const int sector = ic % sect_dims; - - float p; - float theta_base; - if (sector < args.sect_1) { - p = (float) sector; - theta_base = (float) pos[i2]; - } else { - p = (float) sector - args.sect_0; - theta_base = (float) pos[i2 + args.ne02]; - } - - const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p); - // end of mrope - - const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; - - rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); - - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); - - const float x0 = src[0]; - const float x1 = src[args.n_dims]; // different from kernel_rope_multi - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi - } else { - device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); - device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } -} - -typedef decltype(kernel_rope_norm) kernel_rope_norm_t; -typedef decltype(kernel_rope_neox) kernel_rope_neox_t; -typedef decltype(kernel_rope_multi) kernel_rope_multi_t; -typedef decltype(kernel_rope_vision) kernel_rope_vision_t; - -template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm; -template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm; - -template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox; -template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox; - -template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi; -template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi; - -template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision; -template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision; - -typedef void (im2col_t)( - constant ggml_metal_kargs_im2col & args, - device const float * x, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]); - -template -kernel void kernel_im2col( - constant ggml_metal_kargs_im2col & args, - device const float * x, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { -// const int64_t IC = tgpg[0]; - const int64_t OH = tgpg[1]; - const int64_t OW = tgpg[2]; - - const int64_t KH = ntg[1]; - const int64_t KW = ntg[2]; - - int64_t in = tpitg[0]; - const int64_t ikh = tpitg[1]; - const int64_t ikw = tpitg[2]; - - const int64_t iic = tgpig[0]; - const int64_t ioh = tgpig[1]; - const int64_t iow = tgpig[2]; - - const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0; - const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1; - - int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw); - - device T * pdst = (device T *) (dst); - - if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) { - while (in < args.N) { - pdst[offset_dst] = 0.0f; - offset_dst += ntg[0]*args.CHW*OH*OW; - - in += ntg[0]; - } - } else { - int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw; - - while (in < args.N) { - pdst[offset_dst] = x[offset_src]; - - offset_dst += ntg[0]*args.CHW*OH*OW; - offset_src += ntg[0]*args.ofs0; - - in += ntg[0]; - } - } -} - -template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col; -template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; - -// TODO: optimize -typedef void (im2col_ext_t)( - constant ggml_metal_kargs_im2col & args, - device const float * x, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]); - -template -kernel void kernel_im2col_ext( - constant ggml_metal_kargs_im2col & args, - device const float * x, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1] - const int64_t KHW = (int64_t)args.KHW; - - const int64_t d = tgpig[0] / args.CHW; - const int64_t chw = tgpig[0] % args.CHW; - const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1) - const int64_t HW = tgpig[0] % KHW; - - const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0]; - if (tpitg_0 >= args.N) { - return; - } - - const int64_t tpitg_1 = HW / args.KW; - const int64_t tpitg_2 = HW % args.KW; - - const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0; - const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1; - - const int64_t offset_dst = - (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW + - (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2); - - device T * pdst = (device T *) (dst); - - if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) { - pdst[offset_dst] = 0.0f; - } else { - const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1; - pdst[offset_dst] = x[offset_src + iih * args.IW + iiw]; - } -} - -template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext; -template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext; - -template -kernel void kernel_conv_2d( - constant ggml_metal_kargs_conv_2d & args, - device const char * weights, - device const char * src, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const uint threads_per_tg = ntg.x * ntg.y * ntg.z; - const uint tg_index = (tgpig.z * tgpg.y + tgpig.y) * tgpg.x + tgpig.x; - const uint local_thread = tpitg.z * (ntg.x * ntg.y) + tpitg.y * ntg.x + tpitg.x; - const uint thread_index = tg_index * threads_per_tg + local_thread; - const uint64_t total_threads = (uint64_t) threads_per_tg * tgpg.x * tgpg.y * tgpg.z; - const uint64_t total_outputs = (uint64_t) args.N * args.OC * args.OH * args.OW; - - for (uint64_t index = thread_index; index < total_outputs; index += total_threads) { - uint64_t tmp = index; - - const int32_t ow = tmp % args.OW; tmp /= args.OW; - const int32_t oh = tmp % args.OH; tmp /= args.OH; - const int32_t oc = tmp % args.OC; tmp /= args.OC; - const int32_t n = tmp; - - float acc = 0.0f; - - const int32_t base_x = ow*args.s0 - args.p0; - const int32_t base_y = oh*args.s1 - args.p1; - - int32_t ky_start = 0; - if (base_y < 0) { - ky_start = (-base_y + args.d1 - 1)/args.d1; - } - int32_t ky_end = args.KH; - const int32_t y_max = args.IH - 1 - base_y; - if (y_max < 0) { - ky_end = ky_start; - } else if (base_y + (args.KH - 1)*args.d1 >= args.IH) { - ky_end = min(ky_end, y_max/args.d1 + 1); - } - - int32_t kx_start = 0; - if (base_x < 0) { - kx_start = (-base_x + args.d0 - 1)/args.d0; - } - int32_t kx_end = args.KW; - const int32_t x_max = args.IW - 1 - base_x; - if (x_max < 0) { - kx_end = kx_start; - } else if (base_x + (args.KW - 1)*args.d0 >= args.IW) { - kx_end = min(kx_end, x_max/args.d0 + 1); - } - - if (ky_start < ky_end && kx_start < kx_end) { - const uint64_t src_base_n = (uint64_t) n * args.nb13; - const uint64_t w_base_oc = (uint64_t) oc * args.nb03; - - for (int32_t ic = 0; ic < args.IC; ++ic) { - const uint64_t src_base_nc = src_base_n + (uint64_t) ic * args.nb12; - const uint64_t w_base_ocic = w_base_oc + (uint64_t) ic * args.nb02; - - for (int32_t ky = ky_start; ky < ky_end; ++ky) { - const int32_t iy = base_y + ky*args.d1; - const uint64_t src_base_row = src_base_nc + (uint64_t) iy * args.nb11; - const uint64_t w_base_row = w_base_ocic + (uint64_t) ky * args.nb01; - - for (int32_t kx = kx_start; kx < kx_end; ++kx) { - const int32_t ix = base_x + kx*args.d0; - const uint64_t src_offs = src_base_row + (uint64_t) ix * args.nb10; - const uint64_t w_offs = w_base_row + (uint64_t) kx * args.nb00; - - const float x = *(device const float *)(src + src_offs); - const float w = (float) (*(device const TK *)(weights + w_offs)); - - acc += x * w; - } - } - } - } - - const uint64_t dst_offs = - (uint64_t) n * args.nb3 + - (uint64_t) oc * args.nb2 + - (uint64_t) oh * args.nb1 + - (uint64_t) ow * args.nb0; - - *(device float *)(dst + dst_offs) = acc; - } -} - -template [[host_name("kernel_conv_2d_f32_f32")]] -kernel void kernel_conv_2d( - constant ggml_metal_kargs_conv_2d & args, - device const char * weights, - device const char * src, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]); - -template [[host_name("kernel_conv_2d_f16_f32")]] -kernel void kernel_conv_2d( - constant ggml_metal_kargs_conv_2d & args, - device const char * weights, - device const char * src, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]); - -typedef void (conv_transpose_1d_t)( - constant ggml_metal_kargs_conv_transpose_1d & args, - device const float * src0, - device const float * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]); - -template -kernel void kernel_conv_transpose_1d( - constant ggml_metal_kargs_conv_transpose_1d & args, - device const T * src0, - device const float * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]) { - - // For output position j on the time axis, only input positions - // i such that i*s0 <= j < i*s0 + K - // contribute -- i.e. i in [ceil((j - K + 1)/s0), floor(j/s0)] - // intersected with [0, IL-1]. That's at most ceil(K/s0) values - // (typically 2 for stride==K/2 transposed convs). - const int32_t j = tgpig[0]; - const int32_t s0 = args.s0; - const int32_t K = args.K; - const int32_t IL = args.IL; - - int32_t i_min; - { - int32_t a = j - K + 1; - i_min = a <= 0 ? 0 : (a + s0 - 1) / s0; // ceil(a/s0) for a>0 - } - int32_t i_max = j / s0; - if (i_max > IL - 1) i_max = IL - 1; - - float v = 0.0f; - if (i_min <= i_max) { - for (int64_t c = 0; c < args.IC; c++) { - const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1]; - const int32_t input_offset = c * IL; - - for (int32_t i = i_min; i <= i_max; i++) { - v += float(src0[kernel_offset + j - i * s0]) * src1[input_offset + i]; - } - } - } - - device float * dst_ptr = (device float *) (dst + tgpig[0] * args.nb0 + tgpig[1] * args.nb1); - - dst_ptr[0] = v; -} - -template [[host_name("kernel_conv_transpose_1d_f32_f32")]] -kernel void kernel_conv_transpose_1d( - constant ggml_metal_kargs_conv_transpose_1d & args, - device const float * src0, - device const float * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]); - -template [[host_name("kernel_conv_transpose_1d_f16_f32")]] -kernel void kernel_conv_transpose_1d( - constant ggml_metal_kargs_conv_transpose_1d & args, - device const half * src0, - device const float * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]); - - -typedef void (conv_transpose_2d_t)( - constant ggml_metal_kargs_conv_transpose_2d & args, - device const float * src0, - device const float * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]); - -template -kernel void kernel_conv_transpose_2d( - constant ggml_metal_kargs_conv_transpose_2d & args, - device const T * src0, - device const float * src1, - device char * dst, - threadgroup float * shared_sum [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t out_x = tgpig[0]; - const int64_t out_y = tgpig[1]; - const int64_t out_c = tgpig[2]; - - const int64_t kw = tpitg[0]; - const int64_t kh = tpitg[1]; - - float v = 0.0f; - - for (int64_t in_c = 0; in_c < args.IC; in_c++) { - int64_t in_y = out_y - kh; - - if (in_y < 0 || in_y % args.s0) continue; - - in_y /= args.s0; - - if (in_y >= args.IH) continue; - - int64_t in_x = out_x - kw; - - if (in_x < 0 || in_x % args.s0) continue; - - in_x /= args.s0; - - if (in_x >= args.IW) continue; - - const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; - const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; - - v += (float)src0[kernel_idx] * src1[input_idx]; - } - - const uint tid = tpitg.y * ntg.x + tpitg.x; - shared_sum[tid] = v; - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (tid == 0) { - float total = 0.0f; - const uint num_threads = ntg.x * ntg.y; - for (uint i = 0; i < num_threads; i++) { - total += shared_sum[i]; - } - - device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2); - dst_ptr[0] = total; - } -} - -template [[host_name("kernel_conv_transpose_2d_f32_f32")]] -kernel void kernel_conv_transpose_2d( - constant ggml_metal_kargs_conv_transpose_2d & args, - device const float * src0, - device const float * src1, - device char * dst, - threadgroup float * shared_sum [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]); - -template [[host_name("kernel_conv_transpose_2d_f16_f32")]] -kernel void kernel_conv_transpose_2d( - constant ggml_metal_kargs_conv_transpose_2d & args, - device const half * src0, - device const float * src1, - device char * dst, - threadgroup float * shared_sum [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]); - -constant bool FC_upscale_aa [[function_constant(FC_UPSCALE + 0)]]; - -kernel void kernel_upscale_nearest_f32( - constant ggml_metal_kargs_upscale & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3/args.sf3; - const int64_t i02 = i2/args.sf2; - const int64_t i01 = i1/args.sf1; - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int64_t i00 = i0/args.sf0; - - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - dst_ptr[0] = src0_ptr[0]; - } -} - -static inline float bilinear_tri(float x) { - return MAX(0.0f, 1.0f - fabs(x)); -} - -kernel void kernel_upscale_bilinear_f32( - constant ggml_metal_kargs_upscale & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3 / args.sf3; - const int64_t i02 = i2 / args.sf2; - - const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs; - const int64_t i01 = MAX(0, MIN(args.ne01 - 1, (int64_t)floor(f01))); - const int64_t i01p = MAX(0, MIN(args.ne01 - 1, i01 + 1)); - const float fd1 = MAX(0.0f, MIN(1.0f, f01 - (float)i01)); - - src0 += i03*args.nb03 + i02*args.nb02; - - device float * dst_ptr = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - - if (FC_upscale_aa) { - const float support0 = MAX(1.0f, 1.0f / args.sf0); - const float invscale0 = 1.0f / support0; - const float support1 = MAX(1.0f, 1.0f / args.sf1); - const float invscale1 = 1.0f / support1; - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs; - - int64_t x_min = MAX((int64_t)0, (int64_t)floor(f00 - support0 + args.poffs)); - int64_t x_max = MIN(args.ne00, (int64_t)ceil (f00 + support0 + args.poffs)); - - int64_t y_min = MAX((int64_t)0, (int64_t)floor(f01 - support1 + args.poffs)); - int64_t y_max = MIN(args.ne01, (int64_t)ceil (f01 + support1 + args.poffs)); - - float sum = 0.0f; - float wsum = 0.0f; - - for (int64_t sy = y_min; sy < y_max; ++sy) { - const float wy = MAX(0.0f, 1.0f - fabs((float)sy - f01) * invscale1); - for (int64_t sx = x_min; sx < x_max; ++sx) { - const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0); - const float w = wx * wy; - device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); - sum += (*src_ptr) * w; - wsum += w; - } - } - - const float v = (wsum > 0.0f) ? (sum / wsum) : 0.0f; - dst_ptr[i0] = v; - } - } else { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs; - const int64_t i00 = MAX(0, MIN(args.ne00 - 1, (int64_t)floor(f00))); - const int64_t i00p = MAX(0, MIN(args.ne00 - 1, i00 + 1)); - const float fd0 = MAX(0.0f, MIN(1.0f, f00 - (float)i00)); - - device const float * src00 = (device const float *)(src0 + i01*args.nb01 + i00*args.nb00); - device const float * src10 = (device const float *)(src0 + i01*args.nb01 + i00p*args.nb00); - device const float * src01 = (device const float *)(src0 + i01p*args.nb01 + i00*args.nb00); - device const float * src11 = (device const float *)(src0 + i01p*args.nb01 + i00p*args.nb00); - - const float v = - (*src00) * (1.0f - fd0) * (1.0f - fd1) + - (*src10) * fd0 * (1.0f - fd1) + - (*src01) * (1.0f - fd0) * fd1 + - (*src11) * fd0 * fd1; - - dst_ptr[i0] = v; - } - } -} - -template -kernel void kernel_conv_3d( - constant ggml_metal_kargs_conv_3d & args, - device const char * src0, // Weights [IC * OC, KD, KH, KW] - device const char * src1, // Inputs [IC * N, ID, IH, IW] - device char * dst, // Outputs [OC * N, OD, OH, OW] - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]]) { - - // 1. Un-flatten the spatial dimension from Grid X - int64_t spatial_idx = tgpig.x * 32 + tpitg.x; - - if (spatial_idx >= args.OW * args.OH * args.OD) { - return; // Thread falls outside the spatial volume - } - - int64_t od = spatial_idx / (args.OW * args.OH); - int64_t oh = (spatial_idx / args.OW) % args.OH; - int64_t ow = spatial_idx % args.OW; - - // 2. Map Y to Channels, Z to Batch - int64_t oc = tgpig.y; - int64_t batch_idx = tgpig.z; - - // 3. Calculate anchor coordinates in the Input volume - int64_t i_w_base = ow * args.s0 - args.p0; - int64_t i_h_base = oh * args.s1 - args.p1; - int64_t i_d_base = od * args.s2 - args.p2; - - float sum = 0.0f; - - // 4. Gather Loop (Iterate over Input Channels -> Depth -> Height -> Width) - for (int64_t ic = 0; ic < args.IC; ++ic) { - - // ggml packs batch and channel together in the 4th dimension - int64_t src_cn_idx = batch_idx * args.IC + ic; - int64_t w_cn_idx = oc * args.IC + ic; - - for (int64_t kz = 0; kz < args.KD; ++kz) { - int64_t id = i_d_base + kz * args.d2; - if (id < 0 || id >= args.ID) continue; // Boundary check (Padding) - - for (int64_t ky = 0; ky < args.KH; ++ky) { - int64_t ih = i_h_base + ky * args.d1; - if (ih < 0 || ih >= args.IH) continue; - - for (int64_t kx = 0; kx < args.KW; ++kx) { - int64_t iw = i_w_base + kx * args.d0; - if (iw < 0 || iw >= args.IW) continue; - - // Convert multi-dimensional coordinates to flat byte offsets - int64_t w_idx = kx*args.nb00 + ky*args.nb01 + kz*args.nb02 + w_cn_idx*args.nb03; - int64_t i_idx = iw*args.nb10 + ih*args.nb11 + id*args.nb12 + src_cn_idx*args.nb13; - - // Dereference memory and cast weights to f32 if they were f16 - float w_val = (float)*(device const T*)((device const char*)src0 + w_idx); - float i_val = *(device const float*)((device const char*)src1 + i_idx); - - sum += w_val * i_val; - } - } - } - } - - // 5. Write the accumulated value out to RAM - int64_t dst_cn_idx = batch_idx * args.OC + oc; - int64_t d_idx = ow*args.nb0 + oh*args.nb1 + od*args.nb2 + dst_cn_idx*args.nb3; - - *(device float*)(dst + d_idx) = sum; -} - -// Explicit instantiations so the JIT compiler can find them by name -template [[host_name("kernel_conv_3d_f32_f32")]] -kernel void kernel_conv_3d( - constant ggml_metal_kargs_conv_3d & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]]); - -// Explicit instantiation for f16 weights -template [[host_name("kernel_conv_3d_f16_f32")]] -kernel void kernel_conv_3d( - constant ggml_metal_kargs_conv_3d & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]]); - - -static inline float bicubic_weight1(float x) { - const float a = -0.75f; - return ((a + 2) * x - (a + 3)) * x * x + 1; -} - -static inline float bicubic_weight2(float x) { - const float a = -0.75f; - return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; -} - -kernel void kernel_upscale_bicubic_f32( - constant ggml_metal_kargs_upscale & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3 / args.sf3; - const int64_t i02 = i2 / args.sf2; - - const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs; - const int64_t i01 = (int64_t)floor(f01); - const float fd1 = f01 - (float)i01; - - const float w_y0 = bicubic_weight2(fd1 + 1.0f); - const float w_y1 = bicubic_weight1(fd1); - const float w_y2 = bicubic_weight1(1.0f - fd1); - const float w_y3 = bicubic_weight2(2.0f - fd1); - - const device const char * src_slice = src0 + i03 * args.nb03 + i02 * args.nb02; - - device float * dst_ptr = (device float *)(dst + i3 * args.nb3 + i2 * args.nb2 + i1 * args.nb1); - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs; - const int64_t i00 = (int64_t)floor(f00); - const float fd0 = f00 - (float)i00; - - const float w_x0 = bicubic_weight2(fd0 + 1.0f); - const float w_x1 = bicubic_weight1(fd0); - const float w_x2 = bicubic_weight1(1.0f - fd0); - const float w_x3 = bicubic_weight2(2.0f - fd0); - - float sum = 0.0f; - - for (int dy = -1; dy <= 2; ++dy) { - const int64_t iy = MAX(0, MIN(args.ne01 - 1, i01 + dy)); - const float wy = (dy == -1) ? w_y0 : (dy == 0) ? w_y1 : (dy == 1) ? w_y2 : w_y3; - - for (int dx = -1; dx <= 2; ++dx) { - const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx)); - const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3; - - device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); - sum += (*src_ptr) * wx * wy; - } - } - - dst_ptr[i0] = sum; - } -} - -kernel void kernel_roll_f32( - constant ggml_metal_kargs_roll & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - device const float * src0_ptr = (device const float *) src0; - device float * dst_ptr = (device float *) dst; - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - // apply shifts and wrap around - int64_t i00 = i0 - args.s0; - int64_t i01 = i1 - args.s1; - int64_t i02 = i2 - args.s2; - int64_t i03 = i3 - args.s3; - - if (i00 < 0) { i00 += args.ne00; } else if (i00 >= args.ne00) { i00 -= args.ne00; } - if (i01 < 0) { i01 += args.ne01; } else if (i01 >= args.ne01) { i01 -= args.ne01; } - if (i02 < 0) { i02 += args.ne02; } else if (i02 >= args.ne02) { i02 -= args.ne02; } - if (i03 < 0) { i03 += args.ne03; } else if (i03 >= args.ne03) { i03 -= args.ne03; } - - int64_t src_idx = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00 + i00; - int64_t dst_idx = i3 *args.ne2 *args.ne1 *args.ne0 + i2 *args.ne1 *args.ne0 + i1 *args.ne0 + i0; - - dst_ptr[dst_idx] = src0_ptr[src_idx]; - } -} - -template -kernel void kernel_pad_impl( - constant ggml_metal_kargs_pad & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int32_t i3 = tgpig.z; - const int32_t i2 = tgpig.y; - const int32_t k0 = tgpig.x/args.ne1; - const int32_t i1 = tgpig.x - k0*args.ne1; - - const int32_t i03 = i3; - const int32_t i02 = i2; - const int32_t i01 = i1; - - device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - - for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) { - const int32_t i0 = k0*1024 + tpitg.x + l0; - if (i0 >= args.ne0) { - break; - } - - if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - dst_ptr[i0] = src0_ptr[i0]; - } else { - dst_ptr[i0] = 0.0f; - } - } -} - -typedef decltype(kernel_pad_impl) kernel_pad_t; - -template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl; -template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl; - -// TODO: this is slow - optimize -kernel void kernel_pad_reflect_1d_f32( - constant ggml_metal_kargs_pad_reflect_1d & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - - if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - if (i0 < args.p0) { - dst_ptr[i0] = src0_ptr[args.p0 - i0]; - } else if (i0 < args.ne0 - args.p1) { - dst_ptr[i0] = src0_ptr[i0 - args.p0]; - } else { - dst_ptr[i0] = src0_ptr[(args.ne0 - args.p1 - args.p0) - (args.p1 + 1 - (args.ne0 - i0)) - 1]; - } - } - } -} - -kernel void kernel_arange_f32( - constant ggml_metal_kargs_arange & args, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - device float * dst_ptr = (device float *) dst; - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - dst_ptr[i0] = args.start + args.step * i0; - } -} - -kernel void kernel_timestep_embedding_f32( - constant ggml_metal_kargs_timestep_embedding & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - int i = tgpig.x; - device float * embed_data = (device float *)(dst + i*args.nb1); - - int half_ = args.dim / 2; - for (int j = tpitg.x; j < half_; j += ntg.x) { - float timestep = ((device float *)src0)[i]; - float freq = (float)exp(-log((float)args.max_period) * j / half_); - float arg = timestep * freq; - embed_data[j ] = cos(arg); - embed_data[j + half_] = sin(arg); - } - - if (args.dim % 2 != 0 && tpitg.x == 0) { - embed_data[2 * half_] = 0.f; - } -} - -// bitonic sort implementation following the CUDA kernels as reference -typedef void (argsort_t)( - constant ggml_metal_kargs_argsort & args, - device const char * src0, - device int32_t * dst, - threadgroup int32_t * shmem_i32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]); - -template -kernel void kernel_argsort_f32_i32( - constant ggml_metal_kargs_argsort & args, - device const char * src0, - device int32_t * dst, - threadgroup int32_t * shmem_i32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - // bitonic sort - const int col = tpitg[0]; - const int ib = tgpig[0] / args.ne01; - - const int i00 = ib*ntg.x; - const int i01 = tgpig[0] % args.ne01; - const int i02 = tgpig[1]; - const int i03 = tgpig[2]; - - device const float * src0_row = (device const float *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03); - - // initialize indices - shmem_i32[col] = i00 + col; - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (int k = 2; k <= ntg.x; k *= 2) { - for (int j = k / 2; j > 0; j /= 2) { - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (shmem_i32[col] >= args.ne00 || - (shmem_i32[ixj] < args.ne00 && (order == GGML_SORT_ORDER_ASC ? - src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]] : - src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]])) - ) { - SWAP(shmem_i32[col], shmem_i32[ixj]); - } - } else { - if (shmem_i32[ixj] >= args.ne00 || - (shmem_i32[col] < args.ne00 && (order == GGML_SORT_ORDER_ASC ? - src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]] : - src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]])) - ) { - SWAP(shmem_i32[col], shmem_i32[ixj]); - } - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - } - } - - const int64_t i0 = ib*args.top_k; - - // copy the result to dst without the padding - if (i0 + col < args.ne0 && col < args.top_k) { - dst += i0 + args.ne0*i01 + args.ne0*args.ne1*i02 + args.ne0*args.ne1*args.ne2*i03; - - dst[col] = shmem_i32[col]; - } -} - -template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32; -template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; - -typedef void (argsort_merge_t)( - constant ggml_metal_kargs_argsort_merge & args, - device const char * src0, - device const int32_t * tmp, - device int32_t * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]); - -template -kernel void kernel_argsort_merge_f32_i32( - constant ggml_metal_kargs_argsort_merge & args, - device const char * src0, - device const int32_t * tmp, - device int32_t * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - - const int im = tgpig[0] / args.ne01; - const int i01 = tgpig[0] % args.ne01; - const int i02 = tgpig[1]; - const int i03 = tgpig[2]; - - const int start = im * (2 * args.len); - - const int len0 = MIN(args.len, MAX(0, args.ne0 - (int)(start))); - const int len1 = MIN(args.len, MAX(0, args.ne0 - (int)(start + args.len))); - - const int total = len0 + len1; - - device const int32_t * tmp0 = tmp + start - + i01*args.ne0 - + i02*args.ne0*args.ne01 - + i03*args.ne0*args.ne01*args.ne02; - - device const int32_t * tmp1 = tmp0 + args.len; - - dst += start - + i01*args.top_k - + i02*args.top_k*args.ne01 - + i03*args.top_k*args.ne01*args.ne02; - - device const float * src0_row = (device const float *)(src0 - + args.nb01*i01 - + args.nb02*i02 - + args.nb03*i03); - - if (total == 0) { - return; - } - - const int chunk = (total + ntg.x - 1) / ntg.x; - - const int k0 = tpitg.x * chunk; - const int k1 = MIN(MIN(k0 + chunk, total), args.top_k); - - if (k0 >= args.top_k) { - return; - } - - if (k0 >= total) { - return; - } - - int low = k0 > len1 ? k0 - len1 : 0; - int high = MIN(k0, len0); - - // binary-search partition (i, j) such that i + j = k - while (low < high) { - const int mid = (low + high) >> 1; - - const int32_t idx0 = tmp0[mid]; - const int32_t idx1 = tmp1[k0 - mid - 1]; - - const float val0 = src0_row[idx0]; - const float val1 = src0_row[idx1]; - - bool take_left; - if (order == GGML_SORT_ORDER_ASC) { - take_left = (val0 <= val1); - } else { - take_left = (val0 >= val1); - } - - if (take_left) { - low = mid + 1; - } else { - high = mid; - } - } - - int i = low; - int j = k0 - i; - - // keep the merge fronts into registers - int32_t idx0 = 0; - float val0 = 0.0f; - if (i < len0) { - idx0 = tmp0[i]; - val0 = src0_row[idx0]; - } - - int32_t idx1 = 0; - float val1 = 0.0f; - if (j < len1) { - idx1 = tmp1[j]; - val1 = src0_row[idx1]; - } - - for (int k = k0; k < k1; ++k) { - int32_t out_idx; - - if (i >= len0) { - while (k < k1) { - dst[k++] = tmp1[j++]; - } - break; - } else if (j >= len1) { - while (k < k1) { - dst[k++] = tmp0[i++]; - } - break; - } else { - bool take_left; - - if (order == GGML_SORT_ORDER_ASC) { - take_left = (val0 <= val1); - } else { - take_left = (val0 >= val1); - } - - if (take_left) { - out_idx = idx0; - ++i; - if (i < len0) { - idx0 = tmp0[i]; - val0 = src0_row[idx0]; - } - } else { - out_idx = idx1; - ++j; - if (j < len1) { - idx1 = tmp1[j]; - val1 = src0_row[idx1]; - } - } - } - - dst[k] = out_idx; - } -} - -template [[host_name("kernel_argsort_merge_f32_i32_asc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32; -template [[host_name("kernel_argsort_merge_f32_i32_desc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32; - -constant bool FC_flash_attn_ext_pad_has_mask [[function_constant(FC_FLASH_ATTN_EXT_PAD + 0)]]; - -constant int32_t FC_flash_attn_ext_pad_ncpsg [[function_constant(FC_FLASH_ATTN_EXT_PAD + 25)]]; - -// pad the last chunk of C elements of k and v into a an extra pad buffer -kernel void kernel_flash_attn_ext_pad( - constant ggml_metal_kargs_flash_attn_ext_pad & args, - device const char * k, - device const char * v, - device const char * mask, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int32_t C = FC_flash_attn_ext_pad_ncpsg; - - device char * k_pad = dst; - device char * v_pad = k_pad + args.nb11*C*args.ne_12_2*args.ne_12_3; - device char * mask_pad = v_pad + args.nb21*C*args.ne_12_2*args.ne_12_3; - - const int32_t icp = args.ne11 % C; - const int32_t ic0 = args.ne11 - icp; - - const int32_t i1 = tgpig[0]; - const int32_t i2 = tgpig[1]; - const int32_t i3 = tgpig[2]; - - if (i2 < args.ne_12_2 && i3 < args.ne_12_3) { - device const char * k_src = k + args.nb11*(ic0 + i1) + args.nb12*i2 + args.nb13*i3; - device const char * v_src = v + args.nb21*(ic0 + i1) + args.nb22*i2 + args.nb23*i3; - - device char * k_dst = k_pad + args.nb11*i1 + args.nb11*C*i2 + args.nb11*C*args.ne_12_2*i3; - device char * v_dst = v_pad + args.nb21*i1 + args.nb21*C*i2 + args.nb21*C*args.ne_12_2*i3; - - if (i1 >= icp) { - // here it is not important the exact value that will be used as we rely on masking out the scores in the attention - for (uint64_t i = tiitg; i < args.nb11; i += ntg.x) { - k_dst[i] = 0; - } - for (uint64_t i = tiitg; i < args.nb21; i += ntg.x) { - v_dst[i] = 0; - } - } else { - for (uint64_t i = tiitg; i < args.nb11; i += ntg.x) { - k_dst[i] = k_src[i]; - } - for (uint64_t i = tiitg; i < args.nb21; i += ntg.x) { - v_dst[i] = v_src[i]; - } - } - } - - if (FC_flash_attn_ext_pad_has_mask) { - if (i2 < args.ne32 && i3 < args.ne33) { - for (int ib = i1; ib < args.ne31; ib += C) { - device const half * mask_src = (device const half *)(mask + args.nb31*ib + args.nb32*i2 + args.nb33*i3) + ic0; - device half * mask_dst = (device half *)(mask_pad) + C*ib + C*args.ne31*i2 + C*args.ne31*args.ne32*i3; - - for (int i = tiitg; i < C; i += ntg.x) { - if (i >= icp) { - mask_dst[i] = -MAXHALF; - } else { - mask_dst[i] = mask_src[i]; - } - } - } - } - } -} - -constant int32_t FC_flash_attn_ext_blk_nqptg [[function_constant(FC_FLASH_ATTN_EXT_BLK + 24)]]; -constant int32_t FC_flash_attn_ext_blk_ncpsg [[function_constant(FC_FLASH_ATTN_EXT_BLK + 25)]]; - -// scan the blocks of the mask that are not masked -// 0 - masked (i.e. full of -INF, skip) -// 1 - not masked (i.e. at least one element of the mask is not -INF) -// 2 - all zero -kernel void kernel_flash_attn_ext_blk( - constant ggml_metal_kargs_flash_attn_ext_blk & args, - device const char * mask, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]]) { - // block size C x Q - const int32_t Q = FC_flash_attn_ext_blk_nqptg; - const int32_t C = FC_flash_attn_ext_blk_ncpsg; - - constexpr short NW = N_SIMDWIDTH; - - const int32_t i3 = tgpig[2]/args.ne32; - const int32_t i2 = tgpig[2]%args.ne32; - const int32_t i1 = tgpig[1]; - const int32_t i0 = tgpig[0]; - - char res = i0*C + C > args.ne30 ? 1 : 0; - - device const half * mask_src = (device const half *) (mask + (i1*Q)*args.nb31 + i2*args.nb32 + i3*args.nb33) + i0*C + tiisg; - - // detailed check of the elements of the block - if ((C > NW || Q > 1) && res == 0) { - half mmin = MAXHALF; - half mmax = -MAXHALF; - - FOR_UNROLL (short j = 0; j < Q; ++j) { - FOR_UNROLL (short ii = 0; ii < C/NW; ++ii) { - mmin = min(mmin, mask_src[ii*NW]); - mmax = max(mmax, mask_src[ii*NW]); - } - - mask_src += args.nb31/2; - } - - mmin = simd_min(mmin); - mmax = simd_max(mmax); - - if (mmax > -MAXHALF) { - if (mmin == 0.0 && mmax == 0.0) { - res = 2; - } else { - res = 1; - } - } - } - - const int32_t nblk1 = ((args.ne01 + Q - 1)/Q); - const int32_t nblk0 = ((args.ne30 + C - 1)/C); - - if (tiisg == 0) { - dst[((i3*args.ne32 + i2)*nblk1 + i1)*nblk0 + i0] = res; - } -} - -constant bool FC_flash_attn_ext_has_mask [[function_constant(FC_FLASH_ATTN_EXT + 0)]]; -constant bool FC_flash_attn_ext_has_sinks [[function_constant(FC_FLASH_ATTN_EXT + 1)]]; -constant bool FC_flash_attn_ext_has_bias [[function_constant(FC_FLASH_ATTN_EXT + 2)]]; -constant bool FC_flash_attn_ext_has_scap [[function_constant(FC_FLASH_ATTN_EXT + 3)]]; -constant bool FC_flash_attn_ext_has_kvpad [[function_constant(FC_FLASH_ATTN_EXT + 4)]]; - -constant bool FC_flash_attn_ext_bc_mask [[function_constant(FC_FLASH_ATTN_EXT + 10)]]; - -//constant float FC_flash_attn_ext_scale [[function_constant(FC_FLASH_ATTN_EXT + 10)]]; -//constant float FC_flash_attn_ext_max_bias [[function_constant(FC_FLASH_ATTN_EXT + 11)]]; -//constant float FC_flash_attn_ext_logit_softcap [[function_constant(FC_FLASH_ATTN_EXT + 12)]]; - -constant int32_t FC_flash_attn_ext_ns10 [[function_constant(FC_FLASH_ATTN_EXT + 20)]]; -constant int32_t FC_flash_attn_ext_ns20 [[function_constant(FC_FLASH_ATTN_EXT + 21)]]; -constant int32_t FC_flash_attn_ext_nsg [[function_constant(FC_FLASH_ATTN_EXT + 22)]]; - -// ref: https://arxiv.org/pdf/2307.08691.pdf -template< - typename q_t, // query types in shared memory - typename q4_t, - typename q8x8_t, - typename k_t, // key types in shared memory - typename k4x4_t, - typename k8x8_t, - typename v_t, // value types in shared memory - typename v4x4_t, - typename v8x8_t, - typename qk_t, // Q*K types - typename qk8x8_t, - typename s_t, // soft-max types - typename s2_t, - typename s8x8_t, - typename o_t, // attention accumulation types - typename o4_t, - typename o8x8_t, - typename kd4x4_t, // key type in device memory - short nl_k, - void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &), - typename vd4x4_t, // value type in device memory - short nl_v, - void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), - short DK, // K head size - short DV, // V head size - short Q, // queries per threadgroup - short C, // cache items per threadgroup - short NSG> // number of simd groups -void kernel_flash_attn_ext_impl( - constant ggml_metal_kargs_flash_attn_ext & args, - device const char * q, - device const char * k, - device const char * v, - device const char * mask, - device const char * sinks, - device const char * pad, - device const char * blk, - device char * dst, - threadgroup half * shmem_f16, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const ushort iq3 = tgpig[2]; - const ushort iq2 = tgpig[1]; - const ushort iq1 = tgpig[0]*Q; - -#define NS10 (FC_flash_attn_ext_ns10) -#define NS20 (FC_flash_attn_ext_ns20) - - // note: I had some concerns that using this instead of the ugly macros above was affecting performance - // need to re-check carefully and if no regressions are observerd - remove the macros - // the concerns is that maybe using const variables requires extra registers? but not sure if the compiler - // is clever enough to avoid this. unfortunately, using constexpr is not possible with FC - //const short NS10 = FC_flash_attn_ext_ns10; - //const short NS20 = FC_flash_attn_ext_ns20; - - constexpr short KV = 8; - - constexpr short DK4 = DK/4; - constexpr short DK8 = DK/8; - constexpr short DK16 = DK/16; - constexpr short DV4 = DV/4; - //constexpr short DV8 = DV/8; - constexpr short DV16 = DV/16; - - constexpr short PV = PAD2(DV, 64); - constexpr short PV4 = PV/4; - constexpr short PV8 = PV/8; - //constexpr short PV16 = PV/16; - - constexpr short NW = N_SIMDWIDTH; - constexpr short NQ = Q/NSG; - constexpr short SH = 2*C; // shared memory per simdgroup (s_t == float) - - constexpr short TS = 2*SH; - constexpr short T = DK + 2*PV; // shared memory size per query in (half) - - threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*T); // holds the query data - threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*T); // same as above but in q4_t - threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*T + Q*DK); // the result for all queries in 8x8 matrices (the O matrix from the paper) - threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*T + Q*DK); - threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + Q*T); // scratch buffer for attention, mask and diagonal matrix - threadgroup s2_t * ss2 = (threadgroup s2_t *) (shmem_f16 + Q*T); // same as above but in s2_t - - threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // scratch buffer to load K in shared memory - threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // same as above but in k4x4_t - - threadgroup v_t * sv = (threadgroup v_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // scratch buffer to load V in shared memory - threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // same as above but in v4x4_t - - // mask storage in shared mem - threadgroup half2 * sm2 = (threadgroup half2 *) (shmem_f16 + Q*T + 2*C); - - // per-query mask pointers - device const half2 * pm2[NQ]; - - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - pm2[jj] = (device const half2 *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33); - } - - { - const int32_t nblk1 = ((args.ne01 + Q - 1)/Q); - const int32_t nblk0 = ((args.ne11 + C - 1)/C); - - blk += (((iq3%args.ne33)*args.ne32 + (iq2%args.ne32))*nblk1 + iq1/Q)*nblk0; - } - - { - q += iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03; - - const short ikv2 = iq2/(args.ne02/args.ne_12_2); - const short ikv3 = iq3/(args.ne03/args.ne_12_3); - - k += ikv2*args.nb12 + ikv3*args.nb13; - v += ikv2*args.nb22 + ikv3*args.nb23; - } - - // load heads from Q to shared memory - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - device const float4 * q4 = (device const float4 *) ((device const char *) q + j*args.nb01); - - for (short i = tiisg; i < DK4; i += NW) { - if (iq1 + j < args.ne01) { - sq4[j*DK4 + i] = (q4_t) q4[i]; - } else { - sq4[j*DK4 + i] = 0; - } - } - } - - // zero out - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - for (short i = tiisg; i < DV4; i += NW) { - so4[j*PV4 + i] = 0; - } - - for (short i = tiisg; i < SH; i += NW) { - ss[j*SH + i] = 0.0f; - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - float S[NQ] = { [0 ... NQ-1] = 0.0f }; - - { - float M[NQ] = { [0 ... NQ-1] = -FLT_MAX/2 }; - - float slope = 1.0f; - - // ALiBi - if (FC_flash_attn_ext_has_bias) { - const short h = iq2; - - const float base = h < args.n_head_log2 ? args.m0 : args.m1; - const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; - - slope = pow(base, exph); - } - - // loop over the KV cache - // each simdgroup handles blocks of Q rows and C columns - for (int ic0 = 0; ; ++ic0) { - int ic = ic0*C; - if (ic >= args.ne11) { - break; - } - - // the last partial chunk uses the pad buffer as source - if (FC_flash_attn_ext_has_kvpad && ic + C > args.ne11) { - k = pad; - v = k + args.nb11*C*args.ne_12_2*args.ne_12_3; - mask = v + args.nb21*C*args.ne_12_2*args.ne_12_3; - - const short ikv2 = iq2/(args.ne02/args.ne_12_2); - const short ikv3 = iq3/(args.ne03/args.ne_12_3); - - k += (ikv2 + ikv3*args.ne_12_2)*args.nb11*C; - v += (ikv2 + ikv3*args.ne_12_2)*args.nb21*C; - - if (!FC_flash_attn_ext_has_mask) { - threadgroup half * sm = (threadgroup half *) (sm2); - - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - for (short i = tiisg; i < C; i += NW) { - if (ic + i >= args.ne11) { - sm[2*j*SH + i] = -MAXHALF; - } - } - } - } else { - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - pm2[jj] = (device const half2 *) ((device const half *) mask + - (iq1 + j)*C + - (iq2%args.ne32)*(C*args.ne31) + - (iq3%args.ne33)*(C*args.ne31*args.ne32)); - } - } - - ic = 0; - } - - char blk_cur = 1; - - // read the mask into shared mem - if (FC_flash_attn_ext_has_mask) { - blk_cur = blk[ic0]; - - if (blk_cur == 0) { - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - pm2[jj] += NW; - } - - continue; - } - - if (blk_cur == 1) { - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - if (FC_flash_attn_ext_bc_mask) { - sm2[j*SH + tiisg] = (iq1 + j) < args.ne31 ? pm2[jj][tiisg] : half2(-MAXHALF, -MAXHALF); - } else { - sm2[j*SH + tiisg] = pm2[jj][tiisg]; - } - - pm2[jj] += NW; - } - } else if (blk_cur == 2) { - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - pm2[jj] += NW; - } - } - -#if 0 - // note: old -INF block optimization - obsoleted by pre-computing non-masked blocks - - threadgroup_barrier(mem_flags::mem_threadgroup); - - // used to detect blocks full of -INF - // skip only when the entire threadgroup is masked - half2 smax2(-MAXHALF/2, -MAXHALF/2); - - FOR_UNROLL (short j = 0; j < Q; ++j) { - smax2 = max(smax2, sm2[j*SH + tiisg]); - } - - smax2 = simd_max(smax2); - - if (max(smax2[0], smax2[1]) <= -MAXHALF/2) { - // this barrier is important - threadgroup_barrier(mem_flags::mem_threadgroup); - - continue; - } -#endif - } - - // Q*K^T - // this is compile-time check, so it does not have runtime overhead - if (is_same::value) { - // we can read directly from global memory - device const k_t * pk = (device const k_t *) (k + ic*args.nb11); - threadgroup const q_t * pq = sq; - threadgroup s_t * ps = ss; - - pk += sgitg*(8*NS10); - ps += sgitg*(8*1); - - static_assert((C/8) % NSG == 0, ""); - - constexpr short NC = (C/8)/NSG; - - FOR_UNROLL (short cc = 0; cc < NC; ++cc) { - qk8x8_t mqk = make_filled_simdgroup_matrix((qk_t) 0.0f); - - if (DK % 16 != 0) { - k8x8_t mk; - q8x8_t mq; - - FOR_UNROLL (short i = 0; i < DK8; ++i) { - simdgroup_barrier(mem_flags::mem_none); - - simdgroup_load(mk, pk + 8*i, NS10, 0, true); - simdgroup_load(mq, pq + 8*i, DK); - - simdgroup_barrier(mem_flags::mem_none); - - simdgroup_multiply_accumulate(mqk, mq, mk, mqk); - } - } else { - k8x8_t mk[2]; - q8x8_t mq[2]; - - // note: too much unroll can tank the performance for large heads - #pragma unroll (MIN(DK8/2, 4*NSG)) - for (short i = 0; i < DK8/2; ++i) { - simdgroup_barrier(mem_flags::mem_none); - - simdgroup_load(mq[0], pq + 0*8 + 16*i, DK); - simdgroup_load(mq[1], pq + 1*8 + 16*i, DK); - - simdgroup_load(mk[0], pk + 0*8 + 16*i, NS10, 0, true); - simdgroup_load(mk[1], pk + 1*8 + 16*i, NS10, 0, true); - - simdgroup_barrier(mem_flags::mem_none); - - simdgroup_multiply_accumulate(mqk, mq[0], mk[0], mqk); - simdgroup_multiply_accumulate(mqk, mq[1], mk[1], mqk); - } - } - - simdgroup_store(mqk, ps, SH, 0, false); - - pk += 8*(NSG*NS10); - ps += 8*(NSG); - } - } else { - // TODO: this is the quantized K cache branch - not optimized yet - for (short ccc = 0; ccc < (C/8)/NSG; ++ccc) { - const short cc = ccc*NSG + sgitg; - - const short tx = tiisg%4; - const short ty = tiisg/4; - - qk8x8_t mqk = make_filled_simdgroup_matrix((qk_t) 0.0f); - - for (short ii = 0; ii < DK16; ii += 4) { - device const kd4x4_t * pk4x4 = (device const kd4x4_t *) (k + ((ic + 8*cc + ty)*args.nb11)); - - if (DK16%4 == 0) { - // the head is evenly divisible by 4*16 = 64, so no need for bound checks - { - k4x4_t tmp; - deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp); - sk4x4[4*ty + tx] = tmp; - } - - simdgroup_barrier(mem_flags::mem_threadgroup); - - FOR_UNROLL (short k = 0; k < 4; ++k) { - k8x8_t mk; - q8x8_t mq; - - simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose - simdgroup_load(mq, sq + (2*(ii + k) + 0)*8, DK); - simdgroup_multiply_accumulate(mqk, mq, mk, mqk); - - simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose - simdgroup_load(mq, sq + (2*(ii + k) + 1)*8, DK); - simdgroup_multiply_accumulate(mqk, mq, mk, mqk); - } - } else { - if (ii + tx < DK16) { - k4x4_t tmp; - deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp); - sk4x4[4*ty + tx] = tmp; - } - - simdgroup_barrier(mem_flags::mem_threadgroup); - - for (short k = 0; k < 4 && ii + k < DK16; ++k) { - k8x8_t mk; - q8x8_t mq; - - simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose - simdgroup_load(mq, sq + (2*(ii + k) + 0)*8, DK); - simdgroup_multiply_accumulate(mqk, mq, mk, mqk); - - simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose - simdgroup_load(mq, sq + (2*(ii + k) + 1)*8, DK); - simdgroup_multiply_accumulate(mqk, mq, mk, mqk); - } - } - } - - simdgroup_store(mqk, ss + 8*cc, SH, 0, false); - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - // online softmax - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - const float m = M[jj]; - - // scale and apply the logitcap / mask - float2 s2 = ss2[j*SH/2 + tiisg]*args.scale; - - if (FC_flash_attn_ext_has_scap) { - s2 = args.logit_softcap*precise::tanh(s2); - } - - // mqk = mqk + slope*mask - if (blk_cur != 2) { - if (FC_flash_attn_ext_has_bias) { - s2 += s2_t(sm2[j*SH + tiisg])*slope; - } else { - s2 += s2_t(sm2[j*SH + tiisg]); - } - } - - M[jj] = simd_max(max(M[jj], max(s2[0], s2[1]))); - - const float ms = exp(m - M[jj]); - const float2 vs2 = exp(s2 - M[jj]); - - S[jj] = S[jj]*ms + simd_sum(vs2[0] + vs2[1]); - - // the P matrix from the paper (Q rows, C columns) - ss2[j*SH/2 + tiisg] = vs2; - - if (DV4 % NW == 0) { - FOR_UNROLL (short ii = 0; ii < DV4/NW; ++ii) { - const short i = ii*NW + tiisg; - - so4[j*PV4 + i] *= ms; - } - } else { - for (short i = tiisg; i < DV4; i += NW) { - so4[j*PV4 + i] *= ms; - } - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - // O = O + (Q*K^T)*V - { - // we can read directly from global memory - if (is_same::value) { - static_assert(PV8 % NSG == 0, ""); - - constexpr short NO = PV8/NSG; - - o8x8_t lo[NO]; - - { - auto sot = so + 8*sgitg; - - FOR_UNROLL (short ii = 0; ii < NO; ++ii) { - simdgroup_load(lo[ii], sot, PV, 0, false); - - sot += 8*NSG; - } - } - - { - device const v_t * pv = (device const v_t *) (v + ic*args.nb21); - - pv += 8*sgitg; - - if (DV <= 64) { - FOR_UNROLL (short cc = 0; cc < C/8; ++cc) { - s8x8_t vs; - simdgroup_load(vs, ss + 8*cc, SH, 0, false); - - FOR_UNROLL (short ii = 0; ii < NO/2; ++ii) { - v8x8_t mv[2]; - - simdgroup_load(mv[0], pv + 0*NSG + 16*ii*NSG, NS20, 0, false); - simdgroup_load(mv[1], pv + 8*NSG + 16*ii*NSG, NS20, 0, false); - - simdgroup_multiply_accumulate(lo[2*ii + 0], vs, mv[0], lo[2*ii + 0]); - simdgroup_multiply_accumulate(lo[2*ii + 1], vs, mv[1], lo[2*ii + 1]); - } - - pv += 8*NS20; - } - } else { - constexpr short NC = (C/8)/2; - - FOR_UNROLL (short cc = 0; cc < NC; ++cc) { - s8x8_t vs[2]; - - simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false); - simdgroup_load(vs[1], ss + 16*cc + 8, SH, 0, false); - - FOR_UNROLL (short ii = 0; ii < NO/2; ++ii) { - v8x8_t mv[4]; - - simdgroup_load(mv[0], pv + 0*NSG + 16*ii*NSG + 0*8*NS20, NS20, 0, false); - simdgroup_load(mv[1], pv + 8*NSG + 16*ii*NSG + 0*8*NS20, NS20, 0, false); - simdgroup_load(mv[2], pv + 0*NSG + 16*ii*NSG + 1*8*NS20, NS20, 0, false); - simdgroup_load(mv[3], pv + 8*NSG + 16*ii*NSG + 1*8*NS20, NS20, 0, false); - - simdgroup_multiply_accumulate(lo[2*ii + 0], vs[0], mv[0], lo[2*ii + 0]); - simdgroup_multiply_accumulate(lo[2*ii + 1], vs[0], mv[1], lo[2*ii + 1]); - simdgroup_multiply_accumulate(lo[2*ii + 0], vs[1], mv[2], lo[2*ii + 0]); - simdgroup_multiply_accumulate(lo[2*ii + 1], vs[1], mv[3], lo[2*ii + 1]); - } - - pv += 2*8*NS20; - } - } - } - - { - auto sot = so + 8*sgitg; - - FOR_UNROLL (short ii = 0; ii < NO; ++ii) { - simdgroup_store(lo[ii], sot, PV, 0, false); - - sot += 8*NSG; - } - } - } else { - // TODO: this is the quantized V cache branch - not optimized yet - - const short tx = tiisg%4; - const short ty = tiisg/4; - - for (short cc = 0; cc < C/8; ++cc) { - s8x8_t vs; - simdgroup_load(vs, ss + 8*cc, SH, 0, false); - - for (short ii = 4*sgitg; ii < DV16; ii += 4*NSG) { - device const vd4x4_t * pv4x4 = (device const vd4x4_t *) (v + ((ic + 8*cc + ty)*args.nb21)); - - if (DV16%4 == 0) { - // no need for bound checks - { - v4x4_t tmp; - deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp); - sv4x4[4*ty + tx] = tmp; - } - - simdgroup_barrier(mem_flags::mem_threadgroup); - - FOR_UNROLL (short k = 0; k < 4; ++k) { - v8x8_t mv[2]; - o8x8_t lo[2]; - - simdgroup_load(mv[0], sv + 16*k + 0*8, 4*16, 0, false); - simdgroup_load(mv[1], sv + 16*k + 1*8, 4*16, 0, false); - simdgroup_load(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); - simdgroup_load(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); - - simdgroup_multiply_accumulate(lo[0], vs, mv[0], lo[0]); - simdgroup_multiply_accumulate(lo[1], vs, mv[1], lo[1]); - - simdgroup_store(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); - simdgroup_store(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); - } - } else { - if (ii + tx < DV16) { - v4x4_t tmp; - deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp); - sv4x4[4*ty + tx] = tmp; - } - - simdgroup_barrier(mem_flags::mem_threadgroup); - - for (short k = 0; k < 4 && ii + k < DV16; ++k) { - v8x8_t mv[2]; - o8x8_t lo[2]; - - simdgroup_load(mv[0], sv + 16*k + 0*8, 4*16, 0, false); - simdgroup_load(mv[1], sv + 16*k + 1*8, 4*16, 0, false); - simdgroup_load(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); - simdgroup_load(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); - - simdgroup_multiply_accumulate(lo[0], vs, mv[0], lo[0]); - simdgroup_multiply_accumulate(lo[1], vs, mv[1], lo[1]); - - simdgroup_store(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); - simdgroup_store(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); - } - } - } - } - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - if (FC_flash_attn_ext_has_sinks) { - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - - const float m = M[jj]; - const float s = tiisg == 0 ? ((device const float *) sinks)[iq2] : -FLT_MAX/2; - - M[jj] = simd_max(max(M[jj], s)); - - const float ms = exp(m - M[jj]); - const float vs = exp(s - M[jj]); - - S[jj] = S[jj]*ms + simd_sum(vs); - - for (short i = tiisg; i < DV4; i += NW) { - so4[j*PV4 + i] *= ms; - } - } - } - } - - // store to global memory - for (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; - if (iq1 + j >= args.ne01) { - break; - } - - device float4 * dst4 = (device float4 *) dst + ((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4; - - const float scale = S[jj] == 0.0 ? 0.0f : 1.0f/S[jj]; - - if (DV4 % NW == 0) { - FOR_UNROLL (short ii = 0; ii < DV4/NW; ++ii) { - const short i = ii*NW + tiisg; - - dst4[i] = (float4) so4[j*PV4 + i]*scale; - } - } else { - for (short i = tiisg; i < DV4; i += NW) { - dst4[i] = (float4) so4[j*PV4 + i]*scale; - } - } - } - -#undef NS10 -#undef NS20 -} - -template< - typename q_t, // query types in shared memory - typename q4_t, - typename q8x8_t, - typename k_t, // key types in shared memory - typename k4x4_t, - typename k8x8_t, - typename v_t, // value types in shared memory - typename v4x4_t, - typename v8x8_t, - typename qk_t, // Q*K types - typename qk8x8_t, - typename s_t, // soft-max types - typename s2_t, - typename s8x8_t, - typename o_t, // attention accumulation types - typename o4_t, - typename o8x8_t, - typename kd4x4_t, // key type in device memory - short nl_k, - void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &), - typename vd4x4_t, // value type in device memory - short nl_v, - void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), - short DK, // K head size - short DV, // V head size - short Q = OP_FLASH_ATTN_EXT_NQPSG, // queries per threadgroup - short C = OP_FLASH_ATTN_EXT_NCPSG> // cache items per threadgroup -kernel void kernel_flash_attn_ext( - constant ggml_metal_kargs_flash_attn_ext & args, - device const char * q, - device const char * k, - device const char * v, - device const char * mask, - device const char * sinks, - device const char * pad, - device const char * blk, - device char * dst, - threadgroup half * shmem_f16 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { -#define FWD_TMPL q_t, q4_t, q8x8_t, k_t, k4x4_t, k8x8_t, v_t, v4x4_t, v8x8_t, qk_t, qk8x8_t, s_t, s2_t, s8x8_t, o_t, o4_t, o8x8_t, kd4x4_t, nl_k, deq_k, vd4x4_t, nl_v, deq_v, DK, DV, Q, C -#define FWD_ARGS args, q, k, v, mask, sinks, pad, blk, dst, shmem_f16, tgpig, tiisg, sgitg - switch (FC_flash_attn_ext_nsg) { - // note: disabled cases to reduce library load time - //case 1: kernel_flash_attn_ext_impl(FWD_ARGS); break; - //case 2: kernel_flash_attn_ext_impl(FWD_ARGS); break; - case 4: kernel_flash_attn_ext_impl(FWD_ARGS); break; - case 8: kernel_flash_attn_ext_impl(FWD_ARGS); break; - } -#undef FWD_TMPL -#undef FWD_ARGS -} - -// TODO: this is quite ugly. in the future these types will be hardcoded in the kernel, but for now keep them as -// template to be able to explore different combinations -// -#define FA_TYPES \ - half, half4, simdgroup_half8x8, \ - half, half4x4, simdgroup_half8x8, \ - half, half4x4, simdgroup_half8x8, \ - float, simdgroup_float8x8, \ - float, float2, simdgroup_float8x8, \ - float, float4, simdgroup_float8x8 - //half, half4, simdgroup_half8x8 - -#define FA_TYPES_BF \ - bfloat, bfloat4, simdgroup_bfloat8x8, \ - bfloat, bfloat4x4, simdgroup_bfloat8x8, \ - bfloat, bfloat4x4, simdgroup_bfloat8x8, \ - float, simdgroup_float8x8, \ - float, float2, simdgroup_float8x8, \ - half, half4, simdgroup_half8x8 - //float, float4, simdgroup_float8x8 - -#define FA_TYPES_F32 \ - half, half4, simdgroup_half8x8, \ - float, float4x4, simdgroup_float8x8, \ - float, float4x4, simdgroup_float8x8, \ - float, simdgroup_float8x8, \ - float, float2, simdgroup_float8x8, \ - float, float4, simdgroup_float8x8 - //half, half4, simdgroup_half8x8 - -typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; - -template [[host_name("kernel_flash_attn_ext_f32_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_bf16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -#endif - -template [[host_name("kernel_flash_attn_ext_q4_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; - -#undef FA_TYPES -#undef FA_TYPES_BF -#undef FA_TYPES_F32 - -constant bool FC_flash_attn_ext_vec_has_mask [[function_constant(FC_FLASH_ATTN_EXT_VEC + 0)]]; -constant bool FC_flash_attn_ext_vec_has_sinks [[function_constant(FC_FLASH_ATTN_EXT_VEC + 1)]]; -constant bool FC_flash_attn_ext_vec_has_bias [[function_constant(FC_FLASH_ATTN_EXT_VEC + 2)]]; -constant bool FC_flash_attn_ext_vec_has_scap [[function_constant(FC_FLASH_ATTN_EXT_VEC + 3)]]; -constant bool FC_flash_attn_ext_vec_has_kvpad [[function_constant(FC_FLASH_ATTN_EXT_VEC + 4)]]; - -//constant float FC_flash_attn_ext_vec_scale [[function_constant(FC_FLASH_ATTN_EXT_VEC + 10)]]; -//constant float FC_flash_attn_ext_vec_max_bias [[function_constant(FC_FLASH_ATTN_EXT_VEC + 11)]]; -//constant float FC_flash_attn_ext_vec_logit_softcap [[function_constant(FC_FLASH_ATTN_EXT_VEC + 12)]]; - -constant int32_t FC_flash_attn_ext_vec_ns10 [[function_constant(FC_FLASH_ATTN_EXT_VEC + 20)]]; -constant int32_t FC_flash_attn_ext_vec_ns20 [[function_constant(FC_FLASH_ATTN_EXT_VEC + 21)]]; -constant int32_t FC_flash_attn_ext_vec_nsg [[function_constant(FC_FLASH_ATTN_EXT_VEC + 22)]]; -constant int32_t FC_flash_attn_ext_vec_nwg [[function_constant(FC_FLASH_ATTN_EXT_VEC + 23)]]; - -template< - typename q4_t, // query types in shared memory - typename k4_t, // key types in shared memory - typename v4_t, // value types in shared memory - typename qk_t, // Q*K types - typename s_t, // soft-max types - typename s4_t, - typename o4_t, // attention accumulation types - typename kd4_t, // key type in device memory - short nl_k, - void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &), - typename vd4_t, // value type in device memory - short nl_v, - void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &), - short DK, // K head size - short DV, // V head size - short NE = 4, // head elements per thread - short Q = OP_FLASH_ATTN_EXT_VEC_NQPSG, // queries per threadgroup - short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup -kernel void kernel_flash_attn_ext_vec( - constant ggml_metal_kargs_flash_attn_ext_vec & args, - device const char * q, - device const char * k, - device const char * v, - device const char * mask, - device const char * sinks, - device const char * pad, - device char * dst, - threadgroup half * shmem_f16 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - static_assert(DK % 32 == 0, "DK must be divisible by 32"); - static_assert(DV % 32 == 0, "DV must be divisible by 32"); - -#define NWG (FC_flash_attn_ext_vec_nwg) -#define NSG (FC_flash_attn_ext_vec_nsg) - -#define NS10 (FC_flash_attn_ext_vec_ns10) -#define NS20 (FC_flash_attn_ext_vec_ns20) - - const short iwg = tgpig[2]%NWG; - - const ushort iq3 = tgpig[2]/NWG; - const ushort iq2 = tgpig[1]; - const ushort iq1 = tgpig[0]; - - constexpr short DK4 = DK/4; - constexpr short DV4 = DV/4; - - constexpr short PK = PAD2(DK, 128); - constexpr short PK4 = PK/4; - - constexpr short PV = PAD2(DV, 128); - constexpr short PV4 = PV/4; - - constexpr short NW = N_SIMDWIDTH; - constexpr short NL = NW/NE; // note: this can be adjusted to support different head sizes and simdgroup work loads - constexpr short SH = 4*C; // shared memory per simdgroup - - static_assert(DK4 % NL == 0, "DK4 must be divisible by NL"); - static_assert(DV4 % NL == 0, "DV4 must be divisible by NL"); - - //const short T = PK + NSG*SH; // shared memory size per query in (half) - - //threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data - threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t - threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + NSG*PK); // scratch buffer for attention - threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + NSG*PK); // same as above but in s4_t - threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + NSG*PK); // scratch buffer for mask - threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + NSG*PK + NSG*SH); // scratch buffer for the results - - // store the result for all queries in shared memory (the O matrix from the paper) - so4 += tiisg; - - { - q += iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03; - - const short ikv2 = iq2/(args.ne02/args.ne_12_2); - const short ikv3 = iq3/(args.ne03/args.ne_12_3); - - k += ikv2*args.nb12 + ikv3*args.nb13; - v += ikv2*args.nb22 + ikv3*args.nb23; - } - - // load heads from Q to shared memory - device const float4 * q4 = (device const float4 *) ((device const char *) q); - - if (iq1 < args.ne01) { - for (short i = tiisg; i < PK4; i += NW) { - if (i < DK4) { - sq4[i] = (q4_t) q4[i]; - } else { - sq4[i] = (q4_t) 0.0f; - } - } - } - - // zero out so - for (short i = 0; i < DV4/NL; ++i) { - so4[i*NL] = (o4_t) 0.0f; - } - - // zero out shared memory SH - for (short i = tiisg; i < SH/4; i += NW) { - ss4[i] = (s4_t) 0.0f; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - { - float S = 0.0f; - float M = -FLT_MAX/2; - - // thread indices inside the simdgroup - const short tx = tiisg%NL; - const short ty = tiisg/NL; - - // pointer to the mask - device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33); - - float slope = 1.0f; - - // ALiBi - if (FC_flash_attn_ext_vec_has_bias) { - const short h = iq2; - - const float base = h < args.n_head_log2 ? args.m0 : args.m1; - const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; - - slope = pow(base, exph); - } - - // loop over the KV cache - // each simdgroup handles blocks of Q rows and C columns - for (int ic0 = iwg*NSG + sgitg; ; ic0 += NWG*NSG) { - int ic = ic0*C; - if (ic >= args.ne11) { - break; - } - - // the last partial chunk uses the pad buffer as source - if (FC_flash_attn_ext_vec_has_kvpad && ic + C > args.ne11) { - k = pad; - v = k + args.nb11*C*args.ne_12_2*args.ne_12_3; - mask = v + args.nb21*C*args.ne_12_2*args.ne_12_3; - - const short ikv2 = iq2/(args.ne02/args.ne_12_2); - const short ikv3 = iq3/(args.ne03/args.ne_12_3); - - k += (ikv2 + ikv3*args.ne_12_2)*args.nb11*C; - v += (ikv2 + ikv3*args.ne_12_2)*args.nb21*C; - - if (!FC_flash_attn_ext_vec_has_mask) { - if (ic + tiisg >= args.ne11) { - sm[tiisg] = -MAXHALF; - } - } else { - pm = (device const half *) (mask) + - iq1*C + - (iq2%args.ne32)*(C*args.ne31) + - (iq3%args.ne33)*(C*args.ne31*args.ne32); - } - - ic = 0; - } - - if (FC_flash_attn_ext_vec_has_mask) { - sm[tiisg] = pm[ic + tiisg]; - } - - // skip -INF blocks - if (simd_max(sm[tiisg]) <= -MAXHALF) { - continue; - } - - // Q*K^T - { - device const k4_t * pk4 = (device const k4_t *) (k + ic*args.nb11); - threadgroup const q4_t * pq4 = sq4; - - pk4 += ty*NS10/4 + tx; - pq4 += tx; - - qk_t mqk[C/NE] = { [ 0 ... C/NE - 1] = 0.0f }; - - // each simdgroup processes 1 query and NE (NW/NL) cache elements - FOR_UNROLL (short cc = 0; cc < C/NE; ++cc) { - if (is_same::value) { - FOR_UNROLL (short ii = 0; ii < DK4/NL; ++ii) { - mqk[cc] += dot((float4) pk4[cc*NE*NS10/4 + ii*NL], (float4) pq4[ii*NL]); - } - } else { - device const kd4_t * pk = (device const kd4_t *) (k + ((ic + NE*cc + ty)*args.nb11)); - - k4_t mk; - - FOR_UNROLL (short ii = 0; ii < DK4/NL; ++ii) { - const short i = ii*NL + tx; - - deq_k_t4(pk + i/nl_k, i%nl_k, mk); - - mqk[cc] += dot((float4) mk, (float4) sq4[i]); - } - } - - if (NE == 1) { - mqk[cc] = simd_sum(mqk[cc]); - } else { - // simdgroup reduce (NE = 4) - // [ 0 .. 7] -> [ 0] - // [ 8 .. 15] -> [ 8] - // [16 .. 23] -> [16] - // [24 .. 31] -> [24] - if (NE <= 1) { - mqk[cc] += simd_shuffle_down(mqk[cc], 16); - } - if (NE <= 2) { - mqk[cc] += simd_shuffle_down(mqk[cc], 8); - } - if (NE <= 4) { - mqk[cc] += simd_shuffle_down(mqk[cc], 4); - } - if (NE <= 8) { - mqk[cc] += simd_shuffle_down(mqk[cc], 2); - } - if (NE <= 16) { - mqk[cc] += simd_shuffle_down(mqk[cc], 1); - } - - // broadcast - mqk[cc] = simd_shuffle(mqk[cc], NL*ty); - } - } - - if (FC_flash_attn_ext_vec_has_mask && - !FC_flash_attn_ext_vec_has_scap && - !FC_flash_attn_ext_vec_has_bias) { - ss[NE*tx + ty] = fma(mqk[tx], args.scale, (qk_t) sm[NE*tx + ty]); - } else { - mqk[tx] *= args.scale; - - if (FC_flash_attn_ext_vec_has_scap) { - mqk[tx] = args.logit_softcap*precise::tanh(mqk[tx]); - } - - if (FC_flash_attn_ext_vec_has_bias) { - mqk[tx] += (qk_t) sm[NE*tx + ty]*slope; - } else { - mqk[tx] += (qk_t) sm[NE*tx + ty]; - } - - ss[NE*tx + ty] = mqk[tx]; - } - } - - simdgroup_barrier(mem_flags::mem_threadgroup); - - // online softmax - { - const float m = M; - const float s = ss[tiisg]; - - M = simd_max(max(M, s)); - - const float ms = exp(m - M); - const float vs = exp(s - M); - - S = S*ms + simd_sum(vs); - - // the P matrix from the paper (Q rows, C columns) - ss[tiisg] = vs; - - // O = diag(ms)*O - if ((DV4/NL % NW == 0) || ty == 0) { - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - so4[ii*NL] *= ms; - } - } - } - - simdgroup_barrier(mem_flags::mem_threadgroup); - - // O = O + (Q*K^T)*V - { - o4_t lo[DV4/NL]; - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - lo[ii] = 0.0f; - } - - if (is_same::value) { - device const v4_t * pv4 = (device const v4_t *) (v + ic*args.nb21); - - pv4 += ty*NS20/4 + tx; - - const auto sst = ss + ty; - - FOR_UNROLL (short cc = 0; cc < C/NE; ++cc) { - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - lo[ii] += o4_t(float4(pv4[cc*NE*NS20/4 + ii*NL])*float4(sst[cc*NE])); - } - } - } else { - FOR_UNROLL (short cc = 0; cc < C/NE; ++cc) { - device const vd4_t * pv4 = (device const vd4_t *) (v + ((ic + NE*cc + ty)*args.nb21)); - - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - const short i = ii*NL + tx; - - v4_t mv; - deq_v_t4(pv4 + i/nl_v, i%nl_v, mv); - - lo[ii] += o4_t(float4(mv)*float4(ss[NE*cc + ty])); - } - } - } - - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - if (NE > 1) { - lo[ii][0] += simd_shuffle_down(lo[ii][0], 16); - lo[ii][1] += simd_shuffle_down(lo[ii][1], 16); - lo[ii][2] += simd_shuffle_down(lo[ii][2], 16); - lo[ii][3] += simd_shuffle_down(lo[ii][3], 16); - } - - if (NE > 2) { - lo[ii][0] += simd_shuffle_down(lo[ii][0], 8); - lo[ii][1] += simd_shuffle_down(lo[ii][1], 8); - lo[ii][2] += simd_shuffle_down(lo[ii][2], 8); - lo[ii][3] += simd_shuffle_down(lo[ii][3], 8); - } - - if (NE > 4) { - lo[ii][0] += simd_shuffle_down(lo[ii][0], 4); - lo[ii][1] += simd_shuffle_down(lo[ii][1], 4); - lo[ii][2] += simd_shuffle_down(lo[ii][2], 4); - lo[ii][3] += simd_shuffle_down(lo[ii][3], 4); - } - - if (NE > 8) { - lo[ii][0] += simd_shuffle_down(lo[ii][0], 2); - lo[ii][1] += simd_shuffle_down(lo[ii][1], 2); - lo[ii][2] += simd_shuffle_down(lo[ii][2], 2); - lo[ii][3] += simd_shuffle_down(lo[ii][3], 2); - } - - if (NE > 16) { - lo[ii][0] += simd_shuffle_down(lo[ii][0], 1); - lo[ii][1] += simd_shuffle_down(lo[ii][1], 1); - lo[ii][2] += simd_shuffle_down(lo[ii][2], 1); - lo[ii][3] += simd_shuffle_down(lo[ii][3], 1); - } - } - - if ((DV4/NL % NW == 0) || ty == 0) { - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - so4[ii*NL] += lo[ii]; - } - } - } - } - - if (FC_flash_attn_ext_vec_has_sinks && sgitg == 0 && iwg == 0) { - const float m = M; - const float s = tiisg == 0 ? ((device const float *) sinks)[iq2] : -FLT_MAX/2; - - M = simd_max(max(M, s)); - - const float ms = exp(m - M); - const float vs = exp(s - M); - - S = S*ms + simd_sum(vs); - - if ((DV4/NL % NW == 0) || ty == 0) { - FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { - so4[ii*NL] *= ms; - } - } - } - - // these are needed for reducing the results from the simdgroups (reuse the ss buffer) - if (tiisg == 0) { - ss[0] = (s_t) S; - ss[1] = (s_t) M; - } - } - - so4 -= tiisg; - - threadgroup_barrier(mem_flags::mem_threadgroup); - - // parallel reduce - for (short r = NSG/2; r > 0; r >>= 1) { - if (sgitg < r) { - const float S0 = ss[ 0]; - const float S1 = ss[r*(SH/2) + 0]; - - const float M0 = ss[ 1]; - const float M1 = ss[r*(SH/2) + 1]; - - const float M = max(M0, M1); - - const float ms0 = exp(M0 - M); - const float ms1 = exp(M1 - M); - - const float S = S0*ms0 + S1*ms1; - - if (tiisg == 0) { - ss[0] = S; - ss[1] = M; - } - - // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 - for (short i = tiisg; i < DV4; i += NW) { - so4[i] = so4[i]*ms0 + so4[i + r*PV4]*ms1; - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - // final rescale with 1/S and store to global memory - if (sgitg == 0) { - const int64_t nrows = args.ne3*args.ne2*args.ne1; - const int64_t rid = iq3*args.ne2*args.ne1 + iq2 + iq1*args.ne1; - - device float4 * dst4 = (device float4 *) dst; - device float * dst1 = (device float *) dst + nrows*DV*NWG; // the S and M are stored after the results - - const float S = NWG == 1 ? (ss[0] == 0.0f ? 0.0f : 1.0f/ss[0]) : 1.0f; - - // interleave the workgroup data - for (short i = tiisg; i < DV4; i += NW) { - dst4[rid*DV4*NWG + NWG*i + iwg] = (float4) so4[i]*S; - } - - // store S and M - if (NWG > 1) { - if (tiisg == 0) { - dst1[rid*(2*NWG) + 2*iwg + 0] = ss[0]; - dst1[rid*(2*NWG) + 2*iwg + 1] = ss[1]; - } - } - } - -#undef NWG -#undef NSG -#undef NS10 -#undef NS20 -} - -// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem -// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max -// -#define FA_TYPES \ - half4, \ - half4, \ - half4, \ - float, \ - float, float4, \ - float4 - -#define FA_TYPES_F32 \ - half4, \ - float4, \ - float4, \ - float, \ - float, float4, \ - float4 - -typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_flash_attn_ext_vec_bf16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#endif -template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; - -#undef FA_TYPES -#undef FA_TYPES_F32 - -constant int32_t FC_flash_attn_ext_vec_reduce_DV [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 0)]]; -constant int32_t FC_flash_attn_ext_vec_reduce_NWG [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 1)]]; - -kernel void kernel_flash_attn_ext_vec_reduce( - constant ggml_metal_kargs_flash_attn_ext_vec_reduce & args, - device const char * htmp, - device char * dst, - uint tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { -#define NWG (FC_flash_attn_ext_vec_reduce_NWG) -#define DV (FC_flash_attn_ext_vec_reduce_DV) - - const uint64_t rid = tgpig; - - const short iwg = tiisg; - - device const float * ss = (device const float *) htmp + (uint64_t)args.nrows*DV*NWG; - - float S = ss[rid*(2*NWG) + 2*iwg + 0]; - float M = ss[rid*(2*NWG) + 2*iwg + 1]; - - const float m = simd_max(M); - const float ms = exp(M - m); - - S = simd_sum(S*ms); - S = S == 0.0f ? 0.0f : 1.0f/S; - - const short DV4 = DV/4; - - device const float4 * htmp4 = (device const float4 *) htmp + rid*DV4*NWG; - device float4 * dst4 = (device float4 *) dst + rid*DV4; - - for (short i = sgitg; i < DV4; i += NWG) { - const float4 v = simd_sum(htmp4[i*NWG + iwg]*ms); - - if (iwg == 0) { - dst4[i] = v*S; - } - } - -#undef NWG -#undef DV -} - -template -kernel void kernel_cpy_t_t( - constant ggml_metal_kargs_cpy & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int32_t i03 = tgpig[2]; - const int32_t i02 = tgpig[1]; - const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; - const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; - - if (i01 >= args.ne01) { - return; - } - - const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - - const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); - - device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) { - device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - dst_data[i00] = (T1) src[0]; - break; - } -} - -typedef decltype(kernel_cpy_t_t) kernel_cpy_t; - -template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; -template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy_t_t; -template [[host_name("kernel_cpy_f32_i32")]] kernel kernel_cpy_t kernel_cpy_t_t; -template [[host_name("kernel_cpy_i32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; -template [[host_name("kernel_cpy_i32_i32")]] kernel kernel_cpy_t kernel_cpy_t_t; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_cpy_f32_bf16")]] kernel kernel_cpy_t kernel_cpy_t_t; -#endif -template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; -template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy_t_t; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; -template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy_t_t; -#endif - -template -kernel void kernel_cpy_f32_q( - constant ggml_metal_kargs_cpy & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int32_t i03 = tgpig[2]; - const int32_t i02 = tgpig[1]; - const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; - const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; - - if (i01 >= args.ne01) { - return; - } - - const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - - const int32_t i3 = n / (args.ne2*args.ne1*args.ne0); - const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); - const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; - const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; - - device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { - device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00); - - quantize_func(src, dst_data[i00]); - - break; - } -} - -typedef decltype(kernel_cpy_f32_q) cpy_f_q_t; - -template [[host_name("kernel_cpy_f32_q8_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; -template [[host_name("kernel_cpy_f32_q1_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; -template [[host_name("kernel_cpy_f32_q4_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; -template [[host_name("kernel_cpy_f32_q4_1")]] kernel cpy_f_q_t kernel_cpy_f32_q; -template [[host_name("kernel_cpy_f32_q5_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; -template [[host_name("kernel_cpy_f32_q5_1")]] kernel cpy_f_q_t kernel_cpy_f32_q; -template [[host_name("kernel_cpy_f32_iq4_nl")]] kernel cpy_f_q_t kernel_cpy_f32_q; - -template -kernel void kernel_cpy_q_f32( - constant ggml_metal_kargs_cpy & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int32_t i03 = tgpig[2]; - const int32_t i02 = tgpig[1]; - const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; - const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; - - if (i01 >= args.ne01) { - return; - } - - const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - - const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); - - device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { - T4x4 temp; - dequantize_func(src_data + i00/nl, i00%nl, temp); - dst_data[i00] = temp; - - break; - } -} - -typedef decltype(kernel_cpy_q_f32) cpy_q_f_t; - -template [[host_name("kernel_cpy_q1_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; - -template [[host_name("kernel_cpy_q1_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; -template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; - -template -kernel void kernel_concat( - constant ggml_metal_kargs_concat & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - - const int i3 = tgpig.z; - const int i2 = tgpig.y; - const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y; - - if (i1 >= args.ne1) { - return; - } - - int o[4] = {0, 0, 0, 0}; - o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - device const T * x; - - if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - x = (device const T *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00); - } else { - x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10); - } - - device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - - *y = *x; - } -} - -typedef decltype(kernel_concat) kernel_concat_t; - -template [[host_name("kernel_concat_f32")]] kernel kernel_concat_t kernel_concat; -template [[host_name("kernel_concat_f16")]] kernel kernel_concat_t kernel_concat; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat; -#endif -template [[host_name("kernel_concat_i8")]] kernel kernel_concat_t kernel_concat; -template [[host_name("kernel_concat_i16")]] kernel kernel_concat_t kernel_concat; -template [[host_name("kernel_concat_i32")]] kernel kernel_concat_t kernel_concat; -template [[host_name("kernel_concat_i64")]] kernel kernel_concat_t kernel_concat; - -template -void kernel_mul_mv_q2_K_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_q2_K * x = (device const block_q2_K *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const short ix = tiisg/8; // 0...3 - const short it = tiisg%8; // 0...7 - const short iq = it/4; // 0 or 1 - const short ir = it%4; // 0...3 - const short is = (8*ir)/16;// 0 or 1 - - device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir; - - for (int ib = ix; ib < nb; ib += 4) { - float4 sumy = {0.f, 0.f, 0.f, 0.f}; - for (short i = 0; i < 8; ++i) { - yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; - yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8]; - yl[i+16] = y4[i+64]; sumy[2] += yl[i+16]; - yl[i+24] = y4[i+96]; sumy[3] += yl[i+24]; - } - - device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*iq + is; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; - device const half * dh = &x[ib].d; - - for (short row = 0; row < nr0; row++) { - float4 acc1 = {0.f, 0.f, 0.f, 0.f}; - float4 acc2 = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); - acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); - acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); - acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); - acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); - acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); - acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); - acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); - } - float dall = dh[0]; - float dmin = dh[1] * 1.f/16.f; - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + - (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f + - (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f + - (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) - - dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0)); - - qs += args.nb01/2; - sc += args.nb01; - dh += args.nb01/2; - } - - y4 += 4 * QK_K; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_q2_K_f32")]] -kernel void kernel_mul_mv_q2_K_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_q2_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_q3_K_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_q3_K * x = (device const block_q3_K *) (src0 + offset0); - device const float * yy = (device const float *) (src1 + offset1); - - float yl[32]; - - //const uint16_t kmask1 = 0x3030; - //const uint16_t kmask2 = 0x0f0f; - - const short tid = tiisg/4; - const short ix = tiisg%4; - const short ip = tid/4; // 0 or 1 - const short il = 2*((tid%4)/2); // 0 or 2 - const short ir = tid%2; - const short l0 = 8*ir; - - // One would think that the Metal compiler would figure out that ip and il can only have - // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it - // with these two tales. - // - // Possible masks for the high bit - const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0 - {0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2 - {0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0 - {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2 - - // Possible masks for the low 2 bits - const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}}; - - const ushort4 hm = mm[2*ip + il/2]; - - const short shift = 2*il; - - const float v1 = il == 0 ? 4.f : 64.f; - const float v2 = 4.f * v1; - - const uint16_t s_shift1 = 4*ip; - const uint16_t s_shift2 = s_shift1 + il; - - const short q_offset = 32*ip + l0; - const short y_offset = 128*ip + 32*il + l0; - - device const float * y1 = yy + ix*QK_K + y_offset; - - uint32_t scales32, aux32; - thread uint16_t * scales16 = (thread uint16_t *)&scales32; - thread const int8_t * scales = (thread const int8_t *)&scales32; - - float sumf1[nr0] = {0.f}; - float sumf2[nr0] = {0.f}; - - for (int i = ix; i < nb; i += 4) { - for (short l = 0; l < 8; ++l) { - yl[l+ 0] = y1[l+ 0]; - yl[l+ 8] = y1[l+16]; - yl[l+16] = y1[l+32]; - yl[l+24] = y1[l+48]; - } - - device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset); - device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0); - device const uint16_t * a = (device const uint16_t *)(x[i].scales); - device const half * dh = &x[i].d; - - for (short row = 0; row < nr0; ++row) { - const float d_all = (float)dh[0]; - - scales16[0] = a[4]; - scales16[1] = a[5]; - aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030; - scales16[0] = a[il+0]; - scales16[1] = a[il+1]; - scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32; - - float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0; - for (short l = 0; l < 8; l += 2) { - const int32_t qs = q[l/2]; - s1 += yl[l+0] * (qs & qm[il/2][0]); - s2 += yl[l+1] * (qs & qm[il/2][1]); - s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]); - s4 += yl[l+16] * (qs & qm[il/2][2]); - s5 += yl[l+17] * (qs & qm[il/2][3]); - s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]); - } - float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); - float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); - sumf1[row] += d1 * (scales[0] - 32); - sumf2[row] += d2 * (scales[2] - 32); - - s1 = s2 = s3 = s4 = s5 = s6 = 0; - for (short l = 0; l < 8; l += 2) { - const int32_t qs = q[l/2+8]; - s1 += yl[l+8] * (qs & qm[il/2][0]); - s2 += yl[l+9] * (qs & qm[il/2][1]); - s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]); - s4 += yl[l+24] * (qs & qm[il/2][2]); - s5 += yl[l+25] * (qs & qm[il/2][3]); - s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]); - } - d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); - d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); - sumf1[row] += d1 * (scales[1] - 32); - sumf2[row] += d2 * (scales[3] - 32); - - q += args.nb01/2; - h += args.nb01/2; - a += args.nb01/2; - dh += args.nb01/2; - } - - y1 += 4 * QK_K; - } - - for (int row = 0; row < nr0; ++row) { - const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift); - sumf1[row] = simd_sum(sumf); - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - if (tiisg == 0) { - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - dst_f32[first_row + row] = sumf1[row]; - } - } -} - -[[host_name("kernel_mul_mv_q3_K_f32")]] -kernel void kernel_mul_mv_q3_K_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_q3_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_q4_K_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - constexpr uint16_t kmask1 = 0x3f3f; - constexpr uint16_t kmask2 = 0x0f0f; - constexpr uint16_t kmask3 = 0xc0c0; - - const short ix = tiisg/8; // 0...3 - const short it = tiisg%8; // 0...7 - const short iq = it/4; // 0 or 1 - const short ir = it%4; // 0...3 - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_q4_K * x = (device const block_q4_K *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[16]; - float yh[16]; - - float sumf[nr0]={0.f}; - - device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir; - - uint16_t sc16[4]; - thread const uint8_t * sc8 = (thread const uint8_t *)sc16; - - for (int ib = ix; ib < nb; ib += 4) { - float4 sumy = {0.f, 0.f, 0.f, 0.f}; - - for (short i = 0; i < 8; ++i) { - yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0]; - yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8]; - yh[i+0] = y4[i+128]; sumy[2] += yh[i+0]; - yh[i+8] = y4[i+160]; sumy[3] += yh[i+8]; - } - - device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq; - device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; - device const half * dh = &x[ib].d; - - for (short row = 0; row < nr0; row++) { - sc16[0] = sc[0] & kmask1; - sc16[1] = sc[2] & kmask1; - sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2); - sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2); - - device const uint16_t * q2 = q1 + 32; - - float4 acc1 = {0.f, 0.f, 0.f, 0.f}; - float4 acc2 = {0.f, 0.f, 0.f, 0.f}; - - FOR_UNROLL (short i = 0; i < 4; ++i) { - acc1[0] += yl[2*i + 0] * (q1[i] & 0x000F); - acc1[1] += yl[2*i + 1] * (q1[i] & 0x0F00); - acc1[2] += yl[2*i + 8] * (q1[i] & 0x00F0); - acc1[3] += yl[2*i + 9] * (q1[i] & 0xF000); - acc2[0] += yh[2*i + 0] * (q2[i] & 0x000F); - acc2[1] += yh[2*i + 1] * (q2[i] & 0x0F00); - acc2[2] += yh[2*i + 8] * (q2[i] & 0x00F0); - acc2[3] += yh[2*i + 9] * (q2[i] & 0xF000); - } - - sumf[row] += dh[0] * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] + - (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f + - (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] + - (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) - - dh[1] * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); - - q1 += args.nb01/2; - sc += args.nb01/2; - dh += args.nb01/2; - } - - y4 += 4 * QK_K; - } - - device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_q4_K_f32")]] -kernel void kernel_mul_mv_q4_K_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_q4_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_q5_K_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_q5_K * x = (device const block_q5_K *) (src0 + offset0); - device const float * yy = (device const float *) (src1 + offset1); - - float sumf[nr0]={0.f}; - - float yl[16], yh[16]; - - constexpr uint16_t kmask1 = 0x3f3f; - constexpr uint16_t kmask2 = 0x0f0f; - constexpr uint16_t kmask3 = 0xc0c0; - - const short tid = tiisg/4; - const short ix = tiisg%4; - const short iq = tid/4; - const short ir = tid%4; - - const short l0 = 8*ir; - const short q_offset = 32*iq + l0; - const short y_offset = 64*iq + l0; - - const uint8_t hm1 = 1u << (2*iq); - const uint8_t hm2 = hm1 << 1; - const uint8_t hm3 = hm1 << 4; - const uint8_t hm4 = hm2 << 4; - - uint16_t sc16[4]; - thread const uint8_t * sc8 = (thread const uint8_t *)sc16; - - device const float * y1 = yy + ix*QK_K + y_offset; - - for (int i = ix; i < nb; i += 4) { - device const uint8_t * q1 = x[i].qs + q_offset; - device const uint8_t * qh = x[i].qh + l0; - device const half * dh = &x[i].d; - device const uint16_t * a = (device const uint16_t *)x[i].scales + iq; - - device const float * y2 = y1 + 128; - float4 sumy = {0.f, 0.f, 0.f, 0.f}; - for (short l = 0; l < 8; ++l) { - yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0]; - yl[l+8] = y1[l+32]; sumy[1] += yl[l+8]; - yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0]; - yh[l+8] = y2[l+32]; sumy[3] += yh[l+8]; - } - - for (short row = 0; row < nr0; ++row) { - device const uint8_t * q2 = q1 + 64; - - sc16[0] = a[0] & kmask1; - sc16[1] = a[2] & kmask1; - sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); - sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); - - float4 acc1 = {0.f}; - float4 acc2 = {0.f}; - FOR_UNROLL (short l = 0; l < 8; ++l) { - uint8_t h = qh[l]; - acc1[0] += yl[l+0] * (q1[l] & 0x0F); - acc1[1] += yl[l+8] * (q1[l] & 0xF0); - acc1[2] += yh[l+0] * (q2[l] & 0x0F); - acc1[3] += yh[l+8] * (q2[l] & 0xF0); - acc2[0] += h & hm1 ? yl[l+0] : 0.f; - acc2[1] += h & hm2 ? yl[l+8] : 0.f; - acc2[2] += h & hm3 ? yh[l+0] : 0.f; - acc2[3] += h & hm4 ? yh[l+8] : 0.f; - } - - sumf[row] += dh[0] * (sc8[0] * (acc1[0] + 16.f*acc2[0]) + - sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) + - sc8[4] * (acc1[2] + 16.f*acc2[2]) + - sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) - - dh[1] * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); - - q1 += args.nb01; - qh += args.nb01; - dh += args.nb01/2; - a += args.nb01/2; - } - - y1 += 4 * QK_K; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - const float tot = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = tot; - } - } -} - -[[host_name("kernel_mul_mv_q5_K_f32")]] -kernel void kernel_mul_mv_q5_K_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_q5_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_q6_K_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - constexpr uint8_t kmask1 = 0x03; - constexpr uint8_t kmask2 = 0x0C; - constexpr uint8_t kmask3 = 0x30; - constexpr uint8_t kmask4 = 0xC0; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_q6_K * x = (device const block_q6_K *) (src0 + offset0); - device const float * yy = (device const float *) (src1 + offset1); - - float sumf[nr0] = { 0.f }; - - float yl[16]; - - const short tid = tiisg/2; - const short ix = tiisg%2; - const short ip = tid/8; // 0 or 1 - const short il = tid%8; - const short l0 = 4*il; - const short is = 8*ip + l0/16; - - const short y_offset = 128*ip + l0; - const short q_offset_l = 64*ip + l0; - const short q_offset_h = 32*ip + l0; - - for (int i = ix; i < nb; i += 2) { - device const uint8_t * q1 = x[i].ql + q_offset_l; - device const uint8_t * q2 = q1 + 32; - device const uint8_t * qh = x[i].qh + q_offset_h; - device const int8_t * sc = x[i].scales + is; - device const half * dh = &x[i].d; - - device const float * y = yy + i * QK_K + y_offset; - - for (short l = 0; l < 4; ++l) { - yl[4*l + 0] = y[l + 0]; - yl[4*l + 1] = y[l + 32]; - yl[4*l + 2] = y[l + 64]; - yl[4*l + 3] = y[l + 96]; - } - - for (short row = 0; row < nr0; ++row) { - float4 sums = {0.f, 0.f, 0.f, 0.f}; - - FOR_UNROLL (short l = 0; l < 4; ++l) { - sums[0] += yl[4*l + 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); - sums[1] += yl[4*l + 1] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); - sums[2] += yl[4*l + 2] * ((int8_t)((q1[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32); - sums[3] += yl[4*l + 3] * ((int8_t)((q2[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); - } - - sumf[row] += dh[0] * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); - - q1 += args.nb01; - q2 += args.nb01; - qh += args.nb01; - sc += args.nb01; - dh += args.nb01/2; - } - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_q6_K_f32")]] -kernel void kernel_mul_mv_q6_K_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_q6_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -// ======================= "True" 2-bit - -template -void kernel_mul_mv_iq2_xxs_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq2_xxs * x = (device const block_iq2_xxs *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem); - threadgroup uint8_t * ssigns = (threadgroup uint8_t *)(svalues + 256); - { - int nval = 4; - int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xxs_grid[pos + i]; - nval = 2; - pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i]; - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - const int ix = tiisg; - - device const float * y4 = y + 32 * ix; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (short i = 0; i < 32; ++i) { - yl[i] = y4[i]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq2_xxs * xr = x + ibl; - device const uint16_t * q2 = xr->qs + 4 * ib; - device const half * dh = &xr->d; - - for (short row = 0; row < nr0; row++) { - const float db = dh[0]; - device const uint8_t * aux8 = (device const uint8_t *)q2; - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = db * (0.5f + (aux32 >> 28)); - - float sum = 0; - for (short l = 0; l < 4; ++l) { - const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + aux8[l]); - const uint8_t signs = ssigns[(aux32 >> 7*l) & 127]; - for (short j = 0; j < 8; ++j) { - sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); - } - } - sumf[row] += d * sum; - - dh += args.nb01/2; - q2 += args.nb01/2; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all * 0.25f; - } - } -} - -[[host_name("kernel_mul_mv_iq2_xxs_f32")]] -kernel void kernel_mul_mv_iq2_xxs_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq2_xxs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq2_xs_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq2_xs * x = (device const block_iq2_xs *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem); - threadgroup uint8_t * ssigns = (threadgroup uint8_t *)(svalues + 512); - { - int nval = 8; - int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xs_grid[pos + i]; - nval = 2; - pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i]; - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - const int ix = tiisg; - - device const float * y4 = y + 32 * ix; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (short i = 0; i < 32; ++i) { - yl[i] = y4[i]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq2_xs * xr = x + ibl; - device const uint16_t * q2 = xr->qs + 4 * ib; - device const uint8_t * sc = xr->scales + ib; - device const half * dh = &xr->d; - - for (short row = 0; row < nr0; row++) { - const float db = dh[0]; - const uint8_t ls1 = sc[0] & 0xf; - const uint8_t ls2 = sc[0] >> 4; - const float d1 = db * (0.5f + ls1); - const float d2 = db * (0.5f + ls2); - - float sum1 = 0, sum2 = 0; - for (short l = 0; l < 2; ++l) { - const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511)); - const uint8_t signs = ssigns[(q2[l] >> 9)]; - for (short j = 0; j < 8; ++j) { - sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); - } - } - for (short l = 2; l < 4; ++l) { - const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511)); - const uint8_t signs = ssigns[(q2[l] >> 9)]; - for (short j = 0; j < 8; ++j) { - sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); - } - } - sumf[row] += d1 * sum1 + d2 * sum2; - - dh += args.nb01/2; - q2 += args.nb01/2; - sc += args.nb01; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all * 0.25f; - } - } -} - -[[host_name("kernel_mul_mv_iq2_xs_f32")]] -kernel void kernel_mul_mv_iq2_xs_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq2_xs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq3_xxs_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq3_xxs * x = (device const block_iq3_xxs *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - threadgroup uint32_t * svalues = (threadgroup uint32_t *)(shmem); - threadgroup uint8_t * ssigns = (threadgroup uint8_t *)(svalues + 256); - { - int nval = 4; - int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3xxs_grid[pos + i]; - nval = 2; - pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i]; - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - const int ix = tiisg; - - device const float * y4 = y + 32 * ix; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (short i = 0; i < 32; ++i) { - yl[i] = y4[i]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq3_xxs * xr = x + ibl; - device const uint8_t * q3 = xr->qs + 8 * ib; - device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib; - device const half * dh = &xr->d; - - for (short row = 0; row < nr0; row++) { - const float db = dh[0]; - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float d = db * (0.5f + (aux32 >> 28)); - - float2 sum = {0}; - for (short l = 0; l < 4; ++l) { - const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + q3[2*l+0]); - const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + q3[2*l+1]); - const uint8_t signs = ssigns[(aux32 >> 7*l) & 127]; - for (short j = 0; j < 4; ++j) { - sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } - } - sumf[row] += d * (sum[0] + sum[1]); - - dh += args.nb01/2; - q3 += args.nb01; - gas += args.nb01/2; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all * 0.5f; - } - } -} - -[[host_name("kernel_mul_mv_iq3_xxs_f32")]] -kernel void kernel_mul_mv_iq3_xxs_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq3_xxs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq3_s_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq3_s * x = (device const block_iq3_s *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - threadgroup uint32_t * svalues = (threadgroup uint32_t *) shmem; - { - int nval = 8; - int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3s_grid[pos + i]; - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - const int ix = tiisg; - - device const float * y4 = y + 32 * ix; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (short i = 0; i < 32; ++i) { - yl[i] = y4[i]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq3_s * xr = x + ibl; - device const uint8_t * qs = xr->qs + 8 * ib; - device const uint8_t * qh = xr->qh + ib; - device const uint8_t * sc = xr->scales + (ib/2); - device const uint8_t * signs = xr->signs + 4 * ib; - device const half * dh = &xr->d; - - for (short row = 0; row < nr0; row++) { - const float db = dh[0]; - const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf)); - - float2 sum = {0}; - for (short l = 0; l < 4; ++l) { - const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? svalues + 256 : svalues; - const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? svalues + 256 : svalues; - const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]); - const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]); - for (short j = 0; j < 4; ++j) { - sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]); - sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]); - } - } - sumf[row] += d * (sum[0] + sum[1]); - - dh += args.nb01/2; - qs += args.nb01; - qh += args.nb01; - sc += args.nb01; - signs += args.nb01; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_iq3_s_f32")]] -kernel void kernel_mul_mv_iq3_s_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq3_s_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq2_s_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq2_s * x = (device const block_iq2_s *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - //threadgroup uint64_t * svalues = (threadgroup uint64_t *) shmem; - //{ - // int nval = 32; - // int pos = (32*sgitg + tiisg)*nval; - // for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2s_grid[pos + i]; - // threadgroup_barrier(mem_flags::mem_threadgroup); - //} - - const short ix = tiisg; - - device const float * y4 = y + 32 * ix; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (short i = 0; i < 32; ++i) { - yl[i] = y4[i]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq2_s * xr = x + ibl; - device const uint8_t * qs = xr->qs + 4 * ib; - device const uint8_t * qh = xr->qh + ib; - device const uint8_t * sc = xr->scales + ib; - device const uint8_t * signs = qs + QK_K/8; - device const half * dh = &xr->d; - - for (short row = 0; row < nr0; row++) { - const float db = dh[0]; - const float d1 = db * (0.5f + (sc[0] & 0xf)); - const float d2 = db * (0.5f + (sc[0] >> 4)); - - float2 sum = {0}; - for (short l = 0; l < 2; ++l) { - //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300))); - //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300))); - constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300))); - constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300))); - for (short j = 0; j < 8; ++j) { - sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]); - sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]); - } - } - sumf[row] += d1 * sum[0] + d2 * sum[1]; - - dh += args.nb01/2; - qs += args.nb01; - qh += args.nb01; - sc += args.nb01; - signs += args.nb01; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all * 0.25f; - } - } -} - -[[host_name("kernel_mul_mv_iq2_s_f32")]] -kernel void kernel_mul_mv_iq2_s_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq2_s_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq1_s_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq1_s * x = (device const block_iq1_s *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - const short ix = tiisg; - - device const float * y4 = y + 32 * ix; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - float sumy = 0; - for (short i = 0; i < 32; ++i) { - yl[i] = y4[i]; - sumy += yl[i]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq1_s * xr = x + ibl; - device const uint8_t * qs = xr->qs + 4 * ib; - device const uint16_t * qh = xr->qh + ib; - device const half * dh = &xr->d; - - for (short row = 0; row < nr0; row++) { - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700))); - constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700))); - constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700))); - - float sum = 0; - for (short j = 0; j < 4; ++j) { - sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) - + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4) - + yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) - + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); - } - sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1); - - dh += args.nb01/2; - qs += args.nb01; - qh += args.nb01/2; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_iq1_s_f32")]] -kernel void kernel_mul_mv_iq1_s_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq1_s_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq1_m_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - const int nb = args.ne00/QK_K; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * nr0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq1_m * x = (device const block_iq1_m *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - float yl[32]; - float sumf[nr0]={0.f}; - - const int nb32 = nb * (QK_K / 32); - - const short ix = tiisg; - - device const float * y4 = y + 32 * ix; - - iq1m_scale_t scale; - - for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - float4 sumy = {0.f}; - for (short i = 0; i < 8; ++i) { - yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; - yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8]; - yl[i+16] = y4[i+16]; sumy[2] += yl[i+16]; - yl[i+24] = y4[i+24]; sumy[3] += yl[i+24]; - } - - const int ibl = ib32 / (QK_K / 32); - const int ib = ib32 % (QK_K / 32); - - device const block_iq1_m * xr = x + ibl; - device const uint8_t * qs = xr->qs + 4 * ib; - device const uint8_t * qh = xr->qh + 2 * ib; - device const uint16_t * sc = (device const uint16_t *)xr->scales; - - for (short row = 0; row < nr0; row++) { - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); - constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700))); - constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700))); - - float2 sum = {0.f}; - for (short j = 0; j < 4; ++j) { - sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) - + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4); - sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) - + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); - } - const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - - sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + - (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); - - sc += args.nb01/2; - qs += args.nb01; - qh += args.nb01; - } - - y4 += 32 * 32; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_iq1_m_f32")]] -kernel void kernel_mul_mv_iq1_m_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq1_m_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq4_nl_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - threadgroup float * shmem_f32 = (threadgroup float *) shmem; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * NR0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - const int nb = args.ne00/QK4_NL; - const int ns01 = args.nb01/args.nb00; - - const short ix = tiisg/2; // 0...15 - const short it = tiisg%2; // 0 or 1 - - shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - float4 yl[4]; - float sumf[NR0]={0.f}; - - device const float * yb = y + ix*QK4_NL + it*8; - - uint32_t aux32[2]; - thread const uint8_t * q8 = (thread const uint8_t *)aux32; - - float4 qf1, qf2; - - // [TAG_MUL_MV_WEIRD] - for (int ib = ix; ib < nb && ib < ns01; ib += 16) { - device const float4 * y4 = (device const float4 *)yb; - yl[0] = y4[0]; - yl[1] = y4[4]; - yl[2] = y4[1]; - yl[3] = y4[5]; - - for (short row = 0; row < NR0; row++) { - device const block_iq4_nl & xb = x[row*ns01 + ib]; - device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it); - - float4 acc1 = {0.f}, acc2 = {0.f}; - - aux32[0] = q4[0] | (q4[1] << 16); - aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; - aux32[0] &= 0x0f0f0f0f; - qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; - qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; - acc1 += yl[0] * qf1; - acc2 += yl[1] * qf2; - - aux32[0] = q4[2] | (q4[3] << 16); - aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; - aux32[0] &= 0x0f0f0f0f; - qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; - qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; - acc1 += yl[2] * qf1; - acc2 += yl[3] * qf2; - - acc1 += acc2; - - sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); - } - - yb += 16 * QK4_NL; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_iq4_nl_f32")]] -kernel void kernel_mul_mv_iq4_nl_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq4_nl_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_iq4_xs_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - threadgroup float * shmem_f32 = (threadgroup float *) shmem; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - const int first_row = (r0 * NSG + sgitg) * NR0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - const int nb = args.ne00/QK_K; - const int ns01 = args.nb01/args.nb00; - - const short ix = tiisg/16; // 0 or 1 - const short it = tiisg%16; // 0...15 - const short ib = it/2; - const short il = it%2; - - shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - float4 yl[4]; - float sumf[NR0]={0.f}; - - device const float * yb = y + ix * QK_K + ib * 32 + il * 8; - - uint32_t aux32[2]; - thread const uint8_t * q8 = (thread const uint8_t *)aux32; - - float4 qf1, qf2; - - // [TAG_MUL_MV_WEIRD] - for (int ibl = ix; ibl < nb && ibl < ns01; ibl += 2) { - device const float4 * y4 = (device const float4 *)yb; - yl[0] = y4[0]; - yl[1] = y4[4]; - yl[2] = y4[1]; - yl[3] = y4[5]; - - for (short row = 0; row < NR0; ++row) { - device const block_iq4_xs & xb = x[row*ns01 + ibl]; - device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il); - - float4 acc1 = {0.f}, acc2 = {0.f}; - - aux32[0] = (q4[0] ) & 0x0f0f0f0f; - aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f; - qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; - qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; - acc1 += yl[0] * qf1; - acc2 += yl[1] * qf2; - - aux32[0] = (q4[1] ) & 0x0f0f0f0f; - aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f; - qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; - qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; - acc1 += yl[2] * qf1; - acc2 += yl[3] * qf2; - - acc1 += acc2; - - const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32; - sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); - } - - yb += 2 * QK_K; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_iq4_xs_f32")]] -kernel void kernel_mul_mv_iq4_xs_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq4_xs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -void kernel_mul_mv_mxfp4_f32_impl( - args_t args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg) { - const short NSG = FC_mul_mv_nsg; - - threadgroup float * shmem_f32 = (threadgroup float *) shmem; - - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int first_row = (r0 * NSG + sgitg) * NR0; - - const uint i12 = im%FC_mul_mv_ne12; - const uint i13 = im/FC_mul_mv_ne12; - - const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; - const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; - - device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0); - device const float * y = (device const float *) (src1 + offset1); - - const int nb = args.ne00/QK_MXFP4; - const int ns01 = args.nb01/args.nb00; // this can be larger than nb for permuted src0 tensors - - const short ix = tiisg/2; // 0...15 - const short it = tiisg%2; // 0 or 1 - - shmem_f32[tiisg] = kvalues_mxfp4_f[tiisg%16]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - float4 yl[4]; - float sumf[NR0]={0.f}; - - device const float * yb = y + ix*QK_MXFP4 + it*8; - - // note: just the check `ib < nb` is enough, but adding the redundant `&& ib < ns01` check makes the kernel a bit faster - // no idea why that is - needs some deeper investigation [TAG_MUL_MV_WEIRD] - for (int ib = ix; ib < nb && ib < ns01; ib += 16) { - device const float4 * y4 = (device const float4 *) yb; - - yl[0] = y4[0]; - yl[1] = y4[4]; - yl[2] = y4[1]; - yl[3] = y4[5]; - - FOR_UNROLL (short row = 0; row < NR0; row++) { - device const block_mxfp4 & xb = x[row*ns01 + ib]; - device const uint8_t * q2 = (device const uint8_t *)(xb.qs + 8*it); - - float4 acc1 = yl[0]*float4(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]); - float4 acc2 = yl[1]*float4(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]); - float4 acc3 = yl[2]*float4(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]); - float4 acc4 = yl[3]*float4(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]); - - acc1 = (acc1 + acc3) + (acc2 + acc4); - - sumf[row] += e8m0_to_fp32(xb.e) * ((acc1[0] + acc1[1]) + (acc1[2] + acc1[3])); - } - - yb += 16 * QK_MXFP4; - } - - device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - - for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { - float sum_all = simd_sum(sumf[row]); - if (tiisg == 0) { - dst_f32[first_row + row] = sum_all; - } - } -} - -[[host_name("kernel_mul_mv_mxfp4_f32")]] -kernel void kernel_mul_mv_mxfp4_f32( - constant ggml_metal_kargs_mul_mv & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -template -kernel void kernel_get_rows_q( - constant ggml_metal_kargs_get_rows & args, - device const void * src0, - device const void * src1, - device void * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 ntg [[threads_per_threadgroup]]) { - const int32_t iw0 = tgpig.x/args.ne10; - const int32_t i10 = tgpig.x%args.ne10; - const int32_t i11 = tgpig.y; - const int32_t i12 = tgpig.z; - - const int32_t r = ((const device int32_t *) ((const device char *) src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0]; - - const int32_t i02 = i11; - const int32_t i03 = i12; - - auto psrc = (device const block_q *) ((const device char *) src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01); - auto pdst = (device float4x4 *) (( device char *) dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1); - - for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) { - float4x4 temp; - dequantize_func(psrc + ind/nl, ind%nl, temp); - pdst[ind] = temp; - - break; - } -} - -template -kernel void kernel_get_rows_f( - constant ggml_metal_kargs_get_rows & args, - device const void * src0, - device const void * src1, - device void * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort3 ntg [[threads_per_threadgroup]]) { - const int32_t iw0 = tgpig.x/args.ne10; - const int32_t i10 = tgpig.x%args.ne10; - const int32_t i11 = tgpig.y; - const int32_t i12 = tgpig.z; - - const int32_t r = ((const device int32_t *) ((const device char *) src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0]; - - const int32_t i02 = i11; - const int32_t i03 = i12; - - auto psrc = (const device T0 *) ((const device char *) src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01); - auto pdst = ( device T *) (( device char *) dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1); - - for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) { - pdst[ind] = psrc[ind]; - - break; - } -} - -template -kernel void kernel_set_rows_q32( - constant ggml_metal_kargs_set_rows & args, - device const void * src0, - device const void * src1, - device float * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint3 tptg [[threads_per_threadgroup]]) { - const int32_t i03 = tgpig.z; - const int32_t i02 = tgpig.y; - - const int32_t i12 = i03%args.ne12; - const int32_t i11 = i02%args.ne11; - - const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; - if (i01 >= args.ne01) { - return; - } - - const int32_t i10 = i01; - const TI i1 = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; - - device block_q * dst_row = ( device block_q *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); - const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); - - for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { - quantize_func(src_row + 32*ind, dst_row[ind]); - } -} - -template -kernel void kernel_set_rows_f( - constant ggml_metal_kargs_set_rows & args, - device const void * src0, - device const void * src1, - device float * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint3 tptg [[threads_per_threadgroup]]) { - const int32_t i03 = tgpig.z; - const int32_t i02 = tgpig.y; - - const int32_t i12 = i03%args.ne12; - const int32_t i11 = i02%args.ne11; - - const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; - if (i01 >= args.ne01) { - return; - } - - const int32_t i10 = i01; - const TI i1 = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; - - device T * dst_row = ( device T *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); - const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); - - for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { - dst_row[ind] = (T) src_row[ind]; - } -} - -kernel void kernel_diag_f32( - constant ggml_metal_kargs_diag & args, - device const char * src0, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]]) { - constexpr short NW = N_SIMDWIDTH; - - const int32_t i3 = tgpig.z; - const int32_t i2 = tgpig.y; - const int32_t i1 = tgpig.x; - - device const float * src0_ptr = (device const float *)(src0 + i2*args.nb02 + i3*args.nb03); - device float * dst_ptr = (device float *)(dst + i1*args.nb01 + i2*args.nb2 + i3*args.nb3); - - for (int i0 = tiitg; i0 < args.ne0; i0 += NW) { - dst_ptr[i0] = i0 == i1 ? src0_ptr[i0] : 0.0f; - } -} - -constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]]; -constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]]; -constant short FC_mul_mm_ne12 [[function_constant(FC_MUL_MM + 2)]]; -constant short FC_mul_mm_ne13 [[function_constant(FC_MUL_MM + 3)]]; -constant short FC_mul_mm_r2 [[function_constant(FC_MUL_MM + 4)]]; -constant short FC_mul_mm_r3 [[function_constant(FC_MUL_MM + 5)]]; - -// each block_q contains 16*nl weights -#ifdef GGML_METAL_HAS_TENSOR -template< - typename SA, typename SA_4x4, typename SA_8x8, - typename SB, typename SB_2x4, typename SB_8x8, - typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread SA_4x4 &), - typename T0, typename T0_4x4, typename T1, typename T1_2x4> -kernel void kernel_mul_mm( - constant ggml_metal_kargs_mul_mm & args, - device const char * srcA, - device const char * srcB, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig [[threadgroup_position_in_grid]], - ushort tiitg [[thread_index_in_threadgroup]], - ushort sgitg [[simdgroup_index_in_threadgroup]]) { - (void) sgitg; - - // Matrix dimensions: A(M,K) x B(K,N) -> C(M,N) - const int K = args.ne00; - const int M = args.ne0; - const int N = args.ne1; - - // Batch dimension handling - const int im = tgpig.z; - const int i12 = im % FC_mul_mm_ne12; - const int i13 = im / FC_mul_mm_ne12; - - // Batch offsets for srcA and srcB - const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03; - - // Tile dimensions - constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X; - constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y; - - // Tile offsets in output matrix - const int ra = tgpig.y * NRA; - const int rb = tgpig.x * NRB; - - // Threadgroup memory for dequantized A tile only - threadgroup SA * sa = (threadgroup SA *)(shmem); - - // Work-item count for A loading - constexpr int A_WORK_ITEMS = NRA * N_MM_NK; - constexpr int NUM_THREADS = N_SIMDWIDTH * N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y; - - // tA wraps threadgroup memory - auto tA = tensor(sa, dextents(N_MM_NK_TOTAL, NRA)); - - // tB wraps device memory directly - device T1 * ptrB = (device T1 *)(srcB + args.nb12*i12 + args.nb13*i13); - const int strideB = args.nb11 / sizeof(T1); - auto tB = tensor(ptrB, dextents(K, N), array({1, strideB})); - - // Configure matmul operation - mpp::tensor_ops::matmul2d< - mpp::tensor_ops::matmul2d_descriptor( - NRB, NRA, N_MM_NK_TOTAL, false, true, true, - mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate), - execution_simdgroups> mm; - - auto cT = mm.get_destination_cooperative_tensor(); - - // Accumulate partial results over K dimension - for (int loop_k = 0; loop_k < K; loop_k += N_MM_NK_TOTAL) { - // === PHASE 1: Dequantization of A into threadgroup memory === - for (int work = tiitg; work < A_WORK_ITEMS; work += NUM_THREADS) { - const int row = work / N_MM_NK; - const int k_chunk = work % N_MM_NK; - const int k_pos = loop_k + k_chunk * 16; - const short k_base = k_chunk * 16; - - // Bounds check: skip device read if row is out of matrix bounds - if (ra + row < M) { - if (is_same::value && FC_mul_mm_bc_inp) { - // Element-wise reads when K is not aligned (nb01 not aligned for half4x4/float4x4). - // MSL spec Table 2.5: half4x4 requires 8-byte alignment. When K is odd, - // nb01 = K*2 is not 8-byte aligned, so odd-row pointers are misaligned. - // Mirrors the legacy kernel's existing guard. - device const T0 * row_ptr = (device const T0 *)(srcA + args.nb01 * (ra + row) + offset0); - - FOR_UNROLL (short i = 0; i < 16; i++) { - sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? (SA) row_ptr[k_pos + i] : (SA)0; - } - } else { - const int block_idx = k_pos / (16 * nl); - const short il = (k_pos / 16) % nl; - - device const block_q * row_ptr = (device const block_q *)(srcA + args.nb01 * (ra + row) + offset0); - - SA_4x4 temp_a; - dequantize_func(row_ptr + block_idx, il, temp_a); - - FOR_UNROLL (short i = 0; i < 16; i++) { - // Zero-pad A for K positions beyond valid range (handles partial K iterations) - sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? temp_a[i/4][i%4] : (SA)0; - } - } - } else { - // Zero-pad rows beyond matrix bounds - FOR_UNROLL (short i = 0; i < 16; i++) { - sa[row * N_MM_NK_TOTAL + (k_base + i)] = (SA)0; - } - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - // === PHASE 2: Tensor matmul === - auto mA = tA.slice(0, 0); - auto mB = tB.slice(loop_k, rb); - - mm.run(mB, mA, cT); - - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - // Store result tile to output matrix (with batch offset) - // cT.store handles bounds checking via tD's extents (M, N) - device float * dstBatch = (device float *)dst + im * N * M; - - auto tD = tensor(dstBatch, dextents(M, N), array({1, M})); - cT.store(tD.slice(ra, rb)); -} - -#else - -template< - typename S0, typename S0_4x4, typename S0_8x8, - typename S1, typename S1_2x4, typename S1_8x8, - typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), - typename T0, typename T0_4x4, typename T1, typename T1_2x4> -kernel void kernel_mul_mm( - constant ggml_metal_kargs_mul_mm & args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - - threadgroup S0 * sa = (threadgroup S0 *)(shmem); - threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096); - - constexpr int NR0 = 64; - constexpr int NR1 = 32; - - constexpr int NK = 32; - constexpr int NL0 = NK/16; - constexpr int NL1 = NK/8; - - const int im = tgpig.z; - const int r0 = tgpig.y*NR0; - const int r1 = tgpig.x*NR1; - - // if this block is of 64x32 shape or smaller - const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0; - const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1; - - // a thread shouldn't load data outside of the matrix - const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63 - const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31 - - const short il0 = (tiitg % NL0); - - short il = il0; - - const int i12 = im % FC_mul_mm_ne12; - const int i13 = im / FC_mul_mm_ne12; - - const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03; - const short offset1 = il0/nl; - - device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1; - - const short iy = 8*(tiitg % NL1); - - device const T1 * y = (device const T1 *)(src1 - + args.nb13*i13 - + args.nb12*i12 - + args.nb11*(r1 + lr1) - + args.nb10*iy); - - S0_8x8 ma[4]; - S1_8x8 mb[2]; - - simdgroup_float8x8 mc[8]; - - for (short i = 0; i < 8; i++){ - mc[i] = make_filled_simdgroup_matrix(0.f); - } - - for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) { - // load data and store to threadgroup memory - if (is_same::value && FC_mul_mm_bc_inp) { - threadgroup_barrier(mem_flags::mem_threadgroup); - - // no need for dequantization - for (short i = 0; i < 16; i++) { - const short sx = 2*il0 + i/8; - const short sy = (tiitg/NL0)/8; - - //const short lx = i%8; - //const short ly = (tiitg/NL0)%8; - const short lx = (tiitg/NL0)%8; - const short ly = i%8; - - const short ib = 8*sx + sy; - - *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0; - } - } else { - S0_4x4 temp_a; - dequantize_func(x, il, temp_a); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - FOR_UNROLL (short i = 0; i < 16; i++) { - const short sx = 2*il0 + i/8; - const short sy = (tiitg/NL0)/8; - - //const short lx = i%8; - //const short ly = (tiitg/NL0)%8; - const short lx = (tiitg/NL0)%8; - const short ly = i%8; - - const short ib = 8*sx + sy; - - // NOTE: this is massively slower.. WTF? - //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4]; - - *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4]; - } - } - - if (FC_mul_mm_bc_inp) { - for (short i = 0; i < 8; ++i) { - const short sx = (tiitg%NL1); - const short sy = (tiitg/NL1)/8; - - const short lx = i; - const short ly = (tiitg/NL1)%8; - //const short lx = (tiitg/NL1)%8; - //const short ly = i; - - const short ib = 4*sx + sy; - - *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0; - } - } else { - const short sx = (tiitg%NL1); - const short sy = (tiitg/NL1)/8; - - //const short dx = sx; - //const short dy = sy; - - const short ly = (tiitg/NL1)%8; - - const short ib = 4*sx + sy; - - *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y)); - } - - il = (il + 2 < nl) ? il + 2 : il % 2; - x = (il < 2) ? x + (2 + nl - 1)/nl : x; - - y += NK; - - threadgroup_barrier(mem_flags::mem_threadgroup); - - // load matrices from threadgroup memory and conduct outer products - threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2)); - threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2)); - - FOR_UNROLL (short ik = 0; ik < NK/8; ik++) { - simdgroup_barrier(mem_flags::mem_none); - - FOR_UNROLL (short i = 0; i < 4; i++) { - simdgroup_load(ma[i], lsma + 64*i, 8, 0, false); - } - - simdgroup_barrier(mem_flags::mem_none); - - FOR_UNROLL (short i = 0; i < 2; i++) { - simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false); - } - - simdgroup_barrier(mem_flags::mem_none); - - FOR_UNROLL (short i = 0; i < 8; i++){ - simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); - } - - lsma += 8*64; - lsmb += 4*64; - } - } - - if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) { - // if no bounds checks on the output are needed, we can directly write to device memory - device float * C = (device float *) dst + - (r0 + 32*(sgitg & 1)) + \ - (r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; - - for (short i = 0; i < 8; i++) { - simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false); - } - } else { - // block is smaller than 64x32, we should avoid writing data outside of the matrix - threadgroup_barrier(mem_flags::mem_threadgroup); - - threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0; - - for (short i = 0; i < 8; i++) { - simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false); - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (sgitg == 0) { - for (int j = tiitg; j < nr1; j += NR1) { - device float * D = (device float *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0; - device float4 * D4 = (device float4 *) D; - - threadgroup float * C = temp_str + (j*NR0); - threadgroup float4 * C4 = (threadgroup float4 *) C; - - int i = 0; - for (; i < nr0/4; i++) { - *(D4 + i) = *(C4 + i); - } - - i *= 4; - for (; i < nr0; i++) { - *(D + i) = *(C + i); - } - } - } - } -} - -#endif // GGML_METAL_HAS_TENSOR - -template // n_expert_used -kernel void kernel_mul_mm_id_map0( - constant ggml_metal_kargs_mul_mm_id_map0 & args, - device const char * src2, - device char * htpe, - device char * hids, - threadgroup char * shmem [[threadgroup(0)]], - ushort tpitg[[thread_position_in_threadgroup]], - ushort ntg[[threads_per_threadgroup]]) { - const short ide = tpitg; // expert id - - uint32_t n_all = 0; - - device int32_t * ids_i32 = (device int32_t *) hids + ide*args.ne21; - - for (int i21 = 0; i21 < args.ne21; i21 += ntg) { // n_tokens - if (i21 + tpitg < args.ne21) { - device const int32_t * src2_i32 = (device const int32_t *) (src2 + (i21 + tpitg)*args.nb21); - - threadgroup uint16_t * sids = (threadgroup uint16_t *) shmem + tpitg*ne20; - - #pragma unroll(ne20) - for (short i20 = 0; i20 < ne20; i20++) { - sids[i20] = src2_i32[i20]; - } - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (short t = 0; t < ntg; t++) { - if (i21 + t >= args.ne21) { - break; - } - - threadgroup const uint16_t * sids = (threadgroup const uint16_t *) shmem + t*ne20; - - short sel = 0; - #pragma unroll(ne20) - for (short i20 = 0; i20 < ne20; i20++) { - sel += (sids[i20] == ide)*(i20 + 1); - } - - ids_i32[n_all] = (i21 + t)*ne20 + sel - 1; - - n_all += sel > 0; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - device uint32_t * tpe_u32 = (device uint32_t *) (htpe); - tpe_u32[ide] = n_all; -} - -typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t; - -template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>; -template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>; -template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>; -template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>; -template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>; -template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>; -template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>; -template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>; -template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>; - -template -kernel void kernel_mul_mm_id( - constant ggml_metal_kargs_mul_mm_id & args, - device const char * src0, - device const char * src1, - device const char * htpe, - device const char * hids, - device char * dst, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - threadgroup S0 * sa = (threadgroup S0 *)(shmem); - threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096); - -#ifdef GGML_METAL_HAS_TENSOR - threadgroup float * sc = (threadgroup float *)(shmem); -#endif - - constexpr int NR0 = 64; - constexpr int NR1 = 32; - - constexpr int NK = 32; - constexpr int NL0 = NK/16; - constexpr int NL1 = NK/8; - - const int im = tgpig.z; // expert - const int r0 = tgpig.y*NR0; - const int r1 = tgpig.x*NR1; - - device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe); - device const int32_t * ids_i32 = (device const int32_t *) (hids); - - const int32_t neh1 = tpe_u32[im]; - - if (r1 >= neh1) { - return; - } - - // if this block is of 64x32 shape or smaller - const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0; - const short nr1 = ( neh1 - r1 < NR1) ? ( neh1 - r1) : NR1; - - // a thread shouldn't load data outside of the matrix - const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63 - const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31 - - const short il0 = (tiitg % NL0); - - short il = il0; - - const int id = ids_i32[im*args.ne21 + r1 + lr1]; - - const short i11 = (id % args.ne20) % args.ne11; - const short i12 = (id / args.ne20); - const short i13 = 0; - - const uint64_t offset0 = im*args.nb02 + i13*args.nb03; - const short offset1 = il0/nl; - - device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1; - - const short iy = 8*(tiitg % NL1); - - device const T1 * y = (device const T1 *)(src1 - + args.nb13*i13 - + args.nb12*i12 - + args.nb11*i11 - + args.nb10*iy); - -#ifndef GGML_METAL_HAS_TENSOR - S0_8x8 ma[4]; - S1_8x8 mb[2]; - - simdgroup_float8x8 mc[8]; - - for (short i = 0; i < 8; i++){ - mc[i] = make_filled_simdgroup_matrix(0.f); - } -#else - auto tA = tensor, tensor_inline>(sa, dextents(NK, NR0)); - auto tB = tensor, tensor_inline>(sb, dextents(NR1, NK )); - - mpp::tensor_ops::matmul2d< - mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate), - execution_simdgroups<4>> mm; - - auto cT = mm.get_destination_cooperative_tensor(); -#endif - - for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) { -#ifndef GGML_METAL_HAS_TENSOR - // load data and store to threadgroup memory - if (is_same::value && FC_mul_mm_bc_inp) { - threadgroup_barrier(mem_flags::mem_threadgroup); - - // no need for dequantization - for (short i = 0; i < 16; i++) { - const short sx = 2*il0 + i/8; - const short sy = (tiitg/NL0)/8; - - //const short lx = i%8; - //const short ly = (tiitg/NL0)%8; - const short lx = (tiitg/NL0)%8; - const short ly = i%8; - - const short ib = 8*sx + sy; - - *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? (S0) *((device T0 *) x + i) : (S0) 0; - } - } else { - S0_4x4 temp_a; - dequantize_func(x, il, temp_a); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - FOR_UNROLL (short i = 0; i < 16; i++) { - const short sx = 2*il0 + i/8; - const short sy = (tiitg/NL0)/8; - - //const short lx = i%8; - //const short ly = (tiitg/NL0)%8; - const short lx = (tiitg/NL0)%8; - const short ly = i%8; - - const short ib = 8*sx + sy; - - // NOTE: this is massively slower.. WTF? - //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4]; - - *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4]; - } - } - - if (FC_mul_mm_bc_inp) { - for (short i = 0; i < 8; ++i) { - const short sx = (tiitg%NL1); - const short sy = (tiitg/NL1)/8; - - const short lx = i; - const short ly = (tiitg/NL1)%8; - //const short lx = (tiitg/NL1)%8; - //const short ly = i; - - const short ib = 4*sx + sy; - - *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0; - } - } else { - const short sx = (tiitg%NL1); - const short sy = (tiitg/NL1)/8; - - //const short dx = sx; - //const short dy = sy; - - const short ly = (tiitg/NL1)%8; - - const short ib = 4*sx + sy; - - *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y)); - } -#else - // load data and store to threadgroup memory - if (is_same::value && FC_mul_mm_bc_inp) { - threadgroup_barrier(mem_flags::mem_threadgroup); - - // no need for dequantization - for (short i = 0; i < 16; i++) { - const short sx = 2*il0 + i/8; - const short sy = (tiitg/NL0)/8; - - const short lx = i%8; - const short ly = (tiitg/NL0)%8; - //const short lx = (tiitg/NL0)%8; - //const short ly = i%8; - - *(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0; - } - } else { - S0_4x4 temp_a; - dequantize_func(x, il, temp_a); - - threadgroup_barrier(mem_flags::mem_threadgroup); - - FOR_UNROLL (short i = 0; i < 16; i++) { - const short sx = 2*il0 + i/8; - const short sy = (tiitg/NL0)/8; - - const short lx = i%8; - const short ly = (tiitg/NL0)%8; - //const short lx = (tiitg/NL0)%8; - //const short ly = i%8; - - *(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4]; - } - } - - if (FC_mul_mm_bc_inp) { - for (short i = 0; i < 8; ++i) { - const short sx = (tiitg%NL1); - const short sy = (tiitg/NL1)/8; - - const short lx = i; - const short ly = (tiitg/NL1)%8; - //const short lx = (tiitg/NL1)%8; - //const short ly = i; - - *(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0; - } - } else { - const short sx = (tiitg%NL1); - const short sy = (tiitg/NL1)/8; - - //const short lx = i; - const short ly = (tiitg/NL1)%8; - //const short lx = (tiitg/NL1)%8; - //const short ly = i; - - *(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y)); - } -#endif - - il = (il + 2 < nl) ? il + 2 : il % 2; - x = (il < 2) ? x + (2 + nl - 1)/nl : x; - - y += NK; - - threadgroup_barrier(mem_flags::mem_threadgroup); - -#ifndef GGML_METAL_HAS_TENSOR - // load matrices from threadgroup memory and conduct outer products - threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2)); - threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2)); - - FOR_UNROLL (short ik = 0; ik < NK/8; ik++) { - simdgroup_barrier(mem_flags::mem_none); - - FOR_UNROLL (short i = 0; i < 4; i++) { - simdgroup_load(ma[i], lsma + 64*i, 8, 0, false); - } - - simdgroup_barrier(mem_flags::mem_none); - - FOR_UNROLL (short i = 0; i < 2; i++) { - simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false); - } - - simdgroup_barrier(mem_flags::mem_none); - - FOR_UNROLL (short i = 0; i < 8; i++){ - simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); - } - - lsma += 8*64; - lsmb += 4*64; - } -#else - auto sA = tA.slice(0, 0); - auto sB = tB.slice(0, 0); - - mm.run(sB, sA, cT); -#endif - } - - // block is smaller than 64x32, we should avoid writing data outside of the matrix - threadgroup_barrier(mem_flags::mem_threadgroup); - -#ifdef GGML_METAL_HAS_TENSOR - auto tC = tensor, tensor_inline>(sc, dextents(NR0, NR1)); - cT.store(tC); -#else - threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0; - - for (short i = 0; i < 8; i++) { - simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false); - } -#endif - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (short j = sgitg; j < nr1; j += 4) { - const int id = ids_i32[im*args.ne21 + r1 + j]; - - const short ide = id % args.ne20; - const short idt = id / args.ne20; - - device float * D = (device float *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0; - device float4 * D4 = (device float4 *) D; - - threadgroup float * C = (threadgroup float *) shmem + j*NR0; - threadgroup float4 * C4 = (threadgroup float4 *) C; - - int i = tiisg; - for (; i < nr0/4; i += 32) { - *(D4 + i) = *(C4 + i); - } - - i = (4*(nr0/4)) + tiisg; - for (; i < nr0; i += 32) { - *(D + i) = *(C + i); - } - } -} - -#define QK_NL 16 - -// -// get rows -// - -typedef decltype(kernel_get_rows_f) get_rows_f_t; - -template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f; -template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f; -template [[host_name("kernel_get_rows_i32")]] kernel get_rows_f_t kernel_get_rows_f; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f; -#endif - -typedef decltype(kernel_get_rows_q) get_rows_q_t; - -template [[host_name("kernel_get_rows_q1_0")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_mxfp4")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get_rows_q; - -// -// set rows -// - -typedef decltype(kernel_set_rows_f) set_rows_f_t; - -template [[host_name("kernel_set_rows_f32_i64")]] kernel set_rows_f_t kernel_set_rows_f; -template [[host_name("kernel_set_rows_f32_i32")]] kernel set_rows_f_t kernel_set_rows_f; -template [[host_name("kernel_set_rows_f16_i64")]] kernel set_rows_f_t kernel_set_rows_f; -template [[host_name("kernel_set_rows_f16_i32")]] kernel set_rows_f_t kernel_set_rows_f; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_set_rows_bf16_i64")]] kernel set_rows_f_t kernel_set_rows_f; -template [[host_name("kernel_set_rows_bf16_i32")]] kernel set_rows_f_t kernel_set_rows_f; -#endif - -typedef decltype(kernel_set_rows_q32) set_rows_q32_t; - -template [[host_name("kernel_set_rows_q8_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q8_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q4_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q4_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q4_1_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q4_1_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q5_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q5_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q5_1_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_q5_1_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_iq4_nl_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; -template [[host_name("kernel_set_rows_iq4_nl_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; - -// -// matrix-matrix multiplication -// - -typedef decltype(kernel_mul_mm) mul_mm_t; - -template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm; -#endif -template [[host_name("kernel_mul_mm_q1_0_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_mxfp4_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm; - -template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q1_0_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_1_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q8_0_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_mxfp4_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q2_K_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q3_K_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_K_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_K_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q6_K_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xxs_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_xs_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_xxs_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq3_s_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq2_s_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_s_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq1_m_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_nl_f16")]] kernel mul_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_iq4_xs_f16")]] kernel mul_mm_t kernel_mul_mm; - -// -// indirect matrix-matrix multiplication -// - -typedef decltype(kernel_mul_mm_id) mul_mm_id; - -template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mul_mm_id kernel_mul_mm_id; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mul_mm_id kernel_mul_mm_id; -#endif -template [[host_name("kernel_mul_mm_id_q1_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_mxfp4_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_mul_mm_id; - -template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q1_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q8_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_mxfp4_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q2_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q3_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q6_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq3_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq2_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; - -// -// matrix-vector multiplication -// - -typedef void (kernel_mul_mv_disp_t)( - ggml_metal_kargs_mul_mv args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig, - ushort tiisg); - -typedef void (kernel_mul_mv2_disp_t)( - ggml_metal_kargs_mul_mv args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiisg, - ushort sgitg); - -template -void mmv_fn( - ggml_metal_kargs_mul_mv args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiitg, - ushort tiisg, - ushort sgitg) { - disp_fn(args, src0, src1, dst, tgpig, tiisg); -} - -template -void mmv_fn( - ggml_metal_kargs_mul_mv args, - device const char * src0, - device const char * src1, - device char * dst, - threadgroup char * shmem, - uint3 tgpig, - ushort tiitg, - ushort tiisg, - ushort sgitg) { - disp_fn(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); -} - -typedef decltype(mmv_fn>) mul_mv_disp_fn_t; - -template -kernel void kernel_mul_mv_id( - constant ggml_metal_kargs_mul_mv_id & args, - device const char * src0s, - device const char * src1, - device char * dst, - device const char * ids, - threadgroup char * shmem [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { - const int iid1 = tgpig.z/args.nei0; - const int idx = tgpig.z%args.nei0; - - tgpig.z = 0; - - const int32_t i02 = ((device const int32_t *) (ids + iid1*args.nbi1))[idx]; - - const int64_t i11 = idx % args.ne11; - const int64_t i12 = iid1; - - const int64_t i1 = idx; - const int64_t i2 = i12; - - device const char * src0_cur = src0s + i02*args.nb02; - device const char * src1_cur = src1 + i11*args.nb11 + i12*args.nb12; - - device char * dst_cur = dst + (i1*args.ne0 + i2*args.ne1*args.ne0)*sizeof(float); - - ggml_metal_kargs_mul_mv args0 = { - /*.ne00 =*/ args.ne00, - /*.ne01 =*/ args.ne01, - /*.ne02 =*/ 1, // args.ne02, - /*.nb00 =*/ args.nb00, - /*.nb01 =*/ args.nb01, - /*.nb02 =*/ args.nb02, - /*.nb03 =*/ args.nb02, // args.ne02 == 1 - /*.ne10 =*/ args.ne10, - /*.ne11 =*/ 1, // args.ne11, - /*.ne12 =*/ 1, // args.ne12, - /*.nb10 =*/ args.nb10, - /*.nb11 =*/ args.nb11, - /*.nb12 =*/ args.nb12, - /*.nb13 =*/ args.nb12, // ne12 == 1 - /*.ne0 =*/ args.ne0, - /*.ne1 =*/ 1, // args.ne1, - /*.nr0 =*/ args.nr0, - /*.r2 =*/ 1, - /*.r3 =*/ 1, - }; - - disp_fn( - args0, - /* src0 */ src0_cur, - /* src1 */ src1_cur, - /* dst */ dst_cur, - shmem, - tgpig, - tiitg, - tiisg, - sgitg); -} - -typedef decltype(kernel_mul_mv_id>>) kernel_mul_mv_id_t; - -typedef decltype(kernel_mul_mv_id>>) kernel_mul_mv_id_4_t; - -template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mv_id_bf16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -#endif -template [[host_name("kernel_mul_mv_id_f32_f32_4")]] kernel kernel_mul_mv_id_4_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_f16_f32_4")]] kernel kernel_mul_mv_id_4_t kernel_mul_mv_id>>; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mv_id_bf16_f32_4")]] kernel kernel_mul_mv_id_4_t kernel_mul_mv_id>>; -#endif - -template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; - -template [[host_name("kernel_mul_mv_id_q1_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q5_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; - -template [[host_name("kernel_mul_mv_id_mxfp4_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; - -template [[host_name("kernel_mul_mv_id_q2_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q3_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q4_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q5_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq1_m_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq2_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; - -kernel void kernel_pool_2d_max_f32( - constant ggml_metal_kargs_pool_2d & args, - device const float * src0, - device float * dst, - uint gid[[thread_position_in_grid]]) { - - if (gid >= args.np) { - return; - } - - const int idx = gid; - const int I_HW = args.IH * args.IW; - const int O_HW = args.OH * args.OW; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / args.OW; - const int cur_ow = idx % O_HW % args.OW; - - device const float * i_ptr = src0 + nc * I_HW; - device float * o_ptr = dst + nc * O_HW; - - const int start_h = cur_oh * args.s1 - args.p1; - const int bh = MAX(0, start_h); - const int eh = MIN(args.IH, start_h + args.k1); - const int start_w = cur_ow * args.s0 - args.p0; - const int bw = MAX(0, start_w); - const int ew = MIN(args.IW, start_w + args.k0); - - float res = -INFINITY; - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { - res = MAX(res, i_ptr[i * args.IW + j]); - } - } - - o_ptr[cur_oh * args.OW + cur_ow] = res; -} - -kernel void kernel_pool_2d_avg_f32( - constant ggml_metal_kargs_pool_2d & args, - device const float * src0, - device float * dst, - uint gid[[thread_position_in_grid]]) { - - if (gid >= args.np) { - return; - } - - const int idx = gid; - const int I_HW = args.IH * args.IW; - const int O_HW = args.OH * args.OW; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / args.OW; - const int cur_ow = idx % O_HW % args.OW; - - device const float * i_ptr = src0 + nc * I_HW; - device float * o_ptr = dst + nc * O_HW; - - const int start_h = cur_oh * args.s1 - args.p1; - const int bh = MAX(0, start_h); - const int eh = MIN(args.IH, start_h + args.k1); - const int start_w = cur_ow * args.s0 - args.p0; - const int bw = MAX(0, start_w); - const int ew = MIN(args.IW, start_w + args.k0); - // const float scale = 1. / ((eh - bh) * (ew - bw)); - const float scale = 1. / (args.k0 * args.k1); - - float res = 0; - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { - float cur = i_ptr[i * args.IW + j]; - res += cur * scale; - } - } - - o_ptr[cur_oh * args.OW + cur_ow] = res; -} - - -kernel void kernel_pool_1d_max_f32( - constant ggml_metal_kargs_pool_1d & args, - device const float * src, - device float * dst, - uint gid [[thread_position_in_grid]] -) { - - if (gid >= args.np) { - return; - } - - const int ow = (int)gid % args.OW; - const int row = (int)gid / args.OW; - - const int base = ow * args.s0 - args.p0; - - float acc = -INFINITY; - - const int src_off = row * args.IW; - const int dst_off = row * args.OW; - - for (int ki = 0; ki < args.k0; ++ki) { - int j = base + ki; - if (j < 0 || j >= args.IW){ - continue; - } - float v = src[src_off + j]; - acc = max(acc, v); - } - - dst[dst_off + ow] = acc; -} - -kernel void kernel_pool_1d_avg_f32( - constant ggml_metal_kargs_pool_1d & args, - device const float * src, - device float * dst, - uint gid [[thread_position_in_grid]] -) { - - if (gid >= args.np) { - return; - } - - const int ow = (int)gid % args.OW; - const int row = (int)gid / args.OW; - - const int base = ow * args.s0 - args.p0; - - float acc = 0.0f; - int cnt = 0; - - const int src_off = row * args.IW; - const int dst_off = row * args.OW; - - for (int ki = 0; ki < args.k0; ++ki) { - const int j = base + ki; - if (j < 0 || j >= args.IW) { - continue; - } - acc += src[src_off + j]; - cnt += 1; - } - - dst[dst_off + ow] = (cnt > 0) ? (acc / (float)cnt) : 0.0f; -} - -kernel void kernel_opt_step_adamw_f32( - constant ggml_metal_kargs_opt_step_adamw & args, - device float * x, - device const float * g, - device float * g_m, - device float * g_v, - device const float * pars, - uint gid[[thread_position_in_grid]]) { - - if (gid >= args.np) { - return; - } - - const float alpha = pars[0]; - const float beta1 = pars[1]; - const float beta2 = pars[2]; - const float eps = pars[3]; - const float wd = pars[4]; - const float beta1h = pars[5]; - const float beta2h = pars[6]; - - const float gi = g[gid]; - const float gmi = g_m[gid] * beta1 + gi * (1.0f - beta1); - const float gvi = g_v[gid] * beta2 + gi * gi * (1.0f - beta2); - - g_m[gid] = gmi; - g_v[gid] = gvi; - - const float mh = gmi * beta1h; - const float vh = sqrt(gvi * beta2h) + eps; - - x[gid] = x[gid] * (1.0f - alpha * wd) - alpha * mh / vh; -} - -kernel void kernel_opt_step_sgd_f32( - constant ggml_metal_kargs_opt_step_sgd & args, - device float * x, - device const float * g, - device const float * pars, - uint gid[[thread_position_in_grid]]) { - - if (gid >= args.np) { - return; - } - - x[gid] = x[gid] * (1.0f - pars[0] * pars[1]) - pars[0] * g[gid]; -} - -template -kernel void kernel_memset( - constant ggml_metal_kargs_memset & args, - device T * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = args.val; -} - -typedef decltype(kernel_memset) kernel_memset_t; - -template [[host_name("kernel_memset_i64")]] kernel kernel_memset_t kernel_memset; - -constant short FC_count_equal_nsg [[function_constant(FC_COUNT_EQUAL + 0)]]; - -template -kernel void kernel_count_equal( - constant ggml_metal_kargs_count_equal & args, - device const char * src0, - device const char * src1, - device atomic_int * dst, - threadgroup int32_t * shmem_i32 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const short NSG = FC_count_equal_nsg; - - const int i3 = tgpig.z; - const int i2 = tgpig.y; - const int i1 = tgpig.x; - - if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { - return; - } - - int sum = 0; - - device const char * base0 = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03; - device const char * base1 = src1 + i1*args.nb11 + i2*args.nb12 + i3*args.nb13; - - for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { - const T v0 = *(device const T *)(base0 + i0*args.nb00); - const T v1 = *(device const T *)(base1 + i0*args.nb10); - sum += (v0 == v1); - } - - sum = simd_sum(sum); - - if (tiisg == 0) { - shmem_i32[sgitg] = sum; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (sgitg == 0) { - float v = 0.0f; - if (tpitg.x < NSG) { - v = shmem_i32[tpitg.x]; - } - - float total = simd_sum(v); - if (tpitg.x == 0) { - atomic_fetch_add_explicit(dst, (int32_t) total, memory_order_relaxed); - } - } -} - -typedef decltype(kernel_count_equal) kernel_count_equal_t; - -template [[host_name("kernel_count_equal_i32")]] kernel kernel_count_equal_t kernel_count_equal; diff --git a/ggml/src/ggml-metal/kernels/argsort.metal b/ggml/src/ggml-metal/kernels/argsort.metal new file mode 100644 index 0000000000..7d144fbd75 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/argsort.metal @@ -0,0 +1,232 @@ +#include "common.h" + +// bitonic sort implementation following the CUDA kernels as reference +typedef void (argsort_t)( + constant ggml_metal_kargs_argsort & args, + device const char * src0, + device int32_t * dst, + threadgroup int32_t * shmem_i32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_argsort_f32_i32( + constant ggml_metal_kargs_argsort & args, + device const char * src0, + device int32_t * dst, + threadgroup int32_t * shmem_i32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + // bitonic sort + const int col = tpitg[0]; + const int ib = tgpig[0] / args.ne01; + + const int i00 = ib*ntg.x; + const int i01 = tgpig[0] % args.ne01; + const int i02 = tgpig[1]; + const int i03 = tgpig[2]; + + device const float * src0_row = (device const float *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03); + + // initialize indices + shmem_i32[col] = i00 + col; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (int k = 2; k <= ntg.x; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (shmem_i32[col] >= args.ne00 || + (shmem_i32[ixj] < args.ne00 && (order == GGML_SORT_ORDER_ASC ? + src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]] : + src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]])) + ) { + SWAP(shmem_i32[col], shmem_i32[ixj]); + } + } else { + if (shmem_i32[ixj] >= args.ne00 || + (shmem_i32[col] < args.ne00 && (order == GGML_SORT_ORDER_ASC ? + src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]] : + src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]])) + ) { + SWAP(shmem_i32[col], shmem_i32[ixj]); + } + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + } + } + + const int64_t i0 = ib*args.top_k; + + // copy the result to dst without the padding + if (i0 + col < args.ne0 && col < args.top_k) { + dst += i0 + args.ne0*i01 + args.ne0*args.ne1*i02 + args.ne0*args.ne1*args.ne2*i03; + + dst[col] = shmem_i32[col]; + } +} + +template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32; +template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; + +typedef void (argsort_merge_t)( + constant ggml_metal_kargs_argsort_merge & args, + device const char * src0, + device const int32_t * tmp, + device int32_t * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_argsort_merge_f32_i32( + constant ggml_metal_kargs_argsort_merge & args, + device const char * src0, + device const int32_t * tmp, + device int32_t * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + + const int im = tgpig[0] / args.ne01; + const int i01 = tgpig[0] % args.ne01; + const int i02 = tgpig[1]; + const int i03 = tgpig[2]; + + const int start = im * (2 * args.len); + + const int len0 = MIN(args.len, MAX(0, args.ne0 - (int)(start))); + const int len1 = MIN(args.len, MAX(0, args.ne0 - (int)(start + args.len))); + + const int total = len0 + len1; + + device const int32_t * tmp0 = tmp + start + + i01*args.ne0 + + i02*args.ne0*args.ne01 + + i03*args.ne0*args.ne01*args.ne02; + + device const int32_t * tmp1 = tmp0 + args.len; + + dst += start + + i01*args.top_k + + i02*args.top_k*args.ne01 + + i03*args.top_k*args.ne01*args.ne02; + + device const float * src0_row = (device const float *)(src0 + + args.nb01*i01 + + args.nb02*i02 + + args.nb03*i03); + + if (total == 0) { + return; + } + + const int chunk = (total + ntg.x - 1) / ntg.x; + + const int k0 = tpitg.x * chunk; + const int k1 = MIN(MIN(k0 + chunk, total), args.top_k); + + if (k0 >= args.top_k) { + return; + } + + if (k0 >= total) { + return; + } + + int low = k0 > len1 ? k0 - len1 : 0; + int high = MIN(k0, len0); + + // binary-search partition (i, j) such that i + j = k + while (low < high) { + const int mid = (low + high) >> 1; + + const int32_t idx0 = tmp0[mid]; + const int32_t idx1 = tmp1[k0 - mid - 1]; + + const float val0 = src0_row[idx0]; + const float val1 = src0_row[idx1]; + + bool take_left; + if (order == GGML_SORT_ORDER_ASC) { + take_left = (val0 <= val1); + } else { + take_left = (val0 >= val1); + } + + if (take_left) { + low = mid + 1; + } else { + high = mid; + } + } + + int i = low; + int j = k0 - i; + + // keep the merge fronts into registers + int32_t idx0 = 0; + float val0 = 0.0f; + if (i < len0) { + idx0 = tmp0[i]; + val0 = src0_row[idx0]; + } + + int32_t idx1 = 0; + float val1 = 0.0f; + if (j < len1) { + idx1 = tmp1[j]; + val1 = src0_row[idx1]; + } + + for (int k = k0; k < k1; ++k) { + int32_t out_idx; + + if (i >= len0) { + while (k < k1) { + dst[k++] = tmp1[j++]; + } + break; + } else if (j >= len1) { + while (k < k1) { + dst[k++] = tmp0[i++]; + } + break; + } else { + bool take_left; + + if (order == GGML_SORT_ORDER_ASC) { + take_left = (val0 <= val1); + } else { + take_left = (val0 >= val1); + } + + if (take_left) { + out_idx = idx0; + ++i; + if (i < len0) { + idx0 = tmp0[i]; + val0 = src0_row[idx0]; + } + } else { + out_idx = idx1; + ++j; + if (j < len1) { + idx1 = tmp1[j]; + val1 = src0_row[idx1]; + } + } + } + + dst[k] = out_idx; + } +} + +template [[host_name("kernel_argsort_merge_f32_i32_asc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32; +template [[host_name("kernel_argsort_merge_f32_i32_desc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32; diff --git a/ggml/src/ggml-metal/kernels/binbcast.metal b/ggml/src/ggml-metal/kernels/binbcast.metal new file mode 100644 index 0000000000..abbca9a64d --- /dev/null +++ b/ggml/src/ggml-metal/kernels/binbcast.metal @@ -0,0 +1,226 @@ +#include "common.h" + +// OP: 0 - add, 1 - sub, 2 - mul, 3 - div +constant short FC_bin_op [[function_constant(FC_BIN + 0)]]; +constant short FC_bin_f [[function_constant(FC_BIN + 1)]]; +constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]]; +constant bool FC_bin_cb [[function_constant(FC_BIN + 3)]]; + +template +kernel void kernel_bin_fuse_impl( + constant ggml_metal_kargs_bin & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { +#define FC_OP FC_bin_op +#define FC_F FC_bin_f +#define FC_RB FC_bin_rb +#define FC_CB FC_bin_cb + + if (FC_RB) { + // row broadcast + const uint i0 = tgpig.y*args.ne00 + tgpig.x; + const uint i1 = FC_CB ? tgpig.x%args.ne10 : tgpig.x; + + device const T0 * src0_row = (device const T0 *) (src0); + device T * dst_row = (device T *) (dst); + + if (FC_F == 1) { + device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]); + + if (FC_OP == 0) { + dst_row[i0] = src0_row[i0] + src1_row[i1]; + } + + if (FC_OP == 1) { + dst_row[i0] = src0_row[i0] - src1_row[i1]; + } + + if (FC_OP == 2) { + dst_row[i0] = src0_row[i0] * src1_row[i1]; + } + + if (FC_OP == 3) { + dst_row[i0] = src0_row[i0] / src1_row[i1]; + } + } else { + T0 res = src0_row[i0]; + + if (FC_OP == 0) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res += ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } + + if (FC_OP == 1) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res -= ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } + + if (FC_OP == 2) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res *= ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } + + if (FC_OP == 3) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res /= ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } + + dst_row[i0] = res; + } + } else { + const int i03 = tgpig.z; + const int i02 = tgpig.y; + const int i01 = tgpig.x; + + if (i01 >= args.ne01) { + return; + } + + const int i13 = i03%args.ne13; + const int i12 = i02%args.ne12; + const int i11 = i01%args.ne11; + + device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs); + device T * dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs); + + if (FC_F == 1) { + device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i10 = FC_CB ? i0%args.ne10 : i0; + + if (FC_OP == 0) { + dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10]; + } + + if (FC_OP == 1) { + dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10]; + } + + if (FC_OP == 2) { + dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10]; + } + + if (FC_OP == 3) { + dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10]; + } + } + } else { + device const T1 * src1_ptr[8]; + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); + } + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i10 = FC_CB ? i0%args.ne10 : i0; + + T res = src0_ptr[i0]; + + if (FC_OP == 0) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res += src1_ptr[j][i10]; + } + } + + if (FC_OP == 1) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res -= src1_ptr[j][i10]; + } + } + + if (FC_OP == 2) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res *= src1_ptr[j][i10]; + } + } + + if (FC_OP == 3) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res /= src1_ptr[j][i10]; + } + } + + dst_ptr[i0] = res; + } + } + } + +#undef FC_OP +#undef FC_F +#undef FC_RB +#undef FC_CB +} + +typedef decltype(kernel_bin_fuse_impl) kernel_bin_fuse_t; + +template [[host_name("kernel_bin_fuse_f32_f32_f32")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl; +template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl; + +kernel void kernel_add_id( + constant ggml_metal_kargs_add_id & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i1 = tgpig.x; + const int i2 = tgpig.y; + + const int i11 = *((device const int32_t *) (src2 + i1*sizeof(int32_t) + i2*args.nb21)); + + const size_t nb1 = args.ne0 * sizeof(float); + const size_t nb2 = args.ne1 * nb1; + + device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2); + device const float * src0_row = (device const float *)((device char *)src0 + i1*args.nb01 + i2*args.nb02); + device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11); + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + dst_row[i0] = src0_row[i0] + src1_row[i0]; + } +} + +template +kernel void kernel_repeat( + constant ggml_metal_kargs_repeat & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + const int i03 = i3%args.ne03; + const int i02 = i2%args.ne02; + const int i01 = i1%args.ne01; + + device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01; + device char * dst_ptr = dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1; + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i00 = i0%args.ne00; + *((device T *)(dst_ptr + i0*args.nb0)) = *((device T *)(src0_ptr + i00*args.nb00)); + } +} + +typedef decltype(kernel_repeat) kernel_repeat_t; + +template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat; +#endif +template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; diff --git a/ggml/src/ggml-metal/kernels/common.h b/ggml/src/ggml-metal/kernels/common.h new file mode 100644 index 0000000000..c4d6743944 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/common.h @@ -0,0 +1,126 @@ +#pragma once + +#include "ggml-metal-impl.h" + +#include + +#ifdef GGML_METAL_HAS_TENSOR +#include + +#include +#endif + +using namespace metal; + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; } + +#define PAD2(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#define FOR_UNROLL(x) _Pragma("clang loop unroll(full)") for (x) + +#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 + +// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf +// +// cmd: +// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal/kernels/.metal +// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal/kernels/.metal +// +#if __METAL_VERSION__ < 310 && defined(GGML_METAL_HAS_BF16) +#undef GGML_METAL_HAS_BF16 +#endif + +#if defined(GGML_METAL_HAS_BF16) +typedef matrix bfloat4x4; +typedef matrix bfloat2x4; +#endif + +constexpr constant static float kvalues_iq4nl_f[16] = { + -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f +}; + +constexpr constant static float kvalues_mxfp4_f[16] = { + 0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f +}; + +static inline int best_index_int8(int n, constant float * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + +static inline float e8m0_to_fp32(uint8_t x) { + uint32_t bits; + + if (x == 0) { + bits = 0x00400000; + } else { + bits = (uint32_t) x << 23; + } + + return as_type(bits); +} + +static inline float dot(float x, float y) { + return x*y; +} + +static inline float sum(float x) { + return x; +} + +static inline float sum(float4 x) { + return x[0] + x[1] + x[2] + x[3]; +} + +enum ggml_sort_order { + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, +}; + +constant float GELU_COEF_A = 0.044715f; +constant float GELU_QUICK_COEF = -1.702f; +constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; +constant float SQRT_2_INV = 0.70710678118654752440084436210484f; + +// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation +// ref: https://www.johndcook.com/blog/python_erf/ +constant float p_erf = 0.3275911f; +constant float a1_erf = 0.254829592f; +constant float a2_erf = -0.284496736f; +constant float a3_erf = 1.421413741f; +constant float a4_erf = -1.453152027f; +constant float a5_erf = 1.061405429f; + +template +inline T erf_approx(T x) { + T sign_x = sign(x); + x = fabs(x); + T t = 1.0f / (1.0f + p_erf * x); + T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + return sign_x * y; +} + +template T elu_approx(T x); + +template<> inline float elu_approx(float x) { + return (x > 0.f) ? x : (exp(x) - 1); +} + +template<> inline float4 elu_approx(float4 x) { + float4 res; + + res[0] = (x[0] > 0.0f) ? x[0] : (exp(x[0]) - 1.0f); + res[1] = (x[1] > 0.0f) ? x[1] : (exp(x[1]) - 1.0f); + res[2] = (x[2] > 0.0f) ? x[2] : (exp(x[2]) - 1.0f); + res[3] = (x[3] > 0.0f) ? x[3] : (exp(x[3]) - 1.0f); + + return res; +} diff --git a/ggml/src/ggml-metal/kernels/conv.metal b/ggml/src/ggml-metal/kernels/conv.metal new file mode 100644 index 0000000000..33014e0ed0 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/conv.metal @@ -0,0 +1,485 @@ +#include "common.h" + +typedef void (im2col_t)( + constant ggml_metal_kargs_im2col & args, + device const float * x, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_im2col( + constant ggml_metal_kargs_im2col & args, + device const float * x, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { +// const int64_t IC = tgpg[0]; + const int64_t OH = tgpg[1]; + const int64_t OW = tgpg[2]; + + const int64_t KH = ntg[1]; + const int64_t KW = ntg[2]; + + int64_t in = tpitg[0]; + const int64_t ikh = tpitg[1]; + const int64_t ikw = tpitg[2]; + + const int64_t iic = tgpig[0]; + const int64_t ioh = tgpig[1]; + const int64_t iow = tgpig[2]; + + const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0; + const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1; + + int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw); + + device T * pdst = (device T *) (dst); + + if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) { + while (in < args.N) { + pdst[offset_dst] = 0.0f; + offset_dst += ntg[0]*args.CHW*OH*OW; + + in += ntg[0]; + } + } else { + int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw; + + while (in < args.N) { + pdst[offset_dst] = x[offset_src]; + + offset_dst += ntg[0]*args.CHW*OH*OW; + offset_src += ntg[0]*args.ofs0; + + in += ntg[0]; + } + } +} + +template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col; +template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; + +// TODO: optimize +typedef void (im2col_ext_t)( + constant ggml_metal_kargs_im2col & args, + device const float * x, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_im2col_ext( + constant ggml_metal_kargs_im2col & args, + device const float * x, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1] + const int64_t KHW = (int64_t)args.KHW; + + const int64_t d = tgpig[0] / args.CHW; + const int64_t chw = tgpig[0] % args.CHW; + const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1) + const int64_t HW = tgpig[0] % KHW; + + const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0]; + if (tpitg_0 >= args.N) { + return; + } + + const int64_t tpitg_1 = HW / args.KW; + const int64_t tpitg_2 = HW % args.KW; + + const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0; + const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1; + + const int64_t offset_dst = + (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW + + (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2); + + device T * pdst = (device T *) (dst); + + if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) { + pdst[offset_dst] = 0.0f; + } else { + const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1; + pdst[offset_dst] = x[offset_src + iih * args.IW + iiw]; + } +} + +template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext; +template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext; + +template +kernel void kernel_conv_2d( + constant ggml_metal_kargs_conv_2d & args, + device const char * weights, + device const char * src, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const uint threads_per_tg = ntg.x * ntg.y * ntg.z; + const uint tg_index = (tgpig.z * tgpg.y + tgpig.y) * tgpg.x + tgpig.x; + const uint local_thread = tpitg.z * (ntg.x * ntg.y) + tpitg.y * ntg.x + tpitg.x; + const uint thread_index = tg_index * threads_per_tg + local_thread; + const uint64_t total_threads = (uint64_t) threads_per_tg * tgpg.x * tgpg.y * tgpg.z; + const uint64_t total_outputs = (uint64_t) args.N * args.OC * args.OH * args.OW; + + for (uint64_t index = thread_index; index < total_outputs; index += total_threads) { + uint64_t tmp = index; + + const int32_t ow = tmp % args.OW; tmp /= args.OW; + const int32_t oh = tmp % args.OH; tmp /= args.OH; + const int32_t oc = tmp % args.OC; tmp /= args.OC; + const int32_t n = tmp; + + float acc = 0.0f; + + const int32_t base_x = ow*args.s0 - args.p0; + const int32_t base_y = oh*args.s1 - args.p1; + + int32_t ky_start = 0; + if (base_y < 0) { + ky_start = (-base_y + args.d1 - 1)/args.d1; + } + int32_t ky_end = args.KH; + const int32_t y_max = args.IH - 1 - base_y; + if (y_max < 0) { + ky_end = ky_start; + } else if (base_y + (args.KH - 1)*args.d1 >= args.IH) { + ky_end = min(ky_end, y_max/args.d1 + 1); + } + + int32_t kx_start = 0; + if (base_x < 0) { + kx_start = (-base_x + args.d0 - 1)/args.d0; + } + int32_t kx_end = args.KW; + const int32_t x_max = args.IW - 1 - base_x; + if (x_max < 0) { + kx_end = kx_start; + } else if (base_x + (args.KW - 1)*args.d0 >= args.IW) { + kx_end = min(kx_end, x_max/args.d0 + 1); + } + + if (ky_start < ky_end && kx_start < kx_end) { + const uint64_t src_base_n = (uint64_t) n * args.nb13; + const uint64_t w_base_oc = (uint64_t) oc * args.nb03; + + for (int32_t ic = 0; ic < args.IC; ++ic) { + const uint64_t src_base_nc = src_base_n + (uint64_t) ic * args.nb12; + const uint64_t w_base_ocic = w_base_oc + (uint64_t) ic * args.nb02; + + for (int32_t ky = ky_start; ky < ky_end; ++ky) { + const int32_t iy = base_y + ky*args.d1; + const uint64_t src_base_row = src_base_nc + (uint64_t) iy * args.nb11; + const uint64_t w_base_row = w_base_ocic + (uint64_t) ky * args.nb01; + + for (int32_t kx = kx_start; kx < kx_end; ++kx) { + const int32_t ix = base_x + kx*args.d0; + const uint64_t src_offs = src_base_row + (uint64_t) ix * args.nb10; + const uint64_t w_offs = w_base_row + (uint64_t) kx * args.nb00; + + const float x = *(device const float *)(src + src_offs); + const float w = (float) (*(device const TK *)(weights + w_offs)); + + acc += x * w; + } + } + } + } + + const uint64_t dst_offs = + (uint64_t) n * args.nb3 + + (uint64_t) oc * args.nb2 + + (uint64_t) oh * args.nb1 + + (uint64_t) ow * args.nb0; + + *(device float *)(dst + dst_offs) = acc; + } +} + +template [[host_name("kernel_conv_2d_f32_f32")]] +kernel void kernel_conv_2d( + constant ggml_metal_kargs_conv_2d & args, + device const char * weights, + device const char * src, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template [[host_name("kernel_conv_2d_f16_f32")]] +kernel void kernel_conv_2d( + constant ggml_metal_kargs_conv_2d & args, + device const char * weights, + device const char * src, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +typedef void (conv_transpose_1d_t)( + constant ggml_metal_kargs_conv_transpose_1d & args, + device const float * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + +template +kernel void kernel_conv_transpose_1d( + constant ggml_metal_kargs_conv_transpose_1d & args, + device const T * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]) { + + // For output position j on the time axis, only input positions + // i such that i*s0 <= j < i*s0 + K + // contribute -- i.e. i in [ceil((j - K + 1)/s0), floor(j/s0)] + // intersected with [0, IL-1]. That's at most ceil(K/s0) values + // (typically 2 for stride==K/2 transposed convs). + const int32_t j = tgpig[0]; + const int32_t s0 = args.s0; + const int32_t K = args.K; + const int32_t IL = args.IL; + + int32_t i_min; + { + int32_t a = j - K + 1; + i_min = a <= 0 ? 0 : (a + s0 - 1) / s0; // ceil(a/s0) for a>0 + } + int32_t i_max = j / s0; + if (i_max > IL - 1) i_max = IL - 1; + + float v = 0.0f; + if (i_min <= i_max) { + for (int64_t c = 0; c < args.IC; c++) { + const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1]; + const int32_t input_offset = c * IL; + + for (int32_t i = i_min; i <= i_max; i++) { + v += float(src0[kernel_offset + j - i * s0]) * src1[input_offset + i]; + } + } + } + + device float * dst_ptr = (device float *) (dst + tgpig[0] * args.nb0 + tgpig[1] * args.nb1); + + dst_ptr[0] = v; +} + +template [[host_name("kernel_conv_transpose_1d_f32_f32")]] +kernel void kernel_conv_transpose_1d( + constant ggml_metal_kargs_conv_transpose_1d & args, + device const float * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + +template [[host_name("kernel_conv_transpose_1d_f16_f32")]] +kernel void kernel_conv_transpose_1d( + constant ggml_metal_kargs_conv_transpose_1d & args, + device const half * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + + +typedef void (conv_transpose_2d_t)( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const float * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + +template +kernel void kernel_conv_transpose_2d( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const T * src0, + device const float * src1, + device char * dst, + threadgroup float * shared_sum [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t out_x = tgpig[0]; + const int64_t out_y = tgpig[1]; + const int64_t out_c = tgpig[2]; + + const int64_t kw = tpitg[0]; + const int64_t kh = tpitg[1]; + + float v = 0.0f; + + for (int64_t in_c = 0; in_c < args.IC; in_c++) { + int64_t in_y = out_y - kh; + + if (in_y < 0 || in_y % args.s0) continue; + + in_y /= args.s0; + + if (in_y >= args.IH) continue; + + int64_t in_x = out_x - kw; + + if (in_x < 0 || in_x % args.s0) continue; + + in_x /= args.s0; + + if (in_x >= args.IW) continue; + + const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; + const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; + + v += (float)src0[kernel_idx] * src1[input_idx]; + } + + const uint tid = tpitg.y * ntg.x + tpitg.x; + shared_sum[tid] = v; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tid == 0) { + float total = 0.0f; + const uint num_threads = ntg.x * ntg.y; + for (uint i = 0; i < num_threads; i++) { + total += shared_sum[i]; + } + + device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2); + dst_ptr[0] = total; + } +} + +template [[host_name("kernel_conv_transpose_2d_f32_f32")]] +kernel void kernel_conv_transpose_2d( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const float * src0, + device const float * src1, + device char * dst, + threadgroup float * shared_sum [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template [[host_name("kernel_conv_transpose_2d_f16_f32")]] +kernel void kernel_conv_transpose_2d( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const half * src0, + device const float * src1, + device char * dst, + threadgroup float * shared_sum [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_conv_3d( + constant ggml_metal_kargs_conv_3d & args, + device const char * src0, // Weights [IC * OC, KD, KH, KW] + device const char * src1, // Inputs [IC * N, ID, IH, IW] + device char * dst, // Outputs [OC * N, OD, OH, OW] + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]) { + + // 1. Un-flatten the spatial dimension from Grid X + int64_t spatial_idx = tgpig.x * 32 + tpitg.x; + + if (spatial_idx >= args.OW * args.OH * args.OD) { + return; // Thread falls outside the spatial volume + } + + int64_t od = spatial_idx / (args.OW * args.OH); + int64_t oh = (spatial_idx / args.OW) % args.OH; + int64_t ow = spatial_idx % args.OW; + + // 2. Map Y to Channels, Z to Batch + int64_t oc = tgpig.y; + int64_t batch_idx = tgpig.z; + + // 3. Calculate anchor coordinates in the Input volume + int64_t i_w_base = ow * args.s0 - args.p0; + int64_t i_h_base = oh * args.s1 - args.p1; + int64_t i_d_base = od * args.s2 - args.p2; + + float sum = 0.0f; + + // 4. Gather Loop (Iterate over Input Channels -> Depth -> Height -> Width) + for (int64_t ic = 0; ic < args.IC; ++ic) { + + // ggml packs batch and channel together in the 4th dimension + int64_t src_cn_idx = batch_idx * args.IC + ic; + int64_t w_cn_idx = oc * args.IC + ic; + + for (int64_t kz = 0; kz < args.KD; ++kz) { + int64_t id = i_d_base + kz * args.d2; + if (id < 0 || id >= args.ID) continue; // Boundary check (Padding) + + for (int64_t ky = 0; ky < args.KH; ++ky) { + int64_t ih = i_h_base + ky * args.d1; + if (ih < 0 || ih >= args.IH) continue; + + for (int64_t kx = 0; kx < args.KW; ++kx) { + int64_t iw = i_w_base + kx * args.d0; + if (iw < 0 || iw >= args.IW) continue; + + // Convert multi-dimensional coordinates to flat byte offsets + int64_t w_idx = kx*args.nb00 + ky*args.nb01 + kz*args.nb02 + w_cn_idx*args.nb03; + int64_t i_idx = iw*args.nb10 + ih*args.nb11 + id*args.nb12 + src_cn_idx*args.nb13; + + // Dereference memory and cast weights to f32 if they were f16 + float w_val = (float)*(device const T*)((device const char*)src0 + w_idx); + float i_val = *(device const float*)((device const char*)src1 + i_idx); + + sum += w_val * i_val; + } + } + } + } + + // 5. Write the accumulated value out to RAM + int64_t dst_cn_idx = batch_idx * args.OC + oc; + int64_t d_idx = ow*args.nb0 + oh*args.nb1 + od*args.nb2 + dst_cn_idx*args.nb3; + + *(device float*)(dst + d_idx) = sum; +} + +// Explicit instantiations so the JIT compiler can find them by name +template [[host_name("kernel_conv_3d_f32_f32")]] +kernel void kernel_conv_3d( + constant ggml_metal_kargs_conv_3d & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]); + +// Explicit instantiation for f16 weights +template [[host_name("kernel_conv_3d_f16_f32")]] +kernel void kernel_conv_3d( + constant ggml_metal_kargs_conv_3d & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]); diff --git a/ggml/src/ggml-metal/kernels/dequantize.h b/ggml/src/ggml-metal/kernels/dequantize.h new file mode 100644 index 0000000000..16a6cad561 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/dequantize.h @@ -0,0 +1,686 @@ +#pragma once + +#include "common.h" + +#define GGML_COMMON_DECL_METAL +#define GGML_COMMON_IMPL_METAL +#if defined(GGML_METAL_EMBED_LIBRARY) +__embed_ggml-common.h__ +#else +#include "ggml-common.h" +#endif + +#define QK_NL 16 // shared by mul_mm and get_rows_q instantiations + +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); +} + +template +void dequantize_f32_t4(device const float4 * src, short il, thread type4 & reg) { + reg = (type4)(*src); +} + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); +} + +template +void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) { + reg = (type4)(*(src)); +} + +#if defined(GGML_METAL_HAS_BF16) +template +void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); +} + +template +void dequantize_bf16_t4(device const bfloat4 * src, short il, thread type4 & reg) { + reg = (type4)(*(src)); +} +#endif + +template +void dequantize_q1_0(device const block_q1_0 * xb, short il, thread type4x4 & reg) { + device const uint8_t * qs = xb->qs; + const float d = xb->d; + const float neg_d = -d; + + const int byte_offset = il * 2; // il*16 bits = il*2 bytes + const uint8_t b0 = qs[byte_offset]; + const uint8_t b1 = qs[byte_offset + 1]; + + float4x4 reg_f; + + reg_f[0][0] = select(neg_d, d, bool(b0 & 0x01)); + reg_f[0][1] = select(neg_d, d, bool(b0 & 0x02)); + reg_f[0][2] = select(neg_d, d, bool(b0 & 0x04)); + reg_f[0][3] = select(neg_d, d, bool(b0 & 0x08)); + reg_f[1][0] = select(neg_d, d, bool(b0 & 0x10)); + reg_f[1][1] = select(neg_d, d, bool(b0 & 0x20)); + reg_f[1][2] = select(neg_d, d, bool(b0 & 0x40)); + reg_f[1][3] = select(neg_d, d, bool(b0 & 0x80)); + + reg_f[2][0] = select(neg_d, d, bool(b1 & 0x01)); + reg_f[2][1] = select(neg_d, d, bool(b1 & 0x02)); + reg_f[2][2] = select(neg_d, d, bool(b1 & 0x04)); + reg_f[2][3] = select(neg_d, d, bool(b1 & 0x08)); + reg_f[3][0] = select(neg_d, d, bool(b1 & 0x10)); + reg_f[3][1] = select(neg_d, d, bool(b1 & 0x20)); + reg_f[3][2] = select(neg_d, d, bool(b1 & 0x40)); + reg_f[3][3] = select(neg_d, d, bool(b1 & 0x80)); + + reg = (type4x4) reg_f; +} + +template +void dequantize_q1_0_t4(device const block_q1_0 * xb, short il, thread type4 & reg) { + const float d = xb->d; + const float neg_d = -d; + const int base = il * 4; + const uint8_t byte = xb->qs[base / 8]; + const int s = base % 8; + + float4 reg_f; + reg_f[0] = select(neg_d, d, bool((byte >> (s )) & 1)); + reg_f[1] = select(neg_d, d, bool((byte >> (s + 1)) & 1)); + reg_f[2] = select(neg_d, d, bool((byte >> (s + 2)) & 1)); + reg_f[3] = select(neg_d, d, bool((byte >> (s + 3)) & 1)); + + reg = (type4) reg_f; +} + +template +void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + float4x4 reg_f; + + for (int i = 0; i < 8; i++) { + reg_f[i/2][2*(i%2) + 0] = d1 * (qs[i] & mask0) + md; + reg_f[i/2][2*(i%2) + 1] = d2 * (qs[i] & mask1) + md; + } + + reg = (type4x4) reg_f; +} + +template +void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = (il/4) ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = (il/4) ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i = 0; i < 2; i++) { + reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + md; + reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + md; + } +} + + + +template +void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + float4x4 reg_f; + + for (int i = 0; i < 8; i++) { + reg_f[i/2][2*(i%2) + 0] = ((qs[i] & mask0) * d1) + m; + reg_f[i/2][2*(i%2) + 1] = ((qs[i] & mask1) * d2) + m; + } + + reg = (type4x4) reg_f; +} + +template +void dequantize_q4_1_t4(device const block_q4_1 * xb, short il, thread type4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = (il/4) ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = (il/4) ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i = 0; i < 2; i++) { + reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + m; + reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + m; + } +} + +template +void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 3); + const float d = xb->d; + const float md = -16.h * xb->d; + const ushort mask = il ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = il ? 4 : 0; + + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + float4x4 reg_f; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg_f[i/2][2*(i%2) + 0] = d * x0 + md; + reg_f[i/2][2*(i%2) + 1] = d * x1 + md; + } + + reg = (type4x4) reg_f; +} + +template +void dequantize_q5_0_t4(device const block_q5_0 * xb, short il, thread type4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 3); + const float d = xb->d; + const float md = -16.h * xb->d; + const ushort mask = (il/4) ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = (il/4) ? 4 : 0; + + const int gh_mv = (il/4) ? 12 : 0; + const int gh_bk = (il/4) ? 0 : 4; + + for (int ii = 0; ii < 2; ii++) { + int i = 2*(il%4) + ii; + + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[2*ii + 0] = d * x0 + md; + reg[2*ii + 1] = d * x1 + md; + } +} + +template +void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 4); + const float d = xb->d; + const float m = xb->m; + const ushort mask = il ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = il ? 4 : 0; + + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + float4x4 reg_f; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg_f[i/2][2*(i%2) + 0] = d * x0 + m; + reg_f[i/2][2*(i%2) + 1] = d * x1 + m; + } + + reg = (type4x4) reg_f; +} + +template +void dequantize_q5_1_t4(device const block_q5_1 * xb, short il, thread type4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 4); + const float d = xb->d; + const float m = xb->m; + const ushort mask = (il/4) ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = (il/4) ? 4 : 0; + + const int gh_mv = (il/4) ? 12 : 0; + const int gh_bk = (il/4) ? 0 : 4; + + for (int ii = 0; ii < 2; ii++) { + int i = 2*(il%4) + ii; + + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[2*ii + 0] = d * x0 + m; + reg[2*ii + 1] = d * x1 + m; + } +} + +template +void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const float d = xb->d; + + float4x4 reg_f; + + for (int i = 0; i < 16; i++) { + reg_f[i/4][i%4] = (qs[i + 16*il] * d); + } + + reg = (type4x4) reg_f; +} + +template +void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const float d = xb->d; + + for (int i = 0; i < 4; i++) { + reg[i] = (qs[4*(il%4) + i + 16*(il/4)] * d); + } +} + +template +void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) { + device const uint8_t * q2 = (device const uint8_t *)xb->qs; + + const float d = e8m0_to_fp32(xb->e); + const uint8_t shr = il >= 1 ? 4 : 0; + + for (int i = 0; i < 4; ++i) { + reg[i][0] = d * kvalues_mxfp4_f[(q2[4*i + 0] >> shr) & 0x0F]; + reg[i][1] = d * kvalues_mxfp4_f[(q2[4*i + 1] >> shr) & 0x0F]; + reg[i][2] = d * kvalues_mxfp4_f[(q2[4*i + 2] >> shr) & 0x0F]; + reg[i][3] = d * kvalues_mxfp4_f[(q2[4*i + 3] >> shr) & 0x0F]; + } +} + +template +void dequantize_mxfp4_t4(device const block_mxfp4 * xb, short il, thread type4 & reg) { + device const uint8_t * q2 = (device const uint8_t *)xb->qs; + + const float d = e8m0_to_fp32(xb->e); + const short il4 = il%4; + + const uint8_t shr = il >= 4 ? 4 : 0; + + reg[0] = d * kvalues_mxfp4_f[(q2[4*il4 + 0] >> shr) & 0x0F]; + reg[1] = d * kvalues_mxfp4_f[(q2[4*il4 + 1] >> shr) & 0x0F]; + reg[2] = d * kvalues_mxfp4_f[(q2[4*il4 + 2] >> shr) & 0x0F]; + reg[3] = d * kvalues_mxfp4_f[(q2[4*il4 + 3] >> shr) & 0x0F]; +} + +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const float d = xb->d; + const float min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + float dl, ml; + uint8_t sc = xb->scales[il]; + + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; + + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; + + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) + : (scale_2&kmask2) | ((scale_1&kmask1) << 4); + float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + const float ml = 4.f * dl; + + il = (il/2) & 3; + const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl *= coef; + + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); + } +} + +static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { + return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} + : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; +} + +template +void dequantize_q4_K(device const block_q4_K * xb, short il, thread type4x4 & reg) { + device const uchar * q = xb->qs; + + short is = (il/4) * 2; + q = q + (il/4) * 32 + 16 * (il&1); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const float d = il < 2 ? xb->d : xb->d / 16.h; + const float min = xb->dmin; + const float dl = d * sc[0]; + const float ml = min * sc[1]; + + const ushort mask = il < 2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + device const uint8_t * qh = xb->qh; + + short is = (il/4) * 2; + q = q + 32 * (il/4) + 16 * (il&1); + qh = qh + 16 * (il&1); + uint8_t ul = 1 << (il/2); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const float d = il < 2 ? xb->d : xb->d / 16.f; + const float min = xb->dmin; + const float dl = d * sc[0]; + const float ml = min * sc[1]; + + const ushort mask = il<2 ? 0x0F : 0xF0; + const float qh_val = il<2 ? 16.f : 256.f; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; + } +} + +template +void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint16_t * ql = (device const uint16_t *)xb->ql; + device const uint16_t * qh = (device const uint16_t *)xb->qh; + device const int8_t * scales = (device const int8_t *)xb->scales; + + ql = ql + 32*(il/8) + 16*((il/2)&1) + 8*(il&1); + qh = qh + 16*(il/8) + 8*(il&1); + float sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2) & 3; + + const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303); + const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F; + const float ml = d_all * sc * 32.f; + const float dl0 = d_all * sc; + const float dl1 = dl0 / 256.f; + const float dl2 = dl0 / (256.f * 256.f); + const float dl3 = dl0 / (256.f * 256.f * 256.f); + const uint8_t shr_h = il>2 ? 2 : 0; + const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4); + const uint8_t shr_l = il>1 ? 4 : 0; + for (int i = 0; i < 4; ++i) { + const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2; + const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1; + const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l); + reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml; + reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml; + reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml; + reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml; + } +} + +template +void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. + device const uint16_t * q2 = xb->qs + 4*ib32; + const uint32_t aux32_g = q2[0] | (q2[1] << 16); + const uint32_t aux32_s = q2[2] | (q2[3] << 16); + thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; + const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); + signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + +template +void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint16_t * q2 = xb->qs + 4*ib32; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); + uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); + signs = ksigns_iq2xs[q2[2*il+1] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + +template +void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * q3 = xb->qs + 8*ib32; + device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); + uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } + grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); + grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); + signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } +} + +template +void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 8*ib32; + device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); + constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); + constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); + reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); + } + grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); + grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); + reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); + } +} + +template +void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * signs = qs + QK_K/8; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300))); + constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300))); + for (int i = 0; i < 8; ++i) { + reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]); + reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]); + } +} + +template +void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + const float d = xb->d; + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint16_t * qh = xb->qh; + const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1); + const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA); + const uint16_t h = qh[ib32] >> 6*il; + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml; + reg[1][i] = dl * (grid1[i] >> 4) + ml; + reg[2][i] = dl * (grid2[i] & 0xf) + ml; + reg[3][i] = dl * (grid2[i] >> 4) + ml; + } +} + +template +void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + device const uint16_t * sc = (device const uint16_t *)xb->scales; + + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = scale.f16; + + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * qh = xb->qh + 2*ib32 + il; + + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); + const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml1; + reg[1][i] = dl * (grid1[i] >> 4) + ml1; + reg[2][i] = dl * (grid2[i] & 0xf) + ml2; + reg[3][i] = dl * (grid2[i] >> 4) + ml2; + } +} + +template +void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + +template +void dequantize_iq4_nl_t4(device const block_iq4_nl * xb, short il, thread type4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + aux32 = ((q4[2*(il%4)] | (q4[2*(il%4)+1] << 16)) >> 4*(il/4)) & 0x0f0f0f0f; + reg[0] = d * kvalues_iq4nl_f[q8[0]]; + reg[1] = d * kvalues_iq4nl_f[q8[1]]; + reg[2] = d * kvalues_iq4nl_f[q8[2]]; + reg[3] = d * kvalues_iq4nl_f[q8[3]]; +} + +template +void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32; + const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)xb->d * (ls - 32); + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + diff --git a/ggml/src/ggml-metal/kernels/fa.metal b/ggml/src/ggml-metal/kernels/fa.metal new file mode 100644 index 0000000000..e8edb8411d --- /dev/null +++ b/ggml/src/ggml-metal/kernels/fa.metal @@ -0,0 +1,1656 @@ +#include "common.h" +#include "dequantize.h" + +constant bool FC_flash_attn_ext_pad_has_mask [[function_constant(FC_FLASH_ATTN_EXT_PAD + 0)]]; + +constant int32_t FC_flash_attn_ext_pad_ncpsg [[function_constant(FC_FLASH_ATTN_EXT_PAD + 25)]]; + +// pad the last chunk of C elements of k and v into a an extra pad buffer +kernel void kernel_flash_attn_ext_pad( + constant ggml_metal_kargs_flash_attn_ext_pad & args, + device const char * k, + device const char * v, + device const char * mask, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int32_t C = FC_flash_attn_ext_pad_ncpsg; + + device char * k_pad = dst; + device char * v_pad = k_pad + args.nb11*C*args.ne_12_2*args.ne_12_3; + device char * mask_pad = v_pad + args.nb21*C*args.ne_12_2*args.ne_12_3; + + const int32_t icp = args.ne11 % C; + const int32_t ic0 = args.ne11 - icp; + + const int32_t i1 = tgpig[0]; + const int32_t i2 = tgpig[1]; + const int32_t i3 = tgpig[2]; + + if (i2 < args.ne_12_2 && i3 < args.ne_12_3) { + device const char * k_src = k + args.nb11*(ic0 + i1) + args.nb12*i2 + args.nb13*i3; + device const char * v_src = v + args.nb21*(ic0 + i1) + args.nb22*i2 + args.nb23*i3; + + device char * k_dst = k_pad + args.nb11*i1 + args.nb11*C*i2 + args.nb11*C*args.ne_12_2*i3; + device char * v_dst = v_pad + args.nb21*i1 + args.nb21*C*i2 + args.nb21*C*args.ne_12_2*i3; + + if (i1 >= icp) { + // here it is not important the exact value that will be used as we rely on masking out the scores in the attention + for (uint64_t i = tiitg; i < args.nb11; i += ntg.x) { + k_dst[i] = 0; + } + for (uint64_t i = tiitg; i < args.nb21; i += ntg.x) { + v_dst[i] = 0; + } + } else { + for (uint64_t i = tiitg; i < args.nb11; i += ntg.x) { + k_dst[i] = k_src[i]; + } + for (uint64_t i = tiitg; i < args.nb21; i += ntg.x) { + v_dst[i] = v_src[i]; + } + } + } + + if (FC_flash_attn_ext_pad_has_mask) { + if (i2 < args.ne32 && i3 < args.ne33) { + for (int ib = i1; ib < args.ne31; ib += C) { + device const half * mask_src = (device const half *)(mask + args.nb31*ib + args.nb32*i2 + args.nb33*i3) + ic0; + device half * mask_dst = (device half *)(mask_pad) + C*ib + C*args.ne31*i2 + C*args.ne31*args.ne32*i3; + + for (int i = tiitg; i < C; i += ntg.x) { + if (i >= icp) { + mask_dst[i] = -MAXHALF; + } else { + mask_dst[i] = mask_src[i]; + } + } + } + } + } +} + +constant int32_t FC_flash_attn_ext_blk_nqptg [[function_constant(FC_FLASH_ATTN_EXT_BLK + 24)]]; +constant int32_t FC_flash_attn_ext_blk_ncpsg [[function_constant(FC_FLASH_ATTN_EXT_BLK + 25)]]; + +// scan the blocks of the mask that are not masked +// 0 - masked (i.e. full of -INF, skip) +// 1 - not masked (i.e. at least one element of the mask is not -INF) +// 2 - all zero +kernel void kernel_flash_attn_ext_blk( + constant ggml_metal_kargs_flash_attn_ext_blk & args, + device const char * mask, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]]) { + // block size C x Q + const int32_t Q = FC_flash_attn_ext_blk_nqptg; + const int32_t C = FC_flash_attn_ext_blk_ncpsg; + + constexpr short NW = N_SIMDWIDTH; + + const int32_t i3 = tgpig[2]/args.ne32; + const int32_t i2 = tgpig[2]%args.ne32; + const int32_t i1 = tgpig[1]; + const int32_t i0 = tgpig[0]; + + char res = i0*C + C > args.ne30 ? 1 : 0; + + device const half * mask_src = (device const half *) (mask + (i1*Q)*args.nb31 + i2*args.nb32 + i3*args.nb33) + i0*C + tiisg; + + // detailed check of the elements of the block + if ((C > NW || Q > 1) && res == 0) { + half mmin = MAXHALF; + half mmax = -MAXHALF; + + FOR_UNROLL (short j = 0; j < Q; ++j) { + FOR_UNROLL (short ii = 0; ii < C/NW; ++ii) { + mmin = min(mmin, mask_src[ii*NW]); + mmax = max(mmax, mask_src[ii*NW]); + } + + mask_src += args.nb31/2; + } + + mmin = simd_min(mmin); + mmax = simd_max(mmax); + + if (mmax > -MAXHALF) { + if (mmin == 0.0 && mmax == 0.0) { + res = 2; + } else { + res = 1; + } + } + } + + const int32_t nblk1 = ((args.ne01 + Q - 1)/Q); + const int32_t nblk0 = ((args.ne30 + C - 1)/C); + + if (tiisg == 0) { + dst[((i3*args.ne32 + i2)*nblk1 + i1)*nblk0 + i0] = res; + } +} + +constant bool FC_flash_attn_ext_has_mask [[function_constant(FC_FLASH_ATTN_EXT + 0)]]; +constant bool FC_flash_attn_ext_has_sinks [[function_constant(FC_FLASH_ATTN_EXT + 1)]]; +constant bool FC_flash_attn_ext_has_bias [[function_constant(FC_FLASH_ATTN_EXT + 2)]]; +constant bool FC_flash_attn_ext_has_scap [[function_constant(FC_FLASH_ATTN_EXT + 3)]]; +constant bool FC_flash_attn_ext_has_kvpad [[function_constant(FC_FLASH_ATTN_EXT + 4)]]; + +constant bool FC_flash_attn_ext_bc_mask [[function_constant(FC_FLASH_ATTN_EXT + 10)]]; + +//constant float FC_flash_attn_ext_scale [[function_constant(FC_FLASH_ATTN_EXT + 10)]]; +//constant float FC_flash_attn_ext_max_bias [[function_constant(FC_FLASH_ATTN_EXT + 11)]]; +//constant float FC_flash_attn_ext_logit_softcap [[function_constant(FC_FLASH_ATTN_EXT + 12)]]; + +constant int32_t FC_flash_attn_ext_ns10 [[function_constant(FC_FLASH_ATTN_EXT + 20)]]; +constant int32_t FC_flash_attn_ext_ns20 [[function_constant(FC_FLASH_ATTN_EXT + 21)]]; +constant int32_t FC_flash_attn_ext_nsg [[function_constant(FC_FLASH_ATTN_EXT + 22)]]; + +// ref: https://arxiv.org/pdf/2307.08691.pdf +template< + typename q_t, // query types in shared memory + typename q4_t, + typename q8x8_t, + typename k_t, // key types in shared memory + typename k4x4_t, + typename k8x8_t, + typename v_t, // value types in shared memory + typename v4x4_t, + typename v8x8_t, + typename qk_t, // Q*K types + typename qk8x8_t, + typename s_t, // soft-max types + typename s2_t, + typename s8x8_t, + typename o_t, // attention accumulation types + typename o4_t, + typename o8x8_t, + typename kd4x4_t, // key type in device memory + short nl_k, + void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &), + typename vd4x4_t, // value type in device memory + short nl_v, + void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), + short DK, // K head size + short DV, // V head size + short Q, // queries per threadgroup + short C, // cache items per threadgroup + short NSG> // number of simd groups +void kernel_flash_attn_ext_impl( + constant ggml_metal_kargs_flash_attn_ext & args, + device const char * q, + device const char * k, + device const char * v, + device const char * mask, + device const char * sinks, + device const char * pad, + device const char * blk, + device char * dst, + threadgroup half * shmem_f16, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const ushort iq3 = tgpig[2]; + const ushort iq2 = tgpig[1]; + const ushort iq1 = tgpig[0]*Q; + +#define NS10 (FC_flash_attn_ext_ns10) +#define NS20 (FC_flash_attn_ext_ns20) + + // note: I had some concerns that using this instead of the ugly macros above was affecting performance + // need to re-check carefully and if no regressions are observerd - remove the macros + // the concerns is that maybe using const variables requires extra registers? but not sure if the compiler + // is clever enough to avoid this. unfortunately, using constexpr is not possible with FC + //const short NS10 = FC_flash_attn_ext_ns10; + //const short NS20 = FC_flash_attn_ext_ns20; + + constexpr short KV = 8; + + constexpr short DK4 = DK/4; + constexpr short DK8 = DK/8; + constexpr short DK16 = DK/16; + constexpr short DV4 = DV/4; + //constexpr short DV8 = DV/8; + constexpr short DV16 = DV/16; + + constexpr short PV = PAD2(DV, 64); + constexpr short PV4 = PV/4; + constexpr short PV8 = PV/8; + //constexpr short PV16 = PV/16; + + constexpr short NW = N_SIMDWIDTH; + constexpr short NQ = Q/NSG; + constexpr short SH = 2*C; // shared memory per simdgroup (s_t == float) + + constexpr short TS = 2*SH; + constexpr short T = DK + 2*PV; // shared memory size per query in (half) + + threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*T); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*T); // same as above but in q4_t + threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*T + Q*DK); // the result for all queries in 8x8 matrices (the O matrix from the paper) + threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*T + Q*DK); + threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + Q*T); // scratch buffer for attention, mask and diagonal matrix + threadgroup s2_t * ss2 = (threadgroup s2_t *) (shmem_f16 + Q*T); // same as above but in s2_t + + threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // scratch buffer to load K in shared memory + threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // same as above but in k4x4_t + + threadgroup v_t * sv = (threadgroup v_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // scratch buffer to load V in shared memory + threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T + Q*TS); // same as above but in v4x4_t + + // mask storage in shared mem + threadgroup half2 * sm2 = (threadgroup half2 *) (shmem_f16 + Q*T + 2*C); + + // per-query mask pointers + device const half2 * pm2[NQ]; + + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + pm2[jj] = (device const half2 *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33); + } + + { + const int32_t nblk1 = ((args.ne01 + Q - 1)/Q); + const int32_t nblk0 = ((args.ne11 + C - 1)/C); + + blk += (((iq3%args.ne33)*args.ne32 + (iq2%args.ne32))*nblk1 + iq1/Q)*nblk0; + } + + { + q += iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03; + + const short ikv2 = iq2/(args.ne02/args.ne_12_2); + const short ikv3 = iq3/(args.ne03/args.ne_12_3); + + k += ikv2*args.nb12 + ikv3*args.nb13; + v += ikv2*args.nb22 + ikv3*args.nb23; + } + + // load heads from Q to shared memory + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + device const float4 * q4 = (device const float4 *) ((device const char *) q + j*args.nb01); + + for (short i = tiisg; i < DK4; i += NW) { + if (iq1 + j < args.ne01) { + sq4[j*DK4 + i] = (q4_t) q4[i]; + } else { + sq4[j*DK4 + i] = 0; + } + } + } + + // zero out + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + for (short i = tiisg; i < DV4; i += NW) { + so4[j*PV4 + i] = 0; + } + + for (short i = tiisg; i < SH; i += NW) { + ss[j*SH + i] = 0.0f; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + float S[NQ] = { [0 ... NQ-1] = 0.0f }; + + { + float M[NQ] = { [0 ... NQ-1] = -FLT_MAX/2 }; + + float slope = 1.0f; + + // ALiBi + if (FC_flash_attn_ext_has_bias) { + const short h = iq2; + + const float base = h < args.n_head_log2 ? args.m0 : args.m1; + const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; + + slope = pow(base, exph); + } + + // loop over the KV cache + // each simdgroup handles blocks of Q rows and C columns + for (int ic0 = 0; ; ++ic0) { + int ic = ic0*C; + if (ic >= args.ne11) { + break; + } + + // the last partial chunk uses the pad buffer as source + if (FC_flash_attn_ext_has_kvpad && ic + C > args.ne11) { + k = pad; + v = k + args.nb11*C*args.ne_12_2*args.ne_12_3; + mask = v + args.nb21*C*args.ne_12_2*args.ne_12_3; + + const short ikv2 = iq2/(args.ne02/args.ne_12_2); + const short ikv3 = iq3/(args.ne03/args.ne_12_3); + + k += (ikv2 + ikv3*args.ne_12_2)*args.nb11*C; + v += (ikv2 + ikv3*args.ne_12_2)*args.nb21*C; + + if (!FC_flash_attn_ext_has_mask) { + threadgroup half * sm = (threadgroup half *) (sm2); + + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + for (short i = tiisg; i < C; i += NW) { + if (ic + i >= args.ne11) { + sm[2*j*SH + i] = -MAXHALF; + } + } + } + } else { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + pm2[jj] = (device const half2 *) ((device const half *) mask + + (iq1 + j)*C + + (iq2%args.ne32)*(C*args.ne31) + + (iq3%args.ne33)*(C*args.ne31*args.ne32)); + } + } + + ic = 0; + } + + char blk_cur = 1; + + // read the mask into shared mem + if (FC_flash_attn_ext_has_mask) { + blk_cur = blk[ic0]; + + if (blk_cur == 0) { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + pm2[jj] += NW; + } + + continue; + } + + if (blk_cur == 1) { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + if (FC_flash_attn_ext_bc_mask) { + sm2[j*SH + tiisg] = (iq1 + j) < args.ne31 ? pm2[jj][tiisg] : half2(-MAXHALF, -MAXHALF); + } else { + sm2[j*SH + tiisg] = pm2[jj][tiisg]; + } + + pm2[jj] += NW; + } + } else if (blk_cur == 2) { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + pm2[jj] += NW; + } + } + +#if 0 + // note: old -INF block optimization - obsoleted by pre-computing non-masked blocks + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // used to detect blocks full of -INF + // skip only when the entire threadgroup is masked + half2 smax2(-MAXHALF/2, -MAXHALF/2); + + FOR_UNROLL (short j = 0; j < Q; ++j) { + smax2 = max(smax2, sm2[j*SH + tiisg]); + } + + smax2 = simd_max(smax2); + + if (max(smax2[0], smax2[1]) <= -MAXHALF/2) { + // this barrier is important + threadgroup_barrier(mem_flags::mem_threadgroup); + + continue; + } +#endif + } + + // Q*K^T + // this is compile-time check, so it does not have runtime overhead + if (is_same::value) { + // we can read directly from global memory + device const k_t * pk = (device const k_t *) (k + ic*args.nb11); + threadgroup const q_t * pq = sq; + threadgroup s_t * ps = ss; + + pk += sgitg*(8*NS10); + ps += sgitg*(8*1); + + static_assert((C/8) % NSG == 0, ""); + + constexpr short NC = (C/8)/NSG; + + FOR_UNROLL (short cc = 0; cc < NC; ++cc) { + qk8x8_t mqk = make_filled_simdgroup_matrix((qk_t) 0.0f); + + if (DK % 16 != 0) { + k8x8_t mk; + q8x8_t mq; + + FOR_UNROLL (short i = 0; i < DK8; ++i) { + simdgroup_barrier(mem_flags::mem_none); + + simdgroup_load(mk, pk + 8*i, NS10, 0, true); + simdgroup_load(mq, pq + 8*i, DK); + + simdgroup_barrier(mem_flags::mem_none); + + simdgroup_multiply_accumulate(mqk, mq, mk, mqk); + } + } else { + k8x8_t mk[2]; + q8x8_t mq[2]; + + // note: too much unroll can tank the performance for large heads + #pragma unroll (MIN(DK8/2, 4*NSG)) + for (short i = 0; i < DK8/2; ++i) { + simdgroup_barrier(mem_flags::mem_none); + + simdgroup_load(mq[0], pq + 0*8 + 16*i, DK); + simdgroup_load(mq[1], pq + 1*8 + 16*i, DK); + + simdgroup_load(mk[0], pk + 0*8 + 16*i, NS10, 0, true); + simdgroup_load(mk[1], pk + 1*8 + 16*i, NS10, 0, true); + + simdgroup_barrier(mem_flags::mem_none); + + simdgroup_multiply_accumulate(mqk, mq[0], mk[0], mqk); + simdgroup_multiply_accumulate(mqk, mq[1], mk[1], mqk); + } + } + + simdgroup_store(mqk, ps, SH, 0, false); + + pk += 8*(NSG*NS10); + ps += 8*(NSG); + } + } else { + // TODO: this is the quantized K cache branch - not optimized yet + for (short ccc = 0; ccc < (C/8)/NSG; ++ccc) { + const short cc = ccc*NSG + sgitg; + + const short tx = tiisg%4; + const short ty = tiisg/4; + + qk8x8_t mqk = make_filled_simdgroup_matrix((qk_t) 0.0f); + + for (short ii = 0; ii < DK16; ii += 4) { + device const kd4x4_t * pk4x4 = (device const kd4x4_t *) (k + ((ic + 8*cc + ty)*args.nb11)); + + if (DK16%4 == 0) { + // the head is evenly divisible by 4*16 = 64, so no need for bound checks + { + k4x4_t tmp; + deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp); + sk4x4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + FOR_UNROLL (short k = 0; k < 4; ++k) { + k8x8_t mk; + q8x8_t mq; + + simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_load(mq, sq + (2*(ii + k) + 0)*8, DK); + simdgroup_multiply_accumulate(mqk, mq, mk, mqk); + + simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_load(mq, sq + (2*(ii + k) + 1)*8, DK); + simdgroup_multiply_accumulate(mqk, mq, mk, mqk); + } + } else { + if (ii + tx < DK16) { + k4x4_t tmp; + deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp); + sk4x4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + for (short k = 0; k < 4 && ii + k < DK16; ++k) { + k8x8_t mk; + q8x8_t mq; + + simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_load(mq, sq + (2*(ii + k) + 0)*8, DK); + simdgroup_multiply_accumulate(mqk, mq, mk, mqk); + + simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_load(mq, sq + (2*(ii + k) + 1)*8, DK); + simdgroup_multiply_accumulate(mqk, mq, mk, mqk); + } + } + } + + simdgroup_store(mqk, ss + 8*cc, SH, 0, false); + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // online softmax + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + const float m = M[jj]; + + // scale and apply the logitcap / mask + float2 s2 = ss2[j*SH/2 + tiisg]*args.scale; + + if (FC_flash_attn_ext_has_scap) { + s2 = args.logit_softcap*precise::tanh(s2); + } + + // mqk = mqk + slope*mask + if (blk_cur != 2) { + if (FC_flash_attn_ext_has_bias) { + s2 += s2_t(sm2[j*SH + tiisg])*slope; + } else { + s2 += s2_t(sm2[j*SH + tiisg]); + } + } + + M[jj] = simd_max(max(M[jj], max(s2[0], s2[1]))); + + const float ms = exp(m - M[jj]); + const float2 vs2 = exp(s2 - M[jj]); + + S[jj] = S[jj]*ms + simd_sum(vs2[0] + vs2[1]); + + // the P matrix from the paper (Q rows, C columns) + ss2[j*SH/2 + tiisg] = vs2; + + if (DV4 % NW == 0) { + FOR_UNROLL (short ii = 0; ii < DV4/NW; ++ii) { + const short i = ii*NW + tiisg; + + so4[j*PV4 + i] *= ms; + } + } else { + for (short i = tiisg; i < DV4; i += NW) { + so4[j*PV4 + i] *= ms; + } + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // O = O + (Q*K^T)*V + { + // we can read directly from global memory + if (is_same::value) { + static_assert(PV8 % NSG == 0, ""); + + constexpr short NO = PV8/NSG; + + o8x8_t lo[NO]; + + { + auto sot = so + 8*sgitg; + + FOR_UNROLL (short ii = 0; ii < NO; ++ii) { + simdgroup_load(lo[ii], sot, PV, 0, false); + + sot += 8*NSG; + } + } + + { + device const v_t * pv = (device const v_t *) (v + ic*args.nb21); + + pv += 8*sgitg; + + if (DV <= 64) { + FOR_UNROLL (short cc = 0; cc < C/8; ++cc) { + s8x8_t vs; + simdgroup_load(vs, ss + 8*cc, SH, 0, false); + + FOR_UNROLL (short ii = 0; ii < NO/2; ++ii) { + v8x8_t mv[2]; + + simdgroup_load(mv[0], pv + 0*NSG + 16*ii*NSG, NS20, 0, false); + simdgroup_load(mv[1], pv + 8*NSG + 16*ii*NSG, NS20, 0, false); + + simdgroup_multiply_accumulate(lo[2*ii + 0], vs, mv[0], lo[2*ii + 0]); + simdgroup_multiply_accumulate(lo[2*ii + 1], vs, mv[1], lo[2*ii + 1]); + } + + pv += 8*NS20; + } + } else { + constexpr short NC = (C/8)/2; + + FOR_UNROLL (short cc = 0; cc < NC; ++cc) { + s8x8_t vs[2]; + + simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false); + simdgroup_load(vs[1], ss + 16*cc + 8, SH, 0, false); + + FOR_UNROLL (short ii = 0; ii < NO/2; ++ii) { + v8x8_t mv[4]; + + simdgroup_load(mv[0], pv + 0*NSG + 16*ii*NSG + 0*8*NS20, NS20, 0, false); + simdgroup_load(mv[1], pv + 8*NSG + 16*ii*NSG + 0*8*NS20, NS20, 0, false); + simdgroup_load(mv[2], pv + 0*NSG + 16*ii*NSG + 1*8*NS20, NS20, 0, false); + simdgroup_load(mv[3], pv + 8*NSG + 16*ii*NSG + 1*8*NS20, NS20, 0, false); + + simdgroup_multiply_accumulate(lo[2*ii + 0], vs[0], mv[0], lo[2*ii + 0]); + simdgroup_multiply_accumulate(lo[2*ii + 1], vs[0], mv[1], lo[2*ii + 1]); + simdgroup_multiply_accumulate(lo[2*ii + 0], vs[1], mv[2], lo[2*ii + 0]); + simdgroup_multiply_accumulate(lo[2*ii + 1], vs[1], mv[3], lo[2*ii + 1]); + } + + pv += 2*8*NS20; + } + } + } + + { + auto sot = so + 8*sgitg; + + FOR_UNROLL (short ii = 0; ii < NO; ++ii) { + simdgroup_store(lo[ii], sot, PV, 0, false); + + sot += 8*NSG; + } + } + } else { + // TODO: this is the quantized V cache branch - not optimized yet + + const short tx = tiisg%4; + const short ty = tiisg/4; + + for (short cc = 0; cc < C/8; ++cc) { + s8x8_t vs; + simdgroup_load(vs, ss + 8*cc, SH, 0, false); + + for (short ii = 4*sgitg; ii < DV16; ii += 4*NSG) { + device const vd4x4_t * pv4x4 = (device const vd4x4_t *) (v + ((ic + 8*cc + ty)*args.nb21)); + + if (DV16%4 == 0) { + // no need for bound checks + { + v4x4_t tmp; + deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp); + sv4x4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + FOR_UNROLL (short k = 0; k < 4; ++k) { + v8x8_t mv[2]; + o8x8_t lo[2]; + + simdgroup_load(mv[0], sv + 16*k + 0*8, 4*16, 0, false); + simdgroup_load(mv[1], sv + 16*k + 1*8, 4*16, 0, false); + simdgroup_load(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); + simdgroup_load(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); + + simdgroup_multiply_accumulate(lo[0], vs, mv[0], lo[0]); + simdgroup_multiply_accumulate(lo[1], vs, mv[1], lo[1]); + + simdgroup_store(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); + simdgroup_store(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); + } + } else { + if (ii + tx < DV16) { + v4x4_t tmp; + deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp); + sv4x4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + for (short k = 0; k < 4 && ii + k < DV16; ++k) { + v8x8_t mv[2]; + o8x8_t lo[2]; + + simdgroup_load(mv[0], sv + 16*k + 0*8, 4*16, 0, false); + simdgroup_load(mv[1], sv + 16*k + 1*8, 4*16, 0, false); + simdgroup_load(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); + simdgroup_load(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); + + simdgroup_multiply_accumulate(lo[0], vs, mv[0], lo[0]); + simdgroup_multiply_accumulate(lo[1], vs, mv[1], lo[1]); + + simdgroup_store(lo[0], so + 8*(2*(ii + k) + 0), PV, 0, false); + simdgroup_store(lo[1], so + 8*(2*(ii + k) + 1), PV, 0, false); + } + } + } + } + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (FC_flash_attn_ext_has_sinks) { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + + const float m = M[jj]; + const float s = tiisg == 0 ? ((device const float *) sinks)[iq2] : -FLT_MAX/2; + + M[jj] = simd_max(max(M[jj], s)); + + const float ms = exp(m - M[jj]); + const float vs = exp(s - M[jj]); + + S[jj] = S[jj]*ms + simd_sum(vs); + + for (short i = tiisg; i < DV4; i += NW) { + so4[j*PV4 + i] *= ms; + } + } + } + } + + // store to global memory + for (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; + if (iq1 + j >= args.ne01) { + break; + } + + device float4 * dst4 = (device float4 *) dst + ((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4; + + const float scale = S[jj] == 0.0 ? 0.0f : 1.0f/S[jj]; + + if (DV4 % NW == 0) { + FOR_UNROLL (short ii = 0; ii < DV4/NW; ++ii) { + const short i = ii*NW + tiisg; + + dst4[i] = (float4) so4[j*PV4 + i]*scale; + } + } else { + for (short i = tiisg; i < DV4; i += NW) { + dst4[i] = (float4) so4[j*PV4 + i]*scale; + } + } + } + +#undef NS10 +#undef NS20 +} + +template< + typename q_t, // query types in shared memory + typename q4_t, + typename q8x8_t, + typename k_t, // key types in shared memory + typename k4x4_t, + typename k8x8_t, + typename v_t, // value types in shared memory + typename v4x4_t, + typename v8x8_t, + typename qk_t, // Q*K types + typename qk8x8_t, + typename s_t, // soft-max types + typename s2_t, + typename s8x8_t, + typename o_t, // attention accumulation types + typename o4_t, + typename o8x8_t, + typename kd4x4_t, // key type in device memory + short nl_k, + void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &), + typename vd4x4_t, // value type in device memory + short nl_v, + void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), + short DK, // K head size + short DV, // V head size + short Q = OP_FLASH_ATTN_EXT_NQPSG, // queries per threadgroup + short C = OP_FLASH_ATTN_EXT_NCPSG> // cache items per threadgroup +kernel void kernel_flash_attn_ext( + constant ggml_metal_kargs_flash_attn_ext & args, + device const char * q, + device const char * k, + device const char * v, + device const char * mask, + device const char * sinks, + device const char * pad, + device const char * blk, + device char * dst, + threadgroup half * shmem_f16 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { +#define FWD_TMPL q_t, q4_t, q8x8_t, k_t, k4x4_t, k8x8_t, v_t, v4x4_t, v8x8_t, qk_t, qk8x8_t, s_t, s2_t, s8x8_t, o_t, o4_t, o8x8_t, kd4x4_t, nl_k, deq_k, vd4x4_t, nl_v, deq_v, DK, DV, Q, C +#define FWD_ARGS args, q, k, v, mask, sinks, pad, blk, dst, shmem_f16, tgpig, tiisg, sgitg + switch (FC_flash_attn_ext_nsg) { + // note: disabled cases to reduce library load time + //case 1: kernel_flash_attn_ext_impl(FWD_ARGS); break; + //case 2: kernel_flash_attn_ext_impl(FWD_ARGS); break; + case 4: kernel_flash_attn_ext_impl(FWD_ARGS); break; + case 8: kernel_flash_attn_ext_impl(FWD_ARGS); break; + } +#undef FWD_TMPL +#undef FWD_ARGS +} + +// TODO: this is quite ugly. in the future these types will be hardcoded in the kernel, but for now keep them as +// template to be able to explore different combinations +// +#define FA_TYPES \ + half, half4, simdgroup_half8x8, \ + half, half4x4, simdgroup_half8x8, \ + half, half4x4, simdgroup_half8x8, \ + float, simdgroup_float8x8, \ + float, float2, simdgroup_float8x8, \ + float, float4, simdgroup_float8x8 + //half, half4, simdgroup_half8x8 + +#define FA_TYPES_BF \ + bfloat, bfloat4, simdgroup_bfloat8x8, \ + bfloat, bfloat4x4, simdgroup_bfloat8x8, \ + bfloat, bfloat4x4, simdgroup_bfloat8x8, \ + float, simdgroup_float8x8, \ + float, float2, simdgroup_float8x8, \ + half, half4, simdgroup_half8x8 + //float, float4, simdgroup_float8x8 + +#define FA_TYPES_F32 \ + half, half4, simdgroup_half8x8, \ + float, float4x4, simdgroup_float8x8, \ + float, float4x4, simdgroup_float8x8, \ + float, simdgroup_float8x8, \ + float, float2, simdgroup_float8x8, \ + float, float4, simdgroup_float8x8 + //half, half4, simdgroup_half8x8 + +typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; + +template [[host_name("kernel_flash_attn_ext_f32_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_bf16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +#endif + +template [[host_name("kernel_flash_attn_ext_q4_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk96_dv96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk128_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk512_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +#undef FA_TYPES +#undef FA_TYPES_BF +#undef FA_TYPES_F32 + +constant bool FC_flash_attn_ext_vec_has_mask [[function_constant(FC_FLASH_ATTN_EXT_VEC + 0)]]; +constant bool FC_flash_attn_ext_vec_has_sinks [[function_constant(FC_FLASH_ATTN_EXT_VEC + 1)]]; +constant bool FC_flash_attn_ext_vec_has_bias [[function_constant(FC_FLASH_ATTN_EXT_VEC + 2)]]; +constant bool FC_flash_attn_ext_vec_has_scap [[function_constant(FC_FLASH_ATTN_EXT_VEC + 3)]]; +constant bool FC_flash_attn_ext_vec_has_kvpad [[function_constant(FC_FLASH_ATTN_EXT_VEC + 4)]]; + +//constant float FC_flash_attn_ext_vec_scale [[function_constant(FC_FLASH_ATTN_EXT_VEC + 10)]]; +//constant float FC_flash_attn_ext_vec_max_bias [[function_constant(FC_FLASH_ATTN_EXT_VEC + 11)]]; +//constant float FC_flash_attn_ext_vec_logit_softcap [[function_constant(FC_FLASH_ATTN_EXT_VEC + 12)]]; + +constant int32_t FC_flash_attn_ext_vec_ns10 [[function_constant(FC_FLASH_ATTN_EXT_VEC + 20)]]; +constant int32_t FC_flash_attn_ext_vec_ns20 [[function_constant(FC_FLASH_ATTN_EXT_VEC + 21)]]; +constant int32_t FC_flash_attn_ext_vec_nsg [[function_constant(FC_FLASH_ATTN_EXT_VEC + 22)]]; +constant int32_t FC_flash_attn_ext_vec_nwg [[function_constant(FC_FLASH_ATTN_EXT_VEC + 23)]]; + +template< + typename q4_t, // query types in shared memory + typename k4_t, // key types in shared memory + typename v4_t, // value types in shared memory + typename qk_t, // Q*K types + typename s_t, // soft-max types + typename s4_t, + typename o4_t, // attention accumulation types + typename kd4_t, // key type in device memory + short nl_k, + void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &), + typename vd4_t, // value type in device memory + short nl_v, + void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &), + short DK, // K head size + short DV, // V head size + short NE = 4, // head elements per thread + short Q = OP_FLASH_ATTN_EXT_VEC_NQPSG, // queries per threadgroup + short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup +kernel void kernel_flash_attn_ext_vec( + constant ggml_metal_kargs_flash_attn_ext_vec & args, + device const char * q, + device const char * k, + device const char * v, + device const char * mask, + device const char * sinks, + device const char * pad, + device char * dst, + threadgroup half * shmem_f16 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + static_assert(DK % 32 == 0, "DK must be divisible by 32"); + static_assert(DV % 32 == 0, "DV must be divisible by 32"); + +#define NWG (FC_flash_attn_ext_vec_nwg) +#define NSG (FC_flash_attn_ext_vec_nsg) + +#define NS10 (FC_flash_attn_ext_vec_ns10) +#define NS20 (FC_flash_attn_ext_vec_ns20) + + const short iwg = tgpig[2]%NWG; + + const ushort iq3 = tgpig[2]/NWG; + const ushort iq2 = tgpig[1]; + const ushort iq1 = tgpig[0]; + + constexpr short DK4 = DK/4; + constexpr short DV4 = DV/4; + + constexpr short PK = PAD2(DK, 128); + constexpr short PK4 = PK/4; + + constexpr short PV = PAD2(DV, 128); + constexpr short PV4 = PV/4; + + constexpr short NW = N_SIMDWIDTH; + constexpr short NL = NW/NE; // note: this can be adjusted to support different head sizes and simdgroup work loads + constexpr short SH = 4*C; // shared memory per simdgroup + + static_assert(DK4 % NL == 0, "DK4 must be divisible by NL"); + static_assert(DV4 % NL == 0, "DV4 must be divisible by NL"); + + //const short T = PK + NSG*SH; // shared memory size per query in (half) + + //threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t + threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + NSG*PK); // scratch buffer for attention + threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + NSG*PK); // same as above but in s4_t + threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + NSG*PK); // scratch buffer for mask + threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + NSG*PK + NSG*SH); // scratch buffer for the results + + // store the result for all queries in shared memory (the O matrix from the paper) + so4 += tiisg; + + { + q += iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03; + + const short ikv2 = iq2/(args.ne02/args.ne_12_2); + const short ikv3 = iq3/(args.ne03/args.ne_12_3); + + k += ikv2*args.nb12 + ikv3*args.nb13; + v += ikv2*args.nb22 + ikv3*args.nb23; + } + + // load heads from Q to shared memory + device const float4 * q4 = (device const float4 *) ((device const char *) q); + + if (iq1 < args.ne01) { + for (short i = tiisg; i < PK4; i += NW) { + if (i < DK4) { + sq4[i] = (q4_t) q4[i]; + } else { + sq4[i] = (q4_t) 0.0f; + } + } + } + + // zero out so + for (short i = 0; i < DV4/NL; ++i) { + so4[i*NL] = (o4_t) 0.0f; + } + + // zero out shared memory SH + for (short i = tiisg; i < SH/4; i += NW) { + ss4[i] = (s4_t) 0.0f; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + { + float S = 0.0f; + float M = -FLT_MAX/2; + + // thread indices inside the simdgroup + const short tx = tiisg%NL; + const short ty = tiisg/NL; + + // pointer to the mask + device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33); + + float slope = 1.0f; + + // ALiBi + if (FC_flash_attn_ext_vec_has_bias) { + const short h = iq2; + + const float base = h < args.n_head_log2 ? args.m0 : args.m1; + const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; + + slope = pow(base, exph); + } + + // loop over the KV cache + // each simdgroup handles blocks of Q rows and C columns + for (int ic0 = iwg*NSG + sgitg; ; ic0 += NWG*NSG) { + int ic = ic0*C; + if (ic >= args.ne11) { + break; + } + + // the last partial chunk uses the pad buffer as source + if (FC_flash_attn_ext_vec_has_kvpad && ic + C > args.ne11) { + k = pad; + v = k + args.nb11*C*args.ne_12_2*args.ne_12_3; + mask = v + args.nb21*C*args.ne_12_2*args.ne_12_3; + + const short ikv2 = iq2/(args.ne02/args.ne_12_2); + const short ikv3 = iq3/(args.ne03/args.ne_12_3); + + k += (ikv2 + ikv3*args.ne_12_2)*args.nb11*C; + v += (ikv2 + ikv3*args.ne_12_2)*args.nb21*C; + + if (!FC_flash_attn_ext_vec_has_mask) { + if (ic + tiisg >= args.ne11) { + sm[tiisg] = -MAXHALF; + } + } else { + pm = (device const half *) (mask) + + iq1*C + + (iq2%args.ne32)*(C*args.ne31) + + (iq3%args.ne33)*(C*args.ne31*args.ne32); + } + + ic = 0; + } + + if (FC_flash_attn_ext_vec_has_mask) { + sm[tiisg] = pm[ic + tiisg]; + } + + // skip -INF blocks + if (simd_max(sm[tiisg]) <= -MAXHALF) { + continue; + } + + // Q*K^T + { + device const k4_t * pk4 = (device const k4_t *) (k + ic*args.nb11); + threadgroup const q4_t * pq4 = sq4; + + pk4 += ty*NS10/4 + tx; + pq4 += tx; + + qk_t mqk[C/NE] = { [ 0 ... C/NE - 1] = 0.0f }; + + // each simdgroup processes 1 query and NE (NW/NL) cache elements + FOR_UNROLL (short cc = 0; cc < C/NE; ++cc) { + if (is_same::value) { + FOR_UNROLL (short ii = 0; ii < DK4/NL; ++ii) { + mqk[cc] += dot((float4) pk4[cc*NE*NS10/4 + ii*NL], (float4) pq4[ii*NL]); + } + } else { + device const kd4_t * pk = (device const kd4_t *) (k + ((ic + NE*cc + ty)*args.nb11)); + + k4_t mk; + + FOR_UNROLL (short ii = 0; ii < DK4/NL; ++ii) { + const short i = ii*NL + tx; + + deq_k_t4(pk + i/nl_k, i%nl_k, mk); + + mqk[cc] += dot((float4) mk, (float4) sq4[i]); + } + } + + if (NE == 1) { + mqk[cc] = simd_sum(mqk[cc]); + } else { + // simdgroup reduce (NE = 4) + // [ 0 .. 7] -> [ 0] + // [ 8 .. 15] -> [ 8] + // [16 .. 23] -> [16] + // [24 .. 31] -> [24] + if (NE <= 1) { + mqk[cc] += simd_shuffle_down(mqk[cc], 16); + } + if (NE <= 2) { + mqk[cc] += simd_shuffle_down(mqk[cc], 8); + } + if (NE <= 4) { + mqk[cc] += simd_shuffle_down(mqk[cc], 4); + } + if (NE <= 8) { + mqk[cc] += simd_shuffle_down(mqk[cc], 2); + } + if (NE <= 16) { + mqk[cc] += simd_shuffle_down(mqk[cc], 1); + } + + // broadcast + mqk[cc] = simd_shuffle(mqk[cc], NL*ty); + } + } + + if (FC_flash_attn_ext_vec_has_mask && + !FC_flash_attn_ext_vec_has_scap && + !FC_flash_attn_ext_vec_has_bias) { + ss[NE*tx + ty] = fma(mqk[tx], args.scale, (qk_t) sm[NE*tx + ty]); + } else { + mqk[tx] *= args.scale; + + if (FC_flash_attn_ext_vec_has_scap) { + mqk[tx] = args.logit_softcap*precise::tanh(mqk[tx]); + } + + if (FC_flash_attn_ext_vec_has_bias) { + mqk[tx] += (qk_t) sm[NE*tx + ty]*slope; + } else { + mqk[tx] += (qk_t) sm[NE*tx + ty]; + } + + ss[NE*tx + ty] = mqk[tx]; + } + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + // online softmax + { + const float m = M; + const float s = ss[tiisg]; + + M = simd_max(max(M, s)); + + const float ms = exp(m - M); + const float vs = exp(s - M); + + S = S*ms + simd_sum(vs); + + // the P matrix from the paper (Q rows, C columns) + ss[tiisg] = vs; + + // O = diag(ms)*O + if ((DV4/NL % NW == 0) || ty == 0) { + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + so4[ii*NL] *= ms; + } + } + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + // O = O + (Q*K^T)*V + { + o4_t lo[DV4/NL]; + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + lo[ii] = 0.0f; + } + + if (is_same::value) { + device const v4_t * pv4 = (device const v4_t *) (v + ic*args.nb21); + + pv4 += ty*NS20/4 + tx; + + const auto sst = ss + ty; + + FOR_UNROLL (short cc = 0; cc < C/NE; ++cc) { + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + lo[ii] += o4_t(float4(pv4[cc*NE*NS20/4 + ii*NL])*float4(sst[cc*NE])); + } + } + } else { + FOR_UNROLL (short cc = 0; cc < C/NE; ++cc) { + device const vd4_t * pv4 = (device const vd4_t *) (v + ((ic + NE*cc + ty)*args.nb21)); + + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + const short i = ii*NL + tx; + + v4_t mv; + deq_v_t4(pv4 + i/nl_v, i%nl_v, mv); + + lo[ii] += o4_t(float4(mv)*float4(ss[NE*cc + ty])); + } + } + } + + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + if (NE > 1) { + lo[ii][0] += simd_shuffle_down(lo[ii][0], 16); + lo[ii][1] += simd_shuffle_down(lo[ii][1], 16); + lo[ii][2] += simd_shuffle_down(lo[ii][2], 16); + lo[ii][3] += simd_shuffle_down(lo[ii][3], 16); + } + + if (NE > 2) { + lo[ii][0] += simd_shuffle_down(lo[ii][0], 8); + lo[ii][1] += simd_shuffle_down(lo[ii][1], 8); + lo[ii][2] += simd_shuffle_down(lo[ii][2], 8); + lo[ii][3] += simd_shuffle_down(lo[ii][3], 8); + } + + if (NE > 4) { + lo[ii][0] += simd_shuffle_down(lo[ii][0], 4); + lo[ii][1] += simd_shuffle_down(lo[ii][1], 4); + lo[ii][2] += simd_shuffle_down(lo[ii][2], 4); + lo[ii][3] += simd_shuffle_down(lo[ii][3], 4); + } + + if (NE > 8) { + lo[ii][0] += simd_shuffle_down(lo[ii][0], 2); + lo[ii][1] += simd_shuffle_down(lo[ii][1], 2); + lo[ii][2] += simd_shuffle_down(lo[ii][2], 2); + lo[ii][3] += simd_shuffle_down(lo[ii][3], 2); + } + + if (NE > 16) { + lo[ii][0] += simd_shuffle_down(lo[ii][0], 1); + lo[ii][1] += simd_shuffle_down(lo[ii][1], 1); + lo[ii][2] += simd_shuffle_down(lo[ii][2], 1); + lo[ii][3] += simd_shuffle_down(lo[ii][3], 1); + } + } + + if ((DV4/NL % NW == 0) || ty == 0) { + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + so4[ii*NL] += lo[ii]; + } + } + } + } + + if (FC_flash_attn_ext_vec_has_sinks && sgitg == 0 && iwg == 0) { + const float m = M; + const float s = tiisg == 0 ? ((device const float *) sinks)[iq2] : -FLT_MAX/2; + + M = simd_max(max(M, s)); + + const float ms = exp(m - M); + const float vs = exp(s - M); + + S = S*ms + simd_sum(vs); + + if ((DV4/NL % NW == 0) || ty == 0) { + FOR_UNROLL (short ii = 0; ii < DV4/NL; ++ii) { + so4[ii*NL] *= ms; + } + } + } + + // these are needed for reducing the results from the simdgroups (reuse the ss buffer) + if (tiisg == 0) { + ss[0] = (s_t) S; + ss[1] = (s_t) M; + } + } + + so4 -= tiisg; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // parallel reduce + for (short r = NSG/2; r > 0; r >>= 1) { + if (sgitg < r) { + const float S0 = ss[ 0]; + const float S1 = ss[r*(SH/2) + 0]; + + const float M0 = ss[ 1]; + const float M1 = ss[r*(SH/2) + 1]; + + const float M = max(M0, M1); + + const float ms0 = exp(M0 - M); + const float ms1 = exp(M1 - M); + + const float S = S0*ms0 + S1*ms1; + + if (tiisg == 0) { + ss[0] = S; + ss[1] = M; + } + + // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 + for (short i = tiisg; i < DV4; i += NW) { + so4[i] = so4[i]*ms0 + so4[i + r*PV4]*ms1; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + // final rescale with 1/S and store to global memory + if (sgitg == 0) { + const int64_t nrows = args.ne3*args.ne2*args.ne1; + const int64_t rid = iq3*args.ne2*args.ne1 + iq2 + iq1*args.ne1; + + device float4 * dst4 = (device float4 *) dst; + device float * dst1 = (device float *) dst + nrows*DV*NWG; // the S and M are stored after the results + + const float S = NWG == 1 ? (ss[0] == 0.0f ? 0.0f : 1.0f/ss[0]) : 1.0f; + + // interleave the workgroup data + for (short i = tiisg; i < DV4; i += NW) { + dst4[rid*DV4*NWG + NWG*i + iwg] = (float4) so4[i]*S; + } + + // store S and M + if (NWG > 1) { + if (tiisg == 0) { + dst1[rid*(2*NWG) + 2*iwg + 0] = ss[0]; + dst1[rid*(2*NWG) + 2*iwg + 1] = ss[1]; + } + } + } + +#undef NWG +#undef NSG +#undef NS10 +#undef NS20 +} + +// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem +// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max +// +#define FA_TYPES \ + half4, \ + half4, \ + half4, \ + float, \ + float, float4, \ + float4 + +#define FA_TYPES_F32 \ + half4, \ + float4, \ + float4, \ + float, \ + float, float4, \ + float4 + +typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk32_dv32")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk96_dv96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk128_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk192_dv192")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk192_dv128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk512_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +#undef FA_TYPES +#undef FA_TYPES_F32 + +constant int32_t FC_flash_attn_ext_vec_reduce_DV [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 0)]]; +constant int32_t FC_flash_attn_ext_vec_reduce_NWG [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 1)]]; + +kernel void kernel_flash_attn_ext_vec_reduce( + constant ggml_metal_kargs_flash_attn_ext_vec_reduce & args, + device const char * htmp, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { +#define NWG (FC_flash_attn_ext_vec_reduce_NWG) +#define DV (FC_flash_attn_ext_vec_reduce_DV) + + const uint64_t rid = tgpig; + + const short iwg = tiisg; + + device const float * ss = (device const float *) htmp + (uint64_t)args.nrows*DV*NWG; + + float S = ss[rid*(2*NWG) + 2*iwg + 0]; + float M = ss[rid*(2*NWG) + 2*iwg + 1]; + + const float m = simd_max(M); + const float ms = exp(M - m); + + S = simd_sum(S*ms); + S = S == 0.0f ? 0.0f : 1.0f/S; + + const short DV4 = DV/4; + + device const float4 * htmp4 = (device const float4 *) htmp + rid*DV4*NWG; + device float4 * dst4 = (device float4 *) dst + rid*DV4; + + for (short i = sgitg; i < DV4; i += NWG) { + const float4 v = simd_sum(htmp4[i*NWG + iwg]*ms); + + if (iwg == 0) { + dst4[i] = v*S; + } + } + +#undef NWG +#undef DV +} diff --git a/ggml/src/ggml-metal/kernels/gated_delta_net.metal b/ggml/src/ggml-metal/kernels/gated_delta_net.metal new file mode 100644 index 0000000000..8422d8e29f --- /dev/null +++ b/ggml/src/ggml-metal/kernels/gated_delta_net.metal @@ -0,0 +1,250 @@ +#include "common.h" + +constant short FC_gated_delta_net_ne20 [[function_constant(FC_GATED_DELTA_NET + 0)]]; +constant short FC_gated_delta_net_ne30 [[function_constant(FC_GATED_DELTA_NET + 1)]]; +constant short FC_gated_delta_net_K [[function_constant(FC_GATED_DELTA_NET + 2)]]; + +#if 1 +template +kernel void kernel_gated_delta_net_impl( + constant ggml_metal_kargs_gated_delta_net & args, + device const char * q, + device const char * k, + device const char * v, + device const char * g, + device const char * b, + device const char * s, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { +#define S_v FC_gated_delta_net_ne20 +#define G FC_gated_delta_net_ne30 +#define K FC_gated_delta_net_K + + const uint tx = tpitg.x; + const uint ty = tpitg.y; + + const uint i23 = tgpig.z; // B (n_seqs) + const uint i21 = tgpig.y; // H (head) + const uint i20 = tgpig.x*NSG + ty; // row within S_v + + const uint i01 = i21 % args.ne01; + const uint i11 = i21 % args.ne11; + + const float scale = 1.0f / sqrt((float)S_v); + + // input state layout [S_v, S_v, H, n_seqs] (s0 only): per-seq stride is H*D. + // state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous + const uint state_in_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; + device const float * s_ptr = (device const float *) (s) + state_in_base; + + float ls[NSG]; + + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + ls[j] = s_ptr[is]; + } + + device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20; + + device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01); + device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11); + device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21); + + device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21); + device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G; + + // snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back. + // When n_tokens < K, only slots 0..n_tokens-1 are written; older slots are caller-owned. + + // output state base offset: after attention scores + const uint attn_size = args.ne22 * args.ne21 * S_v * args.ne23; + // output state per-slot size: S_v * S_v * H * n_seqs + const uint state_size_per_snap = S_v * S_v * args.ne21 * args.ne23; + // per-(seq,head) offset within a slot + const uint state_out_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; + + for (short t = 0; t < args.ne22; t++) { + float s_k = 0.0f; + + if (G == 1) { + const float g_exp = exp(g_ptr[0]); + + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + ls[j] *= g_exp; + + s_k += ls[j]*k_ptr[is]; + } + } else { + // KDA + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + ls[j] *= exp(g_ptr[is]); + + s_k += ls[j]*k_ptr[is]; + } + } + + s_k = simd_sum(s_k); + + const float d = (v_ptr[i20] - s_k)*b_ptr[0]; + + float y = 0.0f; + + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + ls[j] += k_ptr[is]*d; + + y += ls[j]*q_ptr[is]; + } + + y = simd_sum(y); + + if (tx == 0) { + dst_attn[t*args.ne21*S_v] = y*scale; + } + + q_ptr += args.ns02; + k_ptr += args.ns12; + v_ptr += args.ns22; + + b_ptr += args.ne21; + g_ptr += args.ne21*G; + + if (K > 1) { + const int target_slot = (int)args.ne22 - 1 - (int)t; + if (target_slot >= 0 && target_slot < (int)K) { + device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base; + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + dst_state[is] = ls[j]; + } + } + } + } + + if (K == 1) { + device float * dst_state = (device float *) (dst) + attn_size + state_out_base; + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + dst_state[is] = ls[j]; + } + } + +#undef S_v +#undef G +#undef K +} + +typedef decltype(kernel_gated_delta_net_impl<4>) kernel_gated_delta_net_t; + +template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<1>; +template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<2>; +template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<4>; + +#else +// a simplified version of the above +// no performance improvement, so keep the above version for now + +template +kernel void kernel_gated_delta_net_impl( + constant ggml_metal_kargs_gated_delta_net & args, + device const char * q, + device const char * k, + device const char * v, + device const char * g, + device const char * b, + device const char * s, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { +#define S_v FC_gated_delta_net_ne20 +#define G FC_gated_delta_net_ne30 + + const uint tx = tpitg.x; + const uint ty = tpitg.y; + + const uint i23 = tgpig.z; // B + const uint i21 = tgpig.y; // H + const uint i20 = tgpig.x*NSG + ty; + + const uint i01 = i21 % args.ne01; + const uint i11 = i21 % args.ne11; + + const float scale = 1.0f / sqrt((float)S_v); + + device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20; + + float lsf[NSG]; + + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + lsf[j] = s_ptr[is*S_v]; + } + + thread T * ls = (thread T *) (lsf); + + device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20; + + device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01); + device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11); + device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21); + + device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21); + device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G; + + for (short t = 0; t < args.ne22; t++) { + device const T * qt_ptr = (device const T *) (q_ptr); + device const T * kt_ptr = (device const T *) (k_ptr); + device const T * gt_ptr = (device const T *) (g_ptr); + + if (G == 1) { + *ls *= exp(g_ptr[0]); + } else { + // KDA + *ls *= exp(gt_ptr[tx]); + } + + const float s_k = simd_sum(dot(*ls, kt_ptr[tx])); + + const float d = (v_ptr[i20] - s_k)*b_ptr[0]; + + *ls += kt_ptr[tx]*d; + + const float y = simd_sum(dot(*ls, qt_ptr[tx])); + + if (tx == 0) { + *dst_attn = y*scale; + } + + q_ptr += args.ns02; + k_ptr += args.ns12; + v_ptr += args.ns22; + + b_ptr += args.ne21; + g_ptr += args.ne21*G; + + dst_attn += args.ne21*S_v; + } + + device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20; + device T * dstt_state = (device T *) (dst_state); + + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + dst_state[is*S_v] = lsf[j]; + } + +#undef S_v +#undef G +} + +typedef decltype(kernel_gated_delta_net_impl) kernel_gated_delta_net_t; + +template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl; +template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl; +template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl; +#endif diff --git a/ggml/src/ggml-metal/kernels/misc.metal b/ggml/src/ggml-metal/kernels/misc.metal new file mode 100644 index 0000000000..fe5daa92a1 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/misc.metal @@ -0,0 +1,347 @@ +#include "common.h" + +kernel void kernel_argmax_f32( + constant ggml_metal_kargs_argmax & args, + device const char * src0, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * x_row = (device const float *) ((device const char *) src0 + tgpig * args.nb01); + + float lmax = -INFINITY; + int32_t larg = -1; + + for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) { + if (x_row[i00] > lmax) { + lmax = x_row[i00]; + larg = i00; + } + } + + // find the argmax value in the block + float max_val = simd_max(lmax); + int32_t arg_val = simd_max(select(-1, larg, lmax == max_val)); + + device int32_t * dst_i32 = (device int32_t *) dst; + + threadgroup float * shared_maxval = (threadgroup float *) shmem; + threadgroup int32_t * shared_argmax = (threadgroup int32_t *) shmem + N_SIMDWIDTH; + + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + shared_maxval[tiisg] = -INFINITY; + shared_argmax[tiisg] = -1; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shared_maxval[sgitg] = max_val; + shared_argmax[sgitg] = arg_val; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max_val = shared_maxval[tiisg]; + arg_val = shared_argmax[tiisg]; + + float max_val_reduced = simd_max(max_val); + int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced)); + + dst_i32[tgpig] = arg_val_reduced; + + return; + } + + dst_i32[tgpig] = arg_val; +} + +kernel void kernel_diag_f32( + constant ggml_metal_kargs_diag & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]]) { + constexpr short NW = N_SIMDWIDTH; + + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t i1 = tgpig.x; + + device const float * src0_ptr = (device const float *)(src0 + i2*args.nb02 + i3*args.nb03); + device float * dst_ptr = (device float *)(dst + i1*args.nb01 + i2*args.nb2 + i3*args.nb3); + + for (int i0 = tiitg; i0 < args.ne0; i0 += NW) { + dst_ptr[i0] = i0 == i1 ? src0_ptr[i0] : 0.0f; + } +} + +kernel void kernel_roll_f32( + constant ggml_metal_kargs_roll & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + device const float * src0_ptr = (device const float *) src0; + device float * dst_ptr = (device float *) dst; + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + // apply shifts and wrap around + int64_t i00 = i0 - args.s0; + int64_t i01 = i1 - args.s1; + int64_t i02 = i2 - args.s2; + int64_t i03 = i3 - args.s3; + + if (i00 < 0) { i00 += args.ne00; } else if (i00 >= args.ne00) { i00 -= args.ne00; } + if (i01 < 0) { i01 += args.ne01; } else if (i01 >= args.ne01) { i01 -= args.ne01; } + if (i02 < 0) { i02 += args.ne02; } else if (i02 >= args.ne02) { i02 -= args.ne02; } + if (i03 < 0) { i03 += args.ne03; } else if (i03 >= args.ne03) { i03 -= args.ne03; } + + int64_t src_idx = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00 + i00; + int64_t dst_idx = i3 *args.ne2 *args.ne1 *args.ne0 + i2 *args.ne1 *args.ne0 + i1 *args.ne0 + i0; + + dst_ptr[dst_idx] = src0_ptr[src_idx]; + } +} + +template +kernel void kernel_pad_impl( + constant ggml_metal_kargs_pad & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t k0 = tgpig.x/args.ne1; + const int32_t i1 = tgpig.x - k0*args.ne1; + + const int32_t i03 = i3; + const int32_t i02 = i2; + const int32_t i01 = i1; + + device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); + + for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) { + const int32_t i0 = k0*1024 + tpitg.x + l0; + if (i0 >= args.ne0) { + break; + } + + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + dst_ptr[i0] = src0_ptr[i0]; + } else { + dst_ptr[i0] = 0.0f; + } + } +} + +typedef decltype(kernel_pad_impl) kernel_pad_t; + +template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl; +template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl; + +// TODO: this is slow - optimize +kernel void kernel_pad_reflect_1d_f32( + constant ggml_metal_kargs_pad_reflect_1d & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3; + const int64_t i02 = i2; + const int64_t i01 = i1; + + device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); + + if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + if (i0 < args.p0) { + dst_ptr[i0] = src0_ptr[args.p0 - i0]; + } else if (i0 < args.ne0 - args.p1) { + dst_ptr[i0] = src0_ptr[i0 - args.p0]; + } else { + dst_ptr[i0] = src0_ptr[(args.ne0 - args.p1 - args.p0) - (args.p1 + 1 - (args.ne0 - i0)) - 1]; + } + } + } +} + +kernel void kernel_arange_f32( + constant ggml_metal_kargs_arange & args, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + device float * dst_ptr = (device float *) dst; + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + dst_ptr[i0] = args.start + args.step * i0; + } +} + +kernel void kernel_timestep_embedding_f32( + constant ggml_metal_kargs_timestep_embedding & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + int i = tgpig.x; + device float * embed_data = (device float *)(dst + i*args.nb1); + + int half_ = args.dim / 2; + for (int j = tpitg.x; j < half_; j += ntg.x) { + float timestep = ((device float *)src0)[i]; + float freq = (float)exp(-log((float)args.max_period) * j / half_); + float arg = timestep * freq; + embed_data[j ] = cos(arg); + embed_data[j + half_] = sin(arg); + } + + if (args.dim % 2 != 0 && tpitg.x == 0) { + embed_data[2 * half_] = 0.f; + } +} + +kernel void kernel_opt_step_adamw_f32( + constant ggml_metal_kargs_opt_step_adamw & args, + device float * x, + device const float * g, + device float * g_m, + device float * g_v, + device const float * pars, + uint gid[[thread_position_in_grid]]) { + + if (gid >= args.np) { + return; + } + + const float alpha = pars[0]; + const float beta1 = pars[1]; + const float beta2 = pars[2]; + const float eps = pars[3]; + const float wd = pars[4]; + const float beta1h = pars[5]; + const float beta2h = pars[6]; + + const float gi = g[gid]; + const float gmi = g_m[gid] * beta1 + gi * (1.0f - beta1); + const float gvi = g_v[gid] * beta2 + gi * gi * (1.0f - beta2); + + g_m[gid] = gmi; + g_v[gid] = gvi; + + const float mh = gmi * beta1h; + const float vh = sqrt(gvi * beta2h) + eps; + + x[gid] = x[gid] * (1.0f - alpha * wd) - alpha * mh / vh; +} + +kernel void kernel_opt_step_sgd_f32( + constant ggml_metal_kargs_opt_step_sgd & args, + device float * x, + device const float * g, + device const float * pars, + uint gid[[thread_position_in_grid]]) { + + if (gid >= args.np) { + return; + } + + x[gid] = x[gid] * (1.0f - pars[0] * pars[1]) - pars[0] * g[gid]; +} + +template +kernel void kernel_memset( + constant ggml_metal_kargs_memset & args, + device T * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = args.val; +} + +typedef decltype(kernel_memset) kernel_memset_t; + +template [[host_name("kernel_memset_i64")]] kernel kernel_memset_t kernel_memset; + +constant short FC_count_equal_nsg [[function_constant(FC_COUNT_EQUAL + 0)]]; + +template +kernel void kernel_count_equal( + constant ggml_metal_kargs_count_equal & args, + device const char * src0, + device const char * src1, + device atomic_int * dst, + threadgroup int32_t * shmem_i32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const short NSG = FC_count_equal_nsg; + + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { + return; + } + + int sum = 0; + + device const char * base0 = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03; + device const char * base1 = src1 + i1*args.nb11 + i2*args.nb12 + i3*args.nb13; + + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + const T v0 = *(device const T *)(base0 + i0*args.nb00); + const T v1 = *(device const T *)(base1 + i0*args.nb10); + sum += (v0 == v1); + } + + sum = simd_sum(sum); + + if (tiisg == 0) { + shmem_i32[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (sgitg == 0) { + float v = 0.0f; + if (tpitg.x < NSG) { + v = shmem_i32[tpitg.x]; + } + + float total = simd_sum(v); + if (tpitg.x == 0) { + atomic_fetch_add_explicit(dst, (int32_t) total, memory_order_relaxed); + } + } +} + +typedef decltype(kernel_count_equal) kernel_count_equal_t; + +template [[host_name("kernel_count_equal_i32")]] kernel kernel_count_equal_t kernel_count_equal; diff --git a/ggml/src/ggml-metal/kernels/mul_mm.metal b/ggml/src/ggml-metal/kernels/mul_mm.metal new file mode 100644 index 0000000000..66d709df0c --- /dev/null +++ b/ggml/src/ggml-metal/kernels/mul_mm.metal @@ -0,0 +1,838 @@ +#include "common.h" +#include "dequantize.h" + +constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]]; +constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]]; +constant short FC_mul_mm_ne12 [[function_constant(FC_MUL_MM + 2)]]; +constant short FC_mul_mm_ne13 [[function_constant(FC_MUL_MM + 3)]]; +constant short FC_mul_mm_r2 [[function_constant(FC_MUL_MM + 4)]]; +constant short FC_mul_mm_r3 [[function_constant(FC_MUL_MM + 5)]]; + +// each block_q contains 16*nl weights +#ifdef GGML_METAL_HAS_TENSOR +template< + typename SA, typename SA_4x4, typename SA_8x8, + typename SB, typename SB_2x4, typename SB_8x8, + typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread SA_4x4 &), + typename T0, typename T0_4x4, typename T1, typename T1_2x4> +kernel void kernel_mul_mm( + constant ggml_metal_kargs_mul_mm & args, + device const char * srcA, + device const char * srcB, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig [[threadgroup_position_in_grid]], + ushort tiitg [[thread_index_in_threadgroup]], + ushort sgitg [[simdgroup_index_in_threadgroup]]) { + (void) sgitg; + + // Matrix dimensions: A(M,K) x B(K,N) -> C(M,N) + const int K = args.ne00; + const int M = args.ne0; + const int N = args.ne1; + + // Batch dimension handling + const int im = tgpig.z; + const int i12 = im % FC_mul_mm_ne12; + const int i13 = im / FC_mul_mm_ne12; + + // Batch offsets for srcA and srcB + const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03; + + // Tile dimensions + constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X; + constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y; + + // Tile offsets in output matrix + const int ra = tgpig.y * NRA; + const int rb = tgpig.x * NRB; + + // Threadgroup memory for dequantized A tile only + threadgroup SA * sa = (threadgroup SA *)(shmem); + + // Work-item count for A loading + constexpr int A_WORK_ITEMS = NRA * N_MM_NK; + constexpr int NUM_THREADS = N_SIMDWIDTH * N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y; + + // tA wraps threadgroup memory + auto tA = tensor(sa, dextents(N_MM_NK_TOTAL, NRA)); + + // tB wraps device memory directly + device T1 * ptrB = (device T1 *)(srcB + args.nb12*i12 + args.nb13*i13); + const int strideB = args.nb11 / sizeof(T1); + auto tB = tensor(ptrB, dextents(K, N), array({1, strideB})); + + // Configure matmul operation + mpp::tensor_ops::matmul2d< + mpp::tensor_ops::matmul2d_descriptor( + NRB, NRA, N_MM_NK_TOTAL, false, true, true, + mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate), + execution_simdgroups> mm; + + auto cT = mm.get_destination_cooperative_tensor(); + + // Accumulate partial results over K dimension + for (int loop_k = 0; loop_k < K; loop_k += N_MM_NK_TOTAL) { + // === PHASE 1: Dequantization of A into threadgroup memory === + for (int work = tiitg; work < A_WORK_ITEMS; work += NUM_THREADS) { + const int row = work / N_MM_NK; + const int k_chunk = work % N_MM_NK; + const int k_pos = loop_k + k_chunk * 16; + const short k_base = k_chunk * 16; + + // Bounds check: skip device read if row is out of matrix bounds + if (ra + row < M) { + if (is_same::value && FC_mul_mm_bc_inp) { + // Element-wise reads when K is not aligned (nb01 not aligned for half4x4/float4x4). + // MSL spec Table 2.5: half4x4 requires 8-byte alignment. When K is odd, + // nb01 = K*2 is not 8-byte aligned, so odd-row pointers are misaligned. + // Mirrors the legacy kernel's existing guard. + device const T0 * row_ptr = (device const T0 *)(srcA + args.nb01 * (ra + row) + offset0); + + FOR_UNROLL (short i = 0; i < 16; i++) { + sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? (SA) row_ptr[k_pos + i] : (SA)0; + } + } else { + const int block_idx = k_pos / (16 * nl); + const short il = (k_pos / 16) % nl; + + device const block_q * row_ptr = (device const block_q *)(srcA + args.nb01 * (ra + row) + offset0); + + SA_4x4 temp_a; + dequantize_func(row_ptr + block_idx, il, temp_a); + + FOR_UNROLL (short i = 0; i < 16; i++) { + // Zero-pad A for K positions beyond valid range (handles partial K iterations) + sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? temp_a[i/4][i%4] : (SA)0; + } + } + } else { + // Zero-pad rows beyond matrix bounds + FOR_UNROLL (short i = 0; i < 16; i++) { + sa[row * N_MM_NK_TOTAL + (k_base + i)] = (SA)0; + } + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // === PHASE 2: Tensor matmul === + auto mA = tA.slice(0, 0); + auto mB = tB.slice(loop_k, rb); + + mm.run(mB, mA, cT); + + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + // Store result tile to output matrix (with batch offset) + // cT.store handles bounds checking via tD's extents (M, N) + device float * dstBatch = (device float *)dst + im * N * M; + + auto tD = tensor(dstBatch, dextents(M, N), array({1, M})); + cT.store(tD.slice(ra, rb)); +} + +#else + +template< + typename S0, typename S0_4x4, typename S0_8x8, + typename S1, typename S1_2x4, typename S1_8x8, + typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), + typename T0, typename T0_4x4, typename T1, typename T1_2x4> +kernel void kernel_mul_mm( + constant ggml_metal_kargs_mul_mm & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + threadgroup S0 * sa = (threadgroup S0 *)(shmem); + threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096); + + constexpr int NR0 = 64; + constexpr int NR1 = 32; + + constexpr int NK = 32; + constexpr int NL0 = NK/16; + constexpr int NL1 = NK/8; + + const int im = tgpig.z; + const int r0 = tgpig.y*NR0; + const int r1 = tgpig.x*NR1; + + // if this block is of 64x32 shape or smaller + const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0; + const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1; + + // a thread shouldn't load data outside of the matrix + const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63 + const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31 + + const short il0 = (tiitg % NL0); + + short il = il0; + + const int i12 = im % FC_mul_mm_ne12; + const int i13 = im / FC_mul_mm_ne12; + + const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03; + const short offset1 = il0/nl; + + device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1; + + const short iy = 8*(tiitg % NL1); + + device const T1 * y = (device const T1 *)(src1 + + args.nb13*i13 + + args.nb12*i12 + + args.nb11*(r1 + lr1) + + args.nb10*iy); + + S0_8x8 ma[4]; + S1_8x8 mb[2]; + + simdgroup_float8x8 mc[8]; + + for (short i = 0; i < 8; i++){ + mc[i] = make_filled_simdgroup_matrix(0.f); + } + + for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) { + // load data and store to threadgroup memory + if (is_same::value && FC_mul_mm_bc_inp) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + // no need for dequantization + for (short i = 0; i < 16; i++) { + const short sx = 2*il0 + i/8; + const short sy = (tiitg/NL0)/8; + + //const short lx = i%8; + //const short ly = (tiitg/NL0)%8; + const short lx = (tiitg/NL0)%8; + const short ly = i%8; + + const short ib = 8*sx + sy; + + *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0; + } + } else { + S0_4x4 temp_a; + dequantize_func(x, il, temp_a); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + FOR_UNROLL (short i = 0; i < 16; i++) { + const short sx = 2*il0 + i/8; + const short sy = (tiitg/NL0)/8; + + //const short lx = i%8; + //const short ly = (tiitg/NL0)%8; + const short lx = (tiitg/NL0)%8; + const short ly = i%8; + + const short ib = 8*sx + sy; + + // NOTE: this is massively slower.. WTF? + //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4]; + + *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4]; + } + } + + if (FC_mul_mm_bc_inp) { + for (short i = 0; i < 8; ++i) { + const short sx = (tiitg%NL1); + const short sy = (tiitg/NL1)/8; + + const short lx = i; + const short ly = (tiitg/NL1)%8; + //const short lx = (tiitg/NL1)%8; + //const short ly = i; + + const short ib = 4*sx + sy; + + *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0; + } + } else { + const short sx = (tiitg%NL1); + const short sy = (tiitg/NL1)/8; + + //const short dx = sx; + //const short dy = sy; + + const short ly = (tiitg/NL1)%8; + + const short ib = 4*sx + sy; + + *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y)); + } + + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2 + nl - 1)/nl : x; + + y += NK; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // load matrices from threadgroup memory and conduct outer products + threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2)); + threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2)); + + FOR_UNROLL (short ik = 0; ik < NK/8; ik++) { + simdgroup_barrier(mem_flags::mem_none); + + FOR_UNROLL (short i = 0; i < 4; i++) { + simdgroup_load(ma[i], lsma + 64*i, 8, 0, false); + } + + simdgroup_barrier(mem_flags::mem_none); + + FOR_UNROLL (short i = 0; i < 2; i++) { + simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false); + } + + simdgroup_barrier(mem_flags::mem_none); + + FOR_UNROLL (short i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); + } + + lsma += 8*64; + lsmb += 4*64; + } + } + + if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) { + // if no bounds checks on the output are needed, we can directly write to device memory + device float * C = (device float *) dst + + (r0 + 32*(sgitg & 1)) + \ + (r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; + + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); + + threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0; + + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (sgitg == 0) { + for (int j = tiitg; j < nr1; j += NR1) { + device float * D = (device float *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0; + device float4 * D4 = (device float4 *) D; + + threadgroup float * C = temp_str + (j*NR0); + threadgroup float4 * C4 = (threadgroup float4 *) C; + + int i = 0; + for (; i < nr0/4; i++) { + *(D4 + i) = *(C4 + i); + } + + i *= 4; + for (; i < nr0; i++) { + *(D + i) = *(C + i); + } + } + } + } +} + +#endif // GGML_METAL_HAS_TENSOR + +template // n_expert_used +kernel void kernel_mul_mm_id_map0( + constant ggml_metal_kargs_mul_mm_id_map0 & args, + device const char * src2, + device char * htpe, + device char * hids, + threadgroup char * shmem [[threadgroup(0)]], + ushort tpitg[[thread_position_in_threadgroup]], + ushort ntg[[threads_per_threadgroup]]) { + const short ide = tpitg; // expert id + + uint32_t n_all = 0; + + device int32_t * ids_i32 = (device int32_t *) hids + ide*args.ne21; + + for (int i21 = 0; i21 < args.ne21; i21 += ntg) { // n_tokens + if (i21 + tpitg < args.ne21) { + device const int32_t * src2_i32 = (device const int32_t *) (src2 + (i21 + tpitg)*args.nb21); + + threadgroup uint16_t * sids = (threadgroup uint16_t *) shmem + tpitg*ne20; + + #pragma unroll(ne20) + for (short i20 = 0; i20 < ne20; i20++) { + sids[i20] = src2_i32[i20]; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (short t = 0; t < ntg; t++) { + if (i21 + t >= args.ne21) { + break; + } + + threadgroup const uint16_t * sids = (threadgroup const uint16_t *) shmem + t*ne20; + + short sel = 0; + #pragma unroll(ne20) + for (short i20 = 0; i20 < ne20; i20++) { + sel += (sids[i20] == ide)*(i20 + 1); + } + + ids_i32[n_all] = (i21 + t)*ne20 + sel - 1; + + n_all += sel > 0; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + device uint32_t * tpe_u32 = (device uint32_t *) (htpe); + tpe_u32[ide] = n_all; +} + +typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t; + +template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>; +template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>; +template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>; +template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>; +template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>; +template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>; +template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>; +template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>; +template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>; + +template +kernel void kernel_mul_mm_id( + constant ggml_metal_kargs_mul_mm_id & args, + device const char * src0, + device const char * src1, + device const char * htpe, + device const char * hids, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + threadgroup S0 * sa = (threadgroup S0 *)(shmem); + threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096); + +#ifdef GGML_METAL_HAS_TENSOR + threadgroup float * sc = (threadgroup float *)(shmem); +#endif + + constexpr int NR0 = 64; + constexpr int NR1 = 32; + + constexpr int NK = 32; + constexpr int NL0 = NK/16; + constexpr int NL1 = NK/8; + + const int im = tgpig.z; // expert + const int r0 = tgpig.y*NR0; + const int r1 = tgpig.x*NR1; + + device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe); + device const int32_t * ids_i32 = (device const int32_t *) (hids); + + const int32_t neh1 = tpe_u32[im]; + + if (r1 >= neh1) { + return; + } + + // if this block is of 64x32 shape or smaller + const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0; + const short nr1 = ( neh1 - r1 < NR1) ? ( neh1 - r1) : NR1; + + // a thread shouldn't load data outside of the matrix + const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63 + const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31 + + const short il0 = (tiitg % NL0); + + short il = il0; + + const int id = ids_i32[im*args.ne21 + r1 + lr1]; + + const short i11 = (id % args.ne20) % args.ne11; + const short i12 = (id / args.ne20); + const short i13 = 0; + + const uint64_t offset0 = im*args.nb02 + i13*args.nb03; + const short offset1 = il0/nl; + + device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1; + + const short iy = 8*(tiitg % NL1); + + device const T1 * y = (device const T1 *)(src1 + + args.nb13*i13 + + args.nb12*i12 + + args.nb11*i11 + + args.nb10*iy); + +#ifndef GGML_METAL_HAS_TENSOR + S0_8x8 ma[4]; + S1_8x8 mb[2]; + + simdgroup_float8x8 mc[8]; + + for (short i = 0; i < 8; i++){ + mc[i] = make_filled_simdgroup_matrix(0.f); + } +#else + auto tA = tensor, tensor_inline>(sa, dextents(NK, NR0)); + auto tB = tensor, tensor_inline>(sb, dextents(NR1, NK )); + + mpp::tensor_ops::matmul2d< + mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate), + execution_simdgroups<4>> mm; + + auto cT = mm.get_destination_cooperative_tensor(); +#endif + + for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) { +#ifndef GGML_METAL_HAS_TENSOR + // load data and store to threadgroup memory + if (is_same::value && FC_mul_mm_bc_inp) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + // no need for dequantization + for (short i = 0; i < 16; i++) { + const short sx = 2*il0 + i/8; + const short sy = (tiitg/NL0)/8; + + //const short lx = i%8; + //const short ly = (tiitg/NL0)%8; + const short lx = (tiitg/NL0)%8; + const short ly = i%8; + + const short ib = 8*sx + sy; + + *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? (S0) *((device T0 *) x + i) : (S0) 0; + } + } else { + S0_4x4 temp_a; + dequantize_func(x, il, temp_a); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + FOR_UNROLL (short i = 0; i < 16; i++) { + const short sx = 2*il0 + i/8; + const short sy = (tiitg/NL0)/8; + + //const short lx = i%8; + //const short ly = (tiitg/NL0)%8; + const short lx = (tiitg/NL0)%8; + const short ly = i%8; + + const short ib = 8*sx + sy; + + // NOTE: this is massively slower.. WTF? + //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4]; + + *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4]; + } + } + + if (FC_mul_mm_bc_inp) { + for (short i = 0; i < 8; ++i) { + const short sx = (tiitg%NL1); + const short sy = (tiitg/NL1)/8; + + const short lx = i; + const short ly = (tiitg/NL1)%8; + //const short lx = (tiitg/NL1)%8; + //const short ly = i; + + const short ib = 4*sx + sy; + + *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0; + } + } else { + const short sx = (tiitg%NL1); + const short sy = (tiitg/NL1)/8; + + //const short dx = sx; + //const short dy = sy; + + const short ly = (tiitg/NL1)%8; + + const short ib = 4*sx + sy; + + *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y)); + } +#else + // load data and store to threadgroup memory + if (is_same::value && FC_mul_mm_bc_inp) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + // no need for dequantization + for (short i = 0; i < 16; i++) { + const short sx = 2*il0 + i/8; + const short sy = (tiitg/NL0)/8; + + const short lx = i%8; + const short ly = (tiitg/NL0)%8; + //const short lx = (tiitg/NL0)%8; + //const short ly = i%8; + + *(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0; + } + } else { + S0_4x4 temp_a; + dequantize_func(x, il, temp_a); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + FOR_UNROLL (short i = 0; i < 16; i++) { + const short sx = 2*il0 + i/8; + const short sy = (tiitg/NL0)/8; + + const short lx = i%8; + const short ly = (tiitg/NL0)%8; + //const short lx = (tiitg/NL0)%8; + //const short ly = i%8; + + *(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4]; + } + } + + if (FC_mul_mm_bc_inp) { + for (short i = 0; i < 8; ++i) { + const short sx = (tiitg%NL1); + const short sy = (tiitg/NL1)/8; + + const short lx = i; + const short ly = (tiitg/NL1)%8; + //const short lx = (tiitg/NL1)%8; + //const short ly = i; + + *(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0; + } + } else { + const short sx = (tiitg%NL1); + const short sy = (tiitg/NL1)/8; + + //const short lx = i; + const short ly = (tiitg/NL1)%8; + //const short lx = (tiitg/NL1)%8; + //const short ly = i; + + *(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y)); + } +#endif + + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2 + nl - 1)/nl : x; + + y += NK; + + threadgroup_barrier(mem_flags::mem_threadgroup); + +#ifndef GGML_METAL_HAS_TENSOR + // load matrices from threadgroup memory and conduct outer products + threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2)); + threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2)); + + FOR_UNROLL (short ik = 0; ik < NK/8; ik++) { + simdgroup_barrier(mem_flags::mem_none); + + FOR_UNROLL (short i = 0; i < 4; i++) { + simdgroup_load(ma[i], lsma + 64*i, 8, 0, false); + } + + simdgroup_barrier(mem_flags::mem_none); + + FOR_UNROLL (short i = 0; i < 2; i++) { + simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false); + } + + simdgroup_barrier(mem_flags::mem_none); + + FOR_UNROLL (short i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); + } + + lsma += 8*64; + lsmb += 4*64; + } +#else + auto sA = tA.slice(0, 0); + auto sB = tB.slice(0, 0); + + mm.run(sB, sA, cT); +#endif + } + + // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); + +#ifdef GGML_METAL_HAS_TENSOR + auto tC = tensor, tensor_inline>(sc, dextents(NR0, NR1)); + cT.store(tC); +#else + threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0; + + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false); + } +#endif + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (short j = sgitg; j < nr1; j += 4) { + const int id = ids_i32[im*args.ne21 + r1 + j]; + + const short ide = id % args.ne20; + const short idt = id / args.ne20; + + device float * D = (device float *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0; + device float4 * D4 = (device float4 *) D; + + threadgroup float * C = (threadgroup float *) shmem + j*NR0; + threadgroup float4 * C4 = (threadgroup float4 *) C; + + int i = tiisg; + for (; i < nr0/4; i += 32) { + *(D4 + i) = *(C4 + i); + } + + i = (4*(nr0/4)) + tiisg; + for (; i < nr0; i += 32) { + *(D + i) = *(C + i); + } + } +} + +// +// matrix-matrix multiplication +// + +typedef decltype(kernel_mul_mm) mul_mm_t; + +template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm; +#endif +template [[host_name("kernel_mul_mm_q1_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_mxfp4_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm; + +template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q1_0_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_1_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_mxfp4_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xxs_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_xxs_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_s_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_s_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_s_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_m_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f16")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_xs_f16")]] kernel mul_mm_t kernel_mul_mm; + +// +// indirect matrix-matrix multiplication +// + +typedef decltype(kernel_mul_mm_id) mul_mm_id; + +template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mul_mm_id kernel_mul_mm_id; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mul_mm_id kernel_mul_mm_id; +#endif +template [[host_name("kernel_mul_mm_id_q1_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_mxfp4_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_mul_mm_id; + +template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q1_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_mxfp4_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_s_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id; diff --git a/ggml/src/ggml-metal/kernels/mul_mv.metal b/ggml/src/ggml-metal/kernels/mul_mv.metal new file mode 100644 index 0000000000..51fad35dec --- /dev/null +++ b/ggml/src/ggml-metal/kernels/mul_mv.metal @@ -0,0 +1,2974 @@ +#include "common.h" +#include "dequantize.h" +// Q1_0 dot product: dot = d * (2 * Σ(yl[i] where bit=1) - sumy) +inline float block_q_n_dot_y(device const block_q1_0 * qb_curr, float sumy, thread float * yl, int il) { + device const uint8_t * qs = qb_curr->qs + il / 8; + const uint8_t b0 = qs[0]; + const uint8_t b1 = qs[1]; + + float acc = 0.0f; + + acc += select(0.0f, yl[ 0], bool(b0 & 0x01)); + acc += select(0.0f, yl[ 1], bool(b0 & 0x02)); + acc += select(0.0f, yl[ 2], bool(b0 & 0x04)); + acc += select(0.0f, yl[ 3], bool(b0 & 0x08)); + acc += select(0.0f, yl[ 4], bool(b0 & 0x10)); + acc += select(0.0f, yl[ 5], bool(b0 & 0x20)); + acc += select(0.0f, yl[ 6], bool(b0 & 0x40)); + acc += select(0.0f, yl[ 7], bool(b0 & 0x80)); + + acc += select(0.0f, yl[ 8], bool(b1 & 0x01)); + acc += select(0.0f, yl[ 9], bool(b1 & 0x02)); + acc += select(0.0f, yl[10], bool(b1 & 0x04)); + acc += select(0.0f, yl[11], bool(b1 & 0x08)); + acc += select(0.0f, yl[12], bool(b1 & 0x10)); + acc += select(0.0f, yl[13], bool(b1 & 0x20)); + acc += select(0.0f, yl[14], bool(b1 & 0x40)); + acc += select(0.0f, yl[15], bool(b1 & 0x80)); + + return qb_curr->d * (2.0f * acc - sumy); +} + +// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i]) +// il indicates where the q4 quants begin (0 or QK4_0/4) +// we assume that the yl's have been multiplied with the appropriate scale factor +// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) +inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) { + float d = qb_curr->d; + + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + + device const uint16_t * qs = ((device const uint16_t *) qb_curr + 1 + il/2); + + for (int i = 0; i < 8; i += 2) { + acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F); + acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0); + acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000); + } + + return d * (sumy * -8.f + acc[0] + acc[1] + acc[2] + acc[3]); +} + +// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i]) +// il indicates where the q4 quants begin (0 or QK4_0/4) +// we assume that the yl's have been multiplied with the appropriate scale factor +// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) +inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) { + float d = qb_curr->d; + float m = qb_curr->m; + + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + + device const uint16_t * qs = ((device const uint16_t *) qb_curr + 2 + il/2); + + for (int i = 0; i < 8; i+=2) { + acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F); + acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0); + acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000); + } + + return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m; +} + +// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i]) +// il indicates where the q5 quants begin (0 or QK5_0/4) +// we assume that the yl's have been multiplied with the appropriate scale factor +// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) +inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) { + float d = qb_curr->d; + + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + + device const uint16_t * qs = ((device const uint16_t *)qb_curr + 3 + il/2); + const uint32_t qh = *((device const uint32_t *)qb_curr->qh); + + for (int i = 0; i < 8; i+=2) { + acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)); + acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); + acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)); + acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); + } + + return d * (sumy * -16.f + acc[0] + acc[1] + acc[2] + acc[3]); +} + +// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i]) +// il indicates where the q5 quants begin (0 or QK5_1/4) +// we assume that the yl's have been multiplied with the appropriate scale factor +// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) +inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) { + float d = qb_curr->d; + float m = qb_curr->m; + + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + + device const uint16_t * qs = ((device const uint16_t *)qb_curr + 4 + il/2); + const uint32_t qh = *((device const uint32_t *)qb_curr->qh); + + for (int i = 0; i < 8; i+=2) { + acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)); + acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); + acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)); + acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); + } + + return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m; +} + +template +static inline void helper_mv_reduce_and_write( + device float * dst_f32, + float sumf[NR0], + const int r0, + const int ne01, + ushort tiisg, + ushort sgitg, + threadgroup char * shmem) { + constexpr short NW = N_SIMDWIDTH; + + threadgroup float * shmem_f32[NR0]; + + for (short row = 0; row < NR0; ++row) { + shmem_f32[row] = (threadgroup float *) shmem + NW*row; + + if (sgitg == 0) { + shmem_f32[row][tiisg] = 0.0f; + } + + sumf[row] = simd_sum(sumf[row]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (short row = 0; row < NR0; ++row) { + if (tiisg == 0) { + shmem_f32[row][sgitg] = sumf[row]; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (short row = 0; row < NR0 && r0 + row < ne01; ++row) { + float tot = simd_sum(shmem_f32[row][tiisg]); + + if (tiisg == 0 && sgitg == 0) { + dst_f32[r0 + row] = tot; + } + } +} + +constant short FC_mul_mv_nsg [[function_constant(FC_MUL_MV + 0)]]; +constant short FC_mul_mv_nxpsg [[function_constant(FC_MUL_MV + 1)]]; +constant short FC_mul_mv_ne12 [[function_constant(FC_MUL_MV + 2)]]; +constant short FC_mul_mv_r2 [[function_constant(FC_MUL_MV + 3)]]; +constant short FC_mul_mv_r3 [[function_constant(FC_MUL_MV + 4)]]; + +template +void mul_vec_q_n_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + constexpr short NW = N_SIMDWIDTH; + constexpr short NQ = 16; + + const int nb = args.ne00/QK4_0; + + const int r0 = (tgpig.x*NSG + sgitg)*NR0; + //const int r0 = tgpig.x*NR0; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + //device const block_q_type * x = (device const block_q_type *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + // pointers to src0 rows + device const block_q_type * ax[NR0]; + FOR_UNROLL (int row = 0; row < NR0; ++row) { + const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + + ax[row] = (device const block_q_type *) ((device char *) src0 + offset0); + } + + float sumf[NR0] = {0.f}; + + const short ix = (tiisg/(NW/NQ)); + const short il = (tiisg%(NW/NQ))*8; + + //const int ib0 = sgitg*NQ + ix; + const int ib0 = ix; + + float yl[16]; // src1 vector cache + + //device const float * yb = y + ix*QK4_0 + il; + device const float * yb = y + ib0*QK4_0 + il; + + // each thread in a SIMD group deals with half a block. + //for (int ib = ib0; ib < nb; ib += NSG*NQ) { + for (int ib = ib0; ib < nb; ib += NQ) { + float sumy[2] = { 0.f, 0.f }; + + FOR_UNROLL (short i = 0; i < 8; i += 2) { + sumy[0] += yb[i + 0] + yb[i + 1]; + yl[i + 0] = yb[i + 0]; + yl[i + 1] = yb[i + 1]/256.f; + + sumy[1] += yb[i + 16] + yb[i + 17]; + yl[i + 8] = yb[i + 16]/16.f; + yl[i + 9] = yb[i + 17]/4096.f; + } + + FOR_UNROLL (short row = 0; row < NR0; row++) { + sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy[0] + sumy[1], yl, il); + } + + yb += QK4_0 * 16; + //yb += NSG*NQ*QK4_0; + } + + device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0; + + //helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); + + for (int row = 0; row < NR0; ++row) { + const float tot = simd_sum(sumf[row]); + + if (tiisg == 0 && r0 + row < args.ne01) { + dst_f32[r0 + row] = tot; + } + } +} + +template +void kernel_mul_mv_q1_0_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK1_0; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset1 = r1*args.nb11 + (i12)*args.nb12 + (i13)*args.nb13; + + device const float * y = (device const float *) (src1 + offset1); + + device const block_q1_0 * ax[nr0]; + for (int row = 0; row < nr0; ++row) { + const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + ax[row] = (device const block_q1_0 *) ((device char *) src0 + offset0); + } + + float yl[16]; + float sumf[nr0] = {0.f}; + + const short ix = (tiisg/8); + const short il = (tiisg%8)*16; + + device const float * yb = y + ix*QK1_0 + il; + + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/8) { + float sumy = 0.f; + + FOR_UNROLL (short i = 0; i < 16; i++) { + yl[i] = yb[i]; + sumy += yb[i]; + } + + FOR_UNROLL (short row = 0; row < nr0; row++) { + sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy, yl, il); + } + + yb += QK1_0 * (N_SIMDWIDTH/8); + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0; ++row) { + const float tot = simd_sum(sumf[row]); + + if (tiisg == 0 && first_row + row < args.ne01) { + dst_f32[first_row + row] = tot; + } + } +} + +[[host_name("kernel_mul_mv_q1_0_f32")]] +kernel void kernel_mul_mv_q1_0_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_q1_0_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +kernel void kernel_mul_mv_q4_0_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +kernel void kernel_mul_mv_q4_1_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +kernel void kernel_mul_mv_q5_0_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +kernel void kernel_mul_mv_q5_1_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + mul_vec_q_n_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_q8_0_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + constexpr short NW = N_SIMDWIDTH; + constexpr short NQ = 8; + + const int nb = args.ne00/QK8_0; + + const int r0 = tgpig.x*NR0; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + //device const block_q8_0 * x = (device const block_q8_0 *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + // pointers to src0 rows + device const block_q8_0 * ax[NR0]; + FOR_UNROLL (short row = 0; row < NR0; ++row) { + const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + + ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0); + } + + float sumf[NR0] = { 0.f }; + + const short ix = tiisg/(NW/NQ); + const short il = tiisg%(NW/NQ); + + const int ib0 = sgitg*NQ + ix; + + float yl[NQ]; + + device const float * yb = y + ib0*QK8_0 + il*NQ; + + // each thread in a SIMD group deals with NQ quants at a time + for (int ib = ib0; ib < nb; ib += NSG*NQ) { + for (short i = 0; i < NQ; ++i) { + yl[i] = yb[i]; + } + + for (short row = 0; row < NR0; row++) { + device const int8_t * qs = ax[row][ib].qs + il*NQ; + + float sumq = 0.f; + FOR_UNROLL (short i = 0; i < NQ; ++i) { + sumq += qs[i] * yl[i]; + } + + sumf[row] += sumq*ax[row][ib].d; + } + + yb += NSG*NQ*QK8_0; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); +} + +[[host_name("kernel_mul_mv_q8_0_f32")]] +kernel void kernel_mul_mv_q8_0_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_q8_0_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +// mat-vec kernel processing in chunks of float4 +// chpb - chunks per quantization block +template +void kernel_mul_mv_ext_q4_f32_impl( + constant ggml_metal_kargs_mul_mv_ext & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + const short NSG = FC_mul_mv_nsg; + const short nxpsg = FC_mul_mv_nxpsg; + + const short chpt = 4; // chunks per thread + + //const short nxpsg = (32); + const short nypsg = (32/nxpsg); + + const short tx = tiisg%nxpsg; + const short ty = tiisg/nxpsg; + + const int i01 = tgpig.x*(nypsg*NSG) + nypsg*sgitg + ty; + const int i11 = tgpig.y*r1ptg; + const int i1m = tgpig.z; + + const int i12 = i1m%FC_mul_mv_ne12; + const int i13 = i1m/FC_mul_mv_ne12; + + const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0; + + device const float4 * y4[r1ptg]; + + for (int ir1 = 0; ir1 < r1ptg; ++ir1) { + y4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4 *) src1; + } + + float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f }; + + short cch = tx%chpb; // current chunk index + + for (int ich = tx; 4*ich < args.ne00; ich += chpt*nxpsg) { + float4 lx[chpt]; + +#pragma unroll(chpt) + for (short ch = 0; ch < chpt; ++ch) { + deq_t4(xq, cch, lx[ch]); + + cch += nxpsg; + if (cch >= chpb) { + xq += cch/chpb; + cch %= chpb; + } + } + +#pragma unroll(chpt) + for (short ch = 0; ch < chpt; ++ch) { +#pragma unroll(r1ptg) + for (short ir1 = 0; ir1 < r1ptg; ++ir1) { + sumf[ir1] += dot(lx[ch], y4[ir1][ch*nxpsg]); + } + } + +#pragma unroll(r1ptg) + for (short ir1 = 0; ir1 < r1ptg; ++ir1) { + y4[ir1] += chpt*nxpsg; + } + } + + // reduce only the threads in each row + for (short ir1 = 0; ir1 < r1ptg; ++ir1) { + if (nxpsg >= 32) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 16); + } + if (nxpsg >= 16) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 8); + } + if (nxpsg >= 8) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 4); + } + if (nxpsg >= 4) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 2); + } + if (nxpsg >= 2) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 1); + } + + //sumf[ir1] = simd_sum(sumf[ir1]); + } + + if (tx == 0) { + for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) { + device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0; + + if (i01 < args.ne01) { + dst_f32[i01] = sumf[ir1]; + } + } + } +} + +// mat-vec kernel processing in chunks of float4x4 +template +void kernel_mul_mv_ext_q4x4_f32_impl( + constant ggml_metal_kargs_mul_mv_ext & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + const short NSG = FC_mul_mv_nsg; + const short nxpsg = FC_mul_mv_nxpsg; + + const short chpt = 1; + + //const short nxpsg = (32); + const short nypsg = (32/nxpsg); + + const short tx = tiisg%nxpsg; + const short ty = tiisg/nxpsg; + + const int i01 = tgpig.x*(nypsg*NSG) + nypsg*sgitg + ty; + const int i11 = tgpig.y*r1ptg; + const int i1m = tgpig.z; + + const int i12 = i1m%FC_mul_mv_ne12; + const int i13 = i1m/FC_mul_mv_ne12; + + const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0; + + device const float4x4 * y4x4[r1ptg]; + + for (int ir1 = 0; ir1 < r1ptg; ++ir1) { + y4x4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4x4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4x4 *) src1; + } + + float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f }; + + short cch = tx%chpb; + + for (int ich = tx; 16*ich < args.ne00; ich += chpt*nxpsg) { + float4x4 lx[chpt]; + +#pragma unroll(chpt) + for (short ch = 0; ch < chpt; ++ch) { + deq_t4x4(xq, cch, lx[ch]); + + cch += nxpsg; + if (cch >= chpb) { + xq += cch/chpb; + cch %= chpb; + } + } + +#pragma unroll(chpt) + for (short ch = 0; ch < chpt; ++ch) { +#pragma unroll(r1ptg) + for (short ir1 = 0; ir1 < r1ptg; ++ir1) { + sumf[ir1] += + dot(lx[ch][0], y4x4[ir1][ch*nxpsg][0]) + + dot(lx[ch][1], y4x4[ir1][ch*nxpsg][1]) + + dot(lx[ch][2], y4x4[ir1][ch*nxpsg][2]) + + dot(lx[ch][3], y4x4[ir1][ch*nxpsg][3]); + + } + } + +#pragma unroll(r1ptg) + for (short ir1 = 0; ir1 < r1ptg; ++ir1) { + y4x4[ir1] += chpt*nxpsg; + } + } + + for (short ir1 = 0; ir1 < r1ptg; ++ir1) { + if (nxpsg >= 32) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 16); + } + if (nxpsg >= 16) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 8); + } + if (nxpsg >= 8) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 4); + } + if (nxpsg >= 4) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 2); + } + if (nxpsg >= 2) { + sumf[ir1] += simd_shuffle_down(sumf[ir1], 1); + } + + //sumf[ir1] = simd_sum(sumf[ir1]); + } + + if (tx == 0) { + for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) { + device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0; + + if (i01 < args.ne01) { + dst_f32[i01] = sumf[ir1]; + } + } + } +} + +// dispatchers needed for compile-time nxpsg +// epb - elements per quantization block +template +kernel void kernel_mul_mv_ext_q4_f32_disp( + constant ggml_metal_kargs_mul_mv_ext & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_ext_q4_f32_impl(args, src0, src1, dst, tgpig, tiisg, sgitg); +} + +template +kernel void kernel_mul_mv_ext_q4x4_f32_disp( + constant ggml_metal_kargs_mul_mv_ext & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_ext_q4x4_f32_impl(args, src0, src1, dst, tgpig, tiisg, sgitg); +} + +typedef decltype(kernel_mul_mv_ext_q4_f32_disp <2, block_q8_0, 32, dequantize_q8_0_t4>) mul_mv_ext_q4_f32_t; +typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>) mul_mv_ext_q4x4_f32_t; + +template [[host_name("kernel_mul_mv_ext_f32_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, float4, 4, dequantize_f32_t4>; +template [[host_name("kernel_mul_mv_ext_f32_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, float4, 4, dequantize_f32_t4>; +template [[host_name("kernel_mul_mv_ext_f32_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, float4, 4, dequantize_f32_t4>; +template [[host_name("kernel_mul_mv_ext_f32_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, float4, 4, dequantize_f32_t4>; + +template [[host_name("kernel_mul_mv_ext_f16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, half4, 4, dequantize_f16_t4>; +template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, half4, 4, dequantize_f16_t4>; +template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>; +template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>; + +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, bfloat4, 4, dequantize_bf16_t4>; +template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, bfloat4, 4, dequantize_bf16_t4>; +template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, bfloat4, 4, dequantize_bf16_t4>; +template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>; +#endif + +template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q1_0, 128, dequantize_q1_0_t4>; +template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q1_0, 128, dequantize_q1_0_t4>; +template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q1_0, 128, dequantize_q1_0_t4>; +template [[host_name("kernel_mul_mv_ext_q1_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q1_0, 128, dequantize_q1_0_t4>; + +template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>; +template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>; +template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>; +template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_0, 32, dequantize_q4_0_t4>; + +template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_1, 32, dequantize_q4_1_t4>; +template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_1, 32, dequantize_q4_1_t4>; +template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_1, 32, dequantize_q4_1_t4>; +template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_1, 32, dequantize_q4_1_t4>; + +template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_0, 32, dequantize_q5_0_t4>; +template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_0, 32, dequantize_q5_0_t4>; +template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_0, 32, dequantize_q5_0_t4>; +template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_0, 32, dequantize_q5_0_t4>; + +template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_1, 32, dequantize_q5_1_t4>; +template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_1, 32, dequantize_q5_1_t4>; +template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_1, 32, dequantize_q5_1_t4>; +template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_1, 32, dequantize_q5_1_t4>; + +template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q8_0, 32, dequantize_q8_0_t4>; +template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q8_0, 32, dequantize_q8_0_t4>; +template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q8_0, 32, dequantize_q8_0_t4>; +template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q8_0, 32, dequantize_q8_0_t4>; + +template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_mxfp4, 32, dequantize_mxfp4_t4>; +template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_mxfp4, 32, dequantize_mxfp4_t4>; +template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_mxfp4, 32, dequantize_mxfp4_t4>; +template [[host_name("kernel_mul_mv_ext_mxfp4_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_mxfp4, 32, dequantize_mxfp4_t4>; + +template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_iq4_nl, 32, dequantize_iq4_nl_t4>; +template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_iq4_nl, 32, dequantize_iq4_nl_t4>; +template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_iq4_nl, 32, dequantize_iq4_nl_t4>; +template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_iq4_nl, 32, dequantize_iq4_nl_t4>; + +template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>; +template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q4_K, 256, dequantize_q4_K>; +template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q4_K, 256, dequantize_q4_K>; +template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q4_K, 256, dequantize_q4_K>; + +template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_K, 256, dequantize_q5_K>; +template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q5_K, 256, dequantize_q5_K>; +template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_K, 256, dequantize_q5_K>; +template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_K, 256, dequantize_q5_K>; + +template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_K, 256, dequantize_q6_K>; +template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_K, 256, dequantize_q6_K>; +template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>; +template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>; + +template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q2_K, 256, dequantize_q2_K>; +template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q2_K, 256, dequantize_q2_K>; +template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q2_K, 256, dequantize_q2_K>; +template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q2_K, 256, dequantize_q2_K>; + +template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q3_K, 256, dequantize_q3_K>; +template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q3_K, 256, dequantize_q3_K>; +template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q3_K, 256, dequantize_q3_K>; +template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q3_K, 256, dequantize_q3_K>; + +template +void kernel_mul_mv_t_t_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + constexpr short NW = N_SIMDWIDTH; + constexpr short NB = 32; + constexpr short NF = 8; + + const int nb = args.ne00/NB; + + const int r0 = tgpig.x*NR0; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + //device const T0 * x = (device const T0 *) (src0 + offset0); + device const T1 * y = (device const T1 *) (src1 + offset1); + + // pointers to src0 rows + device const T0 * ax [NR0]; + FOR_UNROLL (short row = 0; row < NR0; ++row) { + const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + + ax[row] = (device const T0 *) ((device char *) src0 + offset0); + } + + float sumf[NR0] = { 0.f }; + + const short ix = tiisg/(NW/NF); + const short il = tiisg%(NW/NF); + + const int ib0 = sgitg*NF + ix; + + T1 yl[NF]; + + device const T1 * yb = y + (ib0*NB + il*NF); + + for (int ib = ib0; ib < nb; ib += NSG*NF) { + for (short i = 0; i < NF; ++i) { + yl[i] = yb[i]; + } + + for (short row = 0; row < NR0; row++) { + device const T0 * xb = ax[row] + (ib*NB + il*NF); + + float sumq = 0.f; + FOR_UNROLL (short i = 0; i < NF; ++i) { + sumq += xb[i] * yl[i]; + } + + sumf[row] += sumq; + } + + yb += NSG*NF*NW; + } + + for (int i = nb*NB + sgitg*NW + tiisg; i < args.ne00; i += NW*NSG) { + for (short row = 0; row < NR0; row++) { + sumf[row] += ax[row][i] * y[i]; + } + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); +} + +template +void kernel_mul_mv_t_t_disp( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + switch (args.nr0) { + //case 1: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + case 2: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + //case 3: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + //case 4: kernel_mul_mv_t_t_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + } +} + +template +kernel void kernel_mul_mv_t_t( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_t_t_disp(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +typedef decltype(kernel_mul_mv_t_t) mul_mv_t_t; + +template [[host_name("kernel_mul_mv_f32_f32")]] kernel mul_mv_t_t kernel_mul_mv_t_t; +template [[host_name("kernel_mul_mv_f16_f32")]] kernel mul_mv_t_t kernel_mul_mv_t_t; +template [[host_name("kernel_mul_mv_f16_f16")]] kernel mul_mv_t_t kernel_mul_mv_t_t; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t_t kernel_mul_mv_t_t; +template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t_t kernel_mul_mv_t_t; +#endif + +template +void kernel_mul_mv_t_t_4_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + constexpr short NW = N_SIMDWIDTH; + constexpr short NB = 32; + constexpr short NF = 16; + constexpr short NF4 = NF/4; + + const int nb = args.ne00/NB; + + const int r0 = tgpig.x*NR0; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const T1 * y = (device const T1 *) (src1 + offset1); + device const T14 * y4 = (device const T14 *) (src1 + offset1); + + // pointers to src0 rows + device const T0 * ax [NR0]; + device const T04 * ax4[NR0]; + FOR_UNROLL (short row = 0; row < NR0; ++row) { + const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + + ax [row] = (device const T0 *) ((device char *) src0 + offset0); + ax4[row] = (device const T04 *) ((device char *) src0 + offset0); + } + + float sumf[NR0] = { 0.f }; + + const short ix = tiisg/(NW/NF); + const short il = tiisg%(NW/NF); + + const int ib0 = sgitg*NF + ix; + + T14 yl4[NF4]; + + device const T14 * yb4 = y4 + (ib0*NB + il*NF)/4; + + for (int ib = ib0; ib < nb; ib += NSG*NF) { + for (short i = 0; i < NF4; ++i) { + yl4[i] = yb4[i]; + } + + for (short row = 0; row < NR0; row++) { + device const T04 * xb4 = ax4[row] + (ib*NB + il*NF)/4; + + float sumq = 0.f; + FOR_UNROLL (short i = 0; i < NF4; ++i) { + sumq += dot(float4(xb4[i]), float4(yl4[i])); + } + + sumf[row] += sumq; + } + + yb4 += NSG*NF*NW/4; + } + + for (int i = nb*NB + sgitg*NW + tiisg; i < args.ne00; i += NW*NSG) { + for (short row = 0; row < NR0; row++) { + sumf[row] += ax[row][i] * y[i]; + } + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + helper_mv_reduce_and_write(dst_f32, sumf, r0, args.ne01, tiisg, sgitg, shmem); +} + +template +void kernel_mul_mv_t_t_4_disp( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + switch (args.nr0) { + //case 1: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + case 2: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + //case 3: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + //case 4: kernel_mul_mv_t_t_4_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); break; + }; +} + +template +kernel void kernel_mul_mv_t_t_4( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_t_t_4_disp(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +typedef decltype(kernel_mul_mv_t_t_4) mul_mv_t_t_4; + +template [[host_name("kernel_mul_mv_f32_f32_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; +template [[host_name("kernel_mul_mv_f16_f32_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; +template [[host_name("kernel_mul_mv_f16_f16_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mv_bf16_f32_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; +template [[host_name("kernel_mul_mv_bf16_bf16_4")]] kernel mul_mv_t_t_4 kernel_mul_mv_t_t_4; +#endif + +template +void kernel_mul_mv_t_t_short_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig, + ushort tiisg) { + const int r0 = tgpig.x*32 + tiisg; + const int r1 = tgpig.y; + const int im = tgpig.z; + + if (r0 >= args.ne01) { + return; + } + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + + device const T0 * x = (device const T0 *) (src0 + offset0); + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1; + + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const T1 * y = (device const T1 *) (src1 + offset1); + + float res = 0.0f; + + for (int i = 0; i < args.ne00; ++i) { + res += (float) x[i] * (float) y[i]; + } + + dst_f32[(uint64_t)r1*args.ne0 + r0] = res; +} + +template +kernel void kernel_mul_mv_t_t_short( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_t_t_short_impl( + args, + src0, + src1, + dst, + tgpig, + tiisg); +} + +typedef decltype(kernel_mul_mv_t_t_short) mul_mv_t_t_short_t; + +template [[host_name("kernel_mul_mv_f32_f32_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; +template [[host_name("kernel_mul_mv_f16_f32_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; +template [[host_name("kernel_mul_mv_f16_f16_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mv_bf16_f32_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; +template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short; +#endif + +template +void kernel_mul_mv_q2_K_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_q2_K * x = (device const block_q2_K *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const short ix = tiisg/8; // 0...3 + const short it = tiisg%8; // 0...7 + const short iq = it/4; // 0 or 1 + const short ir = it%4; // 0...3 + const short is = (8*ir)/16;// 0 or 1 + + device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir; + + for (int ib = ix; ib < nb; ib += 4) { + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (short i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+64]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+96]; sumy[3] += yl[i+24]; + } + + device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*iq + is; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; + device const half * dh = &x[ib].d; + + for (short row = 0; row < nr0; row++) { + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); + acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); + acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); + acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); + acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); + acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); + acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); + acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); + } + float dall = dh[0]; + float dmin = dh[1] * 1.f/16.f; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + + (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f + + (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f + + (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) - + dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0)); + + qs += args.nb01/2; + sc += args.nb01; + dh += args.nb01/2; + } + + y4 += 4 * QK_K; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_q2_K_f32")]] +kernel void kernel_mul_mv_q2_K_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q2_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_q3_K_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_q3_K * x = (device const block_q3_K *) (src0 + offset0); + device const float * yy = (device const float *) (src1 + offset1); + + float yl[32]; + + //const uint16_t kmask1 = 0x3030; + //const uint16_t kmask2 = 0x0f0f; + + const short tid = tiisg/4; + const short ix = tiisg%4; + const short ip = tid/4; // 0 or 1 + const short il = 2*((tid%4)/2); // 0 or 2 + const short ir = tid%2; + const short l0 = 8*ir; + + // One would think that the Metal compiler would figure out that ip and il can only have + // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it + // with these two tales. + // + // Possible masks for the high bit + const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0 + {0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2 + {0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0 + {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2 + + // Possible masks for the low 2 bits + const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}}; + + const ushort4 hm = mm[2*ip + il/2]; + + const short shift = 2*il; + + const float v1 = il == 0 ? 4.f : 64.f; + const float v2 = 4.f * v1; + + const uint16_t s_shift1 = 4*ip; + const uint16_t s_shift2 = s_shift1 + il; + + const short q_offset = 32*ip + l0; + const short y_offset = 128*ip + 32*il + l0; + + device const float * y1 = yy + ix*QK_K + y_offset; + + uint32_t scales32, aux32; + thread uint16_t * scales16 = (thread uint16_t *)&scales32; + thread const int8_t * scales = (thread const int8_t *)&scales32; + + float sumf1[nr0] = {0.f}; + float sumf2[nr0] = {0.f}; + + for (int i = ix; i < nb; i += 4) { + for (short l = 0; l < 8; ++l) { + yl[l+ 0] = y1[l+ 0]; + yl[l+ 8] = y1[l+16]; + yl[l+16] = y1[l+32]; + yl[l+24] = y1[l+48]; + } + + device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset); + device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0); + device const uint16_t * a = (device const uint16_t *)(x[i].scales); + device const half * dh = &x[i].d; + + for (short row = 0; row < nr0; ++row) { + const float d_all = (float)dh[0]; + + scales16[0] = a[4]; + scales16[1] = a[5]; + aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030; + scales16[0] = a[il+0]; + scales16[1] = a[il+1]; + scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32; + + float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0; + for (short l = 0; l < 8; l += 2) { + const int32_t qs = q[l/2]; + s1 += yl[l+0] * (qs & qm[il/2][0]); + s2 += yl[l+1] * (qs & qm[il/2][1]); + s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]); + s4 += yl[l+16] * (qs & qm[il/2][2]); + s5 += yl[l+17] * (qs & qm[il/2][3]); + s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]); + } + float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); + float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); + sumf1[row] += d1 * (scales[0] - 32); + sumf2[row] += d2 * (scales[2] - 32); + + s1 = s2 = s3 = s4 = s5 = s6 = 0; + for (short l = 0; l < 8; l += 2) { + const int32_t qs = q[l/2+8]; + s1 += yl[l+8] * (qs & qm[il/2][0]); + s2 += yl[l+9] * (qs & qm[il/2][1]); + s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]); + s4 += yl[l+24] * (qs & qm[il/2][2]); + s5 += yl[l+25] * (qs & qm[il/2][3]); + s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]); + } + d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); + d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); + sumf1[row] += d1 * (scales[1] - 32); + sumf2[row] += d2 * (scales[3] - 32); + + q += args.nb01/2; + h += args.nb01/2; + a += args.nb01/2; + dh += args.nb01/2; + } + + y1 += 4 * QK_K; + } + + for (int row = 0; row < nr0; ++row) { + const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift); + sumf1[row] = simd_sum(sumf); + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + if (tiisg == 0) { + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + dst_f32[first_row + row] = sumf1[row]; + } + } +} + +[[host_name("kernel_mul_mv_q3_K_f32")]] +kernel void kernel_mul_mv_q3_K_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q3_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_q4_K_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + constexpr uint16_t kmask1 = 0x3f3f; + constexpr uint16_t kmask2 = 0x0f0f; + constexpr uint16_t kmask3 = 0xc0c0; + + const short ix = tiisg/8; // 0...3 + const short it = tiisg%8; // 0...7 + const short iq = it/4; // 0 or 1 + const short ir = it%4; // 0...3 + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_q4_K * x = (device const block_q4_K *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[16]; + float yh[16]; + + float sumf[nr0]={0.f}; + + device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir; + + uint16_t sc16[4]; + thread const uint8_t * sc8 = (thread const uint8_t *)sc16; + + for (int ib = ix; ib < nb; ib += 4) { + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + + for (short i = 0; i < 8; ++i) { + yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0]; + yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8]; + yh[i+0] = y4[i+128]; sumy[2] += yh[i+0]; + yh[i+8] = y4[i+160]; sumy[3] += yh[i+8]; + } + + device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq; + device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; + device const half * dh = &x[ib].d; + + for (short row = 0; row < nr0; row++) { + sc16[0] = sc[0] & kmask1; + sc16[1] = sc[2] & kmask1; + sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2); + sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2); + + device const uint16_t * q2 = q1 + 32; + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + + FOR_UNROLL (short i = 0; i < 4; ++i) { + acc1[0] += yl[2*i + 0] * (q1[i] & 0x000F); + acc1[1] += yl[2*i + 1] * (q1[i] & 0x0F00); + acc1[2] += yl[2*i + 8] * (q1[i] & 0x00F0); + acc1[3] += yl[2*i + 9] * (q1[i] & 0xF000); + acc2[0] += yh[2*i + 0] * (q2[i] & 0x000F); + acc2[1] += yh[2*i + 1] * (q2[i] & 0x0F00); + acc2[2] += yh[2*i + 8] * (q2[i] & 0x00F0); + acc2[3] += yh[2*i + 9] * (q2[i] & 0xF000); + } + + sumf[row] += dh[0] * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] + + (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f + + (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] + + (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) - + dh[1] * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); + + q1 += args.nb01/2; + sc += args.nb01/2; + dh += args.nb01/2; + } + + y4 += 4 * QK_K; + } + + device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_q4_K_f32")]] +kernel void kernel_mul_mv_q4_K_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q4_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_q5_K_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_q5_K * x = (device const block_q5_K *) (src0 + offset0); + device const float * yy = (device const float *) (src1 + offset1); + + float sumf[nr0]={0.f}; + + float yl[16], yh[16]; + + constexpr uint16_t kmask1 = 0x3f3f; + constexpr uint16_t kmask2 = 0x0f0f; + constexpr uint16_t kmask3 = 0xc0c0; + + const short tid = tiisg/4; + const short ix = tiisg%4; + const short iq = tid/4; + const short ir = tid%4; + + const short l0 = 8*ir; + const short q_offset = 32*iq + l0; + const short y_offset = 64*iq + l0; + + const uint8_t hm1 = 1u << (2*iq); + const uint8_t hm2 = hm1 << 1; + const uint8_t hm3 = hm1 << 4; + const uint8_t hm4 = hm2 << 4; + + uint16_t sc16[4]; + thread const uint8_t * sc8 = (thread const uint8_t *)sc16; + + device const float * y1 = yy + ix*QK_K + y_offset; + + for (int i = ix; i < nb; i += 4) { + device const uint8_t * q1 = x[i].qs + q_offset; + device const uint8_t * qh = x[i].qh + l0; + device const half * dh = &x[i].d; + device const uint16_t * a = (device const uint16_t *)x[i].scales + iq; + + device const float * y2 = y1 + 128; + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (short l = 0; l < 8; ++l) { + yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0]; + yl[l+8] = y1[l+32]; sumy[1] += yl[l+8]; + yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0]; + yh[l+8] = y2[l+32]; sumy[3] += yh[l+8]; + } + + for (short row = 0; row < nr0; ++row) { + device const uint8_t * q2 = q1 + 64; + + sc16[0] = a[0] & kmask1; + sc16[1] = a[2] & kmask1; + sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); + sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); + + float4 acc1 = {0.f}; + float4 acc2 = {0.f}; + FOR_UNROLL (short l = 0; l < 8; ++l) { + uint8_t h = qh[l]; + acc1[0] += yl[l+0] * (q1[l] & 0x0F); + acc1[1] += yl[l+8] * (q1[l] & 0xF0); + acc1[2] += yh[l+0] * (q2[l] & 0x0F); + acc1[3] += yh[l+8] * (q2[l] & 0xF0); + acc2[0] += h & hm1 ? yl[l+0] : 0.f; + acc2[1] += h & hm2 ? yl[l+8] : 0.f; + acc2[2] += h & hm3 ? yh[l+0] : 0.f; + acc2[3] += h & hm4 ? yh[l+8] : 0.f; + } + + sumf[row] += dh[0] * (sc8[0] * (acc1[0] + 16.f*acc2[0]) + + sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) + + sc8[4] * (acc1[2] + 16.f*acc2[2]) + + sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) - + dh[1] * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); + + q1 += args.nb01; + qh += args.nb01; + dh += args.nb01/2; + a += args.nb01/2; + } + + y1 += 4 * QK_K; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + const float tot = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = tot; + } + } +} + +[[host_name("kernel_mul_mv_q5_K_f32")]] +kernel void kernel_mul_mv_q5_K_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q5_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_q6_K_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + constexpr uint8_t kmask1 = 0x03; + constexpr uint8_t kmask2 = 0x0C; + constexpr uint8_t kmask3 = 0x30; + constexpr uint8_t kmask4 = 0xC0; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_q6_K * x = (device const block_q6_K *) (src0 + offset0); + device const float * yy = (device const float *) (src1 + offset1); + + float sumf[nr0] = { 0.f }; + + float yl[16]; + + const short tid = tiisg/2; + const short ix = tiisg%2; + const short ip = tid/8; // 0 or 1 + const short il = tid%8; + const short l0 = 4*il; + const short is = 8*ip + l0/16; + + const short y_offset = 128*ip + l0; + const short q_offset_l = 64*ip + l0; + const short q_offset_h = 32*ip + l0; + + for (int i = ix; i < nb; i += 2) { + device const uint8_t * q1 = x[i].ql + q_offset_l; + device const uint8_t * q2 = q1 + 32; + device const uint8_t * qh = x[i].qh + q_offset_h; + device const int8_t * sc = x[i].scales + is; + device const half * dh = &x[i].d; + + device const float * y = yy + i * QK_K + y_offset; + + for (short l = 0; l < 4; ++l) { + yl[4*l + 0] = y[l + 0]; + yl[4*l + 1] = y[l + 32]; + yl[4*l + 2] = y[l + 64]; + yl[4*l + 3] = y[l + 96]; + } + + for (short row = 0; row < nr0; ++row) { + float4 sums = {0.f, 0.f, 0.f, 0.f}; + + FOR_UNROLL (short l = 0; l < 4; ++l) { + sums[0] += yl[4*l + 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); + sums[1] += yl[4*l + 1] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); + sums[2] += yl[4*l + 2] * ((int8_t)((q1[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32); + sums[3] += yl[4*l + 3] * ((int8_t)((q2[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); + } + + sumf[row] += dh[0] * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); + + q1 += args.nb01; + q2 += args.nb01; + qh += args.nb01; + sc += args.nb01; + dh += args.nb01/2; + } + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_q6_K_f32")]] +kernel void kernel_mul_mv_q6_K_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q6_K_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +// ======================= "True" 2-bit + +template +void kernel_mul_mv_iq2_xxs_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq2_xxs * x = (device const block_iq2_xxs *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem); + threadgroup uint8_t * ssigns = (threadgroup uint8_t *)(svalues + 256); + { + int nval = 4; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xxs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + for (short i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_xxs * xr = x + ibl; + device const uint16_t * q2 = xr->qs + 4 * ib; + device const half * dh = &xr->d; + + for (short row = 0; row < nr0; row++) { + const float db = dh[0]; + device const uint8_t * aux8 = (device const uint8_t *)q2; + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = db * (0.5f + (aux32 >> 28)); + + float sum = 0; + for (short l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + aux8[l]); + const uint8_t signs = ssigns[(aux32 >> 7*l) & 127]; + for (short j = 0; j < 8; ++j) { + sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + sumf[row] += d * sum; + + dh += args.nb01/2; + q2 += args.nb01/2; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_xxs_f32")]] +kernel void kernel_mul_mv_iq2_xxs_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_iq2_xxs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq2_xs_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq2_xs * x = (device const block_iq2_xs *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem); + threadgroup uint8_t * ssigns = (threadgroup uint8_t *)(svalues + 512); + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + for (short i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_xs * xr = x + ibl; + device const uint16_t * q2 = xr->qs + 4 * ib; + device const uint8_t * sc = xr->scales + ib; + device const half * dh = &xr->d; + + for (short row = 0; row < nr0; row++) { + const float db = dh[0]; + const uint8_t ls1 = sc[0] & 0xf; + const uint8_t ls2 = sc[0] >> 4; + const float d1 = db * (0.5f + ls1); + const float d2 = db * (0.5f + ls2); + + float sum1 = 0, sum2 = 0; + for (short l = 0; l < 2; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511)); + const uint8_t signs = ssigns[(q2[l] >> 9)]; + for (short j = 0; j < 8; ++j) { + sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + for (short l = 2; l < 4; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511)); + const uint8_t signs = ssigns[(q2[l] >> 9)]; + for (short j = 0; j < 8; ++j) { + sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + sumf[row] += d1 * sum1 + d2 * sum2; + + dh += args.nb01/2; + q2 += args.nb01/2; + sc += args.nb01; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_xs_f32")]] +kernel void kernel_mul_mv_iq2_xs_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_xs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq3_xxs_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq3_xxs * x = (device const block_iq3_xxs *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint32_t * svalues = (threadgroup uint32_t *)(shmem); + threadgroup uint8_t * ssigns = (threadgroup uint8_t *)(svalues + 256); + { + int nval = 4; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3xxs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + for (short i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq3_xxs * xr = x + ibl; + device const uint8_t * q3 = xr->qs + 8 * ib; + device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib; + device const half * dh = &xr->d; + + for (short row = 0; row < nr0; row++) { + const float db = dh[0]; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = db * (0.5f + (aux32 >> 28)); + + float2 sum = {0}; + for (short l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + q3[2*l+0]); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + q3[2*l+1]); + const uint8_t signs = ssigns[(aux32 >> 7*l) & 127]; + for (short j = 0; j < 4; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + } + sumf[row] += d * (sum[0] + sum[1]); + + dh += args.nb01/2; + q3 += args.nb01; + gas += args.nb01/2; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all * 0.5f; + } + } +} + +[[host_name("kernel_mul_mv_iq3_xxs_f32")]] +kernel void kernel_mul_mv_iq3_xxs_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq3_xxs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq3_s_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq3_s * x = (device const block_iq3_s *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint32_t * svalues = (threadgroup uint32_t *) shmem; + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3s_grid[pos + i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + for (short i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq3_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 8 * ib; + device const uint8_t * qh = xr->qh + ib; + device const uint8_t * sc = xr->scales + (ib/2); + device const uint8_t * signs = xr->signs + 4 * ib; + device const half * dh = &xr->d; + + for (short row = 0; row < nr0; row++) { + const float db = dh[0]; + const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf)); + + float2 sum = {0}; + for (short l = 0; l < 4; ++l) { + const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? svalues + 256 : svalues; + const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? svalues + 256 : svalues; + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]); + for (short j = 0; j < 4; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]); + sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]); + } + } + sumf[row] += d * (sum[0] + sum[1]); + + dh += args.nb01/2; + qs += args.nb01; + qh += args.nb01; + sc += args.nb01; + signs += args.nb01; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_iq3_s_f32")]] +kernel void kernel_mul_mv_iq3_s_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq3_s_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq2_s_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq2_s * x = (device const block_iq2_s *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + //threadgroup uint64_t * svalues = (threadgroup uint64_t *) shmem; + //{ + // int nval = 32; + // int pos = (32*sgitg + tiisg)*nval; + // for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2s_grid[pos + i]; + // threadgroup_barrier(mem_flags::mem_threadgroup); + //} + + const short ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + for (short i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint8_t * qh = xr->qh + ib; + device const uint8_t * sc = xr->scales + ib; + device const uint8_t * signs = qs + QK_K/8; + device const half * dh = &xr->d; + + for (short row = 0; row < nr0; row++) { + const float db = dh[0]; + const float d1 = db * (0.5f + (sc[0] & 0xf)); + const float d2 = db * (0.5f + (sc[0] >> 4)); + + float2 sum = {0}; + for (short l = 0; l < 2; ++l) { + //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300))); + //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300))); + constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300))); + constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300))); + for (short j = 0; j < 8; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]); + sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]); + } + } + sumf[row] += d1 * sum[0] + d2 * sum[1]; + + dh += args.nb01/2; + qs += args.nb01; + qh += args.nb01; + sc += args.nb01; + signs += args.nb01; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_s_f32")]] +kernel void kernel_mul_mv_iq2_s_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_s_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq1_s_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq1_s * x = (device const block_iq1_s *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + const short ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + float sumy = 0; + for (short i = 0; i < 32; ++i) { + yl[i] = y4[i]; + sumy += yl[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq1_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint16_t * qh = xr->qh + ib; + device const half * dh = &xr->d; + + for (short row = 0; row < nr0; row++) { + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700))); + constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700))); + constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700))); + + float sum = 0; + for (short j = 0; j < 4; ++j) { + sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) + + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4) + + yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) + + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); + } + sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1); + + dh += args.nb01/2; + qs += args.nb01; + qh += args.nb01/2; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_iq1_s_f32")]] +kernel void kernel_mul_mv_iq1_s_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq1_s_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq1_m_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + const int nb = args.ne00/QK_K; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * nr0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq1_m * x = (device const block_iq1_m *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + float yl[32]; + float sumf[nr0]={0.f}; + + const int nb32 = nb * (QK_K / 32); + + const short ix = tiisg; + + device const float * y4 = y + 32 * ix; + + iq1m_scale_t scale; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + float4 sumy = {0.f}; + for (short i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+16]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+24]; sumy[3] += yl[i+24]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq1_m * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint8_t * qh = xr->qh + 2 * ib; + device const uint16_t * sc = (device const uint16_t *)xr->scales; + + for (short row = 0; row < nr0; row++) { + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700))); + constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700))); + + float2 sum = {0.f}; + for (short j = 0; j < 4; ++j) { + sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) + + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4); + sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) + + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); + } + const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + + sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + + (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); + + sc += args.nb01/2; + qs += args.nb01; + qh += args.nb01; + } + + y4 += 32 * 32; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_iq1_m_f32")]] +kernel void kernel_mul_mv_iq1_m_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq1_m_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq4_nl_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + threadgroup float * shmem_f32 = (threadgroup float *) shmem; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * NR0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + const int nb = args.ne00/QK4_NL; + const int ns01 = args.nb01/args.nb00; + + const short ix = tiisg/2; // 0...15 + const short it = tiisg%2; // 0 or 1 + + shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[NR0]={0.f}; + + device const float * yb = y + ix*QK4_NL + it*8; + + uint32_t aux32[2]; + thread const uint8_t * q8 = (thread const uint8_t *)aux32; + + float4 qf1, qf2; + + // [TAG_MUL_MV_WEIRD] + for (int ib = ix; ib < nb && ib < ns01; ib += 16) { + device const float4 * y4 = (device const float4 *)yb; + yl[0] = y4[0]; + yl[1] = y4[4]; + yl[2] = y4[1]; + yl[3] = y4[5]; + + for (short row = 0; row < NR0; row++) { + device const block_iq4_nl & xb = x[row*ns01 + ib]; + device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it); + + float4 acc1 = {0.f}, acc2 = {0.f}; + + aux32[0] = q4[0] | (q4[1] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; + qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; + acc1 += yl[0] * qf1; + acc2 += yl[1] * qf2; + + aux32[0] = q4[2] | (q4[3] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; + qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; + acc1 += yl[2] * qf1; + acc2 += yl[3] * qf2; + + acc1 += acc2; + + sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); + } + + yb += 16 * QK4_NL; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_iq4_nl_f32")]] +kernel void kernel_mul_mv_iq4_nl_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq4_nl_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_iq4_xs_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + threadgroup float * shmem_f32 = (threadgroup float *) shmem; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * NSG + sgitg) * NR0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + const int nb = args.ne00/QK_K; + const int ns01 = args.nb01/args.nb00; + + const short ix = tiisg/16; // 0 or 1 + const short it = tiisg%16; // 0...15 + const short ib = it/2; + const short il = it%2; + + shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[NR0]={0.f}; + + device const float * yb = y + ix * QK_K + ib * 32 + il * 8; + + uint32_t aux32[2]; + thread const uint8_t * q8 = (thread const uint8_t *)aux32; + + float4 qf1, qf2; + + // [TAG_MUL_MV_WEIRD] + for (int ibl = ix; ibl < nb && ibl < ns01; ibl += 2) { + device const float4 * y4 = (device const float4 *)yb; + yl[0] = y4[0]; + yl[1] = y4[4]; + yl[2] = y4[1]; + yl[3] = y4[5]; + + for (short row = 0; row < NR0; ++row) { + device const block_iq4_xs & xb = x[row*ns01 + ibl]; + device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il); + + float4 acc1 = {0.f}, acc2 = {0.f}; + + aux32[0] = (q4[0] ) & 0x0f0f0f0f; + aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f; + qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; + qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; + acc1 += yl[0] * qf1; + acc2 += yl[1] * qf2; + + aux32[0] = (q4[1] ) & 0x0f0f0f0f; + aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f; + qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]}; + qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]}; + acc1 += yl[2] * qf1; + acc2 += yl[3] * qf2; + + acc1 += acc2; + + const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32; + sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); + } + + yb += 2 * QK_K; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_iq4_xs_f32")]] +kernel void kernel_mul_mv_iq4_xs_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq4_xs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +template +void kernel_mul_mv_mxfp4_f32_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg) { + const short NSG = FC_mul_mv_nsg; + + threadgroup float * shmem_f32 = (threadgroup float *) shmem; + + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * NSG + sgitg) * NR0; + + const uint i12 = im%FC_mul_mv_ne12; + const uint i13 = im/FC_mul_mv_ne12; + + const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03; + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0); + device const float * y = (device const float *) (src1 + offset1); + + const int nb = args.ne00/QK_MXFP4; + const int ns01 = args.nb01/args.nb00; // this can be larger than nb for permuted src0 tensors + + const short ix = tiisg/2; // 0...15 + const short it = tiisg%2; // 0 or 1 + + shmem_f32[tiisg] = kvalues_mxfp4_f[tiisg%16]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[NR0]={0.f}; + + device const float * yb = y + ix*QK_MXFP4 + it*8; + + // note: just the check `ib < nb` is enough, but adding the redundant `&& ib < ns01` check makes the kernel a bit faster + // no idea why that is - needs some deeper investigation [TAG_MUL_MV_WEIRD] + for (int ib = ix; ib < nb && ib < ns01; ib += 16) { + device const float4 * y4 = (device const float4 *) yb; + + yl[0] = y4[0]; + yl[1] = y4[4]; + yl[2] = y4[1]; + yl[3] = y4[5]; + + FOR_UNROLL (short row = 0; row < NR0; row++) { + device const block_mxfp4 & xb = x[row*ns01 + ib]; + device const uint8_t * q2 = (device const uint8_t *)(xb.qs + 8*it); + + float4 acc1 = yl[0]*float4(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]); + float4 acc2 = yl[1]*float4(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]); + float4 acc3 = yl[2]*float4(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]); + float4 acc4 = yl[3]*float4(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]); + + acc1 = (acc1 + acc3) + (acc2 + acc4); + + sumf[row] += e8m0_to_fp32(xb.e) * ((acc1[0] + acc1[1]) + (acc1[2] + acc1[3])); + } + + yb += 16 * QK_MXFP4; + } + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; + + for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { + float sum_all = simd_sum(sumf[row]); + if (tiisg == 0) { + dst_f32[first_row + row] = sum_all; + } + } +} + +[[host_name("kernel_mul_mv_mxfp4_f32")]] +kernel void kernel_mul_mv_mxfp4_f32( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +// +// matrix-vector multiplication +// + +typedef void (kernel_mul_mv_disp_t)( + ggml_metal_kargs_mul_mv args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig, + ushort tiisg); + +typedef void (kernel_mul_mv2_disp_t)( + ggml_metal_kargs_mul_mv args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiisg, + ushort sgitg); + +template +void mmv_fn( + ggml_metal_kargs_mul_mv args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiitg, + ushort tiisg, + ushort sgitg) { + disp_fn(args, src0, src1, dst, tgpig, tiisg); +} + +template +void mmv_fn( + ggml_metal_kargs_mul_mv args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem, + uint3 tgpig, + ushort tiitg, + ushort tiisg, + ushort sgitg) { + disp_fn(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); +} + +typedef decltype(mmv_fn>) mul_mv_disp_fn_t; + +template +kernel void kernel_mul_mv_id( + constant ggml_metal_kargs_mul_mv_id & args, + device const char * src0s, + device const char * src1, + device char * dst, + device const char * ids, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + const int iid1 = tgpig.z/args.nei0; + const int idx = tgpig.z%args.nei0; + + tgpig.z = 0; + + const int32_t i02 = ((device const int32_t *) (ids + iid1*args.nbi1))[idx]; + + const int64_t i11 = idx % args.ne11; + const int64_t i12 = iid1; + + const int64_t i1 = idx; + const int64_t i2 = i12; + + device const char * src0_cur = src0s + i02*args.nb02; + device const char * src1_cur = src1 + i11*args.nb11 + i12*args.nb12; + + device char * dst_cur = dst + (i1*args.ne0 + i2*args.ne1*args.ne0)*sizeof(float); + + ggml_metal_kargs_mul_mv args0 = { + /*.ne00 =*/ args.ne00, + /*.ne01 =*/ args.ne01, + /*.ne02 =*/ 1, // args.ne02, + /*.nb00 =*/ args.nb00, + /*.nb01 =*/ args.nb01, + /*.nb02 =*/ args.nb02, + /*.nb03 =*/ args.nb02, // args.ne02 == 1 + /*.ne10 =*/ args.ne10, + /*.ne11 =*/ 1, // args.ne11, + /*.ne12 =*/ 1, // args.ne12, + /*.nb10 =*/ args.nb10, + /*.nb11 =*/ args.nb11, + /*.nb12 =*/ args.nb12, + /*.nb13 =*/ args.nb12, // ne12 == 1 + /*.ne0 =*/ args.ne0, + /*.ne1 =*/ 1, // args.ne1, + /*.nr0 =*/ args.nr0, + /*.r2 =*/ 1, + /*.r3 =*/ 1, + }; + + disp_fn( + args0, + /* src0 */ src0_cur, + /* src1 */ src1_cur, + /* dst */ dst_cur, + shmem, + tgpig, + tiitg, + tiisg, + sgitg); +} + +typedef decltype(kernel_mul_mv_id>>) kernel_mul_mv_id_t; + +typedef decltype(kernel_mul_mv_id>>) kernel_mul_mv_id_4_t; + +template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mv_id_bf16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +#endif +template [[host_name("kernel_mul_mv_id_f32_f32_4")]] kernel kernel_mul_mv_id_4_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_f16_f32_4")]] kernel kernel_mul_mv_id_4_t kernel_mul_mv_id>>; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_mul_mv_id_bf16_f32_4")]] kernel kernel_mul_mv_id_4_t kernel_mul_mv_id>>; +#endif + +template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; + +template [[host_name("kernel_mul_mv_id_q1_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q5_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; + +template [[host_name("kernel_mul_mv_id_mxfp4_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; + +template [[host_name("kernel_mul_mv_id_q2_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q3_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q4_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q5_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq1_m_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq2_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; diff --git a/ggml/src/ggml-metal/kernels/norm.metal b/ggml/src/ggml-metal/kernels/norm.metal new file mode 100644 index 0000000000..7e42389fe5 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/norm.metal @@ -0,0 +1,308 @@ +#include "common.h" + +// F == 1 : norm (no fuse) +// F == 2 : norm + mul +// F == 3 : norm + mul + add +template +kernel void kernel_norm_fuse_impl( + constant ggml_metal_kargs_norm & args, + device const char * src0, + device const char * src1_0, + device const char * src1_1, + device char * dst, + threadgroup float * shmem_f32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + if (sgitg == 0) { + shmem_f32[tiisg] = 0.0f; + } + + const int i01 = tgpig.x; + const int i02 = tgpig.y; + const int i03 = tgpig.z; + + device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]); + + device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]); + device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]); + + T sumft(0.0f); + + float sumf = 0.0f; + + for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { + sumft += x[i00]; + } + sumf = dot(sumft, T(1.0f)); + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_f32[tiisg]; + sumf = simd_sum(sumf); + + const float mean = sumf/args.ne00; + + device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); + + sumf = 0.0f; + for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { + y[i00] = x[i00] - mean; + sumf += dot(y[i00], y[i00]); + } + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_f32[tiisg]; + sumf = simd_sum(sumf); + + const float variance = sumf/args.ne00; + + const float scale = 1.0f/sqrt(variance + args.eps); + for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { + if (F == 1) { + y[i00] = (y[i00]*scale); + } + if (F == 2) { + y[i00] = (y[i00]*scale)*f0[i00]; + } + if (F == 3) { + y[i00] = (y[i00]*scale)*f0[i00] + f1[i00]; + } + } +} + +typedef decltype(kernel_norm_fuse_impl) kernel_norm_fuse_t; + +template [[host_name("kernel_norm_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; +template [[host_name("kernel_norm_mul_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; +template [[host_name("kernel_norm_mul_add_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; + +template [[host_name("kernel_norm_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; +template [[host_name("kernel_norm_mul_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; +template [[host_name("kernel_norm_mul_add_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl; + +// F == 1 : rms_norm (no fuse) +// F == 2 : rms_norm + mul +// F == 3 : rms_norm + mul + add +template +kernel void kernel_rms_norm_fuse_impl( + constant ggml_metal_kargs_norm & args, + device const char * src0, + device const char * src1_0, + device const char * src1_1, + device char * dst, + threadgroup float * shmem_f32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + if (sgitg == 0) { + shmem_f32[tiisg] = 0.0f; + } + + const int i01 = tgpig.x; + const int i02 = tgpig.y; + const int i03 = tgpig.z; + + device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]); + + device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]); + device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]); + + float sumf = 0.0f; + + // parallel sum + for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { + sumf += dot(x[i00], x[i00]); + } + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_f32[tiisg]; + sumf = simd_sum(sumf); + + const float mean = sumf/args.ne00; + const float scale = 1.0f/sqrt(mean + args.eps); + + device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); + for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) { + if (F == 1) { + y[i00] = (x[i00]*scale); + } + if (F == 2) { + y[i00] = (x[i00]*scale)*f0[i00]; + } + if (F == 3) { + y[i00] = (x[i00]*scale)*f0[i00] + f1[i00]; + } + } +} + +typedef decltype(kernel_rms_norm_fuse_impl) kernel_rms_norm_fuse_t; + +template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; +template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; +template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; + +template [[host_name("kernel_rms_norm_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; +template [[host_name("kernel_rms_norm_mul_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; +template [[host_name("kernel_rms_norm_mul_add_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl; + +template +kernel void kernel_l2_norm_impl( + constant ggml_metal_kargs_l2_norm & args, + device const char * src0, + device char * dst, + threadgroup float * shmem_f32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i03 = tgpig.z; + const int i02 = tgpig.y; + const int i01 = tgpig.x; + + if (sgitg == 0) { + shmem_f32[tiisg] = 0.0f; + } + + device const T0 * x = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); + + float sumf = 0.0f; + + // parallel sum + for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { + sumf += dot(x[i00], x[i00]); + } + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_f32[tiisg]; + sumf = simd_sum(sumf); + + const float scale = 1.0f/max(sqrt(sumf), args.eps); + + for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { + y[i00] = x[i00] * scale; + } +} + +typedef decltype(kernel_l2_norm_impl) kernel_l2_norm_t; + +template [[host_name("kernel_l2_norm_f32_f32")]] kernel kernel_l2_norm_t kernel_l2_norm_impl; +template [[host_name("kernel_l2_norm_f32_f32_4")]] kernel kernel_l2_norm_t kernel_l2_norm_impl; + +kernel void kernel_group_norm_f32( + constant ggml_metal_kargs_group_norm & args, + device const float * src0, + device float * dst, + threadgroup float * buf [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + const int64_t ne = args.ne00*args.ne01*args.ne02; + const int64_t gs = args.ne00*args.ne01*((args.ne02 + args.ngrp - 1) / args.ngrp); + + int start = tgpig * gs; + int end = start + gs; + + start += tpitg; + + if (end >= ne) { + end = ne; + } + + float tmp = 0.0f; // partial sum for thread in warp + + for (int j = start; j < end; j += ntg) { + tmp += src0[j]; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + tmp = simd_sum(tmp); + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = tmp; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + tmp = buf[tiisg]; + tmp = simd_sum(tmp); + } + + const float mean = tmp / gs; + tmp = 0.0f; + + for (int j = start; j < end; j += ntg) { + float xi = src0[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = simd_sum(tmp); + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = tmp; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + tmp = buf[tiisg]; + tmp = simd_sum(tmp); + } + + const float variance = tmp / gs; + const float scale = 1.0f/sqrt(variance + args.eps); + for (int j = start; j < end; j += ntg) { + dst[j] *= scale; + } +} diff --git a/ggml/src/ggml-metal/kernels/pool.metal b/ggml/src/ggml-metal/kernels/pool.metal new file mode 100644 index 0000000000..13d355b9de --- /dev/null +++ b/ggml/src/ggml-metal/kernels/pool.metal @@ -0,0 +1,148 @@ +#include "common.h" + +kernel void kernel_pool_2d_max_f32( + constant ggml_metal_kargs_pool_2d & args, + device const float * src0, + device float * dst, + uint gid[[thread_position_in_grid]]) { + + if (gid >= args.np) { + return; + } + + const int idx = gid; + const int I_HW = args.IH * args.IW; + const int O_HW = args.OH * args.OW; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / args.OW; + const int cur_ow = idx % O_HW % args.OW; + + device const float * i_ptr = src0 + nc * I_HW; + device float * o_ptr = dst + nc * O_HW; + + const int start_h = cur_oh * args.s1 - args.p1; + const int bh = MAX(0, start_h); + const int eh = MIN(args.IH, start_h + args.k1); + const int start_w = cur_ow * args.s0 - args.p0; + const int bw = MAX(0, start_w); + const int ew = MIN(args.IW, start_w + args.k0); + + float res = -INFINITY; + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { + res = MAX(res, i_ptr[i * args.IW + j]); + } + } + + o_ptr[cur_oh * args.OW + cur_ow] = res; +} + +kernel void kernel_pool_2d_avg_f32( + constant ggml_metal_kargs_pool_2d & args, + device const float * src0, + device float * dst, + uint gid[[thread_position_in_grid]]) { + + if (gid >= args.np) { + return; + } + + const int idx = gid; + const int I_HW = args.IH * args.IW; + const int O_HW = args.OH * args.OW; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / args.OW; + const int cur_ow = idx % O_HW % args.OW; + + device const float * i_ptr = src0 + nc * I_HW; + device float * o_ptr = dst + nc * O_HW; + + const int start_h = cur_oh * args.s1 - args.p1; + const int bh = MAX(0, start_h); + const int eh = MIN(args.IH, start_h + args.k1); + const int start_w = cur_ow * args.s0 - args.p0; + const int bw = MAX(0, start_w); + const int ew = MIN(args.IW, start_w + args.k0); + // const float scale = 1. / ((eh - bh) * (ew - bw)); + const float scale = 1. / (args.k0 * args.k1); + + float res = 0; + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { + float cur = i_ptr[i * args.IW + j]; + res += cur * scale; + } + } + + o_ptr[cur_oh * args.OW + cur_ow] = res; +} + + +kernel void kernel_pool_1d_max_f32( + constant ggml_metal_kargs_pool_1d & args, + device const float * src, + device float * dst, + uint gid [[thread_position_in_grid]] +) { + + if (gid >= args.np) { + return; + } + + const int ow = (int)gid % args.OW; + const int row = (int)gid / args.OW; + + const int base = ow * args.s0 - args.p0; + + float acc = -INFINITY; + + const int src_off = row * args.IW; + const int dst_off = row * args.OW; + + for (int ki = 0; ki < args.k0; ++ki) { + int j = base + ki; + if (j < 0 || j >= args.IW){ + continue; + } + float v = src[src_off + j]; + acc = max(acc, v); + } + + dst[dst_off + ow] = acc; +} + +kernel void kernel_pool_1d_avg_f32( + constant ggml_metal_kargs_pool_1d & args, + device const float * src, + device float * dst, + uint gid [[thread_position_in_grid]] +) { + + if (gid >= args.np) { + return; + } + + const int ow = (int)gid % args.OW; + const int row = (int)gid / args.OW; + + const int base = ow * args.s0 - args.p0; + + float acc = 0.0f; + int cnt = 0; + + const int src_off = row * args.IW; + const int dst_off = row * args.OW; + + for (int ki = 0; ki < args.k0; ++ki) { + const int j = base + ki; + if (j < 0 || j >= args.IW) { + continue; + } + acc += src[src_off + j]; + cnt += 1; + } + + dst[dst_off + ow] = (cnt > 0) ? (acc / (float)cnt) : 0.0f; +} diff --git a/ggml/src/ggml-metal/kernels/quantize.h b/ggml/src/ggml-metal/kernels/quantize.h new file mode 100644 index 0000000000..a3f299f3b1 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/quantize.h @@ -0,0 +1,213 @@ +#pragma once + +#include "common.h" + +void quantize_q1_0(device const float * src, device block_q1_0 & dst) { + float sum_abs = 0.0f; + for (int j = 0; j < QK1_0; j++) { + sum_abs += fabs(src[j]); + } + dst.d = sum_abs / QK1_0; + + for (int j = 0; j < QK1_0 / 8; j++) { + dst.qs[j] = 0; + } + for (int j = 0; j < QK1_0; j++) { + if (src[j] >= 0.0f) { + dst.qs[j / 8] |= (1 << (j % 8)); + } + } +} + +void quantize_q4_0(device const float * src, device block_q4_0 & dst) { +#pragma METAL fp math_mode(safe) + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_0/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + dst.qs[j] = xi0; + dst.qs[j] |= xi1 << 4; + } +} + +void quantize_q4_1(device const float * src, device block_q4_1 & dst) { +#pragma METAL fp math_mode(safe) + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < QK4_1; j++) { + const float v = src[j]; + if (min > v) min = v; + if (max < v) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + dst.m = min; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK4_1/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + dst.qs[j] = xi0; + dst.qs[j] |= xi1 << 4; + } +} + +void quantize_q5_0(device const float * src, device block_q5_0 & dst) { +#pragma METAL fp math_mode(safe) + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK5_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK5_0/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + + thread const uint8_t * qh8 = (thread const uint8_t *)&qh; + + for (int j = 0; j < 4; ++j) { + dst.qh[j] = qh8[j]; + } +} + +void quantize_q5_1(device const float * src, device block_q5_1 & dst) { +#pragma METAL fp math_mode(safe) + float max = src[0]; + float min = src[0]; + + for (int j = 1; j < QK5_1; j++) { + const float v = src[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + dst.m = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK5_1/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); + } + + thread const uint8_t * qh8 = (thread const uint8_t *)&qh; + + for (int j = 0; j < 4; ++j) { + dst.qh[j] = qh8[j]; + } +} + +void quantize_q8_0(device const float * src, device block_q8_0 & dst) { +#pragma METAL fp math_mode(safe) + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = src[j]; + amax = MAX(amax, fabs(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = src[j]*id; + + dst.qs[j] = round(x0); + } +} + +void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) { +#pragma METAL fp math_mode(safe) + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_NL; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / kvalues_iq4nl_f[0]; + const float id = d ? 1.0f/d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_NL/2 + j]*id; + + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1); + + dst.qs[j] = xi0 | (xi1 << 4); + + const float v0 = kvalues_iq4nl_f[xi0]; + const float v1 = kvalues_iq4nl_f[xi1]; + const float w0 = src[0 + j]*src[0 + j]; + const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j]; + sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j]; + sumq2 += w0*v0*v0 + w1*v1*v1; + + } + + dst.d = sumq2 > 0 ? sumqx/sumq2 : d; +} diff --git a/ggml/src/ggml-metal/kernels/quantize.metal b/ggml/src/ggml-metal/kernels/quantize.metal new file mode 100644 index 0000000000..09ea973ec8 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/quantize.metal @@ -0,0 +1,389 @@ +#include "common.h" +#include "dequantize.h" +#include "quantize.h" + +template +kernel void kernel_cpy_t_t( + constant ggml_metal_kargs_cpy & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } + + const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; + + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + + device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) { + device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); + dst_data[i00] = (T1) src[0]; + break; + } +} + +typedef decltype(kernel_cpy_t_t) kernel_cpy_t; + +template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; +template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy_t_t; +template [[host_name("kernel_cpy_f32_i32")]] kernel kernel_cpy_t kernel_cpy_t_t; +template [[host_name("kernel_cpy_i32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; +template [[host_name("kernel_cpy_i32_i32")]] kernel kernel_cpy_t kernel_cpy_t_t; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_cpy_f32_bf16")]] kernel kernel_cpy_t kernel_cpy_t_t; +#endif +template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; +template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy_t_t; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; +template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy_t_t; +#endif + +template +kernel void kernel_cpy_f32_q( + constant ggml_metal_kargs_cpy & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } + + const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; + + const int32_t i3 = n / (args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; + + device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { + device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00); + + quantize_func(src, dst_data[i00]); + + break; + } +} + +typedef decltype(kernel_cpy_f32_q) cpy_f_q_t; + +template [[host_name("kernel_cpy_f32_q8_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; +template [[host_name("kernel_cpy_f32_q1_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; +template [[host_name("kernel_cpy_f32_q4_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; +template [[host_name("kernel_cpy_f32_q4_1")]] kernel cpy_f_q_t kernel_cpy_f32_q; +template [[host_name("kernel_cpy_f32_q5_0")]] kernel cpy_f_q_t kernel_cpy_f32_q; +template [[host_name("kernel_cpy_f32_q5_1")]] kernel cpy_f_q_t kernel_cpy_f32_q; +template [[host_name("kernel_cpy_f32_iq4_nl")]] kernel cpy_f_q_t kernel_cpy_f32_q; + +template +kernel void kernel_cpy_q_f32( + constant ggml_metal_kargs_cpy & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } + + const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; + + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + + device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { + T4x4 temp; + dequantize_func(src_data + i00/nl, i00%nl, temp); + dst_data[i00] = temp; + + break; + } +} + +typedef decltype(kernel_cpy_q_f32) cpy_q_f_t; + +template [[host_name("kernel_cpy_q1_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32; + +template [[host_name("kernel_cpy_q1_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; + +template +kernel void kernel_concat( + constant ggml_metal_kargs_concat & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y; + + if (i1 >= args.ne1) { + return; + } + + int o[4] = {0, 0, 0, 0}; + o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + device const T * x; + + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + x = (device const T *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00); + } else { + x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10); + } + + device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + *y = *x; + } +} + +typedef decltype(kernel_concat) kernel_concat_t; + +template [[host_name("kernel_concat_f32")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_f16")]] kernel kernel_concat_t kernel_concat; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat; +#endif +template [[host_name("kernel_concat_i8")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_i16")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_i32")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_i64")]] kernel kernel_concat_t kernel_concat; + +template +kernel void kernel_get_rows_q( + constant ggml_metal_kargs_get_rows & args, + device const void * src0, + device const void * src1, + device void * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 ntg [[threads_per_threadgroup]]) { + const int32_t iw0 = tgpig.x/args.ne10; + const int32_t i10 = tgpig.x%args.ne10; + const int32_t i11 = tgpig.y; + const int32_t i12 = tgpig.z; + + const int32_t r = ((const device int32_t *) ((const device char *) src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0]; + + const int32_t i02 = i11; + const int32_t i03 = i12; + + auto psrc = (device const block_q *) ((const device char *) src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01); + auto pdst = (device float4x4 *) (( device char *) dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1); + + for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) { + float4x4 temp; + dequantize_func(psrc + ind/nl, ind%nl, temp); + pdst[ind] = temp; + + break; + } +} + +template +kernel void kernel_get_rows_f( + constant ggml_metal_kargs_get_rows & args, + device const void * src0, + device const void * src1, + device void * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 ntg [[threads_per_threadgroup]]) { + const int32_t iw0 = tgpig.x/args.ne10; + const int32_t i10 = tgpig.x%args.ne10; + const int32_t i11 = tgpig.y; + const int32_t i12 = tgpig.z; + + const int32_t r = ((const device int32_t *) ((const device char *) src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0]; + + const int32_t i02 = i11; + const int32_t i03 = i12; + + auto psrc = (const device T0 *) ((const device char *) src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01); + auto pdst = ( device T *) (( device char *) dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1); + + for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) { + pdst[ind] = psrc[ind]; + + break; + } +} + +template +kernel void kernel_set_rows_q32( + constant ggml_metal_kargs_set_rows & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + + const int32_t i12 = i03%args.ne12; + const int32_t i11 = i02%args.ne11; + + const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; + if (i01 >= args.ne01) { + return; + } + + const int32_t i10 = i01; + const TI i1 = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; + + device block_q * dst_row = ( device block_q *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); + const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + + for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { + quantize_func(src_row + 32*ind, dst_row[ind]); + } +} + +template +kernel void kernel_set_rows_f( + constant ggml_metal_kargs_set_rows & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + + const int32_t i12 = i03%args.ne12; + const int32_t i11 = i02%args.ne11; + + const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; + if (i01 >= args.ne01) { + return; + } + + const int32_t i10 = i01; + const TI i1 = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; + + device T * dst_row = ( device T *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); + const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + + for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { + dst_row[ind] = (T) src_row[ind]; + } +} + +// +// get rows +// + +typedef decltype(kernel_get_rows_f) get_rows_f_t; + +template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f; +template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f; +template [[host_name("kernel_get_rows_i32")]] kernel get_rows_f_t kernel_get_rows_f; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f; +#endif + +typedef decltype(kernel_get_rows_q) get_rows_q_t; + +template [[host_name("kernel_get_rows_q1_0")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_mxfp4")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get_rows_q; + +// +// set rows +// + +typedef decltype(kernel_set_rows_f) set_rows_f_t; + +template [[host_name("kernel_set_rows_f32_i64")]] kernel set_rows_f_t kernel_set_rows_f; +template [[host_name("kernel_set_rows_f32_i32")]] kernel set_rows_f_t kernel_set_rows_f; +template [[host_name("kernel_set_rows_f16_i64")]] kernel set_rows_f_t kernel_set_rows_f; +template [[host_name("kernel_set_rows_f16_i32")]] kernel set_rows_f_t kernel_set_rows_f; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_set_rows_bf16_i64")]] kernel set_rows_f_t kernel_set_rows_f; +template [[host_name("kernel_set_rows_bf16_i32")]] kernel set_rows_f_t kernel_set_rows_f; +#endif + +typedef decltype(kernel_set_rows_q32) set_rows_q32_t; + +template [[host_name("kernel_set_rows_q8_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q8_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_1_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_1_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_1_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_1_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_iq4_nl_i64")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_iq4_nl_i32")]] kernel set_rows_q32_t kernel_set_rows_q32; diff --git a/ggml/src/ggml-metal/kernels/reduce.metal b/ggml/src/ggml-metal/kernels/reduce.metal new file mode 100644 index 0000000000..0af9e4f6c2 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/reduce.metal @@ -0,0 +1,228 @@ +#include "common.h" + +kernel void kernel_op_sum_f32( + constant ggml_metal_kargs_sum & args, + device const float * src0, + device float * dst, + threadgroup float * shmem_f32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + + if (args.np == 0) { + return; + } + + // TODO: become function constant + const uint nsg = (ntg.x + 31) / 32; + + float sumf = 0; + + for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) { + sumf += src0[i0]; + } + + sumf = simd_sum(sumf); + + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + float total = 0; + + if (sgitg == 0) { + float v = 0; + + if (tpitg.x < nsg) { + v = shmem_f32[tpitg.x]; + } + + total = simd_sum(v); + + if (tpitg.x == 0) { + dst[0] = total; + } + } +} + +constant short FC_sum_rows_op [[function_constant(FC_SUM_ROWS + 0)]]; + +template +kernel void kernel_sum_rows_impl( + constant ggml_metal_kargs_sum_rows & args, + device const char * src0, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { +#define FC_OP FC_sum_rows_op + + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + threadgroup T0 * shmem_t = (threadgroup T0 *) shmem; + + if (sgitg == 0) { + shmem_t[tiisg] = 0.0f; + } + + device const T0 * src_row = (device const T0 *) (src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03); + device T * dst_row = (device T *) (dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3); + + T0 sumf = T0(0.0f); + + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + sumf += src_row[i0]; + } + + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shmem_t[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_t[tiisg]; + sumf = simd_sum(sumf); + + if (tpitg.x == 0) { + if (FC_OP == OP_SUM_ROWS_NUM_MEAN) { + if (is_same::value) { + dst_row[0] = sum(sumf) / (4*args.ne00); + } else { + dst_row[0] = sum(sumf) / args.ne00; + } + } else { + dst_row[0] = sum(sumf); + } + } + +#undef FC_OP +} + +typedef decltype(kernel_sum_rows_impl) kernel_sum_rows_t; + +template [[host_name("kernel_sum_rows_f32_f32")]] kernel kernel_sum_rows_t kernel_sum_rows_impl; +template [[host_name("kernel_sum_rows_f32_f32_4")]] kernel kernel_sum_rows_t kernel_sum_rows_impl; + +template +kernel void kernel_cumsum_blk( + constant ggml_metal_kargs_cumsum_blk & args, + device const char * src0, + device char * tmp, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int ib = tgpig[0]/args.ne01; + + const int i00 = ib*ntg.x; + const int i01 = tgpig[0]%args.ne01; + const int i02 = tgpig[1]; + const int i03 = tgpig[2]; + + device const float * src0_row = (device const float *) (src0 + + args.nb01*i01 + + args.nb02*i02 + + args.nb03*i03); + + threadgroup float * shmem_f32 = (threadgroup float *) shmem; + + float v = 0.0f; + + if (i00 + tpitg.x < args.ne00) { + v = src0_row[i00 + tpitg.x]; + } + + float s = simd_prefix_inclusive_sum(v); + + if (tiisg == N_SIMDWIDTH - 1) { + shmem_f32[sgitg] = s; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (sgitg == 0) { + shmem_f32[tiisg] = simd_prefix_exclusive_sum(shmem_f32[tiisg]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + s += shmem_f32[sgitg]; + + device float * dst_row = (device float *) dst + + args.ne00*i01 + + args.ne00*args.ne01*i02 + + args.ne00*args.ne01*args.ne02*i03; + + if (i00 + tpitg.x < args.ne00) { + dst_row[i00 + tpitg.x] = s; + } + + if (args.outb && tpitg.x == ntg.x - 1) { + device float * tmp_row = (device float *) tmp + + args.net0*i01 + + args.net0*args.net1*i02 + + args.net0*args.net1*args.net2*i03; + + tmp_row[ib] = s; + } +} + +typedef decltype(kernel_cumsum_blk) kernel_cumsum_blk_t; + +template [[host_name("kernel_cumsum_blk_f32")]] kernel kernel_cumsum_blk_t kernel_cumsum_blk; + +template +kernel void kernel_cumsum_add( + constant ggml_metal_kargs_cumsum_add & args, + device const char * tmp, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int ib = tgpig[0]/args.ne01; + + if (ib == 0) { + return; + } + + const int i00 = ib*ntg.x; + const int i01 = tgpig[0]%args.ne01; + const int i02 = tgpig[1]; + const int i03 = tgpig[2]; + + device const float * tmp_row = (device const float *) (tmp + + args.nbt1*i01 + + args.nbt2*i02 + + args.nbt3*i03); + + device float * dst_row = (device float *) dst + + args.ne00*i01 + + args.ne00*args.ne01*i02 + + args.ne00*args.ne01*args.ne02*i03; + + if (i00 + tpitg.x < args.ne00) { + dst_row[i00 + tpitg.x] += tmp_row[ib - 1]; + } +} + +typedef decltype(kernel_cumsum_add) kernel_cumsum_add_t; + +template [[host_name("kernel_cumsum_add_f32")]] kernel kernel_cumsum_add_t kernel_cumsum_add; diff --git a/ggml/src/ggml-metal/kernels/rope.metal b/ggml/src/ggml-metal/kernels/rope.metal new file mode 100644 index 0000000000..57fa2241de --- /dev/null +++ b/ggml/src/ggml-metal/kernels/rope.metal @@ -0,0 +1,318 @@ +#include "common.h" + +constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]]; +constant bool FC_rope_is_back [[function_constant(FC_ROPE + 1)]]; + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int i0, float ext_factor, float mscale, + thread float * cos_theta, thread float * sin_theta) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); + } + *cos_theta = cos(theta) * mscale; + *sin_theta = sin(theta) * mscale; + if (FC_rope_is_back) { + *sin_theta *= -1.0f; + } +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base)); +} + +static void rope_yarn_corr_dims( + int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base))); +} + +template +kernel void kernel_rope_norm( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float theta_base = (float) pos[i2]; + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < args.n_dims) { + const int ic = i0/2; + + const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); + + const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + const float x0 = src[0]; + const float x1 = src[1]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[1] = x0*sin_theta + x1*cos_theta; + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + +template +kernel void kernel_rope_neox( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float theta_base = (float) pos[i2]; + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < args.n_dims) { + const int ic = i0/2; + + const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); + + const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + +template +kernel void kernel_rope_multi( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < args.n_dims) { + const int ic = i0/2; + + // mrope theta calculations + // note: the rest is the same as kernel_rope_neox + const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3; + const int sec_w01 = args.sect_0 + args.sect_1; // end of section 1 + const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 + const int sector = ic % sect_dims; + + float theta_base; + if (FC_rope_is_imrope) { + if (sector % 3 == 1 && sector < 3 * args.sect_1) { // h + theta_base = (float) pos[i2 + args.ne02 * 1]; + } else if (sector % 3 == 2 && sector < 3 * args.sect_2) { // w + theta_base = (float) pos[i2 + args.ne02 * 2]; + } else if (sector % 3 == 0 && sector < 3 * args.sect_0) { // t + theta_base = (float) pos[i2 + args.ne02 * 0]; + } else { // e + theta_base = (float) pos[i2 + args.ne02 * 3]; + } + } else { + if (sector < args.sect_0) { + theta_base = (float) pos[i2]; + } else if (sector < sec_w01) { + theta_base = (float) pos[i2 + args.ne02 * 1]; + } else if (sector < sec_w012) { + theta_base = (float) pos[i2 + args.ne02 * 2]; + } else { + theta_base = (float) pos[i2 + args.ne02 * 3]; + } + } + // end of mrope + + const float theta = theta_base * pow(args.freq_base, inv_ndims*i0); + + const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + +template +kernel void kernel_rope_vision( + constant ggml_metal_kargs_rope & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tptg [[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int i3 = tgpig[2]; + const int i2 = tgpig[1]; + const int i1 = tgpig[0]; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + device const int32_t * pos = (device const int32_t *) src1; + + const float inv_ndims = -1.f/args.n_dims; + + float cos_theta; + float sin_theta; + + for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) { + if (i0 < 2*args.n_dims) { // different from kernel_rope_multi + const int ic = i0/2; + + // mrope theta calculations (only support 2 dimensions) + const int sect_dims = args.sect_0 + args.sect_1; + const int sector = ic % sect_dims; + + float p; + float theta_base; + if (sector < args.sect_1) { + p = (float) sector; + theta_base = (float) pos[i2]; + } else { + p = (float) sector - args.sect_0; + theta_base = (float) pos[i2 + args.ne02]; + } + + const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p); + // end of mrope + + const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; + + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0); + + const float x0 = src[0]; + const float x1 = src[args.n_dims]; // different from kernel_rope_multi + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi + } else { + device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00); + device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } +} + +typedef decltype(kernel_rope_norm) kernel_rope_norm_t; +typedef decltype(kernel_rope_neox) kernel_rope_neox_t; +typedef decltype(kernel_rope_multi) kernel_rope_multi_t; +typedef decltype(kernel_rope_vision) kernel_rope_vision_t; + +template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm; +template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm; + +template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox; +template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox; + +template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi; +template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi; + +template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision; +template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision; diff --git a/ggml/src/ggml-metal/kernels/softmax.metal b/ggml/src/ggml-metal/kernels/softmax.metal new file mode 100644 index 0000000000..f32fe29379 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/softmax.metal @@ -0,0 +1,223 @@ +#include "common.h" + +template +kernel void kernel_soft_max( + constant ggml_metal_kargs_soft_max & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + threadgroup float * buf [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint3 tptg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + const int32_t i01 = tgpig.x; + + const int32_t i13 = i03%args.ne13; + const int32_t i12 = i02%args.ne12; + const int32_t i11 = i01; + + device const float * psrc0 = (device const float *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr; + device const float * psrc2 = src2 != src0 ? (device const float *) (src2) : nullptr; + device float * pdst = (device float *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3); + + float slope = 1.0f; + + // ALiBi + if (args.max_bias > 0.0f) { + const int32_t h = i02; + + const float base = h < args.n_head_log2 ? args.m0 : args.m1; + const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; + + slope = pow(base, exp); + } + + // parallel max + float lmax = psrc2 ? psrc2[i02] : -INFINITY; + + for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { + lmax = MAX(lmax, psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)); + } + + // find the max value in the block + float max_val = simd_max(lmax); + if (tptg.x > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = -INFINITY; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = max_val; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max_val = buf[tiisg]; + max_val = simd_max(max_val); + } + + // parallel sum + float lsum = 0.0f; + for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { + const float exp_psrc0 = exp((psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val); + lsum += exp_psrc0; + pdst[i00] = exp_psrc0; + } + + // This barrier fixes a failing test + // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335 + threadgroup_barrier(mem_flags::mem_none); + + float sum = simd_sum(lsum); + + if (tptg.x > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[tiisg]; + sum = simd_sum(sum); + } + + if (psrc2) { + sum += exp(psrc2[i02] - max_val); + } + + const float inv_sum = 1.0f/sum; + + for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { + pdst[i00] *= inv_sum; + } +} + +template +kernel void kernel_soft_max_4( + constant ggml_metal_kargs_soft_max & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + threadgroup float * buf [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint3 tptg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + const int32_t i01 = tgpig.x; + + const int32_t i13 = i03%args.ne13; + const int32_t i12 = i02%args.ne12; + const int32_t i11 = i01; + + device const float4 * psrc4 = (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr; + device const float * psrc2 = src2 != src0 ? (device const float * ) (src2) : nullptr; + device float4 * pdst4 = (device float4 *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3); + + float slope = 1.0f; + + if (args.max_bias > 0.0f) { + const int32_t h = i02; + + const float base = h < args.n_head_log2 ? args.m0 : args.m1; + const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; + + slope = pow(base, exp); + } + + // parallel max + float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY; + + for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { + lmax4 = fmax(lmax4, psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))); + } + + const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); + + float max_val = simd_max(lmax); + if (tptg.x > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = -INFINITY; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = max_val; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max_val = buf[tiisg]; + max_val = simd_max(max_val); + } + + // parallel sum + float4 lsum4 = 0.0f; + for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { + const float4 exp_psrc4 = exp((psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val); + lsum4 += exp_psrc4; + pdst4[i00] = exp_psrc4; + } + + const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; + + // This barrier fixes a failing test + // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335 + threadgroup_barrier(mem_flags::mem_none); + + float sum = simd_sum(lsum); + + if (tptg.x > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[tiisg]; + sum = simd_sum(sum); + } + + if (psrc2) { + sum += exp(psrc2[i02] - max_val); + } + + const float inv_sum = 1.0f/sum; + + for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { + pdst4[i00] *= inv_sum; + } +} + +typedef decltype(kernel_soft_max) kernel_soft_max_t; +typedef decltype(kernel_soft_max_4) kernel_soft_max_4_t; + +template [[host_name("kernel_soft_max_f16")]] kernel kernel_soft_max_t kernel_soft_max; +template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kernel_soft_max; +template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; +template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; diff --git a/ggml/src/ggml-metal/kernels/solve_tri.metal b/ggml/src/ggml-metal/kernels/solve_tri.metal new file mode 100644 index 0000000000..50f16facbf --- /dev/null +++ b/ggml/src/ggml-metal/kernels/solve_tri.metal @@ -0,0 +1,75 @@ +#include "common.h" + +constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]]; +constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]]; +constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]]; + +kernel void kernel_solve_tri_f32( + constant ggml_metal_kargs_solve_tri & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + ushort3 tgpig[[threadgroup_position_in_grid]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + constexpr short NW = N_SIMDWIDTH; + + const short NSG = FC_solve_tri_nsg; + const short N = FC_solve_tri_n; + const short K = FC_solve_tri_k; + const short NP = PAD2(N, NW); + + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + const int32_t i01 = tgpig.x*NSG + sgitg; + + threadgroup float * sh0 = (threadgroup float *) shmem; + + device const float * src0_ptr = (device const float *)(src0 + i02 * args.nb02 + i03 * args.nb03) + sgitg*N; + device const float * src1_ptr = (device const float *)(src1 + i02 * args.nb12 + i03 * args.nb13) + i01; + device float * dst_ptr = (device float *)(dst + i02 * args.nb2 + i03 * args.nb3) + i01; + + for (short rr = 0; rr < N; rr += NSG) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + { + threadgroup float * sh0_cur = sh0 + sgitg*NP; + + for (short t = 0; t*NW < N; ++t) { + const short idx = t*NW + tiisg; + sh0_cur[idx] = src0_ptr[idx]; + } + + src0_ptr += NSG*N; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (i01 >= args.ne10) { + continue; + } + + for (short ir = 0; ir < NSG && rr + ir < N; ++ir) { + const short r = rr + ir; + + threadgroup float * sh0_cur = sh0 + ir*NP; + + float sum = 0.0f; + + for (short t = 0; t*NW < r; ++t) { + const short idx = t*NW + tiisg; + sum += sh0_cur[idx] * dst_ptr[idx*K] * (idx < r); + } + + sum = simd_sum(sum); + + if (tiisg == 0) { + const float diag = sh0_cur[r]; + + dst_ptr[r*K] = (src1_ptr[r*K] - sum) / diag; + } + } + } +} diff --git a/ggml/src/ggml-metal/kernels/ssm.metal b/ggml/src/ggml-metal/kernels/ssm.metal new file mode 100644 index 0000000000..3fc87384bf --- /dev/null +++ b/ggml/src/ggml-metal/kernels/ssm.metal @@ -0,0 +1,279 @@ +#include "common.h" + +// ref: ggml.c:ggml_compute_forward_ssm_conv_f32 +kernel void kernel_ssm_conv_f32_f32( + constant ggml_metal_kargs_ssm_conv & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t ir = tgpig.x; + const int64_t i2 = tgpig.y; + const int64_t i3 = tgpig.z; + + const int64_t nc = args.ne10; + //const int64_t ncs = args.ne00; + //const int64_t nr = args.ne01; + //const int64_t n_t = args.ne1; + //const int64_t n_s = args.ne2; + + device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); + device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11); + device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); + + float sumf = 0.0f; + + for (int64_t i0 = 0; i0 < nc; ++i0) { + sumf += s[i0] * c[i0]; + } + + x[0] = sumf; +} + +kernel void kernel_ssm_conv_f32_f32_4( + constant ggml_metal_kargs_ssm_conv & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t ir = tgpig.x; + const int64_t i2 = tgpig.y; + const int64_t i3 = tgpig.z; + + const int64_t nc = args.ne10; + //const int64_t ncs = args.ne00; + //const int64_t nr = args.ne01; + //const int64_t n_t = args.ne1; + //const int64_t n_s = args.ne2; + + device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); + device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11); + device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); + + float sumf = 0.0f; + + for (int64_t i0 = 0; i0 < nc/4; ++i0) { + sumf += dot(s[i0], c[i0]); + } + + x[0] = sumf; +} + +constant short FC_ssm_conv_bs [[function_constant(FC_SSM_CONV + 0)]]; + +// Batched version: each threadgroup processes multiple tokens for better efficiency +// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens +kernel void kernel_ssm_conv_f32_f32_batched( + constant ggml_metal_kargs_ssm_conv & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + // tgpig.x = row index (ir) + // tgpig.y = batch of tokens (i2_base / BATCH_SIZE) + // tgpig.z = sequence index (i3) + // tpitg.x = thread within batch (0..BATCH_SIZE-1) + const short BATCH_SIZE = FC_ssm_conv_bs; + + const int64_t ir = tgpig.x; + const int64_t i2_base = tgpig.y * BATCH_SIZE; + const int64_t i3 = tgpig.z; + const int64_t i2_off = tpitg.x; + const int64_t i2 = i2_base + i2_off; + + const int64_t nc = args.ne10; // conv kernel size (typically 4) + const int64_t n_t = args.ne1; // number of tokens + + // Bounds check for partial batches at the end + if (i2 >= n_t) { + return; + } + + // Load conv weights (shared across all tokens for this row) + device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11); + + // Load source for this specific token + device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); + + // Output location for this token + device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); + + float sumf = 0.0f; + for (int64_t i0 = 0; i0 < nc; ++i0) { + sumf += s[i0] * c[i0]; + } + + x[0] = sumf; +} + +kernel void kernel_ssm_conv_f32_f32_batched_4( + constant ggml_metal_kargs_ssm_conv & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + // tgpig.x = row index (ir) + // tgpig.y = batch of tokens (i2_base / BATCH_SIZE) + // tgpig.z = sequence index (i3) + // tpitg.x = thread within batch (0..BATCH_SIZE-1) + const short BATCH_SIZE = FC_ssm_conv_bs; + + const int64_t ir = tgpig.x; + const int64_t i2_base = tgpig.y * BATCH_SIZE; + const int64_t i3 = tgpig.z; + const int64_t i2_off = tpitg.x; + const int64_t i2 = i2_base + i2_off; + + const int64_t nc = args.ne10; // conv kernel size (typically 4) + const int64_t n_t = args.ne1; // number of tokens + + // Bounds check for partial batches at the end + if (i2 >= n_t) { + return; + } + + // Load conv weights (shared across all tokens for this row) + device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11); + + // Load source for this specific token + device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02); + + // Output location for this token + device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2); + + float sumf = 0.0f; + for (int64_t i0 = 0; i0 < nc/4; ++i0) { + sumf += dot(s[i0], c[i0]); + } + + x[0] = sumf; +} + +// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part +// Optimized version: reduces redundant memory loads by having one thread load shared values +kernel void kernel_ssm_scan_f32( + constant ggml_metal_kargs_ssm_scan & args, + device const void * src0, + device const void * src1, + device const void * src2, + device const void * src3, + device const void * src4, + device const void * src5, + device const void * src6, + device float * dst, + threadgroup float * shared [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgptg[[simdgroups_per_threadgroup]], + uint3 tgpg[[threadgroups_per_grid]]) { + constexpr short NW = N_SIMDWIDTH; + + // Shared memory layout: + // [0..sgptg*NW-1]: partial sums for reduction (existing) + // [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch + // [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch + threadgroup float * shared_sums = shared; + threadgroup float * shared_x_dt = shared + sgptg * NW; + threadgroup float * shared_dA = shared + sgptg * NW + sgptg; + + shared_sums[tpitg.x] = 0.0f; + + const int32_t i0 = tpitg.x; + const int32_t i1 = tgpig.x; + const int32_t ir = tgpig.y; // current head + const int32_t i3 = tgpig.z; // current seq + + const int32_t nc = args.d_state; + const int32_t nr = args.d_inner; + const int32_t nh = args.n_head; + const int32_t ng = args.n_group; + const int32_t n_t = args.n_seq_tokens; + + const int32_t s_off = args.s_off; + + device const int32_t * ids = (device const int32_t *) src6; + + device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); + device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); + + const int32_t i = i0 + i1*nc; + const int32_t g = ir / (nh / ng); // repeat_interleave + + float s0 = s0_buff[i]; + float s = 0.0f; + + device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {ne30, nh} + + const float A0 = A[i0%args.ne30]; + + device const float * x = (device const float *)((device const char *) src1 + i1*args.nb10 + ir*args.nb11 + i3*args.nb13); // {dim, nh, nt, ns} + device const float * dt = (device const float *)((device const char *) src2 + ir*args.nb20 + i3*args.nb22); // {nh, nt, ns} + device const float * B = (device const float *)((device const char *) src4 + g*args.nb41 + i3*args.nb43); // {d_state, ng, nt, ns} + device const float * C = (device const float *)((device const char *) src5 + g*args.nb51 + i3*args.nb53); // {d_state, ng, nt, ns} + + device float * y = dst + (i1 + ir*(nr) + i3*(n_t*nh*nr)); // {dim, nh, nt, ns} + + for (int i2 = 0; i2 < n_t; i2 += sgptg) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + // Pre-compute x_dt and dA for this batch of tokens + // Only first sgptg threads do the loads and expensive math + if (i0 < sgptg && i2 + i0 < n_t) { + // ns12 and ns21 are element strides (nb12/nb10, nb21/nb20) + device const float * x_t = x + i0 * args.ns12; + device const float * dt_t = dt + i0 * args.ns21; + + const float dt0 = dt_t[0]; + const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0; + shared_x_dt[i0] = x_t[0] * dtsp; + shared_dA[i0] = dtsp; // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (int t = 0; t < sgptg && i2 + t < n_t; t++) { + const float x_dt = shared_x_dt[t]; + const float dA = exp(shared_dA[t] * A0); + + s = (s0 * dA) + (B[i0] * x_dt); + + const float sumf = simd_sum(s * C[i0]); + + if (tiisg == 0) { + shared_sums[t*NW + sgitg] = sumf; + } + + // recurse + s0 = s; + + B += args.ns42; + C += args.ns52; + } + + // Advance pointers for next batch + x += sgptg * args.ns12; + dt += sgptg * args.ns21; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]); + + if (tiisg == 0 && i2 + sgitg < n_t) { + y[sgitg*nh*nr] = sumf; + } + + y += sgptg*nh*nr; + } + + s_buff[i] = s; +} diff --git a/ggml/src/ggml-metal/kernels/tri.metal b/ggml/src/ggml-metal/kernels/tri.metal new file mode 100644 index 0000000000..862f78678b --- /dev/null +++ b/ggml/src/ggml-metal/kernels/tri.metal @@ -0,0 +1,69 @@ +#include "common.h" + +template +bool _ggml_vec_tri_cmp(const int i, const int r); + +template<> +bool _ggml_vec_tri_cmp(const int i, const int r) { + return i < r; +} + +template<> +bool _ggml_vec_tri_cmp(const int i, const int r) { + return i <= r; +} + +template<> +bool _ggml_vec_tri_cmp(const int i, const int r) { + return i > r; +} + +template<> +bool _ggml_vec_tri_cmp(const int i, const int r) { + return i >= r; +} + +template +kernel void kernel_tri( + constant ggml_metal_kargs_tri & args, + device const char * src0, + device const char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { + return; + } + + device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03); + device T * dst_row = (device T *) ((device char *) dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3); + + // Each thread is a single element of the row if ne00 < max threads per + // threadgroup, so this will loop once for each index that this thread is + // responsible for + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + // Use the comparison as a mask for branchless + dst_row[i0] = static_cast(_ggml_vec_tri_cmp(i0, i1)) * src_row[i0]; + } +} + +typedef decltype(kernel_tri) kernel_tri_t; + +template [[host_name("kernel_tri_f32_0")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f32_1")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f32_2")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f32_3")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f16_0")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f16_1")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f16_2")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_f16_3")]] kernel kernel_tri_t kernel_tri; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_tri_bf16_0")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_bf16_1")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_bf16_2")]] kernel kernel_tri_t kernel_tri; +template [[host_name("kernel_tri_bf16_3")]] kernel kernel_tri_t kernel_tri; +#endif diff --git a/ggml/src/ggml-metal/kernels/unary.metal b/ggml/src/ggml-metal/kernels/unary.metal new file mode 100644 index 0000000000..39e0099984 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/unary.metal @@ -0,0 +1,360 @@ +#include "common.h" + +constant short FC_unary_op [[function_constant(FC_UNARY + 0)]]; +constant bool FC_unary_cnt[[function_constant(FC_UNARY + 1)]]; + +template +kernel void kernel_unary_impl( + constant ggml_metal_kargs_unary & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { +#define FC_OP FC_unary_op +#define FC_CNT FC_unary_cnt + + device const T0 * src0_ptr; + device T * dst_ptr; + + int i0; + + if (FC_CNT) { + i0 = tgpig.x; + + src0_ptr = (device const T0 *) (src0); + dst_ptr = (device T *) (dst); + } else { + const int i03 = tgpig.z; + const int i02 = tgpig.y; + const int k0 = tgpig.x/args.ne01; + const int i01 = tgpig.x - k0*args.ne01; + + i0 = k0*ntg.x + tpitg.x; + + src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 ); + } + + { + //threadgroup_barrier(mem_flags::mem_none); + + if (!FC_CNT) { + if (i0 >= args.ne0) { + return; + } + } + + const TC x = (TC) src0_ptr[i0]; + + if (FC_OP == OP_UNARY_NUM_SCALE) { + dst_ptr[i0] = (T) (args.scale * x + args.bias); + } + + if (FC_OP == OP_UNARY_NUM_FILL) { + dst_ptr[i0] = (T) args.val; + } + + if (FC_OP == OP_UNARY_NUM_CLAMP) { + dst_ptr[i0] = (T) clamp(x, args.min, args.max); + } + + if (FC_OP == OP_UNARY_NUM_SQR) { + dst_ptr[i0] = (T) (x * x); + } + + if (FC_OP == OP_UNARY_NUM_SQRT) { + dst_ptr[i0] = (T) sqrt(x); + } + + if (FC_OP == OP_UNARY_NUM_SIN) { + dst_ptr[i0] = (T) sin(x); + } + + if (FC_OP == OP_UNARY_NUM_COS) { + dst_ptr[i0] = (T) cos(x); + } + + if (FC_OP == OP_UNARY_NUM_LOG) { + dst_ptr[i0] = (T) log(x); + } + + if (FC_OP == OP_UNARY_NUM_LEAKY_RELU) { + dst_ptr[i0] = (T) (TC(x > 0)*x + TC(x <= 0)*(x * args.slope)); + } + + if (FC_OP == OP_UNARY_NUM_TANH) { + dst_ptr[i0] = (T) precise::tanh(x); + } + + if (FC_OP == OP_UNARY_NUM_RELU) { + dst_ptr[i0] = (T) fmax(0, x); + } + + if (FC_OP == OP_UNARY_NUM_SIGMOID) { + dst_ptr[i0] = (T) (1 / (1 + exp(-x))); + } + + if (FC_OP == OP_UNARY_NUM_GELU) { + dst_ptr[i0] = (T) (0.5*x*(1 + precise::tanh(SQRT_2_OVER_PI*x*(1 + GELU_COEF_A*x*x)))); + } + + if (FC_OP == OP_UNARY_NUM_GELU_ERF) { + dst_ptr[i0] = (T) (0.5*x*(1 + erf_approx(SQRT_2_INV*x))); + } + + if (FC_OP == OP_UNARY_NUM_GELU_QUICK) { + dst_ptr[i0] = (T) (x * (1/(1 + exp(GELU_QUICK_COEF*x)))); + } + + if (FC_OP == OP_UNARY_NUM_SILU) { + dst_ptr[i0] = (T) (x / (1 + exp(-x))); + } + + if (FC_OP == OP_UNARY_NUM_ELU) { + dst_ptr[i0] = (T) elu_approx(x); + } + + if (FC_OP == OP_UNARY_NUM_NEG) { + dst_ptr[i0] = (T) -x; + } + + if (FC_OP == OP_UNARY_NUM_ABS) { + dst_ptr[i0] = (T) fabs(x); + } + + if (FC_OP == OP_UNARY_NUM_SGN) { + dst_ptr[i0] = T(x > 0) - T(x < 0); + } + + if (FC_OP == OP_UNARY_NUM_STEP) { + dst_ptr[i0] = T(x > 0); + } + + if (FC_OP == OP_UNARY_NUM_HARDSWISH) { + dst_ptr[i0] = (T) (x * fmax(0, fmin(1, x/6 + 0.5))); + } + + if (FC_OP == OP_UNARY_NUM_HARDSIGMOID) { + dst_ptr[i0] = (T) fmax(0, fmin(1, x/6 + 0.5)); + } + + if (FC_OP == OP_UNARY_NUM_EXP) { + dst_ptr[i0] = (T) exp(x); + } + + if (FC_OP == OP_UNARY_NUM_SOFTPLUS) { + dst_ptr[i0] = (T) select(log(1 + exp(x)), x, x > 20); + } + + if (FC_OP == OP_UNARY_NUM_EXPM1) { + // TODO: precise implementation + dst_ptr[i0] = (T) (exp(x) - 1); + } + + if (FC_OP == OP_UNARY_NUM_FLOOR) { + dst_ptr[i0] = (T) floor(x); + } + + if (FC_OP == OP_UNARY_NUM_CEIL) { + dst_ptr[i0] = (T) ceil(x); + } + + if (FC_OP == OP_UNARY_NUM_ROUND) { + dst_ptr[i0] = (T) round(x); + } + + if (FC_OP == OP_UNARY_NUM_TRUNC) { + dst_ptr[i0] = (T) trunc(x); + } + + if (FC_OP == OP_UNARY_NUM_XIELU) { + const TC xi = x; + const TC gate = TC(xi > TC(0.0f)); + const TC clamped = fmin(xi, TC(args.val)); + const TC y_pos = TC(args.scale) * xi * xi + TC(args.bias) * xi; + const TC y_neg = (exp(clamped) - TC(1.0f) - xi) * TC(args.slope) + TC(args.bias) * xi; + dst_ptr[i0] = (T) (gate * y_pos + (TC(1.0f) - gate) * y_neg); + } + } + +#undef FC_OP +#undef FC_CNT +} + +typedef decltype(kernel_unary_impl) kernel_unary_t; + +template [[host_name("kernel_unary_f32_f32")]] kernel kernel_unary_t kernel_unary_impl; +template [[host_name("kernel_unary_f32_f32_4")]] kernel kernel_unary_t kernel_unary_impl; +template [[host_name("kernel_unary_f16_f16")]] kernel kernel_unary_t kernel_unary_impl; +template [[host_name("kernel_unary_f16_f16_4")]] kernel kernel_unary_t kernel_unary_impl; + +template +kernel void kernel_reglu( + constant ggml_metal_kargs_glu & args, + device const char * src0, + device const char * src1, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + dst_row[i0] = (T)(x0*x1*(x0 > 0.0f)); + } +} + +typedef decltype(kernel_reglu) kernel_reglu_t; + +template [[host_name("kernel_reglu_f32")]] kernel kernel_reglu_t kernel_reglu; +template [[host_name("kernel_reglu_f16")]] kernel kernel_reglu_t kernel_reglu; + +template +kernel void kernel_geglu( + constant ggml_metal_kargs_glu & args, + device const char * src0, + device const char * src1, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = (T)(gelu*x1); + } +} + +typedef decltype(kernel_geglu) kernel_geglu_t; + +template [[host_name("kernel_geglu_f32")]] kernel kernel_geglu_t kernel_geglu; +template [[host_name("kernel_geglu_f16")]] kernel kernel_geglu_t kernel_geglu; + +template +kernel void kernel_swiglu( + constant ggml_metal_kargs_glu & args, + device const char * src0, + device const char * src1, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = (T)(silu*x1); + } +} + +typedef decltype(kernel_swiglu) kernel_swiglu_t; + +template [[host_name("kernel_swiglu_f32")]] kernel kernel_swiglu_t kernel_swiglu; +template [[host_name("kernel_swiglu_f16")]] kernel kernel_swiglu_t kernel_swiglu; + +template +kernel void kernel_swiglu_oai( + constant ggml_metal_kargs_glu & args, + device const char * src0, + device const char * src1, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + float x0 = src0_row[i0]; + float x1 = src1_row[i0]; + + x0 = min(x0, args.limit); + x1 = max(min(x1, args.limit), -args.limit); + + float out_glu = x0 / (1.0f + exp(-x0 * args.alpha)); + out_glu = out_glu * (1.0f + x1); + + dst_row[i0] = (T)out_glu; + } +} + +typedef decltype(kernel_swiglu_oai) kernel_swiglu_oai_t; + +template [[host_name("kernel_swiglu_oai_f32")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai; +template [[host_name("kernel_swiglu_oai_f16")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai; + +template +kernel void kernel_geglu_erf( + constant ggml_metal_kargs_glu & args, + device const char * src0, + device const char * src1, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu_erf = 0.5f*x0*(1.0f+erf_approx(x0*SQRT_2_INV)); + + dst_row[i0] = (T)(gelu_erf*x1); + } +} + +typedef decltype(kernel_geglu_erf) kernel_geglu_erf_t; + +template [[host_name("kernel_geglu_erf_f32")]] kernel kernel_geglu_erf_t kernel_geglu_erf; +template [[host_name("kernel_geglu_erf_f16")]] kernel kernel_geglu_erf_t kernel_geglu_erf; + +template +kernel void kernel_geglu_quick( + constant ggml_metal_kargs_glu & args, + device const char * src0, + device const char * src1, + device char * dst, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0))); + + dst_row[i0] = (T)(gelu_quick*x1); + } +} + +typedef decltype(kernel_geglu_quick) kernel_geglu_quick_t; + +template [[host_name("kernel_geglu_quick_f32")]] kernel kernel_geglu_quick_t kernel_geglu_quick; +template [[host_name("kernel_geglu_quick_f16")]] kernel kernel_geglu_quick_t kernel_geglu_quick; diff --git a/ggml/src/ggml-metal/kernels/upscale.metal b/ggml/src/ggml-metal/kernels/upscale.metal new file mode 100644 index 0000000000..799c1513c8 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/upscale.metal @@ -0,0 +1,179 @@ +#include "common.h" + +constant bool FC_upscale_aa [[function_constant(FC_UPSCALE + 0)]]; + +kernel void kernel_upscale_nearest_f32( + constant ggml_metal_kargs_upscale & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3/args.sf3; + const int64_t i02 = i2/args.sf2; + const int64_t i01 = i1/args.sf1; + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int64_t i00 = i0/args.sf0; + + device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); + device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + + dst_ptr[0] = src0_ptr[0]; + } +} + +static inline float bilinear_tri(float x) { + return MAX(0.0f, 1.0f - fabs(x)); +} + +kernel void kernel_upscale_bilinear_f32( + constant ggml_metal_kargs_upscale & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3 / args.sf3; + const int64_t i02 = i2 / args.sf2; + + const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs; + const int64_t i01 = MAX(0, MIN(args.ne01 - 1, (int64_t)floor(f01))); + const int64_t i01p = MAX(0, MIN(args.ne01 - 1, i01 + 1)); + const float fd1 = MAX(0.0f, MIN(1.0f, f01 - (float)i01)); + + src0 += i03*args.nb03 + i02*args.nb02; + + device float * dst_ptr = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); + + if (FC_upscale_aa) { + const float support0 = MAX(1.0f, 1.0f / args.sf0); + const float invscale0 = 1.0f / support0; + const float support1 = MAX(1.0f, 1.0f / args.sf1); + const float invscale1 = 1.0f / support1; + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs; + + int64_t x_min = MAX((int64_t)0, (int64_t)floor(f00 - support0 + args.poffs)); + int64_t x_max = MIN(args.ne00, (int64_t)ceil (f00 + support0 + args.poffs)); + + int64_t y_min = MAX((int64_t)0, (int64_t)floor(f01 - support1 + args.poffs)); + int64_t y_max = MIN(args.ne01, (int64_t)ceil (f01 + support1 + args.poffs)); + + float sum = 0.0f; + float wsum = 0.0f; + + for (int64_t sy = y_min; sy < y_max; ++sy) { + const float wy = MAX(0.0f, 1.0f - fabs((float)sy - f01) * invscale1); + for (int64_t sx = x_min; sx < x_max; ++sx) { + const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0); + const float w = wx * wy; + device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); + sum += (*src_ptr) * w; + wsum += w; + } + } + + const float v = (wsum > 0.0f) ? (sum / wsum) : 0.0f; + dst_ptr[i0] = v; + } + } else { + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs; + const int64_t i00 = MAX(0, MIN(args.ne00 - 1, (int64_t)floor(f00))); + const int64_t i00p = MAX(0, MIN(args.ne00 - 1, i00 + 1)); + const float fd0 = MAX(0.0f, MIN(1.0f, f00 - (float)i00)); + + device const float * src00 = (device const float *)(src0 + i01*args.nb01 + i00*args.nb00); + device const float * src10 = (device const float *)(src0 + i01*args.nb01 + i00p*args.nb00); + device const float * src01 = (device const float *)(src0 + i01p*args.nb01 + i00*args.nb00); + device const float * src11 = (device const float *)(src0 + i01p*args.nb01 + i00p*args.nb00); + + const float v = + (*src00) * (1.0f - fd0) * (1.0f - fd1) + + (*src10) * fd0 * (1.0f - fd1) + + (*src01) * (1.0f - fd0) * fd1 + + (*src11) * fd0 * fd1; + + dst_ptr[i0] = v; + } + } +} + +static inline float bicubic_weight1(float x) { + const float a = -0.75f; + return ((a + 2) * x - (a + 3)) * x * x + 1; +} + +static inline float bicubic_weight2(float x) { + const float a = -0.75f; + return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; +} + +kernel void kernel_upscale_bicubic_f32( + constant ggml_metal_kargs_upscale & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3 / args.sf3; + const int64_t i02 = i2 / args.sf2; + + const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs; + const int64_t i01 = (int64_t)floor(f01); + const float fd1 = f01 - (float)i01; + + const float w_y0 = bicubic_weight2(fd1 + 1.0f); + const float w_y1 = bicubic_weight1(fd1); + const float w_y2 = bicubic_weight1(1.0f - fd1); + const float w_y3 = bicubic_weight2(2.0f - fd1); + + const device const char * src_slice = src0 + i03 * args.nb03 + i02 * args.nb02; + + device float * dst_ptr = (device float *)(dst + i3 * args.nb3 + i2 * args.nb2 + i1 * args.nb1); + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs; + const int64_t i00 = (int64_t)floor(f00); + const float fd0 = f00 - (float)i00; + + const float w_x0 = bicubic_weight2(fd0 + 1.0f); + const float w_x1 = bicubic_weight1(fd0); + const float w_x2 = bicubic_weight1(1.0f - fd0); + const float w_x3 = bicubic_weight2(2.0f - fd0); + + float sum = 0.0f; + + for (int dy = -1; dy <= 2; ++dy) { + const int64_t iy = MAX(0, MIN(args.ne01 - 1, i01 + dy)); + const float wy = (dy == -1) ? w_y0 : (dy == 0) ? w_y1 : (dy == 1) ? w_y2 : w_y3; + + for (int dx = -1; dx <= 2; ++dx) { + const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx)); + const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3; + + device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); + sum += (*src_ptr) * wx * wy; + } + } + + dst_ptr[i0] = sum; + } +} diff --git a/ggml/src/ggml-metal/kernels/wkv.metal b/ggml/src/ggml-metal/kernels/wkv.metal new file mode 100644 index 0000000000..8767581c69 --- /dev/null +++ b/ggml/src/ggml-metal/kernels/wkv.metal @@ -0,0 +1,179 @@ +#include "common.h" + +kernel void kernel_rwkv_wkv6_f32( + device const float * k, + device const float * v, + device const float * r, + device const float * tf, + device const float * td, + device const float * state_in, + device float * dst, + constant uint & B, + constant uint & T, + constant uint & C, + constant uint & H, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const uint head_size = 64; // TODO: support head_size = 128 + const uint batch_id = tgpig.x / H; + const uint head_id = tgpig.x % H; + const uint tid = tpitg.x; + + if (batch_id >= B || head_id >= H) { + return; + } + + const uint state_size = C * head_size; + const uint n_seq_tokens = T / B; + + threadgroup float _k[head_size]; + threadgroup float _r[head_size]; + threadgroup float _tf[head_size]; + threadgroup float _td[head_size]; + + float state[head_size]; + + for (uint i = 0; i < head_size; i++) { + state[i] = state_in[batch_id * state_size + head_id * head_size * head_size + + i * head_size + tid]; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + _tf[tid] = tf[head_id * head_size + tid]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; + const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; + + for (uint t = start_t; t < end_t; t += C) { + threadgroup_barrier(mem_flags::mem_threadgroup); + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + const float v_val = v[t]; + float y = 0.0; + + for (uint j = 0; j < head_size; j += 4) { + float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + float4 tf_vec = float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); + float4 td_vec = float4(_td[j], _td[j+1], _td[j+2], _td[j+3]); + float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); + + float4 kv = k_vec * v_val; + + float4 temp = tf_vec * kv + s_vec; + y += dot(r_vec, temp); + + s_vec = s_vec * td_vec + kv; + state[j] = s_vec[0]; + state[j+1] = s_vec[1]; + state[j+2] = s_vec[2]; + state[j+3] = s_vec[3]; + } + + dst[t] = y; + } + + for (uint i = 0; i < head_size; i++) { + dst[T * C + batch_id * state_size + head_id * head_size * head_size + + i * head_size + tid] = state[i]; + } +} + +kernel void kernel_rwkv_wkv7_f32( + device const float * r, + device const float * w, + device const float * k, + device const float * v, + device const float * a, + device const float * b, + device const float * state_in, + device float * dst, + constant uint & B, + constant uint & T, + constant uint & C, + constant uint & H, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const uint head_size = 64; // TODO: support head_size = 128 + const uint batch_id = tgpig.x / H; + const uint head_id = tgpig.x % H; + const uint tid = tpitg.x; + + if (batch_id >= B || head_id >= H) { + return; + } + + const uint state_size = C * head_size; + const uint n_seq_tokens = T / B; + + threadgroup float _r[head_size]; + threadgroup float _w[head_size]; + threadgroup float _k[head_size]; + threadgroup float _a[head_size]; + threadgroup float _b[head_size]; + + float state[head_size]; + + for (uint i = 0; i < head_size; i++) { + state[i] = state_in[batch_id * state_size + head_id * head_size * head_size + + tid * head_size + i]; + } + + const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; + const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; + + for (uint t = start_t; t < end_t; t += C) { + threadgroup_barrier(mem_flags::mem_threadgroup); + _r[tid] = r[t]; + _w[tid] = w[t]; + _k[tid] = k[t]; + _a[tid] = a[t]; + _b[tid] = b[t]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + const float v_val = v[t]; + float y = 0.0, sa = 0.0; + + float4 sa_vec(0.0); + + for (uint j = 0; j < head_size; j += 4) { + float4 a_vec = float4(_a[j], _a[j+1], _a[j+2], _a[j+3]); + float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); + sa_vec += a_vec * s_vec; + } + sa = sa_vec[0] + sa_vec[1] + sa_vec[2] + sa_vec[3]; + + for (uint j = 0; j < head_size; j += 4) { + float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + float4 w_vec = float4(_w[j], _w[j+1], _w[j+2], _w[j+3]); + float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + float4 b_vec = float4(_b[j], _b[j+1], _b[j+2], _b[j+3]); + float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); + + float4 kv = k_vec * v_val; + + s_vec = s_vec * w_vec + kv + sa * b_vec; + y += dot(s_vec, r_vec); + + state[j] = s_vec[0]; + state[j+1] = s_vec[1]; + state[j+2] = s_vec[2]; + state[j+3] = s_vec[3]; + } + + dst[t] = y; + } + + for (uint i = 0; i < head_size; i++) { + dst[T * C + batch_id * state_size + head_id * head_size * head_size + + tid * head_size + i] = state[i]; + } +}