mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
Compare commits
No commits in common. "master" and "b9802" have entirely different histories.
@ -145,7 +145,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
# ==============================================================================
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
@ -156,7 +156,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@ -104,7 +104,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -115,7 +115,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -113,7 +113,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -124,7 +124,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -141,7 +141,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -153,7 +153,7 @@ FROM base AS server
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -115,7 +115,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -126,7 +126,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
ARG OPENVINO_VERSION_MAJOR=2026.2.1
|
||||
ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
|
||||
ARG OPENVINO_VERSION_MAJOR=2026.2
|
||||
ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
|
||||
ARG IGC_VERSION=v2.36.3
|
||||
ARG IGC_VERSION_FULL=2_2.36.3+21719
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
|
||||
ARG IGC_VERSION=v2.34.4
|
||||
ARG IGC_VERSION_FULL=2_2.34.4+21428
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
|
||||
ARG IGDGMM_VERSION=22.10.0
|
||||
|
||||
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
|
||||
@ -214,7 +214,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -225,7 +225,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app/
|
||||
COPY --from=build /app/full/llama-server /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -127,7 +127,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -138,7 +138,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||
|
||||
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||
|
||||
@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
|
||||
@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -118,7 +118,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -97,7 +97,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@ -108,7 +108,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
8
.github/workflows/build-cache.yml
vendored
8
.github/workflows/build-cache.yml
vendored
@ -68,8 +68,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@ -96,8 +96,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
8
.github/workflows/build-openvino.yml
vendored
8
.github/workflows/build-openvino.yml
vendored
@ -39,8 +39,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@ -96,8 +96,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
4
.github/workflows/build-self-hosted.yml
vendored
4
.github/workflows/build-self-hosted.yml
vendored
@ -266,8 +266,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
69
.github/workflows/release.yml
vendored
69
.github/workflows/release.yml
vendored
@ -446,8 +446,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Set OpenVINO version output
|
||||
@ -506,11 +506,8 @@ jobs:
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build/ReleaseOV --config Release --parallel
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
@ -524,26 +521,8 @@ jobs:
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
dest=./build/ReleaseOV/bin
|
||||
OPENVINO_ROOT=./openvino_toolkit
|
||||
ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
|
||||
|
||||
# Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
|
||||
# load these siblings without setupvars.sh / LD_LIBRARY_PATH.
|
||||
cp -P "$ov_lib"/libopenvino.so* \
|
||||
"$ov_lib"/libopenvino_c.so* \
|
||||
"$ov_lib"/libopenvino_*_plugin.so \
|
||||
"$ov_lib"/libopenvino_intel_npu_compiler*.so \
|
||||
"$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
|
||||
"$dest"
|
||||
cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
|
||||
cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
|
||||
|
||||
# OpenVINO licensing
|
||||
cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
|
||||
|
||||
cp LICENSE "$dest"
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .
|
||||
cp LICENSE ./build/ReleaseOV/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
@ -552,9 +531,6 @@ jobs:
|
||||
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||
|
||||
windows-openvino:
|
||||
needs: [check-release]
|
||||
if: ${{ needs.check-release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
outputs:
|
||||
@ -562,8 +538,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
|
||||
steps:
|
||||
- name: Set OpenVINO version output
|
||||
@ -631,9 +607,7 @@ jobs:
|
||||
-A x64 ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_OPENVINO=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
|
||||
|
||||
cmake --build build\ReleaseOV --config Release -- /m
|
||||
|
||||
@ -650,29 +624,8 @@ jobs:
|
||||
id: pack_artifacts
|
||||
shell: powershell
|
||||
run: |
|
||||
# Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
|
||||
$OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
|
||||
if (-not $OPENVINO_ROOT) {
|
||||
Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
|
||||
exit 1
|
||||
}
|
||||
|
||||
$dest = ".\build\ReleaseOV\bin\Release"
|
||||
|
||||
$ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
|
||||
Copy-Item -Path (Join-Path $ovBin '*.dll') -Destination $dest -Force
|
||||
Copy-Item -Path (Join-Path $ovBin 'cache.json') -Destination $dest -Force
|
||||
|
||||
$tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
|
||||
Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
|
||||
|
||||
# OpenVINO licensing
|
||||
$licensingDest = Join-Path $dest 'openvino-licensing'
|
||||
New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
|
||||
Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
|
||||
|
||||
Copy-Item LICENSE $dest
|
||||
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*
|
||||
Copy-Item LICENSE .\build\ReleaseOV\bin\
|
||||
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
|
||||
@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
|
||||
### Untrusted environments or networks
|
||||
|
||||
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
||||
* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
||||
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
|
||||
* Encrypt your data if sending it over the network.
|
||||
|
||||
|
||||
@ -50,7 +50,6 @@ struct command {
|
||||
std::vector<std::string> aliases;
|
||||
bool hidden;
|
||||
int (*func)(int, char **);
|
||||
bool flags = false; // allow --name
|
||||
};
|
||||
|
||||
#ifdef LLAMA_INSTALL_BUILD
|
||||
@ -70,9 +69,9 @@ static const command cmds[] = {
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, false, version, true },
|
||||
{"licenses", "Show third-party licenses", {"credits"}, false, licenses, true },
|
||||
{"help", "Show available commands", {}, false, help, true },
|
||||
{"version", "Show version", {}, false, version },
|
||||
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
|
||||
{"help", "Show available commands", {}, false, help },
|
||||
};
|
||||
|
||||
#undef UPDATE_HIDDEN
|
||||
@ -109,10 +108,7 @@ static int help(int argc, char ** argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool matches(std::string arg, const command & cmd) {
|
||||
if (cmd.flags && arg.size() > 2 && arg[0] == '-' && arg[1] == '-') {
|
||||
arg.erase(0, 2);
|
||||
}
|
||||
static bool matches(const std::string & arg, const command & cmd) {
|
||||
if (arg == cmd.name) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -352,8 +352,6 @@ static std::string get_default_local_path(const std::string & url) {
|
||||
|
||||
common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
|
||||
common_download_hf_plan plan;
|
||||
common_download_hf_plan plan_spec;
|
||||
common_download_hf_plan plan_voc;
|
||||
common_download_opts opts;
|
||||
|
||||
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
|
||||
@ -379,15 +377,7 @@ common_models_handler common_models_handler_init(const common_params & params, l
|
||||
plan = common_download_get_hf_plan(params.model, opts);
|
||||
}
|
||||
|
||||
if (!params.speculative.draft.mparams.hf_repo.empty()) {
|
||||
plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts);
|
||||
}
|
||||
|
||||
if (!params.vocoder.model.hf_repo.empty()) {
|
||||
plan_voc = common_download_get_hf_plan(params.vocoder.model, opts);
|
||||
}
|
||||
|
||||
return common_models_handler{plan, plan_spec, plan_voc, opts};
|
||||
return common_models_handler{plan, opts};
|
||||
}
|
||||
|
||||
bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
|
||||
@ -435,9 +425,7 @@ static std::vector<common_download_task> build_url_tasks(const common_params_mod
|
||||
void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
|
||||
std::vector<common_download_task> tasks;
|
||||
|
||||
auto & plan = handler.plan;
|
||||
auto & plan_spec = handler.plan_spec;
|
||||
auto & plan_voc = handler.plan_voc;
|
||||
auto & plan = handler.plan;
|
||||
|
||||
auto opts = handler.opts; // copy
|
||||
opts.callback = callback;
|
||||
@ -496,22 +484,19 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
}
|
||||
|
||||
// handle hf_plan tasks
|
||||
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
|
||||
for (size_t i = 0; i < model_files.size(); ++i) {
|
||||
auto & model_file = model_files[i];
|
||||
if (!plan.model_files.empty()) {
|
||||
for (size_t i = 0; i < plan.model_files.size(); ++i) {
|
||||
auto & model_file = plan.model_files[i];
|
||||
bool is_first = (i == 0);
|
||||
tasks.emplace_back(model_file, opts, [&, is_first]() {
|
||||
if (is_first) {
|
||||
// only use first part as model path
|
||||
model.path = hf_cache::finalize_file(model_file);
|
||||
params.model.path = hf_cache::finalize_file(model_file);
|
||||
} else {
|
||||
hf_cache::finalize_file(model_file);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
if (!plan.model_files.empty()) {
|
||||
add_tasks(plan.model_files, params.model);
|
||||
}
|
||||
if (!plan.mmproj.local_path.empty()) {
|
||||
tasks.emplace_back(plan.mmproj, opts, [&]() {
|
||||
@ -537,31 +522,9 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
});
|
||||
}
|
||||
|
||||
// handle plan_spec (e.g. --spec-draft-hf)
|
||||
if (!plan_spec.model_files.empty()) {
|
||||
add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
|
||||
}
|
||||
|
||||
// handle vocoder plan (e.g. --hf-repo-v)
|
||||
if (!plan_voc.model_files.empty()) {
|
||||
add_tasks(plan_voc.model_files, params.vocoder.model);
|
||||
}
|
||||
|
||||
// run all tasks in parallel
|
||||
if (!params.offline) {
|
||||
// if duplicated files are found, only download once (but still call on_done for each task)
|
||||
std::unordered_map<std::string, common_download_task *> unique_tasks;
|
||||
for (auto & task : tasks) {
|
||||
auto it = unique_tasks.find(task.local_path);
|
||||
if (it == unique_tasks.end()) {
|
||||
unique_tasks[task.local_path] = &task;
|
||||
}
|
||||
}
|
||||
std::vector<common_download_task> unique_tasks_vec;
|
||||
for (auto & pair : unique_tasks) {
|
||||
unique_tasks_vec.push_back(*pair.second);
|
||||
}
|
||||
common_download_run_tasks(unique_tasks_vec);
|
||||
common_download_run_tasks(tasks);
|
||||
}
|
||||
|
||||
// download successful, update params with the downloaded paths
|
||||
@ -3748,7 +3711,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"draft model for speculative decoding (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.draft.mparams.path = value;
|
||||
params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@ -133,8 +133,6 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
struct common_models_handler {
|
||||
common_download_hf_plan plan;
|
||||
common_download_hf_plan plan_spec;
|
||||
common_download_hf_plan plan_voc;
|
||||
common_download_opts opts;
|
||||
};
|
||||
|
||||
|
||||
@ -114,8 +114,7 @@ class Mamba2Model(TextModel):
|
||||
hparams["text_config"] = hparams["llm_config"]
|
||||
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
||||
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
self.expand = self.find_hparam(["mamba_expand", "expand"], optional=True) or 2
|
||||
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or self.expand * self.d_model
|
||||
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
||||
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
||||
|
||||
def set_vocab(self):
|
||||
@ -145,9 +144,11 @@ class Mamba2Model(TextModel):
|
||||
|
||||
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
||||
|
||||
# Fail early for models which don't have a block expansion factor of 2
|
||||
# TODO: does this really matter?
|
||||
# skip the assertion for FalconH1 Model
|
||||
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
|
||||
assert self.d_inner == self.expand * self.d_model
|
||||
assert self.d_inner == 2 * self.d_model
|
||||
assert self.d_inner % head_dim == 0
|
||||
|
||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
||||
|
||||
@ -237,8 +237,8 @@ chmod +x ubuntu-llamacpp-ov-install.sh
|
||||
# ============================================
|
||||
set -euo pipefail
|
||||
|
||||
OPENVINO_VERSION_MAJOR="2026.2.1"
|
||||
OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3"
|
||||
OPENVINO_VERSION_MAJOR="2026.2"
|
||||
OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
|
||||
@ -334,7 +334,7 @@ echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
|
||||
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
|
||||
|
||||
</details>
|
||||
|
||||
@ -364,8 +364,8 @@ REM ============================================
|
||||
REM llama.cpp OpenVINO Build Script (Ninja)
|
||||
REM ============================================
|
||||
|
||||
set "OPENVINO_VERSION_MAJOR=2026.2.1"
|
||||
set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3"
|
||||
set "OPENVINO_VERSION_MAJOR=2026.2"
|
||||
set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
|
||||
|
||||
set "SCRIPT_DIR=%~dp0"
|
||||
set "VCPKG_DIR=C:\vcpkg"
|
||||
@ -547,7 +547,7 @@ endlocal
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
|
||||
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 15)
|
||||
set(GGML_VERSION_PATCH 3)
|
||||
set(GGML_VERSION_PATCH 2)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
@ -1551,8 +1551,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
ggml_backend_synchronize(split_backend);
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||
@ -1563,15 +1561,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else if (!split_backend->iface.cpy_tensor_async) {
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else if (!split_backend->iface.cpy_tensor_async) {
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
|
||||
@ -1676,8 +1674,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_synchronize(split_backend);
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
|
||||
@ -75,12 +75,12 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmla on available elements only
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
sum1 = svmla_f32_m(pg, sum1, ax1, ay1);
|
||||
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
||||
}
|
||||
// reduce sum1,sum2 to sum1
|
||||
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
||||
|
||||
@ -386,46 +386,6 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
// check if a same-type copy reduces to a 2D strided copy (height rows of width
|
||||
// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel
|
||||
static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
size_t & width, size_t & height, size_t & spitch, size_t & dpitch) {
|
||||
// require matching shape: a reshaped copy maps elements by flat order, which the
|
||||
// prefix walk below does not handle
|
||||
if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// grow the contiguous prefix block shared by both tensors
|
||||
size_t block_nb = ggml_element_size(src0);
|
||||
int d = 0;
|
||||
for (; d < GGML_MAX_DIMS; ++d) {
|
||||
if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) {
|
||||
break;
|
||||
}
|
||||
block_nb *= src0->ne[d];
|
||||
}
|
||||
|
||||
// d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy)
|
||||
if (d == 0 || d == GGML_MAX_DIMS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// dim d carries the rows; everything above it must be a single element
|
||||
for (int i = d + 1; i < GGML_MAX_DIMS; ++i) {
|
||||
if (src0->ne[i] != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
width = block_nb;
|
||||
height = src0->ne[d];
|
||||
spitch = src0->nb[d];
|
||||
dpitch = src1->nb[d];
|
||||
|
||||
return spitch >= width && dpitch >= width;
|
||||
}
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
@ -461,8 +421,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
|
||||
src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
|
||||
|
||||
size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0;
|
||||
|
||||
if (src0->type == src1->type && contiguous_srcs) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
@ -473,9 +431,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
} else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) {
|
||||
CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch,
|
||||
mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<float, float, true>
|
||||
|
||||
@ -3192,24 +3192,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
// Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
|
||||
// Excluding this path for HIP and MUSA as a precaution.
|
||||
// According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
|
||||
// Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
|
||||
// e.g. https://github.com/ROCm/rocm-systems/issues/5109
|
||||
// It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
|
||||
// This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
|
||||
#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
|
||||
const bool copy_from_host = false;
|
||||
#else
|
||||
const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
#endif
|
||||
|
||||
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -3220,17 +3207,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
|
||||
|
||||
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
|
||||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif // NDEBUG
|
||||
return false;
|
||||
}
|
||||
|
||||
if (copy_from_host) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
|
||||
} else if (backend_src != backend_dst) {
|
||||
if (backend_src != backend_dst) {
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
|
||||
@ -2,28 +2,6 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void k_compute_out_prod_ptrs(
|
||||
const float * src0_d, const float * src1_d, float * dst_d,
|
||||
const float ** ptrs_a, const float ** ptrs_b, float ** ptrs_c,
|
||||
const int64_t ne2, const int64_t ne3,
|
||||
const int64_t dps2, const int64_t dps3,
|
||||
const size_t s02, const size_t s03,
|
||||
const size_t s12, const size_t s13,
|
||||
const size_t s2, const size_t s3) {
|
||||
const int64_t i2 = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
const int64_t i3 = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
if (i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t idx = i3*ne2 + i2;
|
||||
|
||||
ptrs_a[idx] = src0_d + (i3/dps3)*s03 + (i2/dps2)*s02;
|
||||
ptrs_b[idx] = src1_d + i3 *s13 + i2 *s12;
|
||||
ptrs_c[idx] = dst_d + i3 *s3 + i2 *s2;
|
||||
}
|
||||
|
||||
void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
@ -89,39 +67,18 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
&beta, dst_d + i3 *s3, ldc, s2,
|
||||
batch_count));
|
||||
}
|
||||
} else if (ne2 > 1 || ne3 > 1) {
|
||||
// dps2 > 1 (src0 broadcast along dim 2 with non-uniform stride) or multiple GEMMs
|
||||
// along dim 3: compute per-GEMM pointers on the device and use a single batched GEMM.
|
||||
GGML_ASSERT(ne3 > 0);
|
||||
GGML_ASSERT(ne2 <= (int64_t) std::numeric_limits<int>::max() / ne3);
|
||||
const int batch_count = (int) (ne2 * ne3);
|
||||
|
||||
ggml_cuda_pool_alloc<const float *> ptrs_a(ctx.pool(), batch_count);
|
||||
ggml_cuda_pool_alloc<const float *> ptrs_b(ctx.pool(), batch_count);
|
||||
ggml_cuda_pool_alloc< float *> ptrs_c(ctx.pool(), batch_count);
|
||||
|
||||
const dim3 block_dims(16, 16);
|
||||
const dim3 grid_dims((ne2 + block_dims.x - 1)/block_dims.x, (ne3 + block_dims.y - 1)/block_dims.y);
|
||||
k_compute_out_prod_ptrs<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ptrs_a.get(), ptrs_b.get(), ptrs_c.get(),
|
||||
ne2, ne3, dps2, dps3, s02, s03, s12, s13, s2, s3);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemmBatched(handle, CUBLAS_OP_N, src1_cublas_op,
|
||||
ne0, ne1, ne01,
|
||||
&alpha, ptrs_a.get(), lda,
|
||||
ptrs_b.get(), ldb,
|
||||
&beta, ptrs_c.get(), ldc,
|
||||
batch_count));
|
||||
} else {
|
||||
// ne2 == 1 && ne3 == 1: single GEMM
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
|
||||
ne0, ne1, ne01,
|
||||
&alpha, src0_d, lda,
|
||||
src1_d, ldb,
|
||||
&beta, dst_d, ldc));
|
||||
// Fallback: ne2 == 1 (no batching benefit) or dps2 > 1 (src0 broadcast along dim 2
|
||||
// with non-uniform stride; would need cublasSgemmBatched with pointer arrays).
|
||||
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
|
||||
ne0, ne1, ne01,
|
||||
&alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
|
||||
src1_d + i3 *s13 + i2 *s12, ldb,
|
||||
&beta, dst_d + i3 *s3 + i2 *s2, ldc));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
@ -48,7 +48,6 @@
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasSgemmBatched hipblasSgemmBatched
|
||||
#define cublasSgemmStridedBatched hipblasSgemmStridedBatched
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cublasOperation_t hipblasOperation_t
|
||||
|
||||
1
ggml/src/ggml-cuda/vendors/musa.h
vendored
1
ggml/src/ggml-cuda/vendors/musa.h
vendored
@ -32,7 +32,6 @@
|
||||
#define cublasSetMathMode mublasSetMathMode
|
||||
#define cublasSetStream mublasSetStream
|
||||
#define cublasSgemm mublasSgemm
|
||||
#define cublasSgemmBatched mublasSgemmBatched
|
||||
#define cublasSgemmStridedBatched mublasSgemmStridedBatched
|
||||
#define cublasStatus_t mublasStatus_t
|
||||
#define cublasOperation_t mublasOperation_t
|
||||
|
||||
@ -192,10 +192,7 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mm_f16_f32_kq_kqv
|
||||
conv2d
|
||||
conv2d_f16_f32
|
||||
flash_attn_pre_f16
|
||||
flash_attn_f32_f16
|
||||
flash_attn_f32_q8_0
|
||||
flash_attn_f32_q4_0
|
||||
flash_attn_f16
|
||||
flash_attn_f32
|
||||
)
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// Flash-attention per-(dk,dv) tile tuning for the Adreno OpenCL backend.
|
||||
// Isolated from ggml-opencl.cpp so the tuning numbers are easy to find and
|
||||
// edit; the FA dispatch and kernel-compile logic stay in the main file.
|
||||
// This header is a file section — it is #included exactly once, at the point
|
||||
// in ggml-opencl.cpp where the ggml logging macros are already in scope.
|
||||
|
||||
// Per-(dk, dv) FA config; shared by dispatch and supports_op.
|
||||
struct ggml_opencl_fa_dim {
|
||||
int dk; int dv; int bm; int bn; int n_split; int nkv_split_threshold;
|
||||
};
|
||||
|
||||
// Split variant fires when n_kv >= threshold (threshold=0 -> always split).
|
||||
// Default tuning covers Adreno 7xx/8xx mobile and X1-series laptop GPUs.
|
||||
static const ggml_opencl_fa_dim g_fa_dims_adreno_default[] = {
|
||||
{ 40, 40, 64, 32, 1, 0}, { 64, 64, 64, 32, 2, 64},
|
||||
{ 80, 80, 64, 32, 2, 64}, { 96, 96, 64, 32, 2, 64},
|
||||
{112, 112, 64, 32, 2, 64}, {128, 128, 64, 32, 2, 64},
|
||||
{192, 128, 16, 16, 1, 0},
|
||||
{192, 192, 16, 16, 1, 0},
|
||||
{256, 256, 16, 16, 16, 0},
|
||||
};
|
||||
|
||||
struct ggml_opencl_fa_dim_table {
|
||||
const ggml_opencl_fa_dim * data;
|
||||
size_t count;
|
||||
|
||||
const ggml_opencl_fa_dim * begin() const { return data; }
|
||||
const ggml_opencl_fa_dim * end() const { return data + count; }
|
||||
};
|
||||
|
||||
// Mutable copy of the active table; GGML_OPENCL_FA_TUNE patches entries here
|
||||
// at backend init without touching the const source table.
|
||||
static ggml_opencl_fa_dim g_fa_dims_runtime[
|
||||
sizeof(g_fa_dims_adreno_default) / sizeof(g_fa_dims_adreno_default[0])];
|
||||
|
||||
static ggml_opencl_fa_dim_table g_opencl_fa_dims = {
|
||||
g_fa_dims_adreno_default,
|
||||
sizeof(g_fa_dims_adreno_default) / sizeof(g_fa_dims_adreno_default[0]),
|
||||
};
|
||||
|
||||
// GGML_OPENCL_FA_TUNE=dk:dv:bm:bn:nsplit:thr[,…] — patches matching entries
|
||||
// in the active table at backend init, before the first FA kernel compiles.
|
||||
// Unmatched (dk,dv) pairs are warned and ignored.
|
||||
static void ggml_opencl_fa_apply_env_overrides() {
|
||||
const char * e = std::getenv("GGML_OPENCL_FA_TUNE");
|
||||
if (!e || !e[0]) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string s = e;
|
||||
size_t pos = 0;
|
||||
while (pos < s.size()) {
|
||||
size_t comma = s.find(',', pos);
|
||||
std::string entry = s.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
|
||||
int dk, dv, bm, bn, nsplit, thr;
|
||||
if (std::sscanf(entry.c_str(), "%d:%d:%d:%d:%d:%d", &dk, &dv, &bm, &bn, &nsplit, &thr) == 6) {
|
||||
bool patched = false;
|
||||
for (size_t i = 0; i < g_opencl_fa_dims.count; ++i) {
|
||||
ggml_opencl_fa_dim & d = g_fa_dims_runtime[i];
|
||||
if (d.dk == dk && d.dv == dv) {
|
||||
d.bm = bm; d.bn = bn; d.n_split = nsplit; d.nkv_split_threshold = thr;
|
||||
GGML_LOG_INFO("ggml_opencl: FA tune override DK=%d DV=%d -> bm=%d bn=%d n_split=%d thr=%d\n",
|
||||
dk, dv, bm, bn, nsplit, thr);
|
||||
patched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!patched) {
|
||||
GGML_LOG_WARN("ggml_opencl: FA tune override DK=%d DV=%d ignored (no matching dim)\n", dk, dv);
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: FA tune override entry malformed: '%s'\n", entry.c_str());
|
||||
}
|
||||
if (comma == std::string::npos) break;
|
||||
pos = comma + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the default table into the mutable runtime buffer and apply any
|
||||
// GGML_OPENCL_FA_TUNE overrides. A per-generation table can be added here
|
||||
// once it has been tuned on hardware.
|
||||
static void ggml_cl_init_fa_dims_table() {
|
||||
const size_t count = sizeof(g_fa_dims_adreno_default) / sizeof(g_fa_dims_adreno_default[0]);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
g_fa_dims_runtime[i] = g_fa_dims_adreno_default[i];
|
||||
}
|
||||
g_opencl_fa_dims = { g_fa_dims_runtime, count };
|
||||
ggml_opencl_fa_apply_env_overrides();
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1582,158 +1582,6 @@ kernel void kernel_restore_block_q8_0(
|
||||
}
|
||||
}
|
||||
|
||||
// View-aware AoS q8_0 -> f32 dequant (f32/f32 FA path).
|
||||
kernel void kernel_dequant_q8_0_f32_view_aos(
|
||||
global char * src,
|
||||
ulong src_offset,
|
||||
ulong src_nb1,
|
||||
ulong src_nb2,
|
||||
ulong src_nb3,
|
||||
int nblk0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
global float * dst
|
||||
) {
|
||||
int blk_i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int batch = get_global_id(2);
|
||||
|
||||
if (blk_i0 >= nblk0) return;
|
||||
if (i1 >= ne1) return;
|
||||
|
||||
int i2 = batch % ne2;
|
||||
int i3 = batch / ne2;
|
||||
if (i3 >= ne3) return;
|
||||
|
||||
global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK8_0);
|
||||
float d = vload_half(0, (global half *)block);
|
||||
global char * qs = block + 2;
|
||||
|
||||
ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
|
||||
global float * out = dst + (dst_row_base + blk_i0) * QK8_0;
|
||||
|
||||
for (int i = 0; i < QK8_0; ++i) {
|
||||
out[i] = d * (float)qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
// View-aware AoS q8_0 -> f16 dequant. Rows tight, batch strides may be gapped.
|
||||
kernel void kernel_dequant_q8_0_f16_view_aos(
|
||||
global char * src,
|
||||
ulong src_offset,
|
||||
ulong src_nb1,
|
||||
ulong src_nb2,
|
||||
ulong src_nb3,
|
||||
int nblk0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
global half * dst
|
||||
) {
|
||||
int blk_i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int batch = get_global_id(2);
|
||||
|
||||
if (blk_i0 >= nblk0) return;
|
||||
if (i1 >= ne1) return;
|
||||
|
||||
int i2 = batch % ne2;
|
||||
int i3 = batch / ne2;
|
||||
if (i3 >= ne3) return;
|
||||
|
||||
global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK8_0);
|
||||
float d = vload_half(0, (global half *)block);
|
||||
global char * qs = block + 2;
|
||||
|
||||
ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
|
||||
global half * out = dst + (dst_row_base + blk_i0) * QK8_0;
|
||||
|
||||
for (int i = 0; i < QK8_0; ++i) {
|
||||
out[i] = (half)(d * (float)qs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// View-aware AoS q4_0 -> f32 dequant (mirrors the q8_0 view variant).
|
||||
kernel void kernel_dequant_q4_0_f32_view_aos(
|
||||
global char * src,
|
||||
ulong src_offset,
|
||||
ulong src_nb1,
|
||||
ulong src_nb2,
|
||||
ulong src_nb3,
|
||||
int nblk0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
global float * dst
|
||||
) {
|
||||
int blk_i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int batch = get_global_id(2);
|
||||
|
||||
if (blk_i0 >= nblk0) return;
|
||||
if (i1 >= ne1) return;
|
||||
|
||||
int i2 = batch % ne2;
|
||||
int i3 = batch / ne2;
|
||||
if (i3 >= ne3) return;
|
||||
|
||||
global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK4_0/2);
|
||||
float d = vload_half(0, (global half *)block);
|
||||
global uchar * qs = (global uchar *)(block + 2);
|
||||
|
||||
ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
|
||||
global float * out = dst + (dst_row_base + blk_i0) * QK4_0;
|
||||
|
||||
for (int i = 0; i < QK4_0/2; ++i) {
|
||||
uchar byte = qs[i];
|
||||
int q0 = (int)(byte & 0x0F) - 8;
|
||||
int q1 = (int)(byte >> 4) - 8;
|
||||
out[i] = d * (float)q0;
|
||||
out[i + QK4_0/2] = d * (float)q1;
|
||||
}
|
||||
}
|
||||
|
||||
// View-aware AoS q4_0 -> f16 dequant (mirrors the q8_0 view variant).
|
||||
kernel void kernel_dequant_q4_0_f16_view_aos(
|
||||
global char * src,
|
||||
ulong src_offset,
|
||||
ulong src_nb1,
|
||||
ulong src_nb2,
|
||||
ulong src_nb3,
|
||||
int nblk0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
global half * dst
|
||||
) {
|
||||
int blk_i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int batch = get_global_id(2);
|
||||
|
||||
if (blk_i0 >= nblk0) return;
|
||||
if (i1 >= ne1) return;
|
||||
|
||||
int i2 = batch % ne2;
|
||||
int i3 = batch / ne2;
|
||||
if (i3 >= ne3) return;
|
||||
|
||||
global char * block = src + src_offset + (ulong)i3*src_nb3 + (ulong)i2*src_nb2 + (ulong)i1*src_nb1 + (ulong)blk_i0 * (2 + QK4_0/2);
|
||||
float d = vload_half(0, (global half *)block);
|
||||
global uchar * qs = (global uchar *)(block + 2);
|
||||
|
||||
ulong dst_row_base = ((ulong)i3 * ne2 * ne1 + (ulong)i2 * ne1 + (ulong)i1) * nblk0;
|
||||
global half * out = dst + (dst_row_base + blk_i0) * QK4_0;
|
||||
|
||||
for (int i = 0; i < QK4_0/2; ++i) {
|
||||
uchar byte = qs[i];
|
||||
int q0 = (int)(byte & 0x0F) - 8;
|
||||
int q1 = (int)(byte >> 4) - 8;
|
||||
out[i] = (half)(d * (float)q0);
|
||||
out[i + QK4_0/2] = (half)(d * (float)q1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q8_0_trans(
|
||||
global uchar * src_q,
|
||||
global half * src_d,
|
||||
|
||||
@ -4,26 +4,14 @@
|
||||
#define ACC_TYPE4 float4
|
||||
#define DATA_TYPE half
|
||||
#define DATA_TYPE4 half4
|
||||
#define CONVERT_ACC4(x) ((float4)((float)(x).s0, (float)(x).s1, (float)(x).s2, (float)(x).s3))
|
||||
#define CONVERT_DATA4(x) ((half4)((half)(x).s0, (half)(x).s1, (half)(x).s2, (half)(x).s3))
|
||||
#define CONVERT_ACC4(x) convert_float4(x)
|
||||
#define CONVERT_DATA4(x) convert_half4(x)
|
||||
|
||||
#define DK_VEC (DK/4)
|
||||
#define DV_VEC (DV/4)
|
||||
#define WG_SIZE (BLOCK_M)
|
||||
#define Q1_WG_SIZE 64
|
||||
|
||||
// The kernels are built with -cl-finite-math-only. On some older Adreno GPUs,
|
||||
// infinite operand can cause undefined behavior and miscompilation for exp.
|
||||
// Therefore, a large negative value is used instead.
|
||||
#define FA_M_INIT (-3.0e38f)
|
||||
|
||||
// Drop full unroll at DK>=192 — Adreno compiler host-memory budget.
|
||||
#if DK >= 192
|
||||
#define FA_UNROLL
|
||||
#else
|
||||
#define FA_UNROLL _Pragma("unroll")
|
||||
#endif
|
||||
|
||||
inline float get_alibi_slope(
|
||||
const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
|
||||
) {
|
||||
@ -93,18 +81,18 @@ __kernel void flash_attn_f16(
|
||||
if (my_query_row < n_q) {
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
|
||||
const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_ACC4(q_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
}
|
||||
ACC_TYPE m_i = FA_M_INIT;
|
||||
ACC_TYPE m_i = -INFINITY;
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
@ -137,72 +125,49 @@ __kernel void flash_attn_f16(
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int j = 0; j < BLOCK_N; j += 4) {
|
||||
for (int j = 0; j < BLOCK_N; j += 2) {
|
||||
const int k_row0 = k_start + j;
|
||||
const int k_row1 = k_start + j + 1;
|
||||
const int k_row2 = k_start + j + 2;
|
||||
const int k_row3 = k_start + j + 3;
|
||||
|
||||
ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc2 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc3 = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
const ACC_TYPE4 qk = q_priv[k];
|
||||
dot_acc0 = mad(qk, CONVERT_ACC4(l_k[j][k]), dot_acc0);
|
||||
dot_acc1 = mad(qk, CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
|
||||
dot_acc2 = mad(qk, CONVERT_ACC4(l_k[j+2][k]), dot_acc2);
|
||||
dot_acc3 = mad(qk, CONVERT_ACC4(l_k[j+3][k]), dot_acc3);
|
||||
dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0);
|
||||
dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
|
||||
}
|
||||
ACC_TYPE s0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
|
||||
ACC_TYPE s1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
|
||||
ACC_TYPE s2 = (dot_acc2.s0 + dot_acc2.s1 + dot_acc2.s2 + dot_acc2.s3) * scale;
|
||||
ACC_TYPE s3 = (dot_acc3.s0 + dot_acc3.s1 + dot_acc3.s2 + dot_acc3.s3) * scale;
|
||||
ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
|
||||
ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
|
||||
|
||||
if (is_causal) {
|
||||
const int causal_limit = n_kv - n_q + my_query_row;
|
||||
if (k_row0 > causal_limit) s0 = FA_M_INIT;
|
||||
if (k_row1 > causal_limit) s1 = FA_M_INIT;
|
||||
if (k_row2 > causal_limit) s2 = FA_M_INIT;
|
||||
if (k_row3 > causal_limit) s3 = FA_M_INIT;
|
||||
if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
|
||||
if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
|
||||
}
|
||||
if (k_row0 >= n_kv) s0 = FA_M_INIT;
|
||||
if (k_row1 >= n_kv) s1 = FA_M_INIT;
|
||||
if (k_row2 >= n_kv) s2 = FA_M_INIT;
|
||||
if (k_row3 >= n_kv) s3 = FA_M_INIT;
|
||||
|
||||
if (k_row0 >= n_kv) score0 = -INFINITY;
|
||||
if (k_row1 >= n_kv) score1 = -INFINITY;
|
||||
|
||||
if (mask_base != NULL) {
|
||||
const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
|
||||
if (k_row0 < n_kv) s0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) s1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
if (k_row2 < n_kv) s2 += slope * (ACC_TYPE)mask_ptr[k_row2];
|
||||
if (k_row3 < n_kv) s3 += slope * (ACC_TYPE)mask_ptr[k_row3];
|
||||
if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
}
|
||||
|
||||
if (logit_softcap > 0.0f) {
|
||||
s0 = logit_softcap * tanh(s0 / logit_softcap);
|
||||
s1 = logit_softcap * tanh(s1 / logit_softcap);
|
||||
s2 = logit_softcap * tanh(s2 / logit_softcap);
|
||||
s3 = logit_softcap * tanh(s3 / logit_softcap);
|
||||
score0 = logit_softcap * tanh(score0 / logit_softcap);
|
||||
score1 = logit_softcap * tanh(score1 / logit_softcap);
|
||||
}
|
||||
|
||||
const ACC_TYPE m_new = max(m_i, max(max(s0, s1), max(s2, s3)));
|
||||
const ACC_TYPE scale_prev = native_exp(m_i - m_new);
|
||||
const ACC_TYPE p0 = native_exp(s0 - m_new);
|
||||
const ACC_TYPE p1 = native_exp(s1 - m_new);
|
||||
const ACC_TYPE p2 = native_exp(s2 - m_new);
|
||||
const ACC_TYPE p3 = native_exp(s3 - m_new);
|
||||
const ACC_TYPE m_new = max(m_i, max(score0, score1));
|
||||
const ACC_TYPE p0 = exp(score0 - m_new);
|
||||
const ACC_TYPE p1 = exp(score1 - m_new);
|
||||
const ACC_TYPE scale_prev = exp(m_i - m_new);
|
||||
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = mad(p3, CONVERT_ACC4(l_v[j+3][i]),
|
||||
mad(p2, CONVERT_ACC4(l_v[j+2][i]),
|
||||
mad(p1, CONVERT_ACC4(l_v[j+1][i]),
|
||||
mad(p0, CONVERT_ACC4(l_v[j][i]),
|
||||
o_acc[i] * scale_prev))));
|
||||
o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]);
|
||||
}
|
||||
l_i = l_i * scale_prev + p0 + p1 + p2 + p3;
|
||||
l_i = l_i * scale_prev + p0 + p1;
|
||||
m_i = m_new;
|
||||
}
|
||||
}
|
||||
@ -214,7 +179,7 @@ __kernel void flash_attn_f16(
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
|
||||
const ACC_TYPE scale_o = exp(m_i - m_final);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] *= scale_o;
|
||||
}
|
||||
@ -226,12 +191,12 @@ __kernel void flash_attn_f16(
|
||||
global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_i > 0.0f) {
|
||||
const ACC_TYPE l_inv = 1.0f / l_i;
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv);
|
||||
}
|
||||
} else {
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_row[i] = (DATA_TYPE4)(0.0f);
|
||||
}
|
||||
@ -293,7 +258,7 @@ __kernel void flash_attn_f16_q1(
|
||||
ACC_TYPE4 q_priv[DK_VEC];
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
|
||||
const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_ACC4(q_ptr[i]);
|
||||
}
|
||||
@ -305,12 +270,12 @@ __kernel void flash_attn_f16_q1(
|
||||
sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : FA_M_INIT;
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
|
||||
for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
@ -328,7 +293,7 @@ __kernel void flash_attn_f16_q1(
|
||||
__local ACC_TYPE local_m[Q1_WG_SIZE];
|
||||
local_m[tid] = m_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -336,7 +301,7 @@ __kernel void flash_attn_f16_q1(
|
||||
const ACC_TYPE m_final = local_m[0];
|
||||
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
@ -346,7 +311,7 @@ __kernel void flash_attn_f16_q1(
|
||||
const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
|
||||
const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
@ -360,7 +325,7 @@ __kernel void flash_attn_f16_q1(
|
||||
}
|
||||
const ACC_TYPE p = exp(score - m_final);
|
||||
l_i += p;
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; i++) {
|
||||
o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]);
|
||||
}
|
||||
@ -370,7 +335,7 @@ __kernel void flash_attn_f16_q1(
|
||||
__local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
|
||||
local_l[tid] = l_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_l[tid] += local_l[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -389,7 +354,7 @@ __kernel void flash_attn_f16_q1(
|
||||
for (int i = 0; i < DV_VEC; i++) {
|
||||
local_o_comp[tid] = o_acc[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -399,7 +364,7 @@ __kernel void flash_attn_f16_q1(
|
||||
}
|
||||
}
|
||||
} else if (tid == 0) {
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
@ -13,18 +13,6 @@
|
||||
#define WG_SIZE (BLOCK_M)
|
||||
#define Q1_WG_SIZE 64
|
||||
|
||||
// The kernels are built with -cl-finite-math-only. On some older Adreno GPUs,
|
||||
// infinite operand can cause undefined behavior and miscompilation for exp.
|
||||
// Therefore, a large negative value is used instead.
|
||||
#define FA_M_INIT (-3.0e38f)
|
||||
|
||||
// Drop full unroll at DK>=192 — Adreno compiler host-memory budget.
|
||||
#if DK >= 192
|
||||
#define FA_UNROLL
|
||||
#else
|
||||
#define FA_UNROLL _Pragma("unroll")
|
||||
#endif
|
||||
|
||||
inline float get_alibi_slope(
|
||||
const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
|
||||
) {
|
||||
@ -94,18 +82,18 @@ __kernel void flash_attn_f32(
|
||||
if (my_query_row < n_q) {
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
|
||||
const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_ACC4(q_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
}
|
||||
ACC_TYPE m_i = FA_M_INIT;
|
||||
ACC_TYPE m_i = -INFINITY;
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
@ -138,72 +126,49 @@ __kernel void flash_attn_f32(
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int j = 0; j < BLOCK_N; j += 4) {
|
||||
for (int j = 0; j < BLOCK_N; j += 2) {
|
||||
const int k_row0 = k_start + j;
|
||||
const int k_row1 = k_start + j + 1;
|
||||
const int k_row2 = k_start + j + 2;
|
||||
const int k_row3 = k_start + j + 3;
|
||||
|
||||
ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc2 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc3 = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
const ACC_TYPE4 qk = q_priv[k];
|
||||
dot_acc0 = mad(qk, CONVERT_ACC4(l_k[j][k]), dot_acc0);
|
||||
dot_acc1 = mad(qk, CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
|
||||
dot_acc2 = mad(qk, CONVERT_ACC4(l_k[j+2][k]), dot_acc2);
|
||||
dot_acc3 = mad(qk, CONVERT_ACC4(l_k[j+3][k]), dot_acc3);
|
||||
dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0);
|
||||
dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1);
|
||||
}
|
||||
ACC_TYPE s0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
|
||||
ACC_TYPE s1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
|
||||
ACC_TYPE s2 = (dot_acc2.s0 + dot_acc2.s1 + dot_acc2.s2 + dot_acc2.s3) * scale;
|
||||
ACC_TYPE s3 = (dot_acc3.s0 + dot_acc3.s1 + dot_acc3.s2 + dot_acc3.s3) * scale;
|
||||
ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
|
||||
ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
|
||||
|
||||
if (is_causal) {
|
||||
const int causal_limit = n_kv - n_q + my_query_row;
|
||||
if (k_row0 > causal_limit) s0 = FA_M_INIT;
|
||||
if (k_row1 > causal_limit) s1 = FA_M_INIT;
|
||||
if (k_row2 > causal_limit) s2 = FA_M_INIT;
|
||||
if (k_row3 > causal_limit) s3 = FA_M_INIT;
|
||||
if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
|
||||
if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
|
||||
}
|
||||
if (k_row0 >= n_kv) s0 = FA_M_INIT;
|
||||
if (k_row1 >= n_kv) s1 = FA_M_INIT;
|
||||
if (k_row2 >= n_kv) s2 = FA_M_INIT;
|
||||
if (k_row3 >= n_kv) s3 = FA_M_INIT;
|
||||
|
||||
if (k_row0 >= n_kv) score0 = -INFINITY;
|
||||
if (k_row1 >= n_kv) score1 = -INFINITY;
|
||||
|
||||
if (mask_base != NULL) {
|
||||
const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
|
||||
if (k_row0 < n_kv) s0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) s1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
if (k_row2 < n_kv) s2 += slope * (ACC_TYPE)mask_ptr[k_row2];
|
||||
if (k_row3 < n_kv) s3 += slope * (ACC_TYPE)mask_ptr[k_row3];
|
||||
if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
}
|
||||
|
||||
if (logit_softcap > 0.0f) {
|
||||
s0 = logit_softcap * tanh(s0 / logit_softcap);
|
||||
s1 = logit_softcap * tanh(s1 / logit_softcap);
|
||||
s2 = logit_softcap * tanh(s2 / logit_softcap);
|
||||
s3 = logit_softcap * tanh(s3 / logit_softcap);
|
||||
score0 = logit_softcap * tanh(score0 / logit_softcap);
|
||||
score1 = logit_softcap * tanh(score1 / logit_softcap);
|
||||
}
|
||||
|
||||
const ACC_TYPE m_new = max(m_i, max(max(s0, s1), max(s2, s3)));
|
||||
const ACC_TYPE scale_prev = native_exp(m_i - m_new);
|
||||
const ACC_TYPE p0 = native_exp(s0 - m_new);
|
||||
const ACC_TYPE p1 = native_exp(s1 - m_new);
|
||||
const ACC_TYPE p2 = native_exp(s2 - m_new);
|
||||
const ACC_TYPE p3 = native_exp(s3 - m_new);
|
||||
const ACC_TYPE m_new = max(m_i, max(score0, score1));
|
||||
const ACC_TYPE p0 = exp(score0 - m_new);
|
||||
const ACC_TYPE p1 = exp(score1 - m_new);
|
||||
const ACC_TYPE scale_prev = exp(m_i - m_new);
|
||||
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = mad(p3, CONVERT_ACC4(l_v[j+3][i]),
|
||||
mad(p2, CONVERT_ACC4(l_v[j+2][i]),
|
||||
mad(p1, CONVERT_ACC4(l_v[j+1][i]),
|
||||
mad(p0, CONVERT_ACC4(l_v[j][i]),
|
||||
o_acc[i] * scale_prev))));
|
||||
o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]);
|
||||
}
|
||||
l_i = l_i * scale_prev + p0 + p1 + p2 + p3;
|
||||
l_i = l_i * scale_prev + p0 + p1;
|
||||
m_i = m_new;
|
||||
}
|
||||
}
|
||||
@ -215,7 +180,7 @@ __kernel void flash_attn_f32(
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
|
||||
const ACC_TYPE scale_o = exp(m_i - m_final);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] *= scale_o;
|
||||
}
|
||||
@ -227,12 +192,12 @@ __kernel void flash_attn_f32(
|
||||
global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_i > 0.0f) {
|
||||
const ACC_TYPE l_inv = 1.0f / l_i;
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv);
|
||||
}
|
||||
} else {
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_row[i] = (DATA_TYPE4)(0.0f);
|
||||
}
|
||||
@ -294,7 +259,7 @@ __kernel void flash_attn_f32_q1(
|
||||
ACC_TYPE4 q_priv[DK_VEC];
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
|
||||
const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_ACC4(q_ptr[i]);
|
||||
}
|
||||
@ -306,12 +271,12 @@ __kernel void flash_attn_f32_q1(
|
||||
sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : FA_M_INIT;
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
|
||||
for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
@ -329,7 +294,7 @@ __kernel void flash_attn_f32_q1(
|
||||
__local ACC_TYPE local_m[Q1_WG_SIZE];
|
||||
local_m[tid] = m_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -337,7 +302,7 @@ __kernel void flash_attn_f32_q1(
|
||||
const ACC_TYPE m_final = local_m[0];
|
||||
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
@ -347,7 +312,7 @@ __kernel void flash_attn_f32_q1(
|
||||
const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset);
|
||||
const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
@ -361,7 +326,7 @@ __kernel void flash_attn_f32_q1(
|
||||
}
|
||||
const ACC_TYPE p = exp(score - m_final);
|
||||
l_i += p;
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; i++) {
|
||||
o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]);
|
||||
}
|
||||
@ -371,7 +336,7 @@ __kernel void flash_attn_f32_q1(
|
||||
__local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
|
||||
local_l[tid] = l_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_l[tid] += local_l[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -390,7 +355,7 @@ __kernel void flash_attn_f32_q1(
|
||||
for (int i = 0; i < DV_VEC; i++) {
|
||||
local_o_comp[tid] = o_acc[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -400,7 +365,7 @@ __kernel void flash_attn_f32_q1(
|
||||
}
|
||||
}
|
||||
} else if (tid == 0) {
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,13 +1,5 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_khr_subgroup_shuffle
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable
|
||||
#define HAS_SUBGROUP_SHUFFLE 1
|
||||
#elif defined(cl_qcom_subgroup_shuffle)
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
|
||||
#define HAS_SUBGROUP_SHUFFLE 1
|
||||
#endif
|
||||
|
||||
#define ACC_TYPE float
|
||||
#define ACC_TYPE4 float4
|
||||
#define Q_DATA_TYPE4 float4
|
||||
@ -20,33 +12,8 @@
|
||||
|
||||
#define DK_VEC (DK/4)
|
||||
#define DV_VEC (DV/4)
|
||||
#define Q1_WG_SIZE 64
|
||||
|
||||
// The kernels are built with -cl-finite-math-only. On some older Adreno GPUs,
|
||||
// infinite operand can cause undefined behavior and miscompilation for exp.
|
||||
// Therefore, a large negative value is used instead.
|
||||
#define FA_M_INIT (-3.0e38f)
|
||||
|
||||
// Drop full unroll at DK>=192 — Adreno compiler host-memory budget.
|
||||
#if DK >= 192
|
||||
#define FA_UNROLL
|
||||
#else
|
||||
#define FA_UNROLL _Pragma("unroll")
|
||||
#endif
|
||||
|
||||
// N_SPLIT>1 splits DK/DV across threads to cut per-thread register use.
|
||||
#ifndef N_SPLIT
|
||||
#define N_SPLIT 1
|
||||
#endif
|
||||
|
||||
#define SPLIT_DK_VEC (DK_VEC / N_SPLIT)
|
||||
#define SPLIT_DV_VEC (DV_VEC / N_SPLIT)
|
||||
|
||||
#if N_SPLIT > 1
|
||||
#define WG_SIZE (BLOCK_M * N_SPLIT)
|
||||
#else
|
||||
#define WG_SIZE (BLOCK_M)
|
||||
#endif
|
||||
#define Q1_WG_SIZE 64
|
||||
|
||||
inline float get_alibi_slope(
|
||||
const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1
|
||||
@ -87,38 +54,19 @@ __kernel void flash_attn_f32_f16(
|
||||
const int mask_ne2,
|
||||
const int mask_ne3,
|
||||
const global void* sinks_void,
|
||||
const ulong sinks_offset,
|
||||
const global void * k_pad_void,
|
||||
const global void * v_pad_void,
|
||||
const global void * mask_pad_void,
|
||||
const global char * blk,
|
||||
const int n_kv_blocks,
|
||||
const ulong mask_pad_nb1,
|
||||
const ulong mask_pad_nb2,
|
||||
const ulong mask_pad_nb3
|
||||
const ulong sinks_offset
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int block_q_idx = get_group_id(0);
|
||||
const int head_batch_idx = get_global_id(1);
|
||||
|
||||
#if N_SPLIT > 1
|
||||
const int q_lane = tid / N_SPLIT;
|
||||
const int split_idx = tid % N_SPLIT;
|
||||
#else
|
||||
const int q_lane = tid;
|
||||
const int split_idx = 0;
|
||||
#endif
|
||||
|
||||
const int my_query_row = block_q_idx * BLOCK_M + q_lane;
|
||||
const int query_valid = my_query_row < n_q;
|
||||
const int my_query_row = block_q_idx * BLOCK_M + tid;
|
||||
|
||||
const int batch_idx = head_batch_idx / n_head;
|
||||
const int head_idx = head_batch_idx % n_head;
|
||||
|
||||
const int gqa_ratio = n_head / n_head_kv;
|
||||
const int head_kv_idx = head_idx / gqa_ratio;
|
||||
const int mask_head_idx = mask_void != NULL ? head_idx % mask_ne2 : 0;
|
||||
const int mask_batch_idx = mask_void != NULL ? batch_idx % mask_ne3 : 0;
|
||||
|
||||
const global char* q_base = (const global char*)q_void + q_offset;
|
||||
const global char* k_base = (const global char*)k_void + k_offset;
|
||||
@ -127,41 +75,27 @@ __kernel void flash_attn_f32_f16(
|
||||
|
||||
const global char* mask_base = NULL;
|
||||
if (mask_void != NULL) {
|
||||
const int mask_head_idx = head_idx % mask_ne2;
|
||||
const int mask_batch_idx = batch_idx % mask_ne3;
|
||||
mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
|
||||
}
|
||||
const global char* mask_pad_base = NULL;
|
||||
if (mask_pad_void != NULL) {
|
||||
mask_pad_base = (const global char*)mask_pad_void + mask_batch_idx * mask_pad_nb3 + mask_head_idx * mask_pad_nb2;
|
||||
}
|
||||
const global char* blk_base = NULL;
|
||||
if (blk != NULL) {
|
||||
const int n_q_blocks = (n_q + BLOCK_M - 1) / BLOCK_M;
|
||||
blk_base = blk + (((mask_batch_idx * mask_ne2) + mask_head_idx) * n_q_blocks + block_q_idx) * n_kv_blocks;
|
||||
}
|
||||
|
||||
ACC_TYPE4 q_priv[SPLIT_DK_VEC];
|
||||
const int dk_off = split_idx * SPLIT_DK_VEC;
|
||||
if (query_valid) {
|
||||
ACC_TYPE4 q_priv[DK_VEC];
|
||||
if (my_query_row < n_q) {
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1;
|
||||
const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset);
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_Q_ACC4(q_ptr[dk_off + i]);
|
||||
}
|
||||
} else {
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DK_VEC; ++i) {
|
||||
q_priv[i] = (ACC_TYPE4)(0.0f);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
ACC_TYPE4 o_acc[SPLIT_DV_VEC];
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = FA_M_INIT;
|
||||
ACC_TYPE m_i = -INFINITY;
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
@ -169,369 +103,86 @@ __kernel void flash_attn_f32_f16(
|
||||
__local KV_DATA_TYPE4 l_k[BLOCK_N][DK_VEC];
|
||||
__local KV_DATA_TYPE4 l_v[BLOCK_N][DV_VEC];
|
||||
|
||||
#if N_SPLIT > 1 && !defined(HAS_SUBGROUP_SHUFFLE)
|
||||
__local ACC_TYPE local_partial[BLOCK_N][WG_SIZE];
|
||||
__local ACC_TYPE local_p[BLOCK_M][BLOCK_N];
|
||||
__local ACC_TYPE local_softmax_scale[BLOCK_M];
|
||||
__local ACC_TYPE local_l_inv[BLOCK_M];
|
||||
#endif
|
||||
|
||||
for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) {
|
||||
char blk_cur = 1;
|
||||
if (blk_base != NULL) {
|
||||
blk_cur = blk_base[k_start / BLOCK_N];
|
||||
if (blk_cur == 0) continue;
|
||||
}
|
||||
|
||||
const int use_kv_pad = k_pad_void != NULL && k_start + BLOCK_N > n_kv;
|
||||
const int k_tile_start = use_kv_pad ? 0 : k_start;
|
||||
const ulong k_tile_nb2 = use_kv_pad ? (ulong) BLOCK_N * k_nb1 : k_nb2;
|
||||
const ulong k_tile_nb3 = use_kv_pad ? (ulong) n_head_kv * k_tile_nb2 : k_nb3;
|
||||
const ulong v_tile_nb2 = use_kv_pad ? (ulong) BLOCK_N * v_nb1 : v_nb2;
|
||||
const ulong v_tile_nb3 = use_kv_pad ? (ulong) n_head_kv * v_tile_nb2 : v_nb3;
|
||||
const global char* k_tile_base = use_kv_pad ? (const global char*) k_pad_void : k_base;
|
||||
const global char* v_tile_base = use_kv_pad ? (const global char*) v_pad_void : v_base;
|
||||
|
||||
for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) {
|
||||
const int row = i / DK_VEC;
|
||||
const int col = i % DK_VEC;
|
||||
const int k_row_idx = k_tile_start + row;
|
||||
if (use_kv_pad || k_row_idx < n_kv) {
|
||||
const ulong k_row_offset = batch_idx * k_tile_nb3 + head_kv_idx * k_tile_nb2 + k_row_idx * k_nb1;
|
||||
l_k[row][col] = ((__global KV_DATA_TYPE4*)(k_tile_base + k_row_offset))[col];
|
||||
} else {
|
||||
l_k[row][col] = (KV_DATA_TYPE4)(0.0h);
|
||||
const int k_row_idx = k_start + row;
|
||||
if (k_row_idx < n_kv) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1;
|
||||
l_k[row][col] = ((__global KV_DATA_TYPE4*)(k_base + k_row_offset))[col];
|
||||
}
|
||||
}
|
||||
for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) {
|
||||
const int row = i / DV_VEC;
|
||||
const int col = i % DV_VEC;
|
||||
const int v_row_idx = k_tile_start + row;
|
||||
if (use_kv_pad || v_row_idx < n_kv) {
|
||||
const ulong v_row_offset = batch_idx * v_tile_nb3 + head_kv_idx * v_tile_nb2 + v_row_idx * v_nb1;
|
||||
l_v[row][col] = ((__global KV_DATA_TYPE4*)(v_tile_base + v_row_offset))[col];
|
||||
} else {
|
||||
l_v[row][col] = (KV_DATA_TYPE4)(0.0h);
|
||||
const int v_row_idx = k_start + row;
|
||||
if (v_row_idx < n_kv) {
|
||||
const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1;
|
||||
l_v[row][col] = ((__global KV_DATA_TYPE4*)(v_base + v_row_offset))[col];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#if N_SPLIT > 1 && defined(HAS_SUBGROUP_SHUFFLE)
|
||||
{
|
||||
const int dv_off = split_idx * SPLIT_DV_VEC;
|
||||
for (int j = 0; j < BLOCK_N; j += 2) {
|
||||
const int k_row0 = k_start + j;
|
||||
const int k_row1 = k_start + j + 1;
|
||||
|
||||
ACC_TYPE partial0 = 0.0f;
|
||||
ACC_TYPE partial1 = 0.0f;
|
||||
FA_UNROLL
|
||||
for (int k = 0; k < SPLIT_DK_VEC; k++) {
|
||||
const ACC_TYPE4 qk = q_priv[k];
|
||||
ACC_TYPE4 dot0 = qk * CONVERT_KV_ACC4(l_k[j ][dk_off + k]);
|
||||
ACC_TYPE4 dot1 = qk * CONVERT_KV_ACC4(l_k[j+1][dk_off + k]);
|
||||
partial0 += dot0.s0 + dot0.s1 + dot0.s2 + dot0.s3;
|
||||
partial1 += dot1.s0 + dot1.s1 + dot1.s2 + dot1.s3;
|
||||
}
|
||||
|
||||
FA_UNROLL
|
||||
for (int step = 1; step < N_SPLIT; step <<= 1) {
|
||||
partial0 += sub_group_shuffle_xor(partial0, step);
|
||||
partial1 += sub_group_shuffle_xor(partial1, step);
|
||||
}
|
||||
|
||||
ACC_TYPE score0 = partial0 * scale;
|
||||
ACC_TYPE score1 = partial1 * scale;
|
||||
|
||||
if (!query_valid) { score0 = FA_M_INIT; score1 = FA_M_INIT; }
|
||||
if (is_causal) {
|
||||
if (k_row0 > (n_kv - n_q + my_query_row)) score0 = FA_M_INIT;
|
||||
if (k_row1 > (n_kv - n_q + my_query_row)) score1 = FA_M_INIT;
|
||||
}
|
||||
if (k_row0 >= n_kv) score0 = FA_M_INIT;
|
||||
if (k_row1 >= n_kv) score1 = FA_M_INIT;
|
||||
|
||||
if (query_valid && mask_base != NULL && blk_cur != 2) {
|
||||
if (use_kv_pad && mask_pad_base != NULL) {
|
||||
const global MASK_DATA_TYPE* mask_ptr =
|
||||
(const global MASK_DATA_TYPE*)(mask_pad_base + my_query_row * mask_pad_nb1);
|
||||
score0 += slope * (ACC_TYPE)mask_ptr[j];
|
||||
score1 += slope * (ACC_TYPE)mask_ptr[j + 1];
|
||||
} else {
|
||||
const global MASK_DATA_TYPE* mask_ptr =
|
||||
(const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
|
||||
if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
}
|
||||
}
|
||||
|
||||
if (logit_softcap > 0.0f) {
|
||||
score0 = logit_softcap * tanh(score0 / logit_softcap);
|
||||
score1 = logit_softcap * tanh(score1 / logit_softcap);
|
||||
}
|
||||
|
||||
const ACC_TYPE m_new = max(m_i, max(score0, score1));
|
||||
// Whole tile masked (m_new == FA_M_INIT): force the exp() args
|
||||
// far negative so the tile contributes 0, not exp(0)=1.
|
||||
const ACC_TYPE m_exp = (m_new == FA_M_INIT) ? 0.0f : m_new;
|
||||
const ACC_TYPE sp = native_exp(m_i - m_exp);
|
||||
const ACC_TYPE p0 = native_exp(score0 - m_exp);
|
||||
const ACC_TYPE p1 = native_exp(score1 - m_exp);
|
||||
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_acc[i] = o_acc[i] * sp
|
||||
+ p0 * CONVERT_KV_ACC4(l_v[j ][dv_off + i])
|
||||
+ p1 * CONVERT_KV_ACC4(l_v[j+1][dv_off + i]);
|
||||
}
|
||||
l_i = l_i * sp + p0 + p1;
|
||||
m_i = m_new;
|
||||
}
|
||||
if (my_query_row >= n_q) {
|
||||
continue;
|
||||
}
|
||||
#elif N_SPLIT > 1
|
||||
// N_SPLIT>1 fallback (no shuffle): 3-phase local-memory reduction.
|
||||
// Phase 1 — partial dots for all BLOCK_N tokens.
|
||||
for (int j = 0; j < BLOCK_N; ++j) {
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
for (int k = 0; k < SPLIT_DK_VEC; k++) {
|
||||
dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j][dk_off + k]), dot_acc);
|
||||
|
||||
for (int j = 0; j < BLOCK_N; j += 2) {
|
||||
const int k_row0 = k_start + j;
|
||||
const int k_row1 = k_start + j + 1;
|
||||
|
||||
ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc0 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j][k]), dot_acc0);
|
||||
dot_acc1 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j+1][k]), dot_acc1);
|
||||
}
|
||||
local_partial[j][tid] =
|
||||
dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE); // 1 barrier: partial dots visible
|
||||
ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
|
||||
ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
|
||||
|
||||
// Phase 2 — split_idx==0 reduces partial sums and computes block softmax.
|
||||
if (split_idx == 0) {
|
||||
if (query_valid) {
|
||||
ACC_TYPE m_new = m_i;
|
||||
for (int j = 0; j < BLOCK_N; ++j) {
|
||||
const int k_row = k_start + j;
|
||||
ACC_TYPE score = 0.0f;
|
||||
FA_UNROLL
|
||||
for (int s = 0; s < N_SPLIT; s++) {
|
||||
score += local_partial[j][q_lane * N_SPLIT + s];
|
||||
}
|
||||
score *= scale;
|
||||
|
||||
if (is_causal && k_row > (n_kv - n_q + my_query_row)) score = FA_M_INIT;
|
||||
if (k_row >= n_kv) score = FA_M_INIT;
|
||||
|
||||
if (mask_base != NULL && blk_cur != 2) {
|
||||
if (use_kv_pad && mask_pad_base != NULL) {
|
||||
const global MASK_DATA_TYPE* mask_ptr =
|
||||
(const global MASK_DATA_TYPE*)(mask_pad_base + my_query_row * mask_pad_nb1);
|
||||
score += slope * (ACC_TYPE)mask_ptr[j];
|
||||
} else {
|
||||
const global MASK_DATA_TYPE* mask_ptr =
|
||||
(const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
|
||||
if (k_row < n_kv) score += slope * (ACC_TYPE)mask_ptr[k_row];
|
||||
}
|
||||
}
|
||||
|
||||
if (logit_softcap > 0.0f) {
|
||||
score = logit_softcap * tanh(score / logit_softcap);
|
||||
}
|
||||
|
||||
m_new = max(m_new, score);
|
||||
local_p[q_lane][j] = score;
|
||||
}
|
||||
|
||||
const ACC_TYPE m_exp = (m_new == FA_M_INIT) ? 0.0f : m_new;
|
||||
const ACC_TYPE sp = native_exp(m_i - m_exp);
|
||||
ACC_TYPE l_new = l_i * sp;
|
||||
for (int j = 0; j < BLOCK_N; ++j) {
|
||||
const ACC_TYPE p = native_exp(local_p[q_lane][j] - m_exp);
|
||||
local_p[q_lane][j] = p;
|
||||
l_new += p;
|
||||
}
|
||||
local_softmax_scale[q_lane] = sp;
|
||||
l_i = l_new;
|
||||
m_i = m_new;
|
||||
} else {
|
||||
local_softmax_scale[q_lane] = 1.0f;
|
||||
for (int j = 0; j < BLOCK_N; ++j) local_p[q_lane][j] = 0.0f;
|
||||
if (is_causal) {
|
||||
if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY;
|
||||
if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Phase 3 — V accumulate using broadcast probabilities.
|
||||
{
|
||||
const ACC_TYPE sp_block = local_softmax_scale[q_lane];
|
||||
const int dv_off = split_idx * SPLIT_DV_VEC;
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_acc[i] *= sp_block;
|
||||
if (k_row0 >= n_kv) score0 = -INFINITY;
|
||||
if (k_row1 >= n_kv) score1 = -INFINITY;
|
||||
|
||||
if (mask_base != NULL) {
|
||||
const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
|
||||
if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
}
|
||||
for (int j = 0; j < BLOCK_N; ++j) {
|
||||
const ACC_TYPE p = local_p[q_lane][j];
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_acc[i] = mad(p, CONVERT_KV_ACC4(l_v[j][dv_off + i]), o_acc[i]);
|
||||
}
|
||||
|
||||
if (logit_softcap > 0.0f) {
|
||||
score0 = logit_softcap * tanh(score0 / logit_softcap);
|
||||
score1 = logit_softcap * tanh(score1 / logit_softcap);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// N_SPLIT==1: j+=4 unroll. Requires BLOCK_N % 4 == 0.
|
||||
if (query_valid) {
|
||||
for (int j = 0; j < BLOCK_N; j += 4) {
|
||||
const int k_row0 = k_start + j;
|
||||
const int k_row1 = k_start + j + 1;
|
||||
const int k_row2 = k_start + j + 2;
|
||||
const int k_row3 = k_start + j + 3;
|
||||
|
||||
ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc2 = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE4 dot_acc3 = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
const ACC_TYPE4 qk = q_priv[k];
|
||||
dot_acc0 = mad(qk, CONVERT_KV_ACC4(l_k[j][k]), dot_acc0);
|
||||
dot_acc1 = mad(qk, CONVERT_KV_ACC4(l_k[j+1][k]), dot_acc1);
|
||||
dot_acc2 = mad(qk, CONVERT_KV_ACC4(l_k[j+2][k]), dot_acc2);
|
||||
dot_acc3 = mad(qk, CONVERT_KV_ACC4(l_k[j+3][k]), dot_acc3);
|
||||
}
|
||||
ACC_TYPE s0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale;
|
||||
ACC_TYPE s1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale;
|
||||
ACC_TYPE s2 = (dot_acc2.s0 + dot_acc2.s1 + dot_acc2.s2 + dot_acc2.s3) * scale;
|
||||
ACC_TYPE s3 = (dot_acc3.s0 + dot_acc3.s1 + dot_acc3.s2 + dot_acc3.s3) * scale;
|
||||
const ACC_TYPE m_new = max(m_i, max(score0, score1));
|
||||
const ACC_TYPE p0 = exp(score0 - m_new);
|
||||
const ACC_TYPE p1 = exp(score1 - m_new);
|
||||
const ACC_TYPE scale_prev = exp(m_i - m_new);
|
||||
|
||||
if (is_causal) {
|
||||
const int causal_limit = n_kv - n_q + my_query_row;
|
||||
if (k_row0 > causal_limit) s0 = FA_M_INIT;
|
||||
if (k_row1 > causal_limit) s1 = FA_M_INIT;
|
||||
if (k_row2 > causal_limit) s2 = FA_M_INIT;
|
||||
if (k_row3 > causal_limit) s3 = FA_M_INIT;
|
||||
}
|
||||
if (k_row0 >= n_kv) s0 = FA_M_INIT;
|
||||
if (k_row1 >= n_kv) s1 = FA_M_INIT;
|
||||
if (k_row2 >= n_kv) s2 = FA_M_INIT;
|
||||
if (k_row3 >= n_kv) s3 = FA_M_INIT;
|
||||
|
||||
if (mask_base != NULL && blk_cur != 2) {
|
||||
if (use_kv_pad && mask_pad_base != NULL) {
|
||||
const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_pad_base + my_query_row * mask_pad_nb1);
|
||||
s0 += slope * (ACC_TYPE)mask_ptr[j];
|
||||
s1 += slope * (ACC_TYPE)mask_ptr[j + 1];
|
||||
s2 += slope * (ACC_TYPE)mask_ptr[j + 2];
|
||||
s3 += slope * (ACC_TYPE)mask_ptr[j + 3];
|
||||
} else {
|
||||
const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1);
|
||||
if (k_row0 < n_kv) s0 += slope * (ACC_TYPE)mask_ptr[k_row0];
|
||||
if (k_row1 < n_kv) s1 += slope * (ACC_TYPE)mask_ptr[k_row1];
|
||||
if (k_row2 < n_kv) s2 += slope * (ACC_TYPE)mask_ptr[k_row2];
|
||||
if (k_row3 < n_kv) s3 += slope * (ACC_TYPE)mask_ptr[k_row3];
|
||||
}
|
||||
}
|
||||
|
||||
if (logit_softcap > 0.0f) {
|
||||
s0 = logit_softcap * tanh(s0 / logit_softcap);
|
||||
s1 = logit_softcap * tanh(s1 / logit_softcap);
|
||||
s2 = logit_softcap * tanh(s2 / logit_softcap);
|
||||
s3 = logit_softcap * tanh(s3 / logit_softcap);
|
||||
}
|
||||
|
||||
const ACC_TYPE m_new = max(m_i, max(max(s0, s1), max(s2, s3)));
|
||||
// Whole tile masked (m_new == FA_M_INIT): force the exp() args
|
||||
// far negative so the tile contributes 0, not exp(0)=1.
|
||||
const ACC_TYPE m_exp = (m_new == FA_M_INIT) ? 0.0f : m_new;
|
||||
const ACC_TYPE scale_prev = native_exp(m_i - m_exp);
|
||||
const ACC_TYPE p0 = native_exp(s0 - m_exp);
|
||||
const ACC_TYPE p1 = native_exp(s1 - m_exp);
|
||||
const ACC_TYPE p2 = native_exp(s2 - m_exp);
|
||||
const ACC_TYPE p3 = native_exp(s3 - m_exp);
|
||||
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = mad(p3, CONVERT_KV_ACC4(l_v[j+3][i]),
|
||||
mad(p2, CONVERT_KV_ACC4(l_v[j+2][i]),
|
||||
mad(p1, CONVERT_KV_ACC4(l_v[j+1][i]),
|
||||
mad(p0, CONVERT_KV_ACC4(l_v[j][i]),
|
||||
o_acc[i] * scale_prev))));
|
||||
}
|
||||
l_i = l_i * scale_prev + p0 + p1 + p2 + p3;
|
||||
m_i = m_new;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// End of tile: every thread must finish reading l_k/l_v before the
|
||||
// next iteration's load overwrites them (WAR hazard on local memory).
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// Write output.
|
||||
#if N_SPLIT > 1 && defined(HAS_SUBGROUP_SHUFFLE)
|
||||
if (query_valid) {
|
||||
ACC_TYPE sinks_sp = 1.0f;
|
||||
if (sinks_void != NULL) {
|
||||
const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
const ACC_TYPE m_sink = sinks_ptr[head_idx];
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
sinks_sp = exp(m_i - m_final);
|
||||
l_i = l_i * sinks_sp + exp(m_sink - m_final);
|
||||
m_i = m_final;
|
||||
}
|
||||
const ACC_TYPE l_inv = (l_i > 0.0f) ? (1.0f / l_i) : 0.0f;
|
||||
const int dv_off = split_idx * SPLIT_DV_VEC;
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
|
||||
global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_inv > 0.0f) {
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_row[dv_off + i] = CONVERT_O_DATA4(o_acc[i] * sinks_sp * l_inv);
|
||||
}
|
||||
} else {
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_row[dv_off + i] = (O_DATA_TYPE4)(0.0f);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_KV_ACC4(l_v[j][i]) + p1 * CONVERT_KV_ACC4(l_v[j+1][i]);
|
||||
}
|
||||
l_i = l_i * scale_prev + p0 + p1;
|
||||
m_i = m_new;
|
||||
}
|
||||
}
|
||||
#elif N_SPLIT > 1
|
||||
if (split_idx == 0) {
|
||||
ACC_TYPE sinks_sp = 1.0f;
|
||||
if (query_valid && sinks_void != NULL) {
|
||||
const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
const ACC_TYPE m_sink = sinks_ptr[head_idx];
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
sinks_sp = exp(m_i - m_final);
|
||||
l_i = l_i * sinks_sp + exp(m_sink - m_final);
|
||||
m_i = m_final;
|
||||
}
|
||||
local_softmax_scale[q_lane] = sinks_sp;
|
||||
local_l_inv[q_lane] = (query_valid && l_i > 0.0f) ? (1.0f / l_i) : 0.0f;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (query_valid) {
|
||||
const ACC_TYPE sinks_sp = local_softmax_scale[q_lane];
|
||||
const ACC_TYPE l_inv = local_l_inv[q_lane];
|
||||
const int dv_off = split_idx * SPLIT_DV_VEC;
|
||||
const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1;
|
||||
global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_inv > 0.0f) {
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_row[dv_off + i] = CONVERT_O_DATA4(o_acc[i] * sinks_sp * l_inv);
|
||||
}
|
||||
} else {
|
||||
FA_UNROLL
|
||||
for (int i = 0; i < SPLIT_DV_VEC; ++i) {
|
||||
o_row[dv_off + i] = (O_DATA_TYPE4)(0.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (query_valid) {
|
||||
if (my_query_row < n_q) {
|
||||
if (sinks_void != NULL) {
|
||||
const global ACC_TYPE* sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
const ACC_TYPE m_sink = sinks_ptr[head_idx];
|
||||
const ACC_TYPE m_final = max(m_i, m_sink);
|
||||
|
||||
const ACC_TYPE scale_o = exp(m_i - m_final);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] *= scale_o;
|
||||
}
|
||||
@ -543,18 +194,17 @@ __kernel void flash_attn_f32_f16(
|
||||
global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset);
|
||||
if (l_i > 0.0f) {
|
||||
const ACC_TYPE l_inv = 1.0f / l_i;
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_row[i] = CONVERT_O_DATA4(o_acc[i] * l_inv);
|
||||
}
|
||||
} else {
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_row[i] = (O_DATA_TYPE4)(0.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void flash_attn_f32_f16_q1(
|
||||
@ -608,16 +258,13 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2;
|
||||
}
|
||||
|
||||
// Q is uniform across WG threads (n_q=1). Share via local memory to
|
||||
// avoid per-thread q_priv[DK_VEC] dynamic-indexed private array that
|
||||
// spills to DDR on Adreno.
|
||||
__local ACC_TYPE4 q_shared[DK_VEC];
|
||||
ACC_TYPE4 q_priv[DK_VEC];
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2;
|
||||
const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset);
|
||||
for (int i = tid; i < DK_VEC; i += Q1_WG_SIZE) {
|
||||
q_shared[i] = CONVERT_Q_ACC4(q_ptr[i]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DK_VEC; ++i) {
|
||||
q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
|
||||
@ -626,14 +273,14 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
sinks_ptr = (const global ACC_TYPE*)((const global char*)sinks_void + sinks_offset);
|
||||
}
|
||||
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : FA_M_INIT;
|
||||
ACC_TYPE m_i = (sinks_ptr != NULL) ? sinks_ptr[head_idx] : -INFINITY;
|
||||
for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
|
||||
dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
|
||||
if (mask_base != NULL) {
|
||||
@ -649,7 +296,7 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
__local ACC_TYPE local_m[Q1_WG_SIZE];
|
||||
local_m[tid] = m_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -657,7 +304,7 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
const ACC_TYPE m_final = local_m[0];
|
||||
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
@ -667,9 +314,9 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset);
|
||||
const global KV_DATA_TYPE4* v_ptr = (const global KV_DATA_TYPE4*)(v_base + v_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; k++) {
|
||||
dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
|
||||
dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
|
||||
if (mask_base != NULL) {
|
||||
@ -681,7 +328,7 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
}
|
||||
const ACC_TYPE p = exp(score - m_final);
|
||||
l_i += p;
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; i++) {
|
||||
o_acc[i] = mad(p, CONVERT_KV_ACC4(v_ptr[i]), o_acc[i]);
|
||||
}
|
||||
@ -691,7 +338,7 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
__local ACC_TYPE4 local_o_comp[Q1_WG_SIZE];
|
||||
local_l[tid] = l_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_l[tid] += local_l[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -710,7 +357,7 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
for (int i = 0; i < DV_VEC; i++) {
|
||||
local_o_comp[tid] = o_acc[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_o_comp[tid] += local_o_comp[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -720,257 +367,7 @@ __kernel void flash_attn_f32_f16_q1(
|
||||
}
|
||||
}
|
||||
} else if (tid == 0) {
|
||||
FA_UNROLL
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_row[i] = (O_DATA_TYPE4)(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
// Flash-decoding split pass. gid(2) = q_idx * n_splits + split_idx.
|
||||
// Partial record per split: [m, l, O[DV]]. Merge kernel applies sink + norm.
|
||||
#define FA_PARTIAL_FLOATS (2 + DV)
|
||||
|
||||
__kernel void flash_attn_f32_f16_q1_split(
|
||||
const global void * q_void, ulong q_offset,
|
||||
const global void * k_void, ulong k_offset,
|
||||
const global void * v_void, ulong v_offset,
|
||||
const float scale,
|
||||
const int n_q,
|
||||
const int n_kv,
|
||||
const int n_head,
|
||||
const ulong q_nb1, const ulong q_nb2, const ulong q_nb3,
|
||||
const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
|
||||
const ulong v_nb1, const ulong v_nb2, const ulong v_nb3,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const int n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int n_head_kv,
|
||||
const global void * mask_void,
|
||||
const ulong mask_offset,
|
||||
const ulong mask_nb1,
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3,
|
||||
global float * partial_void,
|
||||
const int n_splits,
|
||||
const int kv_per_split
|
||||
) {
|
||||
const int tid = get_local_id(0);
|
||||
const int head_batch_idx = get_global_id(1);
|
||||
const int split_q_idx = get_global_id(2);
|
||||
const int split_idx = split_q_idx % n_splits;
|
||||
const int q_idx = split_q_idx / n_splits;
|
||||
const int batch_idx = head_batch_idx / n_head;
|
||||
const int head_idx = head_batch_idx % n_head;
|
||||
const int gqa_ratio = n_head / n_head_kv;
|
||||
const int head_kv_idx = head_idx / gqa_ratio;
|
||||
|
||||
const int kv_start = split_idx * kv_per_split;
|
||||
const int kv_end = min(kv_start + kv_per_split, n_kv);
|
||||
|
||||
const ulong record_stride = (ulong) FA_PARTIAL_FLOATS;
|
||||
const ulong record_idx = ((((ulong) batch_idx * n_head + head_idx) * n_q + q_idx)
|
||||
* n_splits + split_idx);
|
||||
global float * rec = partial_void + record_idx * record_stride;
|
||||
global float4 * rec_o = (global float4 *) (rec + 2);
|
||||
|
||||
if (kv_start >= kv_end) {
|
||||
// Empty split: leave sentinel partial for merge.
|
||||
if (tid == 0) {
|
||||
rec[0] = FA_M_INIT;
|
||||
rec[1] = 0.0f;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const global char * q_base = (const global char *) q_void + q_offset;
|
||||
const global char * k_base = (const global char *) k_void + k_offset;
|
||||
const global char * v_base = (const global char *) v_void + v_offset;
|
||||
|
||||
const global char * mask_base = NULL;
|
||||
if (mask_void != NULL) {
|
||||
const int mask_head_idx = head_idx % mask_ne2;
|
||||
const int mask_batch_idx = batch_idx % mask_ne3;
|
||||
mask_base = (const global char *) mask_void + mask_offset +
|
||||
mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2 +
|
||||
(ulong) q_idx * mask_nb1;
|
||||
}
|
||||
|
||||
// Share Q via local memory (n_q=1 per split -> uniform across WG).
|
||||
__local ACC_TYPE4 q_shared[DK_VEC];
|
||||
const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + (ulong) q_idx * q_nb1;
|
||||
const global Q_DATA_TYPE4 * q_ptr = (const global Q_DATA_TYPE4 *) (q_base + q_row_offset);
|
||||
for (int i = tid; i < DK_VEC; i += Q1_WG_SIZE) {
|
||||
q_shared[i] = CONVERT_Q_ACC4(q_ptr[i]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1);
|
||||
|
||||
// Pass 1a — split-local max.
|
||||
ACC_TYPE m_i = FA_M_INIT;
|
||||
for (int k_idx = kv_start + tid; k_idx < kv_end; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const global KV_DATA_TYPE4 * k_ptr = (const global KV_DATA_TYPE4 *) (k_base + k_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; ++k) {
|
||||
dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
|
||||
if (mask_base != NULL) {
|
||||
const global MASK_DATA_TYPE * mask_ptr = (const global MASK_DATA_TYPE *) (mask_base);
|
||||
score += slope * (ACC_TYPE) mask_ptr[k_idx];
|
||||
}
|
||||
if (logit_softcap > 0.0f) {
|
||||
score = logit_softcap * tanh(score / logit_softcap);
|
||||
}
|
||||
m_i = max(m_i, score);
|
||||
}
|
||||
|
||||
__local ACC_TYPE local_m[Q1_WG_SIZE];
|
||||
local_m[tid] = m_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
const ACC_TYPE m_c = local_m[0];
|
||||
|
||||
// Pass 1b — softmax-weighted V accumulate.
|
||||
ACC_TYPE4 o_acc[DV_VEC];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f);
|
||||
ACC_TYPE l_i = 0.0f;
|
||||
|
||||
for (int k_idx = kv_start + tid; k_idx < kv_end; k_idx += Q1_WG_SIZE) {
|
||||
const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1;
|
||||
const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1;
|
||||
const global KV_DATA_TYPE4 * k_ptr = (const global KV_DATA_TYPE4 *) (k_base + k_row_offset);
|
||||
const global KV_DATA_TYPE4 * v_ptr = (const global KV_DATA_TYPE4 *) (v_base + v_row_offset);
|
||||
ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < DK_VEC; ++k) {
|
||||
dot_acc = mad(q_shared[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc);
|
||||
}
|
||||
ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale;
|
||||
if (mask_base != NULL) {
|
||||
const global MASK_DATA_TYPE * mask_ptr = (const global MASK_DATA_TYPE *) (mask_base);
|
||||
score += slope * (ACC_TYPE) mask_ptr[k_idx];
|
||||
}
|
||||
if (logit_softcap > 0.0f) {
|
||||
score = logit_softcap * tanh(score / logit_softcap);
|
||||
}
|
||||
const ACC_TYPE p = exp(score - m_c);
|
||||
l_i += p;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
o_acc[i] = mad(p, CONVERT_KV_ACC4(v_ptr[i]), o_acc[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__local ACC_TYPE local_l[Q1_WG_SIZE];
|
||||
__local ACC_TYPE4 local_o[Q1_WG_SIZE];
|
||||
local_l[tid] = l_i;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_l[tid] += local_l[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
const ACC_TYPE l_c = local_l[0];
|
||||
|
||||
if (tid == 0) {
|
||||
rec[0] = (float) m_c;
|
||||
rec[1] = (float) l_c;
|
||||
}
|
||||
for (int i = 0; i < DV_VEC; ++i) {
|
||||
local_o[tid] = o_acc[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#pragma unroll
|
||||
for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) local_o[tid] += local_o[tid + s];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (tid == 0) {
|
||||
rec_o[i] = local_o[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FD Pass 2: merge per-split partials into final O. Empty splits drop via exp(-INF)=0.
|
||||
__kernel void flash_attn_f32_merge(
|
||||
const global float * partial_void,
|
||||
global void * o_void,
|
||||
const ulong o_offset,
|
||||
const int n_head,
|
||||
const int n_splits,
|
||||
const ulong o_nb1, const ulong o_nb2, const ulong o_nb3,
|
||||
const global void * sinks_void,
|
||||
const ulong sinks_offset,
|
||||
const int n_q
|
||||
) {
|
||||
const int lane = get_local_id(0); // 0..DV_VEC-1
|
||||
const int head_batch_idx = get_global_id(1);
|
||||
const int q_idx = get_global_id(2);
|
||||
const int batch_idx = head_batch_idx / n_head;
|
||||
const int head_idx = head_batch_idx % n_head;
|
||||
|
||||
const ulong record_stride = (ulong) FA_PARTIAL_FLOATS;
|
||||
const ulong record_idx_0 = (((ulong) batch_idx * n_head + head_idx) * n_q + q_idx) * n_splits;
|
||||
const global float * rec0 = partial_void + record_idx_0 * record_stride;
|
||||
|
||||
__local ACC_TYPE m_final_shared;
|
||||
__local ACC_TYPE l_final_shared;
|
||||
if (lane == 0) {
|
||||
ACC_TYPE m = FA_M_INIT;
|
||||
for (int c = 0; c < n_splits; ++c) {
|
||||
const ACC_TYPE m_c = rec0[c * record_stride + 0];
|
||||
m = max(m, m_c);
|
||||
}
|
||||
ACC_TYPE m_sink = 0.0f;
|
||||
bool has_sink = false;
|
||||
if (sinks_void != NULL) {
|
||||
const global ACC_TYPE * sinks_ptr =
|
||||
(const global ACC_TYPE *) ((const global char *) sinks_void + sinks_offset);
|
||||
m_sink = sinks_ptr[head_idx];
|
||||
has_sink = true;
|
||||
m = max(m, m_sink);
|
||||
}
|
||||
ACC_TYPE l = 0.0f;
|
||||
for (int c = 0; c < n_splits; ++c) {
|
||||
const ACC_TYPE m_c = rec0[c * record_stride + 0];
|
||||
const ACC_TYPE l_c = rec0[c * record_stride + 1];
|
||||
if (m_c > FA_M_INIT) {
|
||||
l += l_c * exp(m_c - m);
|
||||
}
|
||||
}
|
||||
if (has_sink) {
|
||||
l += exp(m_sink - m);
|
||||
}
|
||||
m_final_shared = m;
|
||||
l_final_shared = l;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
const ACC_TYPE m_final = m_final_shared;
|
||||
const ACC_TYPE l_final = l_final_shared;
|
||||
const ACC_TYPE l_inv = (l_final > 0.0f) ? (1.0f / l_final) : 0.0f;
|
||||
|
||||
ACC_TYPE4 o = (ACC_TYPE4)(0.0f);
|
||||
for (int c = 0; c < n_splits; ++c) {
|
||||
const global float * rec_c = rec0 + c * record_stride;
|
||||
const ACC_TYPE m_c = rec_c[0];
|
||||
if (m_c <= FA_M_INIT) continue;
|
||||
const global float4 * rec_oc = (const global float4 *) (rec_c + 2);
|
||||
const ACC_TYPE scale_c = exp(m_c - m_final);
|
||||
o = mad((ACC_TYPE4)(scale_c), rec_oc[lane], o);
|
||||
}
|
||||
o = o * l_inv;
|
||||
|
||||
const ulong o_row_offset = (ulong) batch_idx * o_nb3 + (ulong) q_idx * o_nb2 + (ulong) head_idx * o_nb1;
|
||||
global O_DATA_TYPE4 * o_row = (global O_DATA_TYPE4 *) ((global char *) o_void + o_offset + o_row_offset);
|
||||
o_row[lane] = CONVERT_O_DATA4(o);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,156 +0,0 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__kernel void flash_attn_kv_pad_f16(
|
||||
const global void * k_void, ulong k_offset,
|
||||
const global void * v_void, ulong v_offset,
|
||||
global void * k_pad_void,
|
||||
global void * v_pad_void,
|
||||
const int n_kv,
|
||||
const int n_head_kv,
|
||||
const int n_batch,
|
||||
const ulong k_nb1, const ulong k_nb2, const ulong k_nb3,
|
||||
const ulong v_nb1, const ulong v_nb2, const ulong v_nb3
|
||||
) {
|
||||
const int row_idx = get_global_id(0);
|
||||
const int head_kv_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
if (row_idx >= BLOCK_N || head_kv_idx >= n_head_kv || batch_idx >= n_batch) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int tail_start = n_kv - (n_kv % BLOCK_N);
|
||||
const int src_row_idx = tail_start + row_idx;
|
||||
|
||||
const global char * k_src = (const global char *) k_void + k_offset;
|
||||
const global char * v_src = (const global char *) v_void + v_offset;
|
||||
global char * k_pad = (global char *) k_pad_void;
|
||||
global char * v_pad = (global char *) v_pad_void;
|
||||
|
||||
const ulong k_dst_offset = ((ulong) batch_idx * (ulong) n_head_kv + (ulong) head_kv_idx) * ((ulong) BLOCK_N * k_nb1) + (ulong) row_idx * k_nb1;
|
||||
const ulong v_dst_offset = ((ulong) batch_idx * (ulong) n_head_kv + (ulong) head_kv_idx) * ((ulong) BLOCK_N * v_nb1) + (ulong) row_idx * v_nb1;
|
||||
|
||||
if (src_row_idx < n_kv) {
|
||||
const ulong k_src_offset = (ulong) batch_idx * k_nb3 + (ulong) head_kv_idx * k_nb2 + (ulong) src_row_idx * k_nb1;
|
||||
const ulong v_src_offset = (ulong) batch_idx * v_nb3 + (ulong) head_kv_idx * v_nb2 + (ulong) src_row_idx * v_nb1;
|
||||
|
||||
for (ulong i = 0; i < k_nb1; ++i) {
|
||||
k_pad[k_dst_offset + i] = k_src[k_src_offset + i];
|
||||
}
|
||||
for (ulong i = 0; i < v_nb1; ++i) {
|
||||
v_pad[v_dst_offset + i] = v_src[v_src_offset + i];
|
||||
}
|
||||
} else {
|
||||
for (ulong i = 0; i < k_nb1; ++i) {
|
||||
k_pad[k_dst_offset + i] = 0;
|
||||
}
|
||||
for (ulong i = 0; i < v_nb1; ++i) {
|
||||
v_pad[v_dst_offset + i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void flash_attn_mask_pad_f16(
|
||||
const global void * mask_void, ulong mask_offset,
|
||||
global void * mask_pad_void,
|
||||
const int n_q,
|
||||
const int n_kv,
|
||||
const ulong mask_nb1,
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
) {
|
||||
const int col_idx = get_global_id(0);
|
||||
const int q_row = get_global_id(1);
|
||||
const int mask_slice = get_global_id(2);
|
||||
|
||||
if (col_idx >= BLOCK_N || q_row >= n_q || mask_slice >= mask_ne2 * mask_ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int tail_start = n_kv - (n_kv % BLOCK_N);
|
||||
const int src_col_idx = tail_start + col_idx;
|
||||
const int mask_head_idx = mask_slice % mask_ne2;
|
||||
const int mask_batch_idx = mask_slice / mask_ne2;
|
||||
|
||||
const global char * mask_src_base = (const global char *) mask_void + mask_offset +
|
||||
(ulong) mask_batch_idx * mask_nb3 +
|
||||
(ulong) mask_head_idx * mask_nb2 +
|
||||
(ulong) q_row * mask_nb1;
|
||||
const global half * mask_src = (const global half *) mask_src_base;
|
||||
|
||||
global half * mask_pad = (global half *) mask_pad_void;
|
||||
const ulong dst_idx =
|
||||
(((ulong) mask_batch_idx * (ulong) mask_ne2 + (ulong) mask_head_idx) * (ulong) n_q + (ulong) q_row) * (ulong) BLOCK_N +
|
||||
(ulong) col_idx;
|
||||
|
||||
mask_pad[dst_idx] = src_col_idx < n_kv ? mask_src[src_col_idx] : (half) (-INFINITY);
|
||||
}
|
||||
|
||||
// Per-KV-tile mask class. 0=all -inf (skip tile), 1=mixed (apply mask),
|
||||
// 2=all zero, no -inf (skip mask lookup). Causal diagonal tiles are class 1.
|
||||
__kernel void flash_attn_blk_f16(
|
||||
const global void * mask_void, ulong mask_offset,
|
||||
global char * blk,
|
||||
const int n_q,
|
||||
const int n_kv,
|
||||
const ulong mask_nb1,
|
||||
const ulong mask_nb2,
|
||||
const ulong mask_nb3,
|
||||
const int mask_ne2,
|
||||
const int mask_ne3
|
||||
) {
|
||||
const int kv_block_idx = get_global_id(0);
|
||||
const int q_block_idx = get_global_id(1);
|
||||
const int mask_slice = get_global_id(2);
|
||||
|
||||
const int n_q_blocks = (n_q + BLOCK_M - 1) / BLOCK_M;
|
||||
const int n_kv_blocks = (n_kv + BLOCK_N - 1) / BLOCK_N;
|
||||
if (kv_block_idx >= n_kv_blocks || q_block_idx >= n_q_blocks || mask_slice >= mask_ne2 * mask_ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int mask_head_idx = mask_slice % mask_ne2;
|
||||
const int mask_batch_idx = mask_slice / mask_ne2;
|
||||
const int q_start = q_block_idx * BLOCK_M;
|
||||
const int k_start = kv_block_idx * BLOCK_N;
|
||||
const int q_count = min(BLOCK_M, n_q - q_start);
|
||||
const int k_count = min(BLOCK_N, n_kv - k_start);
|
||||
|
||||
const half neg_max_half = (half) (-65504.0f);
|
||||
char has_unmasked = 0;
|
||||
char has_masked = 0;
|
||||
char has_nonzero = 0;
|
||||
|
||||
const global char * mask_base = (const global char *) mask_void + mask_offset +
|
||||
(ulong) mask_batch_idx * mask_nb3 +
|
||||
(ulong) mask_head_idx * mask_nb2;
|
||||
|
||||
for (int qi = 0; qi < q_count; ++qi) {
|
||||
const global half * mask_row = (const global half *) (mask_base + (ulong) (q_start + qi) * mask_nb1) + k_start;
|
||||
for (int ki = 0; ki < k_count; ++ki) {
|
||||
const half v = mask_row[ki];
|
||||
if (v <= neg_max_half) {
|
||||
has_masked = 1;
|
||||
} else {
|
||||
has_unmasked = 1;
|
||||
if (v != (half) 0.0f) {
|
||||
has_nonzero = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_masked && has_unmasked) break; // mixed tile — short-circuit.
|
||||
}
|
||||
|
||||
char res;
|
||||
if (has_unmasked == 0) {
|
||||
res = 0;
|
||||
} else if (has_masked || has_nonzero) {
|
||||
res = 1;
|
||||
} else {
|
||||
res = 2;
|
||||
}
|
||||
|
||||
blk[((ulong) mask_slice * (ulong) n_q_blocks + (ulong) q_block_idx) * (ulong) n_kv_blocks + (ulong) kv_block_idx] = res;
|
||||
}
|
||||
@ -158,239 +158,6 @@ kernel void kernel_set_rows_f32_i32(
|
||||
}
|
||||
}
|
||||
|
||||
// f32 -> q8_0 quantize set_rows. Block = half d + char qs[32].
|
||||
#define QK8_0 32
|
||||
|
||||
inline void quantize_q8_0_block(global float * x, global char * qs, global half * d_out) {
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
amax = fmax(amax, fabs(x[j]));
|
||||
}
|
||||
|
||||
float d = amax / 127.0f;
|
||||
float id = (d != 0.0f) ? 127.0f / amax : 0.0f;
|
||||
|
||||
vstore_half(d, 0, d_out);
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
qs[j] = (char)((int)round(x[j] * id));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_q8_0_i64(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global char * dst_row = (global char *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK8_0;
|
||||
global char * y = dst_row + blk * (2 + QK8_0);
|
||||
|
||||
quantize_q8_0_block(x, y + 2, (global half *)y);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_q8_0_i32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global char * dst_row = (global char *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK8_0;
|
||||
global char * y = dst_row + blk * (2 + QK8_0);
|
||||
|
||||
quantize_q8_0_block(x, y + 2, (global half *)y);
|
||||
}
|
||||
}
|
||||
|
||||
// SoA q8_0 variants. dst_q: int8[QK8_0] per block; dst_d: fp16 scale per block.
|
||||
// Layout matches kernel_convert_block_q8_0; block index follows dst element order.
|
||||
kernel void kernel_set_rows_q8_0_soa_i64(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst_q,
|
||||
ulong offset_q,
|
||||
global char * dst_d,
|
||||
ulong offset_d,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
int ne1_dst,
|
||||
int ne2_dst,
|
||||
int ne3_dst
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst_q = dst_q + offset_q;
|
||||
dst_d = dst_d + offset_d;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
|
||||
|
||||
global half * d_row = (global half *)(dst_d) + row_blk_base;
|
||||
global char * q_row = (global char *)(dst_q) + row_blk_base * QK8_0;
|
||||
global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK8_0;
|
||||
global char * q = q_row + blk * QK8_0;
|
||||
|
||||
quantize_q8_0_block(x, q, d_row + blk);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_q8_0_soa_i32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst_q,
|
||||
ulong offset_q,
|
||||
global char * dst_d,
|
||||
ulong offset_d,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
int ne1_dst,
|
||||
int ne2_dst,
|
||||
int ne3_dst
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst_q = dst_q + offset_q;
|
||||
dst_d = dst_d + offset_d;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
|
||||
|
||||
global half * d_row = (global half *)(dst_d) + row_blk_base;
|
||||
global char * q_row = (global char *)(dst_q) + row_blk_base * QK8_0;
|
||||
global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK8_0;
|
||||
global char * q = q_row + blk * QK8_0;
|
||||
|
||||
quantize_q8_0_block(x, q, d_row + blk);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_f16_i32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
@ -439,270 +206,3 @@ kernel void kernel_set_rows_f16_i32(
|
||||
dst_row[ind] = src_row[ind];
|
||||
}
|
||||
}
|
||||
|
||||
// f32 -> q4_0 quantize set_rows. Block = half d + uchar qs[16] (shuffled
|
||||
// nibbles: qs[j] low/high = elem j / j+16).
|
||||
// Dequant: val[i] = d * (nibble_i - 8)
|
||||
// nblk0 = number of q4_0 blocks per row = ne00 / 32.
|
||||
#define QK4_0 32
|
||||
#define Q4_0_BLOCK_SIZE 18
|
||||
|
||||
inline void quantize_q4_0_block(global float * x, global uchar * qs, global half * d_out) {
|
||||
// Find the signed value with the largest absolute magnitude (matches ggml ref).
|
||||
float max = 0.0f;
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < QK4_0; j++) {
|
||||
float v = x[j];
|
||||
float a = fabs(v);
|
||||
if (a > amax) {
|
||||
amax = a;
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
float d = max / -8.0f;
|
||||
float id = (d != 0.0f) ? 1.0f / d : 0.0f;
|
||||
|
||||
vstore_half(d, 0, d_out);
|
||||
|
||||
for (int j = 0; j < QK4_0/2; j++) {
|
||||
float x0 = x[j] * id;
|
||||
float x1 = x[j + QK4_0/2] * id;
|
||||
|
||||
int i0 = (int)(x0 + 8.5f);
|
||||
int i1 = (int)(x1 + 8.5f);
|
||||
if (i0 < 0) i0 = 0;
|
||||
if (i0 > 15) i0 = 15;
|
||||
if (i1 < 0) i1 = 0;
|
||||
if (i1 > 15) i1 = 15;
|
||||
|
||||
qs[j] = (uchar)i0 | ((uchar)i1 << 4);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_q4_0_i64(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global char * dst_row = (global char *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK4_0;
|
||||
global char * y = dst_row + blk * Q4_0_BLOCK_SIZE;
|
||||
global half * yd = (global half *)(y);
|
||||
global uchar * yqs = (global uchar *)(y + 2);
|
||||
|
||||
quantize_q4_0_block(x, yqs, yd);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_q4_0_i32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global char * dst_row = (global char *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK4_0;
|
||||
global char * y = dst_row + blk * Q4_0_BLOCK_SIZE;
|
||||
global half * yd = (global half *)(y);
|
||||
global uchar * yqs = (global uchar *)(y + 2);
|
||||
|
||||
quantize_q4_0_block(x, yqs, yd);
|
||||
}
|
||||
}
|
||||
|
||||
// SoA variants for q4_0 dst. Used when the backend has split block_q4_0 records
|
||||
// into separate quant (dst_q) and scale (dst_d) sub-buffers — same pattern as
|
||||
// the q8_0 SoA variants above.
|
||||
//
|
||||
// Layout (matches kernel_convert_block_q4_0, the "shuffled" variant):
|
||||
// dst_q: contiguous 16 packed nibbles per block, block i at offset i * 16 bytes.
|
||||
// dst_d: contiguous fp16 scales, block i at offset i * 2 bytes.
|
||||
// Nibble layout inside each byte is unchanged from AoS: qs[j] low nibble = element j,
|
||||
// qs[j] high nibble = element j+16. kernel_restore_block_q4_0 copies bytes as-is.
|
||||
kernel void kernel_set_rows_q4_0_soa_i64(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst_q,
|
||||
ulong offset_q,
|
||||
global char * dst_d,
|
||||
ulong offset_d,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
int ne1_dst,
|
||||
int ne2_dst,
|
||||
int ne3_dst
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst_q = dst_q + offset_q;
|
||||
dst_d = dst_d + offset_d;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
|
||||
|
||||
global half * d_row = (global half *)(dst_d) + row_blk_base;
|
||||
global uchar * q_row = (global uchar *)(dst_q) + row_blk_base * (QK4_0/2);
|
||||
global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK4_0;
|
||||
global uchar * qs = q_row + blk * (QK4_0/2);
|
||||
global half * d_bk = d_row + blk;
|
||||
|
||||
quantize_q4_0_block(x, qs, d_bk);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_q4_0_soa_i32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst_q,
|
||||
ulong offset_q,
|
||||
global char * dst_d,
|
||||
ulong offset_d,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
int ne1_dst,
|
||||
int ne2_dst,
|
||||
int ne3_dst
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst_q = dst_q + offset_q;
|
||||
dst_d = dst_d + offset_d;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
long row_blk_base = ((long)i03 * ne2_dst * ne1_dst + (long)i02 * ne1_dst + i1) * nblk0;
|
||||
|
||||
global half * d_row = (global half *)(dst_d) + row_blk_base;
|
||||
global uchar * q_row = (global uchar *)(dst_q) + row_blk_base * (QK4_0/2);
|
||||
global float * src_row = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int blk = get_local_id(0); blk < nblk0; blk += get_local_size(0)) {
|
||||
global float * x = src_row + blk * QK4_0;
|
||||
global uchar * qs = q_row + blk * (QK4_0/2);
|
||||
global half * d_bk = d_row + blk;
|
||||
|
||||
quantize_q4_0_block(x, qs, d_bk);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1270,14 +1270,77 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
|
||||
}
|
||||
|
||||
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
|
||||
static const std::map<ggml_op, std::string> ops = {
|
||||
{GGML_OP_NONE, "GGML_OP_NONE" },
|
||||
{GGML_OP_ACC, "GGML_OP_ACC" },
|
||||
{GGML_OP_ADD, "GGML_OP_ADD" },
|
||||
{GGML_OP_ADD1, "GGML_OP_ADD1" },
|
||||
{GGML_OP_ADD_ID, "GGML_OP_ADD_ID" },
|
||||
{GGML_OP_CONCAT, "GGML_OP_CONCAT" },
|
||||
{GGML_OP_CONT, "GGML_OP_CONT" },
|
||||
{GGML_OP_DIV, "GGML_OP_DIV" },
|
||||
{GGML_OP_DUP, "GGML_OP_DUP" },
|
||||
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
|
||||
{GGML_OP_MUL, "GGML_OP_MUL" },
|
||||
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
|
||||
{GGML_OP_MUL_MAT_ID, "GGML_OP_MUL_MAT_ID" },
|
||||
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
|
||||
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
|
||||
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
|
||||
{GGML_OP_NORM, "GGML_OP_NORM" },
|
||||
{GGML_OP_ROPE, "GGML_OP_ROPE" },
|
||||
{GGML_OP_SCALE, "GGML_OP_SCALE" },
|
||||
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
|
||||
{GGML_OP_SUM_ROWS, "GGML_OP_SUM_ROWS" },
|
||||
{GGML_OP_SUB, "GGML_OP_SUB" },
|
||||
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
|
||||
{GGML_OP_VIEW, "GGML_OP_VIEW" },
|
||||
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
|
||||
{GGML_OP_CPY, "GGML_OP_CPY" },
|
||||
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT" },
|
||||
{GGML_OP_L2_NORM, "GGML_OP_L2_NORM" },
|
||||
{GGML_OP_CLAMP, "GGML_OP_CLAMP" },
|
||||
{GGML_OP_PAD, "GGML_OP_PAD" },
|
||||
{GGML_OP_SSM_CONV, "GGML_OP_SSM_CONV" },
|
||||
{GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
|
||||
{GGML_OP_ARGSORT, "GGML_OP_ARGSORT" },
|
||||
{GGML_OP_REPEAT, "GGML_OP_REPEAT" },
|
||||
{GGML_OP_IM2COL, "GGML_OP_IM2COL" }
|
||||
};
|
||||
static const std::map<ggml_unary_op, std::string> unary_ops = {
|
||||
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
|
||||
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
|
||||
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
|
||||
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
|
||||
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
|
||||
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
|
||||
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
|
||||
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
|
||||
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
|
||||
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
|
||||
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
|
||||
{GGML_UNARY_OP_SOFTPLUS, "GGML_UNARY_OP_SOFTPLUS" },
|
||||
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
|
||||
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
|
||||
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
|
||||
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
|
||||
};
|
||||
static const std::map<ggml_glu_op, std::string> glu_ops = {
|
||||
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
|
||||
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
|
||||
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
|
||||
};
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_UNARY:
|
||||
return std::string("GGML_UNARY_OP_") + ggml_unary_op_name(ggml_get_unary_op(node));
|
||||
return unary_ops.at(ggml_get_unary_op(node));
|
||||
case GGML_OP_GLU:
|
||||
return std::string("GGML_GLU_OP_") + ggml_glu_op_name(ggml_get_glu_op(node));
|
||||
return glu_ops.at(ggml_get_glu_op(node));
|
||||
default:
|
||||
return std::string("GGML_OP_") + ggml_op_name(node->op);
|
||||
return ops.at(node->op);
|
||||
}
|
||||
static const std::string unknown_op = "UNKNOWN_GGML_OP";
|
||||
return unknown_op;
|
||||
}
|
||||
|
||||
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
|
||||
|
||||
@ -1053,10 +1053,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
(op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) {
|
||||
return true;
|
||||
}
|
||||
// CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported
|
||||
if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) {
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
|
||||
@ -17,22 +17,6 @@ namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
static ov::Output<ov::Node> reshape_add_id_input_to_2d(const ov::Output<ov::Node> & input,
|
||||
const ov::PartialShape & input_shape,
|
||||
const std::vector<int> & dims) {
|
||||
const auto actual_shape = input.get_partial_shape();
|
||||
if (actual_shape.rank().is_static() && actual_shape.rank().get_length() == 2) {
|
||||
return input;
|
||||
}
|
||||
|
||||
if (input_shape.rank().is_static() && input_shape.rank().get_length() == 2) {
|
||||
return input;
|
||||
}
|
||||
|
||||
auto shape = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
|
||||
return std::make_shared<ov::op::v1::Reshape>(input, get_dimensions(shape, dims), false);
|
||||
}
|
||||
|
||||
OutputVector translate_add_id(const NodeContext & context) {
|
||||
num_inputs_check(context, 3, 3);
|
||||
|
||||
@ -44,9 +28,11 @@ OutputVector translate_add_id(const NodeContext & context) {
|
||||
// input: [1, n_token, n_used, n_embd]
|
||||
// bias: [1, 1, n_expert, n_embd]
|
||||
// ids: [1, 1, n_token, n_used]
|
||||
// Model bias constants may already be stored as [n_expert, n_embd].
|
||||
bias = reshape_add_id_input_to_2d(bias, context.get_input_shape(1), {2, 3});
|
||||
ids = reshape_add_id_input_to_2d(ids, context.get_input_shape(2), {2, 3});
|
||||
auto bias_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(bias, ov::element::i64);
|
||||
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
|
||||
|
||||
bias = std::make_shared<ov::op::v1::Reshape>(bias, get_dimensions(bias_shape_4d, {2, 3}), false);
|
||||
ids = std::make_shared<ov::op::v1::Reshape>(ids, get_dimensions(ids_shape_4d, {2, 3}), false);
|
||||
|
||||
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
|
||||
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
|
||||
|
||||
@ -3,11 +3,8 @@
|
||||
#include "../utils.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/clamp.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
@ -18,7 +15,7 @@ namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
static std::pair<ov::Output<ov::Node>, ov::Output<ov::Node>> get_glu_inputs(const NodeContext & context) {
|
||||
OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
ov::Output<ov::Node> src0;
|
||||
@ -55,12 +52,6 @@ static std::pair<ov::Output<ov::Node>, ov::Output<ov::Node>> get_glu_inputs(cons
|
||||
std::swap(src0, src1);
|
||||
}
|
||||
|
||||
return {src0, src1};
|
||||
}
|
||||
|
||||
OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
auto [src0, src1] = get_glu_inputs(context);
|
||||
|
||||
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(src0);
|
||||
auto silu = std::make_shared<ov::op::v1::Multiply>(src0, sigmoid);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(silu, src1);
|
||||
@ -68,27 +59,6 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
OutputVector translate_glu_swiglu_oai(const NodeContext & context) {
|
||||
auto [src0, src1] = get_glu_inputs(context);
|
||||
|
||||
const int32_t * params = context.get_output_op_params();
|
||||
const float alpha = reinterpret_cast<const float *>(params)[2];
|
||||
const float limit = reinterpret_cast<const float *>(params)[3];
|
||||
|
||||
auto gate = std::make_shared<ov::op::v0::Clamp>(src0, -std::numeric_limits<float>::infinity(), limit);
|
||||
auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, {}, {alpha});
|
||||
auto scaled_gate = std::make_shared<ov::op::v1::Multiply>(gate, alpha_const);
|
||||
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(scaled_gate);
|
||||
auto out_glu = std::make_shared<ov::op::v1::Multiply>(gate, sigmoid);
|
||||
|
||||
auto up = std::make_shared<ov::op::v0::Clamp>(src1, -limit, limit);
|
||||
auto one = ov::op::v0::Constant::create(ov::element::f32, {}, {1.0f});
|
||||
auto up_plus_one = std::make_shared<ov::op::v1::Add>(up, one);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(out_glu, up_plus_one);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
|
||||
@ -2,135 +2,23 @@
|
||||
#include "../op_table.h"
|
||||
#include "../utils.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <openvino/op/bitwise_and.hpp>
|
||||
#include <openvino/op/bitwise_right_shift.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
namespace {
|
||||
|
||||
std::shared_ptr<ov::op::v0::Constant> const_i64(const std::vector<int64_t> & values) {
|
||||
return ov::op::v0::Constant::create(ov::element::i64, ov::Shape{values.size()}, values);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> slice_axis(const ov::Output<ov::Node> & input, int64_t axis, int64_t begin, int64_t end) {
|
||||
return std::make_shared<ov::op::v8::Slice>(input, const_i64({begin}), const_i64({end}), const_i64({1}),
|
||||
const_i64({axis}));
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> translate_mul_mat_id_mxfp4_packed(const NodeContext & context,
|
||||
ov::Output<ov::Node> expert_weights,
|
||||
ov::Output<ov::Node> activations,
|
||||
ov::Output<ov::Node> ids) {
|
||||
auto packed_shape = expert_weights.get_partial_shape().to_shape();
|
||||
FRONT_END_OP_CONVERSION_CHECK(packed_shape.size() == 5 && packed_shape[4] == 17,
|
||||
"Expected packed MXFP4 expert weights with shape [1, n_expert, m, k_blocks, 17]");
|
||||
|
||||
const int64_t n_expert = static_cast<int64_t>(packed_shape[1]);
|
||||
const int64_t rows = static_cast<int64_t>(packed_shape[2]);
|
||||
const int64_t k_blocks = static_cast<int64_t>(packed_shape[3]);
|
||||
const int64_t qk = 32;
|
||||
const int64_t cols = k_blocks * qk;
|
||||
|
||||
auto packed_shape_4d = const_i64({n_expert, rows, k_blocks, 17});
|
||||
expert_weights = std::make_shared<ov::op::v1::Reshape>(expert_weights, packed_shape_4d, false);
|
||||
|
||||
auto activations_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
|
||||
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
|
||||
auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3});
|
||||
auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3});
|
||||
|
||||
activations = std::make_shared<ov::op::v1::Reshape>(activations, activations_shape_3d, false);
|
||||
ids = std::make_shared<ov::op::v1::Reshape>(ids, ids_shape_2d, false);
|
||||
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
|
||||
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
|
||||
}
|
||||
|
||||
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
|
||||
static const std::vector<float> f4e2m1_lut = {0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f,
|
||||
-0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f};
|
||||
std::vector<float> e8m0_lut(256);
|
||||
for (size_t i = 0; i < e8m0_lut.size(); ++i) {
|
||||
uint32_t bits = static_cast<uint32_t>(i) << 23;
|
||||
memcpy(&e8m0_lut[i], &bits, sizeof(float));
|
||||
}
|
||||
e8m0_lut[0] = std::numeric_limits<float>::min() / 2.0f;
|
||||
e8m0_lut[255] = std::numeric_limits<float>::quiet_NaN();
|
||||
|
||||
auto f4_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{f4e2m1_lut.size()}, f4e2m1_lut);
|
||||
auto scale_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{e8m0_lut.size()}, e8m0_lut);
|
||||
|
||||
auto selected_packed_weights = std::make_shared<ov::op::v8::Gather>(expert_weights, ids, gather_axis);
|
||||
auto scale_byte = slice_axis(selected_packed_weights, 4, 0, 1);
|
||||
auto qs = slice_axis(selected_packed_weights, 4, 1, 17);
|
||||
auto low = std::make_shared<ov::op::v13::BitwiseAnd>(
|
||||
qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {0x0F}), ov::op::AutoBroadcastType::NUMPY);
|
||||
auto high_shift = std::make_shared<ov::op::v15::BitwiseRightShift>(
|
||||
qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {4}), ov::op::AutoBroadcastType::NUMPY);
|
||||
auto nibbles = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{low, high_shift}, 4);
|
||||
auto nibble_indices = std::make_shared<ov::op::v0::Convert>(nibbles, ov::element::i32);
|
||||
auto weights_f32 = std::make_shared<ov::op::v8::Gather>(f4_lut, nibble_indices, gather_axis);
|
||||
|
||||
auto scale_indices = std::make_shared<ov::op::v0::Convert>(scale_byte, ov::element::i32);
|
||||
auto scales_f32 = std::make_shared<ov::op::v8::Gather>(scale_lut, scale_indices, gather_axis);
|
||||
ov::Output<ov::Node> selected_weights = std::make_shared<ov::op::v1::Multiply>(weights_f32, scales_f32,
|
||||
ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
auto ids_shape = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
|
||||
auto selected_weights_target_dims = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{get_dimensions(ids_shape, {0, 1}), const_i64({rows, cols})}, 0);
|
||||
selected_weights = std::make_shared<ov::op::v1::Reshape>(selected_weights, selected_weights_target_dims, false);
|
||||
|
||||
auto activations_shape = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
|
||||
ov::Output<ov::Node> acts_target_dims = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{
|
||||
get_dimensions(activations_shape, {0}),
|
||||
get_dimensions(ids_shape, {1}),
|
||||
get_dimensions(activations_shape, {2}),
|
||||
},
|
||||
0);
|
||||
ov::Output<ov::Node> acts_broadcasted =
|
||||
std::make_shared<ov::op::v3::Broadcast>(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL);
|
||||
|
||||
auto activations_expanded = std::make_shared<ov::op::v0::Unsqueeze>(acts_broadcasted, const_i64({2}));
|
||||
ov::Output<ov::Node> result =
|
||||
std::make_shared<ov::op::v0::MatMul>(activations_expanded, selected_weights, false, true);
|
||||
|
||||
auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {rows});
|
||||
auto result_target_dims = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{batch_dim, get_dimensions(ids_shape, {0, 1}), row_dim}, 0);
|
||||
result = std::make_shared<ov::op::v1::Reshape>(result, result_target_dims, false);
|
||||
|
||||
const auto output_type = context.get_output_type();
|
||||
if (result.get_element_type() != output_type) {
|
||||
result = std::make_shared<ov::op::v0::Convert>(result, output_type);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
OutputVector translate_mul_mat_id(const NodeContext & context) {
|
||||
num_inputs_check(context, 3, 3);
|
||||
|
||||
@ -138,12 +26,6 @@ OutputVector translate_mul_mat_id(const NodeContext & context) {
|
||||
auto activations = process_view_input_new(context, 1);
|
||||
auto ids = process_view_input_new(context, 2);
|
||||
|
||||
if (expert_weights.get_element_type() == ov::element::u8 && expert_weights.get_partial_shape().rank().is_static() &&
|
||||
expert_weights.get_partial_shape().rank().get_length() == 5) {
|
||||
return rename_outputs_with_suffix({translate_mul_mat_id_mxfp4_packed(context, expert_weights, activations, ids)},
|
||||
context.get_name());
|
||||
}
|
||||
|
||||
// OpenVINO sees GGML tensors in reversed dimension order:
|
||||
// weights: [1, n_expert, m, k]
|
||||
// activations: [1, n_tokens, n_used_or_1, k]
|
||||
|
||||
@ -6,16 +6,12 @@
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/frontend/exception.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/softmax.hpp>
|
||||
#include <vector>
|
||||
|
||||
@ -24,31 +20,12 @@ namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
static bool is_static_one(const ov::Dimension & dim) {
|
||||
return dim.is_static() && dim.get_length() == 1;
|
||||
}
|
||||
|
||||
static bool same_static_dim(const ov::Dimension & lhs, const ov::Dimension & rhs) {
|
||||
return lhs.is_static() && rhs.is_static() && lhs.get_length() == rhs.get_length();
|
||||
}
|
||||
|
||||
static bool is_attention_sinks_input_shape(const ov::PartialShape & candidate, const ov::PartialShape & logits_shape) {
|
||||
if (candidate.rank().is_dynamic() || logits_shape.rank().is_dynamic() || candidate.rank().get_length() != 4 ||
|
||||
logits_shape.rank().get_length() != 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return is_static_one(candidate[0]) && is_static_one(candidate[1]) && is_static_one(candidate[2]) &&
|
||||
same_static_dim(candidate[3], logits_shape[1]);
|
||||
}
|
||||
|
||||
// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend:
|
||||
// 1) logits = src0 * scale
|
||||
// 2) logits += mask (if provided)
|
||||
// 3) append attention sinks as hidden logits (if provided)
|
||||
// 4) softmax over the last dimension and remove the hidden sink column
|
||||
// 3) softmax over the last dimension
|
||||
OutputVector translate_soft_max(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 3);
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
@ -56,11 +33,6 @@ OutputVector translate_soft_max(const NodeContext & context) {
|
||||
memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float));
|
||||
|
||||
ov::Output<ov::Node> logits = context.get_input(0);
|
||||
const bool second_input_is_sinks =
|
||||
context.get_input_size() == 2 && is_attention_sinks_input_shape(context.get_input_shape(1), context.get_output_shape());
|
||||
const bool has_mask = context.get_input_size() > 1 && !second_input_is_sinks;
|
||||
const bool has_sinks = second_input_is_sinks || context.get_input_size() > 2;
|
||||
const size_t sinks_input_idx = second_input_is_sinks ? 1 : 2;
|
||||
|
||||
// Apply scale first: logits = src0 * scale
|
||||
if (scale != 1.0f) {
|
||||
@ -69,12 +41,12 @@ OutputVector translate_soft_max(const NodeContext & context) {
|
||||
logits = std::make_shared<ov::op::v1::Multiply>(logits, scale_const);
|
||||
}
|
||||
|
||||
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && !has_mask),
|
||||
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2),
|
||||
"OpenVINO softmax ALiBi path requires mask input");
|
||||
|
||||
// Optional mask add: logits += mask
|
||||
// For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
|
||||
if (has_mask) {
|
||||
if (context.get_input_size() > 1) {
|
||||
ov::Output<ov::Node> mask = context.get_input(1);
|
||||
|
||||
// For stateful
|
||||
@ -122,40 +94,8 @@ OutputVector translate_soft_max(const NodeContext & context) {
|
||||
logits = std::make_shared<ov::op::v1::Add>(logits, mask);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> softmax_input = logits;
|
||||
if (has_sinks) {
|
||||
ov::Output<ov::Node> sinks = context.get_input(sinks_input_idx);
|
||||
if (sinks.get_element_type() != logits.get_element_type()) {
|
||||
sinks = std::make_shared<ov::op::v0::Convert>(sinks, logits.get_element_type());
|
||||
}
|
||||
|
||||
auto sink_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {1, -1, 1, 1});
|
||||
auto sinks_4d = std::make_shared<ov::op::v1::Reshape>(sinks, sink_shape, false);
|
||||
|
||||
auto logits_shape = std::make_shared<ov::op::v3::ShapeOf>(logits, ov::element::i64);
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
|
||||
auto four = ov::op::v0::Constant::create(ov::element::i64, {1}, {4});
|
||||
auto shape_axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
|
||||
auto sink_prefix_shape = std::make_shared<ov::op::v8::Slice>(logits_shape, zero, three, one, shape_axis);
|
||||
auto sink_last_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto sink_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{sink_prefix_shape, sink_last_dim}, 0);
|
||||
auto sink_column = std::make_shared<ov::op::v3::Broadcast>(sinks_4d, sink_broadcast_shape,
|
||||
ov::op::BroadcastType::BIDIRECTIONAL);
|
||||
softmax_input = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{logits, sink_column}, 3);
|
||||
|
||||
auto softmax_with_sink = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
|
||||
auto original_last_dim = std::make_shared<ov::op::v8::Slice>(logits_shape, three, four, one, shape_axis);
|
||||
auto res = std::make_shared<ov::op::v8::Slice>(softmax_with_sink, zero, original_last_dim, one, three);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
// Softmax along last dimension (equivalent to ggml softmax over ne[0]).
|
||||
auto res = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
|
||||
auto res = std::make_shared<ov::op::v8::Softmax>(logits, -1);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
@ -47,7 +47,6 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
|
||||
{"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input<v0::Tanh> },
|
||||
{"GGML_OP_VIEW", op::translate_view },
|
||||
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
|
||||
{"GGML_GLU_OP_SWIGLU_OAI", op::translate_glu_swiglu_oai },
|
||||
{"GGML_GLU_OP_GEGLU", op::translate_glu_geglu },
|
||||
{"GGML_OP_SET_ROWS", op::translate_set_rows },
|
||||
{"GGML_OP_CPY", op::translate_cpy },
|
||||
|
||||
@ -32,7 +32,6 @@ GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
GGML_OP_CONVERTER(translate_glu_swiglu);
|
||||
GGML_OP_CONVERTER(translate_glu_swiglu_oai);
|
||||
GGML_OP_CONVERTER(translate_glu_geglu);
|
||||
GGML_OP_CONVERTER(translate_set_rows);
|
||||
GGML_OP_CONVERTER(translate_cpy);
|
||||
|
||||
@ -2,10 +2,8 @@
|
||||
#include "ggml-sycl/common.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
|
||||
static void norm_f32(const float* x, float* dst, const int ncols,
|
||||
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
|
||||
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
|
||||
const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
@ -18,16 +16,16 @@ static void norm_f32(const float* x, float* dst, const int ncols,
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
|
||||
const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row});
|
||||
const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row});
|
||||
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
||||
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
||||
|
||||
x += src_offset;
|
||||
dst += dst_offset;
|
||||
x += strided_offset;
|
||||
dst += packed_offset;
|
||||
|
||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col * src_stride_col];
|
||||
const float xi = x[col];
|
||||
mean_var.x() += xi;
|
||||
mean_var.y() += xi * xi;
|
||||
}
|
||||
@ -56,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols,
|
||||
const float inv_std = sycl::rsqrt(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col * dst_stride_col] = (x[col * src_stride_col] - mean) * inv_std;
|
||||
dst[col] = (x[col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
@ -147,10 +145,8 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols,
|
||||
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
|
||||
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
|
||||
const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
@ -164,17 +160,17 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols,
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
|
||||
const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row});
|
||||
const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row});
|
||||
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
||||
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
||||
|
||||
x += src_offset;
|
||||
dst += dst_offset;
|
||||
x += strided_offset;
|
||||
dst += packed_offset;
|
||||
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col * src_stride_col];
|
||||
const float xi = x[col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
@ -202,15 +198,14 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols,
|
||||
const float scale = sycl::rsqrt(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col * dst_stride_col] = scale * x[col * src_stride_col];
|
||||
dst[col] = scale * x[col];
|
||||
}
|
||||
}
|
||||
|
||||
template<int warp_size>
|
||||
static void l2_norm_f32(const float * x, float * dst, const int ncols,
|
||||
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel,
|
||||
const int64_t src_stride_sample, const int64_t dst_stride_col, const int64_t dst_stride_row,
|
||||
const int64_t dst_stride_channel, const int64_t dst_stride_sample, const float eps,
|
||||
const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, float* s_sum, const int block_size) {
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
@ -220,13 +215,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols,
|
||||
const int sample = item_ct1.get_group(0);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
x += sample*src_stride_sample + channel*src_stride_channel + row*src_stride_row;
|
||||
dst += sample*dst_stride_sample + channel*dst_stride_channel + row*dst_stride_row;
|
||||
x += sample*stride_sample + channel*stride_channel + row*stride_row;
|
||||
dst += ((sample*nchannels + channel)*nrows + row)*ncols;
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col * src_stride_col];
|
||||
const float xi = x[col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
@ -234,13 +229,12 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols,
|
||||
const float scale = sycl::rsqrt(sycl::fmax(tmp, eps * eps));
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col * dst_stride_col] = scale * x[col * src_stride_col];
|
||||
dst[col] = scale * x[col];
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
|
||||
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
|
||||
const float eps, queue_ptr stream, int device) {
|
||||
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
@ -251,10 +245,7 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols,
|
||||
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
|
||||
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
|
||||
eps, item_ct1, nullptr, WARP_SIZE);
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -274,10 +265,7 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols,
|
||||
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
|
||||
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
|
||||
eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -331,9 +319,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
}
|
||||
|
||||
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
|
||||
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
|
||||
const float eps, queue_ptr stream, int device) {
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
|
||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
@ -344,10 +330,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols,
|
||||
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
|
||||
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
|
||||
eps, item_ct1, nullptr, WARP_SIZE);
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -367,10 +350,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols,
|
||||
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
|
||||
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
|
||||
eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -383,14 +363,9 @@ static void l2_norm_f32_sycl(const float * x,
|
||||
const int nrows,
|
||||
const int nchannels,
|
||||
const int nsamples,
|
||||
const int64_t src_stride_col,
|
||||
const int64_t src_stride_row,
|
||||
const int64_t src_stride_channel,
|
||||
const int64_t src_stride_sample,
|
||||
const int64_t dst_stride_col,
|
||||
const int64_t dst_stride_row,
|
||||
const int64_t dst_stride_channel,
|
||||
const int64_t dst_stride_sample,
|
||||
const int64_t stride_row,
|
||||
const int64_t stride_channel,
|
||||
const int64_t stride_sample,
|
||||
const float eps,
|
||||
queue_ptr stream,
|
||||
int device) {
|
||||
@ -404,10 +379,7 @@ static void l2_norm_f32_sycl(const float * x,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(warp_size)]] {
|
||||
l2_norm_f32<warp_size>(x, dst, ncols,
|
||||
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
|
||||
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
|
||||
eps, item_ct1,
|
||||
l2_norm_f32<warp_size>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
||||
nullptr, warp_size);
|
||||
});
|
||||
});
|
||||
@ -426,9 +398,7 @@ static void l2_norm_f32_sycl(const float * x,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(warp_size)]] {
|
||||
l2_norm_f32<warp_size>(x, dst, ncols,
|
||||
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
|
||||
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
|
||||
l2_norm_f32<warp_size>(x, dst, ncols, stride_row, stride_channel, stride_sample,
|
||||
eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
@ -451,20 +421,12 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
const size_t tdst = ggml_type_size(dst->type);
|
||||
GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
|
||||
GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
|
||||
const int64_t ss0 = nb00 / ts0;
|
||||
const int64_t ss1 = nb01 / ts0;
|
||||
const int64_t ss2 = nb02 / ts0;
|
||||
const int64_t ss3 = nb03 / ts0;
|
||||
const int64_t ds0 = nb0 / tdst;
|
||||
const int64_t ds1 = nb1 / tdst;
|
||||
const int64_t ds2 = nb2 / tdst;
|
||||
const int64_t ds3 = nb3 / tdst;
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03,
|
||||
ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device);
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
@ -503,19 +465,11 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
const size_t tdst = ggml_type_size(dst->type);
|
||||
GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
|
||||
GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
|
||||
const int64_t ss0 = nb00 / ts0;
|
||||
const int64_t ss1 = nb01 / ts0;
|
||||
const int64_t ss2 = nb02 / ts0;
|
||||
const int64_t ss3 = nb03 / ts0;
|
||||
const int64_t ds0 = nb0 / tdst;
|
||||
const int64_t ds1 = nb1 / tdst;
|
||||
const int64_t ds2 = nb2 / tdst;
|
||||
const int64_t ds3 = nb3 / tdst;
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03,
|
||||
ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device);
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
@ -690,21 +644,13 @@ void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
const size_t tdst = ggml_type_size(dst->type);
|
||||
GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
|
||||
GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
|
||||
const int64_t ss0 = nb00 / ts0;
|
||||
const int64_t ss1 = nb01 / ts0;
|
||||
const int64_t ss2 = nb02 / ts0;
|
||||
const int64_t ss3 = nb03 / ts0;
|
||||
const int64_t ds0 = nb0 / tdst;
|
||||
const int64_t ds1 = nb1 / tdst;
|
||||
const int64_t ds2 = nb2 / tdst;
|
||||
const int64_t ds3 = nb3 / tdst;
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
|
||||
/*support both WARP_SIZE or WARP_32_SIZE in code
|
||||
choose by hardware for better performance
|
||||
*/
|
||||
l2_norm_f32_sycl<WARP_SIZE>(src0_d, dst_d, ne00, ne01, ne02, ne03,
|
||||
ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, stream, ctx.device);
|
||||
l2_norm_f32_sycl<WARP_SIZE>(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream, ctx.device);
|
||||
}
|
||||
|
||||
@ -126,7 +126,7 @@ static void soft_max_f32(const float * x,
|
||||
break;
|
||||
}
|
||||
|
||||
const float val = sycl::native::exp(sycl::max(vals[col] - max_val, -80.0f));
|
||||
const float val = sycl::native::exp(vals[col] - max_val);
|
||||
tmp += val;
|
||||
vals[col] = val;
|
||||
}
|
||||
@ -154,7 +154,7 @@ static void soft_max_f32(const float * x,
|
||||
tmp = warp_reduce_sum<WARP_SIZE>(tmp);
|
||||
}
|
||||
if (sinks) {
|
||||
tmp += sycl::native::exp(sycl::max(sinks[i02] - max_val, -80.0f));
|
||||
tmp += sycl::native::exp(sinks[i02] - max_val);
|
||||
}
|
||||
const float inv_sum = 1.0f / tmp;
|
||||
|
||||
|
||||
@ -308,7 +308,6 @@ enum vk_device_architecture {
|
||||
AMD_RDNA1,
|
||||
AMD_RDNA2,
|
||||
AMD_RDNA3,
|
||||
INTEL_XE1,
|
||||
INTEL_XE2,
|
||||
NVIDIA_PRE_TURING,
|
||||
NVIDIA_TURING,
|
||||
@ -366,26 +365,21 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool subgroup_size_control = false;
|
||||
bool integer_dot_product = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||
subgroup_size_control = true;
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
|
||||
integer_dot_product = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!subgroup_size_control || !integer_dot_product) {
|
||||
if (!subgroup_size_control) {
|
||||
return vk_device_architecture::OTHER;
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
|
||||
|
||||
props2.pNext = &subgroup_size_control_props;
|
||||
subgroup_size_control_props.pNext = &integer_dot_props;
|
||||
device.getProperties2(&props2);
|
||||
|
||||
if (subgroup_size_control_props.minSubgroupSize == 16) {
|
||||
@ -394,9 +388,6 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
||||
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
|
||||
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
||||
return vk_device_architecture::INTEL_XE2;
|
||||
} else if (subgroup_size_control_props.minSubgroupSize == 8 &&
|
||||
integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) {
|
||||
return vk_device_architecture::INTEL_XE1;
|
||||
}
|
||||
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
@ -3846,7 +3837,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) {
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
|
||||
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
||||
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
@ -4719,7 +4710,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
}
|
||||
uint32_t rm_iq = 2 * rm_kq;
|
||||
|
||||
const bool use_subgroups = device->subgroup_arithmetic;
|
||||
const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN;
|
||||
// Ensure a subgroup size >= 16 is available
|
||||
const bool use_subgroups16 = use_subgroups && subgroup_min_size_16;
|
||||
|
||||
@ -6370,8 +6361,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
break;
|
||||
case VK_VENDOR_ID_INTEL: {
|
||||
// Current Windows driver does not expose BF16 support.
|
||||
// We only want to use l_warptile if coopmat is available
|
||||
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support;
|
||||
// We only want to use l_warptile if coopmat is available and is Xe2+
|
||||
const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2;
|
||||
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat;
|
||||
device->mul_mat_l[i] = use_l_warptile;
|
||||
device->mul_mat_id_l[i] = use_l_warptile;
|
||||
device->mul_mat_m[i] = true;
|
||||
@ -17898,9 +17890,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
||||
switch (props.vendorID) {
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
// Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions.
|
||||
return (arch == vk_device_architecture::INTEL_XE2) ||
|
||||
(arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows);
|
||||
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
|
||||
// while some older hardware (ex. Arc A770) has performance regressions
|
||||
return arch == vk_device_architecture::INTEL_XE2;
|
||||
case VK_VENDOR_ID_AMD:
|
||||
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
||||
// Workaround for AMD proprietary driver reporting support on all GPUs
|
||||
@ -17948,8 +17940,6 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
|
||||
case 0xE20B: // B580
|
||||
case 0xE211: // Pro B60
|
||||
return 20;
|
||||
case 0xB080: // PTL Xe3 LPG 2x6 (12 subslices)
|
||||
return 12;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -158,7 +158,7 @@ const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
@ -144,7 +144,7 @@ const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
@ -28,10 +28,13 @@ vec2 cache_b_ds;
|
||||
|
||||
#include "mul_mat_vecq_funcs.glsl"
|
||||
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint col, const uint b_qs_idx) {
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint col = i*BLOCK_SIZE + tid*K_PER_ITER;
|
||||
|
||||
// Preload data_b block
|
||||
const uint b_block_idx = (j*p.batch_stride_b + col) / QUANT_K_Q8_1 + b_offset;
|
||||
const uint b_qs_idx = tid % (32 / K_PER_ITER);
|
||||
const uint b_block_idx_outer = b_block_idx / 4;
|
||||
const uint b_block_idx_inner = b_block_idx % 4;
|
||||
cache_b_ds = vec2(data_b[b_block_idx_outer].ds[b_block_idx_inner]);
|
||||
@ -88,35 +91,35 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
}
|
||||
}
|
||||
|
||||
const uint col_stride = K_PER_ITER * BLOCK_SIZE;
|
||||
uint num_iters = p.ncols / col_stride;
|
||||
if (num_iters * col_stride + K_PER_ITER * tid < p.ncols) {
|
||||
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||
if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
|
||||
num_iters++;
|
||||
}
|
||||
int unroll_count = 4;
|
||||
uint unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
const uint b_qs_idx = tid % (32 / K_PER_ITER);
|
||||
uint col = tid * K_PER_ITER;
|
||||
while (num_iters >= 4) {
|
||||
uint i = 0;
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < 4; ++k) {
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
col += col_stride;
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
|
||||
num_iters -= 4;
|
||||
}
|
||||
|
||||
if (num_iters >= 2) {
|
||||
unroll_count = 2;
|
||||
unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
col += col_stride;
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
col += col_stride;
|
||||
num_iters -= 2;
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_iters > 0) {
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
while (i < num_iters) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
|
||||
@ -42,7 +42,7 @@ float op_leaky_relu(float x) {
|
||||
}
|
||||
|
||||
float op_step(float x) {
|
||||
return x > 0.0f ? 1.0f : 0.0f;
|
||||
return x >= 0.0f ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
float op_tanh(float x) {
|
||||
|
||||
@ -1 +1 @@
|
||||
eced84c86f8b012c752c016f7fe789adea168e1e
|
||||
707321c4cf6d21cb4bc831aa8b687dbf01a521ce
|
||||
|
||||
@ -169,6 +169,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(d_inner % n_head == 0);
|
||||
GGML_ASSERT(d_inner % d_state == 0);
|
||||
GGML_ASSERT(d_inner % n_group == 0);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
|
||||
@ -39,11 +39,10 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
|
||||
const int64_t d_inner = hparams.ssm_d_inner;
|
||||
const int64_t d_state = hparams.ssm_d_state;
|
||||
const int64_t n_group = hparams.ssm_n_group;
|
||||
const int64_t dt_rank = hparams.ssm_dt_rank;
|
||||
|
||||
const int64_t conv_dim = d_inner + 2 * n_group * d_state;
|
||||
const int64_t d_in_proj = d_inner + conv_dim + dt_rank;
|
||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
|
||||
|
||||
// only an expansion factor of 2 is supported for now
|
||||
GGML_ASSERT(2 * n_embd == d_inner);
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@ -69,11 +68,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
||||
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
|
||||
|
||||
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {dt_rank}, 0);
|
||||
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
|
||||
|
||||
// no "weight" suffix for these
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, dt_rank}, 0);
|
||||
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, dt_rank}, 0);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
|
||||
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
|
||||
|
||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
||||
|
||||
|
||||
@ -302,9 +302,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
llama_build_and_test(test-alloc.cpp)
|
||||
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
||||
llama_build(test-export-graph-ops.cpp)
|
||||
target_include_directories(test-export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
llama_build(export-graph-ops.cpp)
|
||||
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
if (TARGET gguf-model-data)
|
||||
target_link_libraries(test-export-graph-ops PRIVATE gguf-model-data)
|
||||
target_compile_definitions(test-export-graph-ops PRIVATE LLAMA_HF_FETCH)
|
||||
target_link_libraries(export-graph-ops PRIVATE gguf-model-data)
|
||||
target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH)
|
||||
endif()
|
||||
|
||||
@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
LOG_ERR("test-export-graph-ops compiled without HF fetch support\n");
|
||||
LOG_ERR("export-graph-ops compiled without HF fetch support\n");
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
@ -2890,17 +2890,12 @@ struct test_cpy : public test_case {
|
||||
const std::array<int64_t, 4> ne_dst;
|
||||
const std::array<int64_t, 4> permute_src;
|
||||
const std::array<int64_t, 4> permute_dst;
|
||||
const std::array<int64_t, 4> dst_alloc; // if set, dst is a view into a larger buffer (strided)
|
||||
bool _src_use_permute;
|
||||
bool _dst_use_permute;
|
||||
bool _src_transpose;
|
||||
bool _use_dst_shape;
|
||||
bool _use_dst_alloc;
|
||||
|
||||
std::string vars() override {
|
||||
if (_use_dst_alloc) {
|
||||
return VARS_TO_STR8(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose, dst_alloc);
|
||||
}
|
||||
if (_use_dst_shape) {
|
||||
return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
|
||||
}
|
||||
@ -2948,15 +2943,12 @@ struct test_cpy : public test_case {
|
||||
std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
|
||||
std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
|
||||
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
|
||||
bool transpose_src = false,
|
||||
std::array<int64_t, 4> dst_alloc = {0, 0, 0, 0})
|
||||
bool transpose_src = false)
|
||||
: type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
|
||||
dst_alloc(dst_alloc),
|
||||
_src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
|
||||
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
|
||||
_src_transpose(transpose_src),
|
||||
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0),
|
||||
_use_dst_alloc(dst_alloc[0] > 0){}
|
||||
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
|
||||
@ -2974,23 +2966,12 @@ struct test_cpy : public test_case {
|
||||
}
|
||||
|
||||
std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
|
||||
ggml_tensor * dst;
|
||||
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
|
||||
ggml_set_name(dst, "dst");
|
||||
|
||||
if (_use_dst_alloc) {
|
||||
// view a sub-block of a larger buffer -> strided dst
|
||||
ggml_tensor * dst_buf = ggml_new_tensor(ctx, type_dst, 4, dst_alloc.data());
|
||||
ggml_set_name(dst_buf, "dst_buf");
|
||||
dst = ggml_view_4d(ctx, dst_buf, dst_ne[0], dst_ne[1], dst_ne[2], dst_ne[3],
|
||||
dst_buf->nb[1], dst_buf->nb[2], dst_buf->nb[3], 0);
|
||||
ggml_set_name(dst, "dst_view");
|
||||
} else {
|
||||
dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
|
||||
ggml_set_name(dst, "dst");
|
||||
|
||||
if (_dst_use_permute) {
|
||||
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
|
||||
ggml_set_name(dst, "dst_permuted");
|
||||
}
|
||||
if (_dst_use_permute) {
|
||||
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
|
||||
ggml_set_name(dst, "dst_permuted");
|
||||
}
|
||||
|
||||
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
||||
@ -7992,9 +7973,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_conv_2d({ 256, 256, 192, 1 }, { 3, 3, 192, 96 }, kernel_type, 1, 1, 1, 1, 1, 1, false));
|
||||
}
|
||||
|
||||
// sycl backend will limit task global_range < MAX_INT
|
||||
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
||||
@ -8200,8 +8178,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
|
||||
|
||||
// CPY - different src/dst shapes (reshaping via CPY)
|
||||
// Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
|
||||
@ -8696,12 +8672,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
256, 16, 16, {ne2, 1}, {1, 1}));
|
||||
}
|
||||
|
||||
// nr2 sweep to cover the cublasSgemmBatched pointer-array path (dps2 > 1)
|
||||
for (int64_t nr2 : {8, 16, 32}) {
|
||||
test_cases.emplace_back(new test_out_prod(GGML_TYPE_F32, GGML_TYPE_F32,
|
||||
256, 16, 16, {1, 1}, {nr2, 1}));
|
||||
}
|
||||
|
||||
// add_id
|
||||
for (ggml_type type_a : {GGML_TYPE_F32}) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
@ -9964,7 +9934,7 @@ static void usage(char ** argv) {
|
||||
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
|
||||
printf(" --list-ops lists all available GGML operations\n");
|
||||
printf(" --show-coverage shows test coverage\n");
|
||||
printf(" --test-file reads test operators from a test file generated by test-export-graph-ops\n");
|
||||
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
|
||||
printf(" -j <n> runs tests using <n> parallel worker threads (default: 1, test mode only)\n");
|
||||
}
|
||||
|
||||
|
||||
@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
|
||||
output_path = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] == "--no-common") {
|
||||
use_common = false;
|
||||
use_common = true;
|
||||
} else if (tmpl_path.empty()) {
|
||||
tmpl_path = args[i];
|
||||
} else {
|
||||
|
||||
@ -102,34 +102,21 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
|
||||
return fabsf(result - dot_ref) / test_size;
|
||||
}
|
||||
|
||||
static int test_vec_dot_f32(bool verbose) {
|
||||
const auto * f32 = ggml_get_type_traits_cpu(GGML_TYPE_F32);
|
||||
int num_failed = 0;
|
||||
for (int n : {1, 2, 3, 5, 7, 8, 15, 16, 17, 31, 33, 63, 67, 127, 129, 193, 255, 1023}) {
|
||||
std::vector<float> a(n);
|
||||
std::vector<float> b(n);
|
||||
generate_data(0.0, n, a.data());
|
||||
generate_data(1.0, n, b.data());
|
||||
int main(int argc, char * argv[]) {
|
||||
bool verbose = false;
|
||||
const size_t test_size = 32 * 128;
|
||||
|
||||
float result = 0.0f;
|
||||
f32->vec_dot(n, &result, 0, a.data(), 0, b.data(), 0, 1);
|
||||
const float ref = dot_product(a.data(), b.data(), n);
|
||||
const float error = fabsf(result - ref) / n;
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
const bool failed = !(error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf(" f32 vec_dot n=%4d: %s (ref=%f got=%f err=%f)\n",
|
||||
n, RESULT_STR[failed], ref, result, error);
|
||||
if (arg == "-v") {
|
||||
verbose = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return num_failed;
|
||||
}
|
||||
|
||||
static int test_vec_dot_q(bool verbose) {
|
||||
int num_failed = 0;
|
||||
|
||||
const size_t test_size = 32 * 128;
|
||||
|
||||
std::vector<float> test_data(test_size);
|
||||
std::vector<float> test_data2(test_size);
|
||||
@ -137,6 +124,11 @@ static int test_vec_dot_q(bool verbose) {
|
||||
generate_data(0.0, test_data.size(), test_data.data());
|
||||
generate_data(1.0, test_data2.size(), test_data2.data());
|
||||
|
||||
ggml_cpu_init();
|
||||
|
||||
int num_failed = 0;
|
||||
bool failed = false;
|
||||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
@ -164,7 +156,7 @@ static int test_vec_dot_q(bool verbose) {
|
||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
|
||||
type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
bool failed = !(total_error < max_quantization_error);
|
||||
failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
||||
@ -179,15 +171,15 @@ static int test_vec_dot_q(bool verbose) {
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: type == GGML_TYPE_Q1_0
|
||||
? MAX_DOT_PRODUCT_ERROR_BINARY
|
||||
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
|
||||
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
||||
: type == GGML_TYPE_NVFP4
|
||||
? MAX_DOT_PRODUCT_ERROR_FP4
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: type == GGML_TYPE_Q1_0
|
||||
? MAX_DOT_PRODUCT_ERROR_BINARY
|
||||
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
|
||||
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
||||
: type == GGML_TYPE_NVFP4
|
||||
? MAX_DOT_PRODUCT_ERROR_FP4
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
@ -196,31 +188,6 @@ static int test_vec_dot_q(bool verbose) {
|
||||
}
|
||||
}
|
||||
|
||||
return num_failed;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
bool verbose = false;
|
||||
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-v") {
|
||||
verbose = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cpu_init();
|
||||
|
||||
int num_failed = 0;
|
||||
|
||||
num_failed += test_vec_dot_f32(verbose);
|
||||
num_failed += test_vec_dot_q(verbose);
|
||||
|
||||
if (num_failed || verbose) {
|
||||
printf("%d tests failed\n", num_failed);
|
||||
}
|
||||
|
||||
@ -55,7 +55,8 @@ struct clip_hparams {
|
||||
int32_t n_head = 0;
|
||||
int32_t n_head_kv = 0;
|
||||
int32_t n_layer = 0;
|
||||
int32_t n_merge = 1; // number of patch merges **per-side**
|
||||
// idefics3
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
// for preprocessor
|
||||
int32_t image_longest_edge = 0;
|
||||
@ -134,7 +135,8 @@ struct clip_hparams {
|
||||
int32_t custom_image_max_tokens = -1;
|
||||
|
||||
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
|
||||
const int patch_area = patch_size * patch_size * n_merge * n_merge;
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
|
||||
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
|
||||
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
|
||||
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
|
||||
@ -143,7 +145,8 @@ struct clip_hparams {
|
||||
void set_warmup_n_tokens(int n_tokens) {
|
||||
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
|
||||
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
|
||||
warmup_image_size = n_tok_per_side * patch_size * n_merge;
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
|
||||
// TODO: support warmup size for custom token numbers
|
||||
}
|
||||
// sam vit deepseek-ocr
|
||||
|
||||
@ -1210,9 +1210,6 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<int> pinpoints;
|
||||
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
|
||||
if (pinpoints.size() % 2 != 0) {
|
||||
throw std::runtime_error(string_format("%s: image_grid_pinpoints must have an even number of elements, got %zu\n", __func__, pinpoints.size()));
|
||||
}
|
||||
if (!pinpoints.empty()) {
|
||||
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
||||
hparams.image_res_candidates.push_back({
|
||||
@ -1255,16 +1252,15 @@ struct clip_model_loader {
|
||||
}
|
||||
|
||||
if (is_vision) {
|
||||
std::vector<float> image_mean;
|
||||
std::vector<float> image_std;
|
||||
get_arr_f32(KEY_IMAGE_MEAN, image_mean, false);
|
||||
get_arr_f32(KEY_IMAGE_STD , image_std, false);
|
||||
if (image_mean.size() < 3 || image_std.size() < 3) {
|
||||
throw std::runtime_error(string_format("%s: image_mean/image_std arrays must have at least 3 elements, got %zu and %zu\n", __func__, image_mean.size(), image_std.size()));
|
||||
}
|
||||
int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
|
||||
int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
|
||||
GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
|
||||
GGML_ASSERT(idx_std >= 0 && "image_std not found");
|
||||
const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
|
||||
const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
hparams.image_mean[i] = image_mean[i];
|
||||
hparams.image_std[i] = image_std[i];
|
||||
hparams.image_mean[i] = mean_data[i];
|
||||
hparams.image_std[i] = std_data[i];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1690,8 +1686,8 @@ struct clip_model_loader {
|
||||
if (hparams.image_size > 65536) {
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.patch_size <= 0 || hparams.patch_size >= 65536) {
|
||||
throw std::runtime_error(string_format("%s: patch_size (%d) must be positive and less than 65536\n", __func__, hparams.patch_size));
|
||||
if (hparams.patch_size <= 0) {
|
||||
throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
|
||||
}
|
||||
if (hparams.n_embd <= 0) {
|
||||
throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd));
|
||||
@ -1699,9 +1695,6 @@ struct clip_model_loader {
|
||||
if (hparams.image_max_pixels < hparams.image_min_pixels) {
|
||||
throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
|
||||
}
|
||||
if (hparams.n_merge < 0 || hparams.n_merge >= 65536) {
|
||||
throw std::runtime_error(string_format("%s: n_merge (%d) must be greater than 0 and less than 65536\n", __func__, hparams.n_merge));
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
||||
@ -3074,29 +3067,6 @@ struct clip_model_loader {
|
||||
output = gguf_get_val_f32(ctx_gguf.get(), i);
|
||||
}
|
||||
|
||||
void get_arr_f32(const std::string & key, std::vector<float> & output, bool required = true) const {
|
||||
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||
if (i < 0) {
|
||||
if (required) {
|
||||
throw std::runtime_error("Key not found: " + key);
|
||||
}
|
||||
return;
|
||||
}
|
||||
const auto type = gguf_get_arr_type(ctx_gguf.get(), i);
|
||||
if (type != GGUF_TYPE_FLOAT32) {
|
||||
throw std::runtime_error(string_format("%s: array '%s' has type %d, expected %d (GGUF_TYPE_FLOAT32)\n", __func__, key.c_str(), type, GGUF_TYPE_FLOAT32));
|
||||
}
|
||||
const size_t n = gguf_get_arr_n(ctx_gguf.get(), i);
|
||||
if (n > (size_t) std::numeric_limits<int>::max()) {
|
||||
throw std::runtime_error(string_format("%s: array '%s' is too large (%zu elements)\n", __func__, key.c_str(), n));
|
||||
}
|
||||
output.resize(n);
|
||||
const float * values = (const float *)gguf_get_arr_data(ctx_gguf.get(), i);
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
output[j] = values[j];
|
||||
}
|
||||
}
|
||||
|
||||
void get_string(const std::string & key, std::string & output, bool required = true) const {
|
||||
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||
if (i < 0) {
|
||||
@ -3116,18 +3086,11 @@ struct clip_model_loader {
|
||||
}
|
||||
return;
|
||||
}
|
||||
const auto type = gguf_get_arr_type(ctx_gguf.get(), i);
|
||||
if (type != GGUF_TYPE_INT32) {
|
||||
throw std::runtime_error(string_format("%s: array '%s' has type %d, expected %d (GGUF_TYPE_INT32)\n", __func__, key.c_str(), type, GGUF_TYPE_INT32));
|
||||
}
|
||||
const size_t n = gguf_get_arr_n(ctx_gguf.get(), i);
|
||||
if (n > (size_t) std::numeric_limits<int>::max()) {
|
||||
throw std::runtime_error(string_format("%s: array '%s' is too large (%zu elements)\n", __func__, key.c_str(), n));
|
||||
}
|
||||
int n = gguf_get_arr_n(ctx_gguf.get(), i);
|
||||
output.resize(n);
|
||||
const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
output[j] = values[j];
|
||||
for (int i = 0; i < n; ++i) {
|
||||
output[i] = values[i];
|
||||
}
|
||||
}
|
||||
|
||||
@ -3401,8 +3364,8 @@ int clip_n_output_tokens(const clip_ctx * ctx, const clip_image_f32 * img) {
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = ctx->model.hparams.n_merge;
|
||||
int n_patches_x = img->nx() / patch_size / n_merge;
|
||||
int n_patches_y = img->ny() / patch_size / n_merge;
|
||||
int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
if (ctx->model.token_embd_img_break) {
|
||||
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||
} else {
|
||||
|
||||
@ -63,8 +63,8 @@ ggml_cgraph * clip_graph_pixtral::build() {
|
||||
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
|
||||
// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
|
||||
|
||||
const int p_y = n_patches_y / n_merge;
|
||||
const int p_x = n_patches_x / n_merge;
|
||||
const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
|
||||
const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
|
||||
const int p_total = p_x * p_y;
|
||||
const int n_embd_text = cur->ne[0];
|
||||
const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
|
||||
|
||||
@ -628,7 +628,7 @@ mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_
|
||||
mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_llava_uhd::get_slice_instructions(const clip_image_size & original_size) {
|
||||
mtmd_image_preprocessor_llava_uhd::slice_instructions res;
|
||||
// align slices by patch_size * n_merge so an integer number of merger output tokens fits per slice
|
||||
const int n_merge = hparams.n_merge;
|
||||
const int n_merge = hparams.n_merge > 0 ? hparams.n_merge : 1;
|
||||
const int patch_size = hparams.patch_size * n_merge;
|
||||
const int slice_size = hparams.image_size;
|
||||
const int original_width = original_size.width;
|
||||
@ -894,7 +894,7 @@ mtmd_image_preproc_out mtmd_image_preprocessor_dyn_size::preprocess(const clip_i
|
||||
clip_image_u8 resized_image;
|
||||
const clip_image_size original_size = img.get_size();
|
||||
// the original pixtral model doesn't have n_merge
|
||||
const int cur_merge = hparams.n_merge;
|
||||
const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
|
||||
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
||||
original_size,
|
||||
hparams.patch_size * cur_merge,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
set(TARGET ggml-rpc-server)
|
||||
set(TARGET rpc-server)
|
||||
add_executable(${TARGET} rpc-server.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@ -4,8 +4,8 @@
|
||||
> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
|
||||
> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
|
||||
|
||||
The `ggml-rpc-server` allows exposing `ggml` devices on a remote host.
|
||||
The RPC backend communicates with one or several instances of `ggml-rpc-server` and offloads computations to them.
|
||||
The `rpc-server` allows exposing `ggml` devices on a remote host.
|
||||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||
|
||||
```mermaid
|
||||
@ -14,15 +14,15 @@ flowchart TD
|
||||
rpcb<-->|TCP|srvb
|
||||
rpcb<-.->|TCP|srvn
|
||||
subgraph hostn[Host N]
|
||||
srvn[ggml-rpc-server]<-.->dev4["CUDA0"]
|
||||
srvn[ggml-rpc-server]<-.->dev5["CPU"]
|
||||
srvn[rpc-server]<-.->dev4["CUDA0"]
|
||||
srvn[rpc-server]<-.->dev5["CPU"]
|
||||
end
|
||||
subgraph hostb[Host B]
|
||||
srvb[ggml-rpc-server]<-->dev3["Metal"]
|
||||
srvb[rpc-server]<-->dev3["Metal"]
|
||||
end
|
||||
subgraph hosta[Host A]
|
||||
srva[ggml-rpc-server]<-->dev["CUDA0"]
|
||||
srva[ggml-rpc-server]<-->dev2["CUDA1"]
|
||||
srva[rpc-server]<-->dev["CUDA0"]
|
||||
srva[rpc-server]<-->dev2["CUDA1"]
|
||||
end
|
||||
subgraph host[Main Host]
|
||||
local["Local devices"]<-->ggml[llama-cli]
|
||||
@ -33,7 +33,7 @@ flowchart TD
|
||||
class local,dev,dev2,dev3,dev4,dev5 devcls
|
||||
```
|
||||
|
||||
By default, `ggml-rpc-server` exposes all available accelerator devices on the host.
|
||||
By default, `rpc-server` exposes all available accelerator devices on the host.
|
||||
If there are no accelerators, it exposes a single `CPU` device.
|
||||
|
||||
## Usage
|
||||
@ -41,7 +41,7 @@ If there are no accelerators, it exposes a single `CPU` device.
|
||||
### Remote hosts
|
||||
|
||||
On each remote host, build the backends for each accelerator by adding `-DGGML_RPC=ON` to the build options.
|
||||
For example, to build the `ggml-rpc-server` with support for CUDA accelerators:
|
||||
For example, to build the `rpc-server` with support for CUDA accelerators:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc-cuda
|
||||
@ -50,10 +50,10 @@ cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
When started, the `ggml-rpc-server` will detect and expose all available `CUDA` devices:
|
||||
When started, the `rpc-server` will detect and expose all available `CUDA` devices:
|
||||
|
||||
```bash
|
||||
$ bin/ggml-rpc-server
|
||||
$ bin/rpc-server
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
||||
ggml_cuda_init: found 1 CUDA devices:
|
||||
@ -67,14 +67,14 @@ Devices:
|
||||
|
||||
You can control the set of exposed CUDA devices with the `CUDA_VISIBLE_DEVICES` environment variable or the `--device` command line option. The following two commands have the same effect:
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/ggml-rpc-server -p 50052
|
||||
$ bin/ggml-rpc-server --device CUDA0 -p 50052
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||
$ bin/rpc-server --device CUDA0 -p 50052
|
||||
```
|
||||
|
||||
### Main host
|
||||
|
||||
On the main host build `llama.cpp` with the backends for the local devices and add `-DGGML_RPC=ON` to the build options.
|
||||
Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `ggml-rpc-server`:
|
||||
Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
|
||||
```bash
|
||||
$ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -ngl 99 --rpc 192.168.88.10:50052,192.168.88.11:50052
|
||||
@ -90,7 +90,7 @@ This can speed up model loading significantly, especially when using large model
|
||||
To enable the cache, use the `-c` option:
|
||||
|
||||
```bash
|
||||
$ bin/ggml-rpc-server -c
|
||||
$ bin/rpc-server -c
|
||||
```
|
||||
|
||||
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
|
||||
@ -103,8 +103,8 @@ RDMA is enabled by default when `libibverbs` is found at build time.
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `ggml-rpc-server`:
|
||||
Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `rpc-server`:
|
||||
```bash
|
||||
$ GGML_RPC_DEBUG=1 bin/ggml-rpc-server
|
||||
$ GGML_RPC_DEBUG=1 bin/rpc-server
|
||||
```
|
||||
|
||||
|
||||
@ -15,8 +15,6 @@ add_library(${TARGET} STATIC
|
||||
server-common.h
|
||||
server-context.cpp
|
||||
server-context.h
|
||||
server-stream.cpp
|
||||
server-stream.h
|
||||
server-tools.cpp
|
||||
server-tools.h
|
||||
server-schema.cpp
|
||||
|
||||
@ -57,7 +57,6 @@ The core architecture consists of the following components:
|
||||
- `server_tokens`: Unified representation of token sequences (supports both text and multimodal tokens); used by `server_task` and `server_slot`.
|
||||
- `server_prompt_checkpoint`: For recurrent (e.g., RWKV) and SWA models, stores snapshots of KV cache state. Enables reuse when subsequent requests share the same prompt prefix, saving redundant computation.
|
||||
- `server_models`: Standalone component for managing multiple backend instances (used in router mode). It is completely independent of `server_context`.
|
||||
- `stream_session_manager`: Process wide owner of resumable SSE stream sessions (`g_stream_sessions`), keyed by conversation id. Backs the replay buffer that lets a client reattach to a generation after an HTTP disconnect. See the "Resumable streaming" section below.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
@ -118,58 +117,6 @@ Here is an example trace of an API request for text completion:
|
||||
- As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state.
|
||||
- `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer.
|
||||
|
||||
### Resumable streaming (SSE replay buffer)
|
||||
|
||||
By default a streaming generation is bound to its HTTP socket: when the socket drops (refresh, tab close, mobile background, transient network) the generation aborts and the live stream is lost. This feature keeps the generation running server side and lets a client reattach.
|
||||
|
||||
It is opt in via the `X-Conversation-Id` header on `POST /v1/chat/completions`. Without the header the OAI strict path is unchanged. The conversation id is the only identity end to end (server map key, client localStorage key, route path), with an optional `::model` suffix for direct routing in router mode.
|
||||
|
||||
The feature lives entirely in `server-stream.{h,cpp}` and rests on three types:
|
||||
|
||||
- `stream_session`: a bounded ring buffer (4 MiB cap, oldest bytes drop first) plus a condvar. `append` pushes raw SSE bytes, `read_from` drains from any offset and blocks for live bytes or finalize, `finalize` wakes readers, `cancel` stops the producer. One conv maps to at most one live session.
|
||||
- `stream_session_manager` (`g_stream_sessions`): owns all sessions keyed by conv id, enforces the one conv one session invariant via `create_or_replace`, and runs a GC thread that drops completed sessions past their TTL.
|
||||
- `stream_pipe_producer` / `stream_pipe_consumer`: the write and read ends. The producer owns the session lifetime and finalizes it on destruction; the consumer is read only and never finalizes, so a reader detaching cannot kill a running generation.
|
||||
|
||||
Producer side: `server_res_generator` attaches a producer pipe when the header is present. The HTTP content provider mirrors every chunk into the ring before writing it to the socket. While a pipe is attached, `stream_aware_should_stop` ignores peer disconnect, so a dropped socket does not stop generation: only an explicit `DELETE` does. When the peer leaves early, `on_complete` calls `close()`, which drains the rest of the generation into the ring on the http worker.
|
||||
|
||||
Lifetime safety: the producer pipe holds a shared `alive` flag also captured by the session cancel hook. `~server_res_generator` calls `cleanup()` to clear that hook while the reader is still alive, so a `cancel` arriving during teardown can never call `stop()` on a freed response. This ordering is the most fragile part of the feature: finalizing or destroying the producer before `cleanup()` runs reintroduces a use after free.
|
||||
|
||||
Consumer side: `GET /v1/stream/<conv_id>?from=N` opens a `text/event-stream` that replays buffered bytes from offset `N` and blocks for live bytes, so the browser reattaches like a fresh EventSource. An offset below the dropped prefix returns 400.
|
||||
|
||||
Routes:
|
||||
|
||||
- `GET /v1/stream/:conv_id?from=N`: replay or live reattach.
|
||||
- `POST /v1/streams/lookup` with `{"conversation_ids": [...]}`: returns session status only for ids the caller already owns. There is no listing route, so live sessions cannot be enumerated (an earlier `GET /v1/streams` was removed for exactly this reason).
|
||||
- `DELETE /v1/stream/:conv_id`: explicit Stop, idempotent (`evict_and_cancel`).
|
||||
|
||||
Router mode binds the same paths to proxy handlers. A `conv_id -> child` map (`conv_models`), populated when a POST is routed, resolves the owning child in one lookup with no polling. The lookup groups ids per child; GET and DELETE proxy straight to the owner. This loopback REST hop is expected to move to a websocket IPC later, swapping only the transport.
|
||||
|
||||
Lifecycle: `g_stream_sessions.start_gc()` runs in main after common init, `stop_gc()` runs first in `clean_up()` and finalizes every live session so no reader hangs. Reader blocking and the post drop drain both run on httplib worker threads, which block on a condvar rather than spin.
|
||||
|
||||
| Constant | Value | Role |
|
||||
| --- | --- | --- |
|
||||
| `STREAM_SESSION_TTL_SECONDS` | 300 | retention of a completed session before GC |
|
||||
| `STREAM_SESSION_MAX_BYTES` | 4 MiB | ring cap per session |
|
||||
| `STREAM_SESSION_GC_INTERVAL_SECONDS` | 60 | GC tick |
|
||||
| `STREAM_READ_WAKE_INTERVAL_MS` | 200 | read_from wake to recheck should_stop |
|
||||
| `STREAM_LOOKUP_TIMEOUT_MS` | 250 | router to child loopback budget |
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
Client -- "POST + X-Conversation-Id" --> RG[server_res_generator]
|
||||
RG -- attach --> Prod[stream_pipe_producer]
|
||||
Prod -- "write, drain on peer drop" --> Sess
|
||||
subgraph g_stream_sessions
|
||||
Sess[stream_session: ring buffer, 4 MiB]
|
||||
GC[GC thread] -- drop after TTL --> Sess
|
||||
end
|
||||
Sess -- read_from offset --> Cons[stream_pipe_consumer]
|
||||
Cons -- "GET /v1/stream/:id?from=N" --> Client
|
||||
DEL[DELETE /v1/stream/:id] -- evict_and_cancel --> Sess
|
||||
```
|
||||
|
||||
The diagram shows the buffer touch points. The live wire (chunks streamed to the original client during a normal generation) is the producer's default output, described under "Producer side" above.
|
||||
|
||||
### Testing
|
||||
|
||||
`llama-server` includes an automated test suite based on `pytest`.
|
||||
@ -276,7 +223,6 @@ The flow for downloading a new model:
|
||||
- Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
|
||||
- INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
|
||||
- Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
|
||||
- Resumable streaming (SSE replay buffer): https://github.com/ggml-org/llama.cpp/pull/23226
|
||||
|
||||
|
||||
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include "server-task.h"
|
||||
#include "server-queue.h"
|
||||
#include "server-schema.h"
|
||||
#include "server-stream.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
@ -4023,15 +4022,6 @@ struct server_res_generator : server_http_res {
|
||||
queue_tasks.wait_until_no_sleep();
|
||||
}
|
||||
}
|
||||
~server_res_generator() override {
|
||||
// cleanup() must run while rd is still alive (rd is destroyed after this body returns)
|
||||
if (spipe) {
|
||||
spipe->cleanup();
|
||||
}
|
||||
}
|
||||
void stop() override {
|
||||
rd.stop();
|
||||
}
|
||||
void ok(const json & response_data) {
|
||||
status = 200;
|
||||
data = safe_json_to_str(response_data);
|
||||
@ -4220,10 +4210,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
}
|
||||
};
|
||||
|
||||
auto effective_should_stop = stream_aware_should_stop(res_this, req.should_stop);
|
||||
|
||||
try {
|
||||
if (effective_should_stop()) {
|
||||
if (req.should_stop()) {
|
||||
SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
|
||||
return false; // should_stop condition met
|
||||
}
|
||||
@ -4257,8 +4245,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
// receive subsequent results
|
||||
bool timeout = false;
|
||||
int64_t start_time = ggml_time_ms();
|
||||
auto result = rd.next([&timeout, &start_time, ¶ms, &effective_should_stop]() {
|
||||
if (effective_should_stop()) {
|
||||
auto result = rd.next([&timeout, &req, &start_time, ¶ms]() {
|
||||
if (req.should_stop()) {
|
||||
return true; // should_stop condition met
|
||||
} else if (params.sse_ping_interval > 0 && ggml_time_ms() - start_time > (int64_t)params.sse_ping_interval * 1000) {
|
||||
timeout = true;
|
||||
@ -4276,7 +4264,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
|
||||
if (result == nullptr) {
|
||||
SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
|
||||
GGML_ASSERT(effective_should_stop());
|
||||
GGML_ASSERT(req.should_stop());
|
||||
return false; // should_stop condition met
|
||||
}
|
||||
|
||||
@ -4314,10 +4302,6 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
};
|
||||
}
|
||||
|
||||
// attach a producer pipe to the response when X-Conversation-Id is present.
|
||||
// the pipe mirrors SSE chunks into the ring buffer and wires up the cancel hook.
|
||||
stream_session_attach_pipe(*res, req.headers);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#include "common.h"
|
||||
#include "server-http.h"
|
||||
#include "server-stream.h"
|
||||
#include "server-common.h"
|
||||
#include "ui.h"
|
||||
|
||||
@ -457,40 +456,13 @@ static void set_headers(httplib::Response & res, const std::map<std::string, std
|
||||
}
|
||||
}
|
||||
|
||||
// percent-decode a path component (%XX). path params arrive raw from httplib, unlike query
|
||||
// params, so a conv id like "conv::model" sent as "conv%3A%3Amodel" must be decoded here to
|
||||
// match the value the client put in the X-Conversation-Id header
|
||||
static std::string decode_path_component(const std::string & in) {
|
||||
std::string out;
|
||||
out.reserve(in.size());
|
||||
for (size_t i = 0; i < in.size(); i++) {
|
||||
if (in[i] == '%' && i + 2 < in.size()) {
|
||||
auto hex = [](char c) -> int {
|
||||
if (c >= '0' && c <= '9') return c - '0';
|
||||
if (c >= 'a' && c <= 'f') return c - 'a' + 10;
|
||||
if (c >= 'A' && c <= 'F') return c - 'A' + 10;
|
||||
return -1;
|
||||
};
|
||||
int hi = hex(in[i + 1]);
|
||||
int lo = hex(in[i + 2]);
|
||||
if (hi >= 0 && lo >= 0) {
|
||||
out.push_back(char((hi << 4) | lo));
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
out.push_back(in[i]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static std::map<std::string, std::string> get_params(const httplib::Request & req) {
|
||||
std::map<std::string, std::string> params;
|
||||
for (const auto & [key, value] : req.params) {
|
||||
params[key] = value;
|
||||
}
|
||||
for (const auto & [key, value] : req.path_params) {
|
||||
params[key] = decode_path_component(value);
|
||||
params[key] = value;
|
||||
}
|
||||
return params;
|
||||
}
|
||||
@ -525,41 +497,26 @@ static void process_handler_response(server_http_req_ptr && request, server_http
|
||||
set_headers(res, response->headers);
|
||||
const std::string content_type = response->content_type;
|
||||
// convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
|
||||
std::shared_ptr<server_http_req> q_ptr = std::move(request);
|
||||
std::shared_ptr<server_http_res> r_ptr = std::move(response);
|
||||
|
||||
const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
|
||||
std::shared_ptr q_ptr = std::move(request);
|
||||
std::shared_ptr r_ptr = std::move(response);
|
||||
const auto chunked_content_provider = [response = r_ptr](size_t, const httplib::DataSink & sink) -> bool {
|
||||
std::string chunk;
|
||||
const bool has_next = response->next(chunk);
|
||||
if (!chunk.empty()) {
|
||||
// mirror into the ring buffer first, the session must reflect every SSE chunk
|
||||
// whether or not the wire write below succeeds
|
||||
if (response->spipe) {
|
||||
response->spipe->write(chunk.data(), chunk.size());
|
||||
}
|
||||
if (!sink.write(chunk.data(), chunk.size())) {
|
||||
// peer is gone, stop the wire path here
|
||||
return false;
|
||||
}
|
||||
SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
|
||||
}
|
||||
if (!has_next) {
|
||||
// producer reached its natural end on the wire, a later close() skips the drain
|
||||
if (response->spipe) {
|
||||
response->spipe->done();
|
||||
}
|
||||
sink.done();
|
||||
SRV_DBG("%s", "http: stream ended\n");
|
||||
}
|
||||
return has_next;
|
||||
};
|
||||
const auto on_complete = [request = q_ptr, response = r_ptr](bool) mutable {
|
||||
// on a dropped peer, close() drains the rest of the generation into the ring buffer
|
||||
if (response->spipe) {
|
||||
response->spipe->close();
|
||||
}
|
||||
response.reset(); // spipe destructor finalizes the session if attached
|
||||
request.reset();
|
||||
response.reset(); // trigger the destruction of the response object
|
||||
request.reset(); // trigger the destruction of the request object
|
||||
};
|
||||
res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
|
||||
} else {
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@ -11,7 +10,6 @@
|
||||
#include <unordered_map>
|
||||
|
||||
struct common_params;
|
||||
struct stream_pipe_producer; // defined in server-stream.h
|
||||
|
||||
// generator-like API for HTTP response generation
|
||||
// this object response with one of the 2 modes:
|
||||
@ -25,20 +23,12 @@ struct server_http_res {
|
||||
std::string data;
|
||||
std::map<std::string, std::string> headers;
|
||||
|
||||
// if set, the stream survives a client disconnect: the producer pipe keeps draining into the
|
||||
// ring buffer and finalizes the session on destruction, so no explicit on_stream_end is needed.
|
||||
// shared_ptr (not unique_ptr) so the forward-declared type is safe to delete here.
|
||||
std::shared_ptr<stream_pipe_producer> spipe;
|
||||
|
||||
// TODO: move this to a virtual function once we have proper polymorphism support
|
||||
std::function<bool(std::string &)> next = nullptr;
|
||||
bool is_stream() const {
|
||||
return next != nullptr;
|
||||
}
|
||||
|
||||
// called when the session is cancelled (e.g. DELETE /v1/stream/<conv_id>).
|
||||
// server_res_generator overrides this to stop its reader; the default is a no-op.
|
||||
virtual void stop() {}
|
||||
|
||||
virtual ~server_http_res() = default;
|
||||
};
|
||||
|
||||
|
||||
@ -1,14 +1,12 @@
|
||||
#include "server-common.h"
|
||||
#include "server-models.h"
|
||||
#include "server-context.h"
|
||||
#include "server-stream.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "preset.h"
|
||||
#include "download.h"
|
||||
|
||||
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
|
||||
#include <optional>
|
||||
#include <sheredom/subprocess.h>
|
||||
|
||||
#include <functional>
|
||||
@ -94,9 +92,6 @@ struct server_subproc {
|
||||
}
|
||||
};
|
||||
|
||||
// short loopback budget for the resumable stream router to child JSON calls (probe, lookup,
|
||||
// delete). distinct from params.timeout_read/write which only applies to the generation proxy
|
||||
static constexpr int STREAM_LOOKUP_TIMEOUT_MS = 250;
|
||||
|
||||
static std::filesystem::path get_server_exec_path() {
|
||||
#if defined(_WIN32)
|
||||
@ -1585,45 +1580,6 @@ static bool is_autoload(const common_params & params, const server_http_req & re
|
||||
}
|
||||
}
|
||||
|
||||
// percent encode one query or path component, covers reserved chars without pulling in
|
||||
// httplib::detail. used by the stream routes to forward conversation_id to children safely
|
||||
static std::string encode_qs(const std::string & in) {
|
||||
std::string out;
|
||||
out.reserve(in.size() * 3);
|
||||
for (unsigned char c : in) {
|
||||
bool safe = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')
|
||||
|| c == '-' || c == '_' || c == '.' || c == '~';
|
||||
if (safe) {
|
||||
out.push_back(char(c));
|
||||
} else {
|
||||
char buf[4];
|
||||
std::snprintf(buf, sizeof(buf), "%%%02X", c);
|
||||
out.append(buf, 3);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// resolve the child that owns a conversation's stream session via the conv_id -> model map
|
||||
// populated when the POST was routed. single map lookup then a meta lookup, no polling, no
|
||||
// parsing of the conv id. returns nullopt when nothing maps, the caller answers not found and
|
||||
// the client recovers
|
||||
static std::optional<server_model_meta> resolve_child_for_conv(
|
||||
server_models & models, const std::string & conversation_id) {
|
||||
if (conversation_id.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto tracked = models.conv_models.lookup(conversation_id);
|
||||
if (!tracked.has_value()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto meta = models.get_meta(*tracked);
|
||||
if (meta.has_value() && meta->is_ready()) {
|
||||
return meta;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void server_models_routes::init_routes() {
|
||||
this->get_router_props = [this](const server_http_req & req) {
|
||||
std::string name = req.get_param("model");
|
||||
@ -1672,12 +1628,6 @@ void server_models_routes::init_routes() {
|
||||
if (!router_validate_model(name, models, autoload, error_res)) {
|
||||
return error_res;
|
||||
}
|
||||
// remember which child serves this conversation so the stream routes can route straight
|
||||
// to it without polling, keyed on the exact conv id from the header
|
||||
std::string conv_id = stream_conv_id_from_headers(req.headers);
|
||||
if (!conv_id.empty()) {
|
||||
models.conv_models.remember(conv_id, name);
|
||||
}
|
||||
return models.proxy_request(req, method, name, true); // update last usage for POST request only
|
||||
};
|
||||
|
||||
@ -1869,128 +1819,6 @@ void server_models_routes::init_routes() {
|
||||
res_ok(res, {{"success", true}});
|
||||
return res;
|
||||
};
|
||||
|
||||
this->router_stream_get = [this](const server_http_req & req) {
|
||||
// GET /v1/stream/<conv_id>?from=N. resolve the owning child from the conv_id -> model
|
||||
// map, 404 when nothing maps
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
std::string conv_id = req.get_param("conv_id");
|
||||
if (conv_id.empty()) {
|
||||
res_err(res, format_error_response("Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
std::optional<server_model_meta> owner = resolve_child_for_conv(models, conv_id);
|
||||
if (!owner.has_value()) {
|
||||
res_err(res, format_error_response("Stream not found or expired", ERROR_TYPE_NOT_FOUND));
|
||||
return res;
|
||||
}
|
||||
std::string from = req.get_param("from");
|
||||
std::string child_path = "/v1/stream/" + encode_qs(conv_id);
|
||||
if (!from.empty()) {
|
||||
child_path += "?from=" + from;
|
||||
}
|
||||
SRV_INF("proxying stream resume to model %s on port %d, path=%s\n",
|
||||
owner->name.c_str(), owner->port, child_path.c_str());
|
||||
auto proxy = std::make_unique<server_http_proxy>(
|
||||
"GET",
|
||||
"http",
|
||||
CHILD_ADDR,
|
||||
owner->port,
|
||||
child_path,
|
||||
req.headers,
|
||||
req.body,
|
||||
req.files,
|
||||
req.should_stop,
|
||||
params.timeout_read,
|
||||
params.timeout_write);
|
||||
return std::unique_ptr<server_http_res>(std::move(proxy));
|
||||
};
|
||||
|
||||
this->router_streams_lookup = [this](const server_http_req & req) {
|
||||
// POST /v1/streams/lookup. resolve each requested conv id to its owning child via the
|
||||
// map, group the ids per child, and query only the children that actually own some of
|
||||
// them instead of fanning out to every ready child. a child only answers for the ids
|
||||
// it owns, never lists anything else
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
std::vector<std::string> requested;
|
||||
try {
|
||||
json body = json::parse(req.body);
|
||||
if (body.contains("conversation_ids") && body["conversation_ids"].is_array()) {
|
||||
for (const auto & v : body["conversation_ids"]) {
|
||||
if (v.is_string() && !v.get<std::string>().empty()) {
|
||||
requested.push_back(v.get<std::string>());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception &) {
|
||||
res_ok(res, json::array());
|
||||
return res;
|
||||
}
|
||||
|
||||
// group requested ids by the child port that owns them, drop ids that map to nothing
|
||||
std::unordered_map<int, json> per_child;
|
||||
for (const auto & cid : requested) {
|
||||
auto owner = resolve_child_for_conv(models, cid);
|
||||
if (!owner.has_value()) {
|
||||
continue;
|
||||
}
|
||||
per_child[owner->port].push_back(cid);
|
||||
}
|
||||
|
||||
json aggregated = json::array();
|
||||
for (auto & [port, ids] : per_child) {
|
||||
json child_body = {{"conversation_ids", ids}};
|
||||
httplib::Client cli(CHILD_ADDR, port);
|
||||
cli.set_connection_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
|
||||
cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
|
||||
cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
|
||||
auto resp = cli.Post("/v1/streams/lookup", child_body.dump(), "application/json");
|
||||
if (!resp || resp->status != 200) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
json child_arr = json::parse(resp->body);
|
||||
if (!child_arr.is_array()) {
|
||||
continue;
|
||||
}
|
||||
for (auto & entry : child_arr) {
|
||||
if (entry.is_object()) {
|
||||
aggregated.push_back(entry);
|
||||
}
|
||||
}
|
||||
} catch (const std::exception &) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
res_ok(res, aggregated);
|
||||
return res;
|
||||
};
|
||||
|
||||
this->router_stream_delete = [this](const server_http_req & req) {
|
||||
// DELETE /v1/stream/<conv_id>. resolve the owning child via the map and forward only to
|
||||
// it, evict_and_cancel is idempotent on the child
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
std::string conv_id = req.get_param("conv_id");
|
||||
if (conv_id.empty()) {
|
||||
res_err(res, format_error_response("Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
std::string child_path = "/v1/stream/" + encode_qs(conv_id);
|
||||
auto owner = resolve_child_for_conv(models, conv_id);
|
||||
if (owner.has_value()) {
|
||||
httplib::Client cli(CHILD_ADDR, owner->port);
|
||||
cli.set_connection_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
|
||||
cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
|
||||
cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000);
|
||||
auto resp = cli.Delete(child_path.c_str());
|
||||
(void) resp; // best effort, 404 and network errors are equivalent to no op
|
||||
}
|
||||
// drop the tracking entry, the session is being torn down
|
||||
models.conv_models.forget(conv_id);
|
||||
res->status = 204;
|
||||
res->content_type = "application/json";
|
||||
return res;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -11,10 +11,7 @@
|
||||
#include <condition_variable>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
/**
|
||||
* state diagram:
|
||||
@ -129,44 +126,6 @@ private:
|
||||
// if true, the next get_meta() will trigger a reload of model list
|
||||
bool need_reload = false;
|
||||
|
||||
// conv_id -> model name that currently serves its stream session, lets the resumable stream
|
||||
// routes go straight to the owning child instead of polling every one. populated when
|
||||
// proxy_request forwards a POST carrying an X-Conversation-Id. best effort: a stale entry just
|
||||
// makes the child answer not found and the client recovers. owns its lock, one mutex per struct
|
||||
struct conv_model_tracker {
|
||||
void remember(const std::string & conv_id, const std::string & model) {
|
||||
if (conv_id.empty() || model.empty()) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
map[conv_id] = model;
|
||||
}
|
||||
|
||||
std::optional<std::string> lookup(const std::string & conv_id) {
|
||||
if (conv_id.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
auto it = map.find(conv_id);
|
||||
if (it == map.end()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void forget(const std::string & conv_id) {
|
||||
if (conv_id.empty()) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
map.erase(conv_id);
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mu;
|
||||
std::unordered_map<std::string, std::string> map;
|
||||
};
|
||||
|
||||
common_preset_context ctx_preset;
|
||||
|
||||
common_params base_params;
|
||||
@ -186,9 +145,6 @@ private:
|
||||
void notify_sse(const std::string & event, const std::string & model_id, const json & data = nullptr);
|
||||
|
||||
public:
|
||||
// conv_id -> model tracker for the resumable stream routes, owns its lock
|
||||
conv_model_tracker conv_models;
|
||||
|
||||
server_models(const common_params & params, int argc, char ** argv);
|
||||
|
||||
server_response sse; // for real-time updates via SSE endpoint
|
||||
@ -312,12 +268,6 @@ struct server_models_routes {
|
||||
server_http_context::handler_t get_router_models_sse;
|
||||
server_http_context::handler_t post_router_models;
|
||||
server_http_context::handler_t del_router_models;
|
||||
|
||||
// router side handlers for the resumable streaming routes. each resolves the child that owns
|
||||
// a conversation through the conv_id -> model map, no probing or fan out
|
||||
server_http_context::handler_t router_stream_get;
|
||||
server_http_context::handler_t router_streams_lookup;
|
||||
server_http_context::handler_t router_stream_delete;
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@ -1,569 +0,0 @@
|
||||
#include "server-stream.h"
|
||||
#include "server-common.h"
|
||||
#include "server-http.h"
|
||||
#include "server-queue.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace {
|
||||
constexpr int64_t STREAM_SESSION_TTL_SECONDS = 300;
|
||||
constexpr size_t STREAM_SESSION_MAX_BYTES = 4 * 1024 * 1024;
|
||||
constexpr int64_t STREAM_SESSION_GC_INTERVAL_SECONDS = 60;
|
||||
constexpr int64_t STREAM_READ_WAKE_INTERVAL_MS = 200;
|
||||
|
||||
// returns unix time in seconds
|
||||
int64_t now_seconds() {
|
||||
return std::chrono::duration_cast<std::chrono::seconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()
|
||||
).count();
|
||||
}
|
||||
}
|
||||
|
||||
stream_session::stream_session(std::string conversation_id_, size_t max_bytes_)
|
||||
: conversation_id(std::move(conversation_id_))
|
||||
, started_ts(now_seconds())
|
||||
, prefix_dropped(0)
|
||||
, cap_bytes(max_bytes_)
|
||||
, done(false)
|
||||
, cancelled(false)
|
||||
, completed_ts(0) {
|
||||
buffer.reserve(64 * 1024);
|
||||
}
|
||||
|
||||
bool stream_session::append(const char * data, size_t len) {
|
||||
if (len == 0) {
|
||||
return true;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
if (done.load(std::memory_order_relaxed)) {
|
||||
return false;
|
||||
}
|
||||
if (len >= cap_bytes) {
|
||||
// single chunk bigger than the cap, keep only the tail that fits
|
||||
size_t skip = len - cap_bytes;
|
||||
prefix_dropped += buffer.size() + skip;
|
||||
buffer.clear();
|
||||
buffer.insert(buffer.end(), data + skip, data + len);
|
||||
} else {
|
||||
size_t needed = buffer.size() + len;
|
||||
if (needed > cap_bytes) {
|
||||
size_t to_drop = needed - cap_bytes;
|
||||
buffer.erase(buffer.begin(), buffer.begin() + to_drop);
|
||||
prefix_dropped += to_drop;
|
||||
}
|
||||
buffer.insert(buffer.end(), data, data + len);
|
||||
}
|
||||
}
|
||||
cv.notify_all();
|
||||
return true;
|
||||
}
|
||||
|
||||
void stream_session::finalize() {
|
||||
bool was_done = done.exchange(true, std::memory_order_acq_rel);
|
||||
if (was_done) {
|
||||
return;
|
||||
}
|
||||
completed_ts.store(now_seconds(), std::memory_order_release);
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
stream_read_status stream_session::read_from(size_t offset,
|
||||
const std::function<bool(const char *, size_t)> & sink,
|
||||
const std::function<bool()> & should_stop) {
|
||||
std::unique_lock<std::mutex> lock(mu);
|
||||
while (true) {
|
||||
if (should_stop && should_stop()) {
|
||||
return stream_read_status::OK;
|
||||
}
|
||||
if (offset < prefix_dropped) {
|
||||
return stream_read_status::OFFSET_LOST;
|
||||
}
|
||||
size_t logical_end = prefix_dropped + buffer.size();
|
||||
if (offset < logical_end) {
|
||||
size_t local_off = offset - prefix_dropped;
|
||||
size_t n = buffer.size() - local_off;
|
||||
// copy the available chunk under the lock, release before calling the sink
|
||||
std::vector<char> chunk(buffer.begin() + local_off, buffer.begin() + local_off + n);
|
||||
offset += n;
|
||||
lock.unlock();
|
||||
bool keep_going = sink(chunk.data(), chunk.size());
|
||||
if (!keep_going) {
|
||||
return stream_read_status::OK;
|
||||
}
|
||||
lock.lock();
|
||||
continue;
|
||||
}
|
||||
if (done.load(std::memory_order_acquire)) {
|
||||
return stream_read_status::OK;
|
||||
}
|
||||
// wait for new bytes, finalize, or a periodic wake to re check should_stop
|
||||
cv.wait_for(lock, std::chrono::milliseconds(STREAM_READ_WAKE_INTERVAL_MS));
|
||||
}
|
||||
}
|
||||
|
||||
bool stream_session::is_done() const {
|
||||
return done.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
size_t stream_session::total_size() const {
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
return prefix_dropped + buffer.size();
|
||||
}
|
||||
|
||||
size_t stream_session::dropped_prefix() const {
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
return prefix_dropped;
|
||||
}
|
||||
|
||||
int64_t stream_session::completed_at() const {
|
||||
return completed_ts.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
void stream_session::set_stop_producer(std::function<void()> fn) {
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
stop_producer = std::move(fn);
|
||||
}
|
||||
|
||||
void stream_session::cancel() {
|
||||
// flip cancelled first so the producer-side stream_aware_should_stop can break out of the
|
||||
// recv() wait even if remove_waiting_task_ids does not notify the condvar (the cancel task
|
||||
// posted by rd.stop() will eventually notify, but we do not want to depend on that timing)
|
||||
cancelled.store(true, std::memory_order_release);
|
||||
// copy the hook under the lock then invoke outside, the producer side may grab queue locks
|
||||
// and we do not want to hold our mu across that path
|
||||
std::function<void()> fn;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mu);
|
||||
fn = stop_producer;
|
||||
}
|
||||
if (fn) {
|
||||
fn();
|
||||
}
|
||||
}
|
||||
|
||||
bool stream_session::is_cancelled() const {
|
||||
return cancelled.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
stream_session_manager::stream_session_manager()
|
||||
: running(false) {
|
||||
}
|
||||
|
||||
stream_session_manager::~stream_session_manager() {
|
||||
stop_gc();
|
||||
}
|
||||
|
||||
stream_session_ptr stream_session_manager::create_or_replace(const std::string & conversation_id) {
|
||||
// evict any previous session on the same conv, this guarantees the invariant
|
||||
// "one conv = at most one live session" and propagates cancel to its producer
|
||||
stream_session_ptr previous;
|
||||
auto fresh = std::make_shared<stream_session>(conversation_id, STREAM_SESSION_MAX_BYTES);
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(map_mu);
|
||||
auto it = sessions.find(conversation_id);
|
||||
if (it != sessions.end()) {
|
||||
previous = it->second;
|
||||
it->second = fresh;
|
||||
} else {
|
||||
sessions.emplace(conversation_id, fresh);
|
||||
}
|
||||
}
|
||||
if (previous) {
|
||||
previous->cancel();
|
||||
previous->finalize();
|
||||
}
|
||||
return fresh;
|
||||
}
|
||||
|
||||
stream_session_ptr stream_session_manager::get(const std::string & conversation_id) {
|
||||
std::shared_lock<std::shared_mutex> lock(map_mu);
|
||||
auto it = sessions.find(conversation_id);
|
||||
if (it == sessions.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<stream_session_ptr> stream_session_manager::list_all() const {
|
||||
std::vector<stream_session_ptr> out;
|
||||
std::shared_lock<std::shared_mutex> lock(map_mu);
|
||||
out.reserve(sessions.size());
|
||||
for (auto & kv : sessions) {
|
||||
out.push_back(kv.second);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
void stream_session_manager::evict(const std::string & conversation_id) {
|
||||
stream_session_ptr s;
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(map_mu);
|
||||
auto it = sessions.find(conversation_id);
|
||||
if (it == sessions.end()) {
|
||||
return;
|
||||
}
|
||||
s = it->second;
|
||||
sessions.erase(it);
|
||||
}
|
||||
// finalize outside the map lock so any pending readers wake up and exit
|
||||
s->finalize();
|
||||
}
|
||||
|
||||
void stream_session_manager::evict_and_cancel(const std::string & conversation_id) {
|
||||
stream_session_ptr s;
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(map_mu);
|
||||
auto it = sessions.find(conversation_id);
|
||||
if (it == sessions.end()) {
|
||||
return;
|
||||
}
|
||||
s = it->second;
|
||||
sessions.erase(it);
|
||||
}
|
||||
// signal the producer side first so the inference is cancelled at the queue level,
|
||||
// then finalize, which wakes any pending HTTP reader and lets the drain exit naturally
|
||||
s->cancel();
|
||||
s->finalize();
|
||||
}
|
||||
|
||||
void stream_session_manager::start_gc() {
|
||||
if (running.exchange(true)) {
|
||||
return;
|
||||
}
|
||||
gc_thread = std::thread([this] { gc_loop(); });
|
||||
}
|
||||
|
||||
void stream_session_manager::stop_gc() {
|
||||
bool was_running = running.exchange(false);
|
||||
if (was_running) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(gc_wake_mu);
|
||||
}
|
||||
gc_wake_cv.notify_all();
|
||||
if (gc_thread.joinable()) {
|
||||
gc_thread.join();
|
||||
}
|
||||
}
|
||||
// finalize all live sessions so no reader ever hangs
|
||||
std::vector<stream_session_ptr> snapshot;
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(map_mu);
|
||||
snapshot.reserve(sessions.size());
|
||||
for (auto & kv : sessions) {
|
||||
snapshot.push_back(kv.second);
|
||||
}
|
||||
sessions.clear();
|
||||
}
|
||||
for (auto & s : snapshot) {
|
||||
s->finalize();
|
||||
}
|
||||
}
|
||||
|
||||
void stream_session_manager::gc_loop() {
|
||||
while (running.load(std::memory_order_acquire)) {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(gc_wake_mu);
|
||||
gc_wake_cv.wait_for(lock,
|
||||
std::chrono::seconds(STREAM_SESSION_GC_INTERVAL_SECONDS),
|
||||
[this] { return !running.load(std::memory_order_acquire); });
|
||||
}
|
||||
if (!running.load(std::memory_order_acquire)) {
|
||||
return;
|
||||
}
|
||||
int64_t cutoff = now_seconds() - STREAM_SESSION_TTL_SECONDS;
|
||||
std::vector<stream_session_ptr> to_drop;
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(map_mu);
|
||||
for (auto it = sessions.begin(); it != sessions.end(); ) {
|
||||
int64_t completed = it->second->completed_at();
|
||||
if (completed != 0 && completed <= cutoff) {
|
||||
to_drop.push_back(it->second);
|
||||
it = sessions.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
// finalize outside the map lock, idempotent if the session was already done
|
||||
for (auto & s : to_drop) {
|
||||
s->finalize();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process wide manager, lifecycle controlled by llama-server main() via start_gc/stop_gc
|
||||
stream_session_manager g_stream_sessions;
|
||||
|
||||
// stream_pipe ---------------------------------------------------------------------------------
|
||||
|
||||
stream_pipe::stream_pipe(stream_session_ptr session)
|
||||
: session_(std::move(session)) {
|
||||
}
|
||||
|
||||
bool stream_pipe::is_cancelled() const {
|
||||
return session_->is_cancelled();
|
||||
}
|
||||
|
||||
// stream_pipe_producer
|
||||
|
||||
stream_pipe_producer::stream_pipe_producer(stream_session_ptr session)
|
||||
: stream_pipe(std::move(session)) {
|
||||
}
|
||||
|
||||
stream_pipe_producer::~stream_pipe_producer() {
|
||||
cleanup();
|
||||
session_->finalize();
|
||||
}
|
||||
|
||||
void stream_pipe_producer::cleanup() {
|
||||
if (!alive_) {
|
||||
return;
|
||||
}
|
||||
alive_->store(false, std::memory_order_release);
|
||||
session_->set_stop_producer(nullptr);
|
||||
alive_.reset();
|
||||
}
|
||||
|
||||
bool stream_pipe_producer::write(const char * data, size_t len) {
|
||||
return session_->append(data, len);
|
||||
}
|
||||
|
||||
void stream_pipe_producer::done() {
|
||||
done_ = true;
|
||||
}
|
||||
|
||||
void stream_pipe_producer::close() {
|
||||
// httplib bails its content provider the moment is_peer_alive() goes false, so pump the rest
|
||||
// of the generation into the ring buffer here. a DELETE flips is_cancelled and cuts it short
|
||||
if (done_ || session_->is_cancelled()) {
|
||||
SRV_INF("stream_pipe close: skip drain (done=%d cancelled=%d) conv=%s\n",
|
||||
done_ ? 1 : 0, session_->is_cancelled() ? 1 : 0, session_->conversation_id.c_str());
|
||||
return;
|
||||
}
|
||||
SRV_INF("stream_pipe close: draining conv=%s\n", session_->conversation_id.c_str());
|
||||
size_t drained = 0;
|
||||
std::string chunk;
|
||||
while (true) {
|
||||
chunk.clear();
|
||||
bool has_next = res_->next(chunk);
|
||||
if (!chunk.empty()) {
|
||||
write(chunk.data(), chunk.size());
|
||||
drained += chunk.size();
|
||||
}
|
||||
if (!has_next) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
SRV_INF("stream_pipe close: drain ended conv=%s bytes=%zu\n", session_->conversation_id.c_str(), drained);
|
||||
}
|
||||
|
||||
std::shared_ptr<stream_pipe_producer> stream_pipe_producer::create(stream_session_ptr session,
|
||||
server_http_res & res) {
|
||||
auto alive = std::make_shared<std::atomic<bool>>(true);
|
||||
auto * res_ptr = &res;
|
||||
session->set_stop_producer([alive, res_ptr]() {
|
||||
if (alive->load(std::memory_order_acquire)) {
|
||||
res_ptr->stop();
|
||||
}
|
||||
});
|
||||
auto pipe = std::shared_ptr<stream_pipe_producer>(new stream_pipe_producer(std::move(session)));
|
||||
pipe->alive_ = std::move(alive);
|
||||
pipe->res_ = res_ptr;
|
||||
return pipe;
|
||||
}
|
||||
|
||||
// stream_pipe_consumer
|
||||
|
||||
stream_pipe_consumer::stream_pipe_consumer(stream_session_ptr session)
|
||||
: stream_pipe(std::move(session)) {
|
||||
}
|
||||
|
||||
stream_read_status stream_pipe_consumer::read(size_t & offset,
|
||||
const std::function<bool(const char *, size_t)> & sink,
|
||||
const std::function<bool()> & should_stop) {
|
||||
return session_->read_from(offset, sink, should_stop);
|
||||
}
|
||||
|
||||
std::shared_ptr<stream_pipe_consumer> stream_pipe_consumer::create(stream_session_ptr session) {
|
||||
return std::shared_ptr<stream_pipe_consumer>(new stream_pipe_consumer(std::move(session)));
|
||||
}
|
||||
|
||||
// helper, builds the standard error response and assigns it to a brand new http_res
|
||||
static server_http_res_ptr make_error_response(int status, const std::string & message, error_type type) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
json err = format_error_response(message, type);
|
||||
res->status = json_value(err, "code", status);
|
||||
res->content_type = "application/json; charset=utf-8";
|
||||
res->data = safe_json_to_str({{"error", err}});
|
||||
return res;
|
||||
}
|
||||
|
||||
server_http_context::handler_t make_stream_get_handler() {
|
||||
return [](const server_http_req & req) -> server_http_res_ptr {
|
||||
// GET /v1/stream/<conv_id>?from=N replays the SSE bytes already buffered for the
|
||||
// session, blocks for more bytes when the session is still running, returns when
|
||||
// the session is finalized. the body is streamed back as text/event-stream so the
|
||||
// browser EventSource can attach to it like a fresh request
|
||||
std::string conv_id = req.get_param("conv_id");
|
||||
if (conv_id.empty()) {
|
||||
return make_error_response(400, "Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
auto session = g_stream_sessions.get(conv_id);
|
||||
if (!session) {
|
||||
return make_error_response(404, "Stream not found or expired", ERROR_TYPE_NOT_FOUND);
|
||||
}
|
||||
size_t from = 0;
|
||||
std::string from_str = req.get_param("from");
|
||||
if (!from_str.empty()) {
|
||||
try {
|
||||
from = static_cast<size_t>(std::stoull(from_str));
|
||||
} catch (const std::exception &) {
|
||||
return make_error_response(400, "Invalid 'from' offset", ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
}
|
||||
if (from < session->dropped_prefix()) {
|
||||
return make_error_response(400, "Stream offset lost, please restart", ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 200;
|
||||
res->content_type = "text/event-stream";
|
||||
// the next closure reads from the ring buffer at the requested offset, blocks until
|
||||
// bytes arrive or the session finalizes. exit each call after draining the available
|
||||
// chunk so set_chunked_content_provider gets a chance to flush to the socket
|
||||
auto offset_ptr = std::make_shared<size_t>(from);
|
||||
// consumer pipe: read-only, does not finalize the session on destruction
|
||||
auto pipe = stream_pipe_consumer::create(session);
|
||||
res->next = [pipe, offset_ptr, &req](std::string & output) -> bool {
|
||||
bool got_any = false;
|
||||
pipe->read(*offset_ptr,
|
||||
[&](const char * d, size_t n) {
|
||||
output.append(d, n);
|
||||
*offset_ptr += n;
|
||||
got_any = true;
|
||||
return false;
|
||||
},
|
||||
req.should_stop);
|
||||
return got_any;
|
||||
};
|
||||
return res;
|
||||
};
|
||||
}
|
||||
|
||||
server_http_context::handler_t make_streams_lookup_handler() {
|
||||
return [](const server_http_req & req) -> server_http_res_ptr {
|
||||
// POST /v1/streams/lookup with body {"conversation_ids": ["X", "Y", ...]} returns the
|
||||
// matching sessions, only for ids the caller already knows. each id matches the exact key
|
||||
// and any "<id>::<model>" variant, so one lookup covers every per model session for a conv
|
||||
std::vector<std::string> requested;
|
||||
try {
|
||||
json body = json::parse(req.body);
|
||||
if (body.contains("conversation_ids") && body["conversation_ids"].is_array()) {
|
||||
for (const auto & v : body["conversation_ids"]) {
|
||||
if (v.is_string()) {
|
||||
std::string id = v.get<std::string>();
|
||||
if (!id.empty()) {
|
||||
requested.push_back(std::move(id));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 400;
|
||||
res->content_type = "application/json; charset=utf-8";
|
||||
res->data = safe_json_to_str({{"error", {{"message", std::string("invalid body: ") + e.what()},
|
||||
{"type", "invalid_request_error"}}}});
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<stream_session_ptr> sessions;
|
||||
if (!requested.empty()) {
|
||||
auto all = g_stream_sessions.list_all();
|
||||
for (const auto & rid : requested) {
|
||||
const std::string with_sep = rid + "::";
|
||||
for (auto & s : all) {
|
||||
if (s->conversation_id == rid ||
|
||||
s->conversation_id.compare(0, with_sep.size(), with_sep) == 0) {
|
||||
sessions.push_back(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json arr = json::array();
|
||||
for (auto & s : sessions) {
|
||||
arr.push_back({
|
||||
{"conversation_id", s->conversation_id},
|
||||
{"is_done", s->is_done()},
|
||||
{"total_bytes", s->total_size()},
|
||||
{"started_at", s->started_ts},
|
||||
{"completed_at", s->completed_at()},
|
||||
});
|
||||
}
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 200;
|
||||
res->content_type = "application/json; charset=utf-8";
|
||||
res->data = safe_json_to_str(arr);
|
||||
return res;
|
||||
};
|
||||
}
|
||||
|
||||
server_http_context::handler_t make_stream_delete_handler() {
|
||||
return [](const server_http_req & req) -> server_http_res_ptr {
|
||||
// DELETE /v1/stream/<conv_id> is the explicit user Stop, cancels the producer hook
|
||||
// wired by handle_completions_impl and evicts the buffer. idempotent, a session that
|
||||
// already finalized or was never created returns 204 either way
|
||||
std::string conv_id = req.get_param("conv_id");
|
||||
if (conv_id.empty()) {
|
||||
return make_error_response(400, "Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST);
|
||||
}
|
||||
SRV_INF("DELETE /v1/stream/%s -> evict_and_cancel\n", conv_id.c_str());
|
||||
g_stream_sessions.evict_and_cancel(conv_id);
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 204;
|
||||
res->content_type = "application/json";
|
||||
return res;
|
||||
};
|
||||
}
|
||||
|
||||
std::string stream_conv_id_from_headers(const std::map<std::string, std::string> & headers) {
|
||||
// case-insensitive scan for x-conversation-id
|
||||
static constexpr char target[] = "x-conversation-id";
|
||||
static constexpr size_t target_len = sizeof(target) - 1;
|
||||
for (const auto & [hk, hv] : headers) {
|
||||
if (hk.size() != target_len) continue;
|
||||
bool match = true;
|
||||
for (size_t i = 0; i < target_len; ++i) {
|
||||
char c = hk[i];
|
||||
if (c >= 'A' && c <= 'Z') c = char(c + 32);
|
||||
if (c != target[i]) { match = false; break; }
|
||||
}
|
||||
if (match) {
|
||||
return hv;
|
||||
}
|
||||
}
|
||||
return std::string();
|
||||
}
|
||||
|
||||
void stream_session_attach_pipe(server_http_res & res, const std::map<std::string, std::string> & headers) {
|
||||
std::string conversation_id = stream_conv_id_from_headers(headers);
|
||||
SRV_INF("stream_session_attach_pipe: conv_id=%s (empty=%d)\n",
|
||||
conversation_id.c_str(), conversation_id.empty() ? 1 : 0);
|
||||
if (conversation_id.empty()) {
|
||||
return;
|
||||
}
|
||||
auto session = g_stream_sessions.create_or_replace(conversation_id);
|
||||
res.spipe = stream_pipe_producer::create(session, res);
|
||||
}
|
||||
|
||||
std::function<bool()> stream_aware_should_stop(server_http_res * res, std::function<bool()> fallback) {
|
||||
return [res, fallback = std::move(fallback)]() -> bool {
|
||||
if (res->spipe) {
|
||||
return res->spipe->is_cancelled();
|
||||
}
|
||||
return fallback();
|
||||
};
|
||||
}
|
||||
@ -1,203 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "server-http.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
enum class stream_read_status {
|
||||
OK,
|
||||
OFFSET_LOST,
|
||||
};
|
||||
|
||||
// streaming buffer for one generation, survives HTTP disconnect. the producer appends raw SSE
|
||||
// bytes, readers drain from any offset via read_from and block until more bytes or finalize.
|
||||
// keyed by conversation_id: one conv = at most one live session
|
||||
struct stream_session {
|
||||
std::string conversation_id;
|
||||
int64_t started_ts; // unix seconds at construction, used by /v1/streams listing
|
||||
|
||||
stream_session(std::string conversation_id_, size_t max_bytes_);
|
||||
stream_session(const stream_session &) = delete;
|
||||
stream_session & operator=(const stream_session &) = delete;
|
||||
|
||||
// append raw bytes, drops from the front if the cap is reached.
|
||||
// returns false if the session is already finalized
|
||||
bool append(const char * data, size_t len);
|
||||
|
||||
// mark the session as complete, wakes all pending readers
|
||||
void finalize();
|
||||
|
||||
// drain bytes from offset, calling sink for each chunk. blocks until more
|
||||
// bytes arrive or finalize is called. returns OK on clean exit, OFFSET_LOST
|
||||
// if offset falls below the dropped prefix
|
||||
stream_read_status read_from(size_t offset,
|
||||
const std::function<bool(const char *, size_t)> & sink,
|
||||
const std::function<bool()> & should_stop);
|
||||
|
||||
bool is_done() const;
|
||||
bool is_cancelled() const;
|
||||
size_t total_size() const; // bytes that ever entered the session
|
||||
size_t dropped_prefix() const; // bytes evicted from the front due to cap
|
||||
int64_t completed_at() const; // 0 while alive, unix seconds after finalize
|
||||
|
||||
// attach the producer stop hook used to cancel its reader, pass an empty function to detach
|
||||
void set_stop_producer(std::function<void()> fn);
|
||||
|
||||
// signal the producer to abort its inference asap via the stop hook, idempotent
|
||||
void cancel();
|
||||
|
||||
private:
|
||||
mutable std::mutex mu;
|
||||
std::condition_variable cv;
|
||||
std::vector<char> buffer;
|
||||
size_t prefix_dropped;
|
||||
size_t cap_bytes;
|
||||
std::atomic<bool> done;
|
||||
std::atomic<bool> cancelled;
|
||||
std::atomic<int64_t> completed_ts;
|
||||
std::function<void()> stop_producer; // protected by mu
|
||||
};
|
||||
|
||||
using stream_session_ptr = std::shared_ptr<stream_session>;
|
||||
|
||||
// one end of a stream_session pipe. the base holds the session and the shared query, the
|
||||
// producer and consumer ends derive from it. virtual dtor so each end runs its own teardown:
|
||||
// the producer finalizes the session, the consumer leaves it untouched
|
||||
struct stream_pipe {
|
||||
virtual ~stream_pipe() = default;
|
||||
|
||||
// true if the session was cancelled (e.g. via DELETE /v1/stream/<conv_id>)
|
||||
bool is_cancelled() const;
|
||||
|
||||
protected:
|
||||
explicit stream_pipe(stream_session_ptr session);
|
||||
|
||||
stream_session_ptr session_;
|
||||
};
|
||||
|
||||
// producer end: writes chunks into the ring buffer and owns the session lifetime, finalizing it
|
||||
// on destruction.
|
||||
//
|
||||
// lifetime safety: holds a shared_ptr<atomic<bool>> alive also captured by the session's
|
||||
// stop_producer hook. cleanup() sets alive=false and clears the hook; it must run while the
|
||||
// response the hook calls stop() on is still alive. ~server_res_generator() does this explicitly.
|
||||
struct stream_pipe_producer : stream_pipe {
|
||||
~stream_pipe_producer() override;
|
||||
|
||||
// append raw bytes to the session's ring buffer, returns false if already finalized
|
||||
bool write(const char * data, size_t len);
|
||||
|
||||
// mark the natural end on the wire so a later close() is a no-op
|
||||
void done();
|
||||
|
||||
// on a peer drop, pump the response next() into the ring buffer until done. runs on the http
|
||||
// worker from on_complete, no-op after done() or cancel
|
||||
void close();
|
||||
|
||||
// disarm the stop hook and drop the alive guard, must run while the response the hook
|
||||
// references is still alive. idempotent, the destructor calls it too
|
||||
void cleanup();
|
||||
|
||||
// res.stop() is invoked when the session is cancelled, the alive guard ensures stop() is not
|
||||
// called after cleanup() has run
|
||||
static std::shared_ptr<stream_pipe_producer> create(stream_session_ptr session, server_http_res & res);
|
||||
|
||||
private:
|
||||
explicit stream_pipe_producer(stream_session_ptr session);
|
||||
|
||||
bool done_ = false;
|
||||
std::shared_ptr<std::atomic<bool>> alive_;
|
||||
server_http_res * res_ = nullptr;
|
||||
};
|
||||
|
||||
// consumer end: read-only replay of the ring buffer, the destructor does not finalize the session
|
||||
struct stream_pipe_consumer : stream_pipe {
|
||||
// drain bytes from offset, calling sink for each available chunk. blocks until more data
|
||||
// arrives or the session finalizes. should_stop is polled, returns OFFSET_LOST if offset
|
||||
// fell below the dropped prefix
|
||||
stream_read_status read(size_t & offset,
|
||||
const std::function<bool(const char *, size_t)> & sink,
|
||||
const std::function<bool()> & should_stop);
|
||||
|
||||
static std::shared_ptr<stream_pipe_consumer> create(stream_session_ptr session);
|
||||
|
||||
private:
|
||||
explicit stream_pipe_consumer(stream_session_ptr session);
|
||||
};
|
||||
|
||||
// owns all live sessions, runs a periodic GC to evict expired ones.
|
||||
// the map is keyed by conversation_id, so the invariant "one conv = at most one
|
||||
// live session" is enforced at the type level
|
||||
class stream_session_manager {
|
||||
public:
|
||||
stream_session_manager();
|
||||
~stream_session_manager();
|
||||
|
||||
stream_session_manager(const stream_session_manager &) = delete;
|
||||
stream_session_manager & operator=(const stream_session_manager &) = delete;
|
||||
|
||||
// install a new session for this conversation, evicting and cancelling any previous one.
|
||||
// the conversation_id must be non empty, the caller is responsible for that check.
|
||||
// returns the new session
|
||||
stream_session_ptr create_or_replace(const std::string & conversation_id);
|
||||
|
||||
// lookup, returns null if unknown or already evicted
|
||||
stream_session_ptr get(const std::string & conversation_id);
|
||||
|
||||
// list every live or recently completed session, used by GET /v1/streams without filter
|
||||
std::vector<stream_session_ptr> list_all() const;
|
||||
|
||||
// remove from the map and finalize, wakes any pending readers
|
||||
void evict(const std::string & conversation_id);
|
||||
|
||||
// signal the producer to cancel asap then evict, used by the explicit user Stop path
|
||||
void evict_and_cancel(const std::string & conversation_id);
|
||||
|
||||
void start_gc();
|
||||
void stop_gc();
|
||||
|
||||
private:
|
||||
void gc_loop();
|
||||
|
||||
mutable std::shared_mutex map_mu;
|
||||
std::unordered_map<std::string, stream_session_ptr> sessions; // key: conversation_id
|
||||
std::thread gc_thread;
|
||||
std::atomic<bool> running;
|
||||
std::mutex gc_wake_mu;
|
||||
std::condition_variable gc_wake_cv;
|
||||
};
|
||||
|
||||
// process wide manager, linked by both llama-server and llama-cli. llama-server main() drives
|
||||
// start_gc/stop_gc, llama-cli leaves it idle. the dtor calls stop_gc() unconditionally so exit
|
||||
// is safe whether or not the GC thread ran
|
||||
extern stream_session_manager g_stream_sessions;
|
||||
|
||||
// route handler factories operating on g_stream_sessions, wired under /v1/stream/* by server.cpp.
|
||||
// keeps the resumable stream surface confined to server-stream
|
||||
server_http_context::handler_t make_stream_get_handler();
|
||||
server_http_context::handler_t make_streams_lookup_handler();
|
||||
server_http_context::handler_t make_stream_delete_handler();
|
||||
|
||||
// extract the X-Conversation-Id header value (case-insensitive), empty when absent. exposed so
|
||||
// the router can track which child serves a forwarded POST
|
||||
std::string stream_conv_id_from_headers(const std::map<std::string, std::string> & headers);
|
||||
|
||||
// on an X-Conversation-Id header, create or replace the session and attach a producer pipe to
|
||||
// res. no-op when absent, called from the server_res_generator constructor
|
||||
void stream_session_attach_pipe(server_http_res & res, const std::map<std::string, std::string> & headers);
|
||||
|
||||
// should_stop closure that ignores peer disconnect when a pipe is attached, so only an explicit
|
||||
// DELETE stops the producer and generation keeps flowing into the ring buffer. without a pipe it
|
||||
// delegates to fallback, the legacy non-resumable flow
|
||||
std::function<bool()> stream_aware_should_stop(server_http_res * res, std::function<bool()> fallback);
|
||||
@ -2,7 +2,6 @@
|
||||
#include "server-http.h"
|
||||
#include "server-models.h"
|
||||
#include "server-cors-proxy.h"
|
||||
#include "server-stream.h"
|
||||
#include "server-tools.h"
|
||||
|
||||
#include "arg.h"
|
||||
@ -83,10 +82,6 @@ int llama_server(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
// start the stream session manager GC right after common init, before any HTTP route can
|
||||
// touch it. lifecycle is symmetric, stop_gc() runs in clean_up() before backend free
|
||||
g_stream_sessions.start_gc();
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||
return 1;
|
||||
}
|
||||
@ -244,29 +239,6 @@ int llama_server(int argc, char ** argv) {
|
||||
ctx_http.get ("/slots", ex_wrapper(routes.get_slots));
|
||||
ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots));
|
||||
|
||||
// resumable streaming, the conversation_id is the session identity end to end. router and
|
||||
// child wire different handlers under the same paths: a child binds the local g_stream_sessions
|
||||
// backed factories, the router binds proxies that resolve the owning child through the
|
||||
// conv_id -> model map
|
||||
server_http_context::handler_t stream_get_h;
|
||||
server_http_context::handler_t streams_lookup_h;
|
||||
server_http_context::handler_t stream_delete_h;
|
||||
if (is_router_server) {
|
||||
stream_get_h = models_routes->router_stream_get;
|
||||
streams_lookup_h = models_routes->router_streams_lookup;
|
||||
stream_delete_h = models_routes->router_stream_delete;
|
||||
} else {
|
||||
stream_get_h = make_stream_get_handler();
|
||||
streams_lookup_h = make_streams_lookup_handler();
|
||||
stream_delete_h = make_stream_delete_handler();
|
||||
}
|
||||
ctx_http.get ("/v1/stream/:conv_id", ex_wrapper(stream_get_h));
|
||||
// POST /v1/streams/lookup with body {"conversation_ids": [...]}. you can only ask for ids
|
||||
// you already own (the WebUI passes the convs visible in its sidebar). the server never
|
||||
// lists ids it has not been asked about, so a random caller cannot enumerate live sessions
|
||||
ctx_http.post("/v1/streams/lookup", ex_wrapper(streams_lookup_h));
|
||||
ctx_http.del ("/v1/stream/:conv_id", ex_wrapper(stream_delete_h));
|
||||
|
||||
// Google Cloud Platform (Vertex AI) compat
|
||||
ctx_http.register_gcp_compat();
|
||||
|
||||
@ -342,8 +314,6 @@ int llama_server(int argc, char ** argv) {
|
||||
|
||||
clean_up = [&models_routes]() {
|
||||
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
||||
// stop the session GC first, it finalizes live sessions and wakes pending readers
|
||||
g_stream_sessions.stop_gc();
|
||||
if (models_routes.has_value()) {
|
||||
models_routes->stopping.store(true); // maybe redundant, but just to be safe
|
||||
models_routes->models.unload_all();
|
||||
@ -370,8 +340,6 @@ int llama_server(int argc, char ** argv) {
|
||||
// setup clean up function, to be called before exit
|
||||
clean_up = [&ctx_http, &ctx_server]() {
|
||||
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
||||
// stop the session GC first, it finalizes live sessions and wakes pending readers
|
||||
g_stream_sessions.stop_gc();
|
||||
ctx_http.stop();
|
||||
ctx_server.terminate();
|
||||
llama_backend_free();
|
||||
|
||||
@ -33,7 +33,7 @@
|
||||
|
||||
{#if !readonly && onRemove}
|
||||
<div
|
||||
class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-focus-within:opacity-100 group-hover:opacity-100"
|
||||
class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
|
||||
>
|
||||
<ActionIcon icon={X} tooltip="Remove" stopPropagationOnClick onclick={() => onRemove?.()} />
|
||||
</div>
|
||||
|
||||
@ -56,7 +56,7 @@
|
||||
<div class="relative flex h-6 items-center justify-between">
|
||||
<div class="right-0 flex items-center gap-2 opacity-100 transition-opacity">
|
||||
<div
|
||||
class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-focus-within:opacity-100 group-hover:opacity-100"
|
||||
class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-hover:opacity-100"
|
||||
>
|
||||
<ActionIcon icon={Edit} tooltip="Edit" onclick={editCtx.handleEdit} />
|
||||
<ActionIcon icon={Trash2} tooltip="Delete" onclick={onDelete} />
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
ChatMessages,
|
||||
ChatScreenDragOverlay,
|
||||
ChatScreenProcessingInfo,
|
||||
ChatScreenStreamResumeStatus,
|
||||
ServerLoadingSplash,
|
||||
ChatScreenServerError
|
||||
} from '$lib/components/app';
|
||||
@ -282,10 +281,6 @@
|
||||
|
||||
<ChatScreenServerError />
|
||||
|
||||
{#if page.params.id}
|
||||
<ChatScreenStreamResumeStatus />
|
||||
{/if}
|
||||
|
||||
<div class="pointer-events-none flex flex-col gap-6 items-center w-full">
|
||||
{#if (isMobile.current ? mobileScrollDownHint || isMobileUserScrolledUp : autoScroll.userScrolledUp) && page.url.hash.includes(ROUTES.CHAT) && page.params.id}
|
||||
<ChatScreenActionScrollDown
|
||||
|
||||
@ -1,18 +0,0 @@
|
||||
<script lang="ts">
|
||||
import { chatStore } from '$lib/stores/chat.svelte';
|
||||
import { StreamConnectionState } from '$lib/enums';
|
||||
import { Loader2 } from '@lucide/svelte';
|
||||
|
||||
let state = $derived(chatStore.streamConnectionState);
|
||||
</script>
|
||||
|
||||
{#if state === StreamConnectionState.RESUMING}
|
||||
<div
|
||||
class="pointer-events-auto mx-auto mt-2 mb-2 flex max-w-[48rem] items-center gap-2 rounded-md border border-blue-400/40 bg-blue-50/60 px-3 py-1.5 text-sm text-blue-700 dark:bg-blue-950/40 dark:text-blue-200"
|
||||
role="status"
|
||||
aria-live="polite"
|
||||
>
|
||||
<Loader2 class="h-3.5 w-3.5 animate-spin" />
|
||||
<span>Reconnecting to the stream...</span>
|
||||
</div>
|
||||
{/if}
|
||||
@ -683,11 +683,3 @@ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProc
|
||||
* Rendered inside ChatScreen when `serverError` store has a value.
|
||||
*/
|
||||
export { default as ChatScreenServerError } from './ChatScreen/ChatScreenServerError.svelte';
|
||||
|
||||
/**
|
||||
* Stream resume status indicator. Shows a small "Reconnecting to the stream..."
|
||||
* banner with a spinner while `chatStore.streamConnectionState` is `resuming`,
|
||||
* i.e. after a dropped connection is reattaching to the live SSE replay buffer.
|
||||
* Renders nothing otherwise. Shown inside ChatScreen only on an active conversation route.
|
||||
*/
|
||||
export { default as ChatScreenStreamResumeStatus } from './ChatScreen/ChatScreenStreamResumeStatus.svelte';
|
||||
|
||||
@ -39,6 +39,7 @@
|
||||
depth = 0
|
||||
}: Props = $props();
|
||||
|
||||
let renderActionsDropdown = $state(false);
|
||||
let dropdownOpen = $state(false);
|
||||
|
||||
let isLoading = $derived(getAllLoadingChats().includes(conversation.id));
|
||||
@ -70,10 +71,26 @@
|
||||
}
|
||||
}
|
||||
|
||||
function handleMouseLeave() {
|
||||
if (!dropdownOpen) {
|
||||
renderActionsDropdown = false;
|
||||
}
|
||||
}
|
||||
|
||||
function handleMouseOver() {
|
||||
renderActionsDropdown = true;
|
||||
}
|
||||
|
||||
function handleSelect() {
|
||||
onSelect?.(conversation.id);
|
||||
}
|
||||
|
||||
$effect(() => {
|
||||
if (!dropdownOpen) {
|
||||
renderActionsDropdown = false;
|
||||
}
|
||||
});
|
||||
|
||||
onMount(() => {
|
||||
document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener);
|
||||
|
||||
@ -86,19 +103,23 @@
|
||||
});
|
||||
</script>
|
||||
|
||||
<div
|
||||
class="conversation-item group relative flex min-h-9 w-full items-center justify-between space-x-3 rounded-lg py-1.5 transition-colors hover:bg-foreground/10 {isActive
|
||||
<!-- svelte-ignore a11y_mouse_events_have_key_events -->
|
||||
<button
|
||||
class="group flex min-h-9 w-full cursor-pointer items-center justify-between space-x-3 rounded-lg py-1.5 text-left transition-colors hover:bg-foreground/10 {isActive
|
||||
? 'bg-foreground/5 text-accent-foreground'
|
||||
: ''} px-3"
|
||||
onclick={handleSelect}
|
||||
onmouseover={handleMouseOver}
|
||||
onmouseleave={handleMouseLeave}
|
||||
onfocusin={handleMouseOver}
|
||||
onfocusout={(e) => {
|
||||
if (!e.currentTarget.contains(e.relatedTarget as Node | null)) {
|
||||
handleMouseLeave();
|
||||
}
|
||||
}}
|
||||
>
|
||||
<button
|
||||
class="absolute inset-0 z-0 cursor-pointer rounded-lg focus:outline-none focus-visible:ring-2 focus-visible:ring-ring"
|
||||
onclick={handleSelect}
|
||||
aria-label={conversation.name}
|
||||
>
|
||||
</button>
|
||||
<div
|
||||
class="pointer-events-none relative z-10 flex min-w-0 flex-1 items-center gap-2"
|
||||
class="flex min-w-0 flex-1 items-center gap-2"
|
||||
style:padding-left="{depth * FORK_TREE_DEPTH_PADDING}px"
|
||||
>
|
||||
{#if depth > 0}
|
||||
@ -109,7 +130,7 @@
|
||||
<a
|
||||
{...props}
|
||||
href={RouterService.chat(conversation.forkedFromConversationId)}
|
||||
class="pointer-events-auto flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
|
||||
class="flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
|
||||
>
|
||||
<GitBranch class="h-3.5 w-3.5" />
|
||||
</a>
|
||||
@ -125,15 +146,18 @@
|
||||
{#if isLoading}
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<button
|
||||
class="stop-button pointer-events-auto flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
|
||||
<div
|
||||
class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
|
||||
onclick={handleStop}
|
||||
onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
|
||||
role="button"
|
||||
tabindex="0"
|
||||
aria-label="Stop generation"
|
||||
>
|
||||
<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />
|
||||
|
||||
<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
|
||||
</button>
|
||||
</div>
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content>
|
||||
@ -145,50 +169,52 @@
|
||||
<TruncatedText text={conversation.name} class="text-sm font-medium" showTooltip={false} />
|
||||
</div>
|
||||
|
||||
<div class="actions pointer-events-auto relative z-20 flex items-center">
|
||||
<DropdownMenuActions
|
||||
triggerIcon={MoreHorizontal}
|
||||
triggerTooltip="More actions"
|
||||
bind:open={dropdownOpen}
|
||||
actions={[
|
||||
{
|
||||
icon: conversation.pinned ? PinOff : Pin,
|
||||
label: conversation.pinned ? 'Unpin' : 'Pin',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
handleTogglePin();
|
||||
}
|
||||
},
|
||||
{
|
||||
icon: Pencil,
|
||||
label: 'Edit',
|
||||
onclick: handleEdit,
|
||||
shortcut: ['shift', 'cmd', 'e']
|
||||
},
|
||||
{
|
||||
icon: Download,
|
||||
label: 'Export',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
conversationsStore.downloadConversation(conversation.id);
|
||||
{#if renderActionsDropdown}
|
||||
<div class="actions flex items-center">
|
||||
<DropdownMenuActions
|
||||
triggerIcon={MoreHorizontal}
|
||||
triggerTooltip="More actions"
|
||||
bind:open={dropdownOpen}
|
||||
actions={[
|
||||
{
|
||||
icon: conversation.pinned ? PinOff : Pin,
|
||||
label: conversation.pinned ? 'Unpin' : 'Pin',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
handleTogglePin();
|
||||
}
|
||||
},
|
||||
shortcut: ['shift', 'cmd', 's']
|
||||
},
|
||||
{
|
||||
icon: Trash2,
|
||||
label: 'Delete',
|
||||
onclick: handleDelete,
|
||||
variant: 'destructive',
|
||||
shortcut: ['shift', 'cmd', 'd'],
|
||||
separator: true
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{
|
||||
icon: Pencil,
|
||||
label: 'Edit',
|
||||
onclick: handleEdit,
|
||||
shortcut: ['shift', 'cmd', 'e']
|
||||
},
|
||||
{
|
||||
icon: Download,
|
||||
label: 'Export',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
conversationsStore.downloadConversation(conversation.id);
|
||||
},
|
||||
shortcut: ['shift', 'cmd', 's']
|
||||
},
|
||||
{
|
||||
icon: Trash2,
|
||||
label: 'Delete',
|
||||
onclick: handleDelete,
|
||||
variant: 'destructive',
|
||||
shortcut: ['shift', 'cmd', 'd'],
|
||||
separator: true
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
</button>
|
||||
|
||||
<style>
|
||||
.conversation-item {
|
||||
button {
|
||||
:global([data-slot='dropdown-menu-trigger']:not([data-state='open'])) {
|
||||
opacity: 0;
|
||||
}
|
||||
@ -213,8 +239,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
&:is(:hover) .stop-button,
|
||||
&:focus-within .stop-button {
|
||||
&:is(:hover) .stop-button {
|
||||
:global(.stop-icon) {
|
||||
display: block;
|
||||
}
|
||||
|
||||
@ -21,11 +21,5 @@ export const API_TOOLS = {
|
||||
EXECUTE: '/tools'
|
||||
};
|
||||
|
||||
// resumable stream routes, the conv::model identity is appended as a path segment
|
||||
export const API_STREAM = {
|
||||
BASE: './v1/stream',
|
||||
LOOKUP: './v1/streams/lookup'
|
||||
};
|
||||
|
||||
/** CORS proxy endpoint path */
|
||||
export const CORS_PROXY_ENDPOINT = '/cors-proxy';
|
||||
|
||||
@ -46,7 +46,6 @@ export * from './routes';
|
||||
export * from './sandbox';
|
||||
export * from './settings-keys';
|
||||
export * from './settings-registry';
|
||||
export * from './stream';
|
||||
export * from './supported-file-types';
|
||||
export * from './table-html-restorer';
|
||||
export * from './title-generation';
|
||||
|
||||
@ -26,9 +26,6 @@ export const THINKING_ENABLED_DEFAULT_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.th
|
||||
export const REASONING_EFFORT_DEFAULT_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.reasoningEffortDefault`;
|
||||
export const USER_OVERRIDES_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME}.userOverrides`;
|
||||
|
||||
/** Key prefix for per-conversation resumable stream state, conversationId is appended */
|
||||
export const STREAM_RESUME_LOCALSTORAGE_KEY_PREFIX = `${STORAGE_APP_NAME}.streamResume.`;
|
||||
|
||||
// Deprecated old key names (kept for backward compat while users migrate)
|
||||
/** @deprecated Use {@link ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY} instead */
|
||||
export const DEPRECATED_ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY = `${STORAGE_APP_NAME_DEPRECATED}.alwaysAllowedTools`;
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
// grace window after a visibilitychange before we kick a reader whose socket likely died
|
||||
// while the tab was hidden. covers brief background pauses without thrashing live streams
|
||||
export const STREAM_VISIBILITY_KICK_MS = 1000;
|
||||
@ -5,15 +5,6 @@ export enum ChatMessageStatsView {
|
||||
SUMMARY = 'summary'
|
||||
}
|
||||
|
||||
/**
|
||||
* Connection state of a streamed completion, drives the resume status indicator.
|
||||
*/
|
||||
export enum StreamConnectionState {
|
||||
STREAMING = 'streaming',
|
||||
RESUMING = 'resuming',
|
||||
LOST = 'lost'
|
||||
}
|
||||
|
||||
/**
|
||||
* Reasoning format options for API requests.
|
||||
*/
|
||||
|
||||
@ -10,7 +10,6 @@ export { AgenticSectionType, ContinueIntentKind, ToolCallType } from './agentic.
|
||||
|
||||
export {
|
||||
ChatMessageStatsView,
|
||||
StreamConnectionState,
|
||||
ContentPartType,
|
||||
ConversationSelectionMode,
|
||||
ErrorDialogType,
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import { getAuthHeaders, getJsonHeaders } from '$lib/utils/api-headers';
|
||||
import { getJsonHeaders } from '$lib/utils/api-headers';
|
||||
import { formatAttachmentText } from '$lib/utils/formatters';
|
||||
import { isAbortError } from '$lib/utils/abort';
|
||||
import { streamIdentity } from '$lib/utils/stream-identity';
|
||||
import {
|
||||
ATTACHMENT_LABEL_PDF_FILE,
|
||||
ATTACHMENT_LABEL_MCP_PROMPT,
|
||||
@ -14,10 +13,7 @@ import {
|
||||
CONTROL_ACTION,
|
||||
SSE_LINE_SEPARATOR,
|
||||
SSE_DATA_PREFIX,
|
||||
SSE_DONE_MARKER,
|
||||
STREAM_VISIBILITY_KICK_MS,
|
||||
STREAM_RESUME_LOCALSTORAGE_KEY_PREFIX,
|
||||
API_STREAM
|
||||
SSE_DONE_MARKER
|
||||
} from '$lib/constants';
|
||||
import {
|
||||
AttachmentType,
|
||||
@ -25,14 +21,12 @@ import {
|
||||
FileTypeAudio,
|
||||
MessageRole,
|
||||
MimeTypeAudio,
|
||||
ReasoningFormat,
|
||||
StreamConnectionState
|
||||
ReasoningFormat
|
||||
} from '$lib/enums';
|
||||
import type {
|
||||
ApiChatMessageContentPart,
|
||||
ApiChatMessageData,
|
||||
ApiChatCompletionToolCall,
|
||||
ApiStreamSession
|
||||
ApiChatCompletionToolCall
|
||||
} from '$lib/types/api';
|
||||
import type {
|
||||
AudioInputFormat,
|
||||
@ -60,19 +54,6 @@ function getAudioInputFormat(mimeType: string): AudioInputFormat {
|
||||
return FileTypeAudio.MP3;
|
||||
}
|
||||
|
||||
interface ResumableStreamState {
|
||||
bytesReceived: number;
|
||||
updatedAt: number;
|
||||
|
||||
// model frozen at POST time, lets a reload rebuild the exact conv::model identity the
|
||||
// server keyed the session under. null when the POST carried no explicit model
|
||||
model?: string | null;
|
||||
}
|
||||
|
||||
function streamStorageKey(conversationId: string): string {
|
||||
return STREAM_RESUME_LOCALSTORAGE_KEY_PREFIX + conversationId;
|
||||
}
|
||||
|
||||
export class ChatService {
|
||||
/**
|
||||
*
|
||||
@ -147,7 +128,6 @@ export class ChatService {
|
||||
onChunk,
|
||||
onComplete,
|
||||
onError,
|
||||
onConnectionState,
|
||||
onReasoningChunk,
|
||||
onToolCallChunk,
|
||||
onModel,
|
||||
@ -332,16 +312,9 @@ export class ChatService {
|
||||
}
|
||||
|
||||
try {
|
||||
const headers: Record<string, string> = { ...getJsonHeaders() };
|
||||
// tag streaming requests with the conversation id, this single header is the opt in for the
|
||||
// server side replay buffer and powers discoverActiveStream on tab reopen. with an explicit
|
||||
// model the ::model suffix keeps the per model session distinct
|
||||
if (stream && conversationId) {
|
||||
headers['X-Conversation-Id'] = streamIdentity(conversationId, options.model);
|
||||
}
|
||||
const response = await fetch(API_CHAT.COMPLETIONS, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
headers: getJsonHeaders(),
|
||||
body: JSON.stringify(requestBody),
|
||||
signal
|
||||
});
|
||||
@ -368,9 +341,7 @@ export class ChatService {
|
||||
onCompletionId,
|
||||
onTimings,
|
||||
conversationId,
|
||||
signal,
|
||||
onConnectionState,
|
||||
options.model
|
||||
signal
|
||||
);
|
||||
|
||||
return;
|
||||
@ -502,116 +473,6 @@ export class ChatService {
|
||||
* @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
|
||||
* @param signal - Optional AbortSignal to cancel the pre-encode request
|
||||
*/
|
||||
static async cancelServerStream(conversationId: string, model?: string | null): Promise<void> {
|
||||
if (!conversationId) return;
|
||||
try {
|
||||
const id = streamIdentity(conversationId, model);
|
||||
await fetch(`${API_STREAM.BASE}/${encodeURIComponent(id)}`, {
|
||||
method: 'DELETE',
|
||||
headers: getAuthHeaders()
|
||||
});
|
||||
} catch (e) {
|
||||
console.warn('cancelServerStream failed:', e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the running session to splice into when discoverActiveStream lists candidates for a
|
||||
* conversation. Finalized sessions are not candidates: their final content was already written
|
||||
* to the DB by the original onComplete handler, so attaching to them would replay a buffer that
|
||||
* may not match what the DB holds. A continue session's buffer holds only the appended deltas,
|
||||
* not the pre continue prefix, so replaying it as a fresh generation would erase the original.
|
||||
*
|
||||
* Among running sessions we tie break on the most recent started_at, which covers the case of
|
||||
* multiple inferences left running on the same conversation.
|
||||
*/
|
||||
static selectActiveStream(
|
||||
sessions: ApiStreamSession[] | null | undefined
|
||||
): ApiStreamSession | null {
|
||||
if (!Array.isArray(sessions) || sessions.length === 0) {
|
||||
return null;
|
||||
}
|
||||
const running = sessions.filter((s) => !s.is_done);
|
||||
if (running.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return running.reduce((best, cur) => (cur.started_at > best.started_at ? cur : best));
|
||||
}
|
||||
|
||||
// persist the running byte count and the frozen model for a conversation, a later visit
|
||||
// resumes the SSE replay at the right offset under the same conv::model identity
|
||||
static saveStreamState(
|
||||
conversationId: string,
|
||||
bytesReceived: number,
|
||||
model?: string | null
|
||||
): void {
|
||||
if (!conversationId) return;
|
||||
try {
|
||||
const state: ResumableStreamState = {
|
||||
bytesReceived,
|
||||
updatedAt: Date.now(),
|
||||
model: model ?? null
|
||||
};
|
||||
localStorage.setItem(streamStorageKey(conversationId), JSON.stringify(state));
|
||||
} catch {
|
||||
// localStorage may be full or disabled, silently ignore
|
||||
}
|
||||
}
|
||||
|
||||
static getStreamState(conversationId: string): ResumableStreamState | null {
|
||||
if (!conversationId) return null;
|
||||
try {
|
||||
const raw = localStorage.getItem(streamStorageKey(conversationId));
|
||||
if (!raw) return null;
|
||||
const parsed = JSON.parse(raw) as ResumableStreamState;
|
||||
if (!parsed || typeof parsed.bytesReceived !== 'number') return null;
|
||||
return parsed;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static clearStreamState(conversationId: string): void {
|
||||
if (!conversationId) return;
|
||||
try {
|
||||
localStorage.removeItem(streamStorageKey(conversationId));
|
||||
} catch {
|
||||
// nothing to do
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rebuild the stream identity for a resume. The model persisted at POST time wins, including a
|
||||
* stored null which means the POST carried no explicit model so the identity stays the bare conv
|
||||
* id. Only fall back to the caller supplied current model when nothing was persisted.
|
||||
*/
|
||||
static resumeStreamIdentity(
|
||||
conversationId: string,
|
||||
state: ResumableStreamState | null,
|
||||
fallbackModel: string | null
|
||||
): string {
|
||||
const model = state && state.model !== undefined ? state.model : fallbackModel;
|
||||
return streamIdentity(conversationId, model);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconnect to an interrupted stream for this conversation. Returns the fetch Response so the
|
||||
* existing SSE parser drains it like a fresh stream. The server returns 200 on success, 404 if
|
||||
* no session exists for the conv_id, and 400 if the offset is below the dropped prefix.
|
||||
*/
|
||||
static async resumeStream(
|
||||
conversationId: string,
|
||||
signal?: AbortSignal,
|
||||
model?: string | null
|
||||
): Promise<Response | null> {
|
||||
if (!conversationId) return null;
|
||||
const state = ChatService.getStreamState(conversationId);
|
||||
const from = state?.bytesReceived ?? 0;
|
||||
const id = streamIdentity(conversationId, model);
|
||||
const url = `${API_STREAM.BASE}/${encodeURIComponent(id)}?from=${from}`;
|
||||
return await fetch(url, { method: 'GET', signal, headers: getAuthHeaders() });
|
||||
}
|
||||
|
||||
static async preEncode(
|
||||
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
|
||||
model?: string | null,
|
||||
@ -696,7 +557,7 @@ export class ChatService {
|
||||
* @returns {Promise<void>} Promise that resolves when streaming is complete
|
||||
* @throws {Error} if the stream cannot be read or parsed
|
||||
*/
|
||||
static async handleStreamResponse(
|
||||
private static async handleStreamResponse(
|
||||
response: Response,
|
||||
onChunk?: (chunk: string) => void,
|
||||
onComplete?: (
|
||||
@ -712,34 +573,15 @@ export class ChatService {
|
||||
onCompletionId?: (id: string) => void,
|
||||
onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
|
||||
conversationId?: string,
|
||||
abortSignal?: AbortSignal,
|
||||
onConnectionState?: (state: StreamConnectionState) => void,
|
||||
streamModel?: string | null
|
||||
abortSignal?: AbortSignal
|
||||
): Promise<void> {
|
||||
let reader = response.body?.getReader();
|
||||
const reader = response.body?.getReader();
|
||||
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
// bytesParsed is the absolute server side buffer offset of the next byte to parse
|
||||
// segmentStartOffset is the absolute offset where the current reader started, reset on resume
|
||||
// segmentBytesRead is wire bytes read by the current reader
|
||||
let bytesParsed = 0;
|
||||
let segmentStartOffset = 0;
|
||||
let segmentBytesRead = 0;
|
||||
let lastByteAt = Date.now();
|
||||
// each resume must produce at least one byte to be retried again
|
||||
// if a resume returns 200 but yields nothing, we abandon
|
||||
// since the session has a bounded size, the total number of retries is bounded by construction
|
||||
let madeProgress = true;
|
||||
const encoder = new TextEncoder();
|
||||
if (conversationId) {
|
||||
ChatService.saveStreamState(conversationId, 0, streamModel);
|
||||
}
|
||||
onConnectionState?.(StreamConnectionState.STREAMING);
|
||||
|
||||
let decoder = new TextDecoder();
|
||||
const decoder = new TextDecoder();
|
||||
let aggregatedContent = '';
|
||||
let fullReasoningContent = '';
|
||||
let aggregatedToolCalls: ApiChatCompletionToolCall[] = [];
|
||||
@ -791,180 +633,84 @@ export class ChatService {
|
||||
}
|
||||
};
|
||||
|
||||
const onVisibilityChange = () => {
|
||||
if (typeof document === 'undefined') return;
|
||||
if (document.visibilityState !== 'visible') return;
|
||||
if (streamFinished) return;
|
||||
if (!conversationId) return;
|
||||
// the bytes have been quiet for too long, the OS likely killed the socket
|
||||
// kicking the reader unblocks reader.read with done=true so the outer loop can resume
|
||||
if (Date.now() - lastByteAt > STREAM_VISIBILITY_KICK_MS) {
|
||||
reader!.cancel().catch(() => {});
|
||||
}
|
||||
};
|
||||
if (typeof document !== 'undefined') {
|
||||
document.addEventListener('visibilitychange', onVisibilityChange);
|
||||
}
|
||||
|
||||
try {
|
||||
let chunk = '';
|
||||
// outer loop drives the resume cycle, swaps reader on premature end of stream
|
||||
while (true) {
|
||||
while (true) {
|
||||
if (abortSignal?.aborted) break;
|
||||
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
if (abortSignal?.aborted) break;
|
||||
|
||||
chunk += decoder.decode(value, { stream: true });
|
||||
const lines = chunk.split(SSE_LINE_SEPARATOR);
|
||||
chunk = lines.pop() || '';
|
||||
|
||||
for (const line of lines) {
|
||||
if (abortSignal?.aborted) break;
|
||||
|
||||
let done: boolean;
|
||||
let value: Uint8Array | undefined;
|
||||
try {
|
||||
const r = await reader.read();
|
||||
done = r.done;
|
||||
value = r.value;
|
||||
} catch (readErr) {
|
||||
// reader.read() rejects with TypeError when the underlying connection drops
|
||||
// instead of just resolving with done=true. treat it like done so the outer
|
||||
// loop swaps reader via the resume path
|
||||
if (isAbortError(readErr)) {
|
||||
throw readErr;
|
||||
if (line.startsWith(SSE_DATA_PREFIX)) {
|
||||
const data = line.slice(SSE_DATA_PREFIX.length).trim();
|
||||
if (data === SSE_DONE_MARKER) {
|
||||
streamFinished = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
console.warn('reader.read() rejected, treating as premature end:', readErr);
|
||||
done = true;
|
||||
value = undefined;
|
||||
}
|
||||
if (done) break;
|
||||
|
||||
if (abortSignal?.aborted) break;
|
||||
try {
|
||||
const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
|
||||
const choice = parsed.choices?.[0];
|
||||
const content = choice?.delta?.content;
|
||||
const reasoningContent = choice?.delta?.reasoning_content;
|
||||
const toolCalls = choice?.delta?.tool_calls;
|
||||
const timings = parsed.timings;
|
||||
const promptProgress = parsed.prompt_progress;
|
||||
|
||||
if (value && value.byteLength > 0) {
|
||||
segmentBytesRead += value.byteLength;
|
||||
lastByteAt = Date.now();
|
||||
if (!madeProgress) {
|
||||
madeProgress = true;
|
||||
onConnectionState?.(StreamConnectionState.STREAMING);
|
||||
}
|
||||
}
|
||||
|
||||
chunk += decoder.decode(value, { stream: true });
|
||||
const lines = chunk.split(SSE_LINE_SEPARATOR);
|
||||
chunk = lines.pop() || '';
|
||||
|
||||
// the persisted offset must point right after the last fully parsed line,
|
||||
// the trailing `chunk` is partial bytes still waiting for a newline
|
||||
if (conversationId) {
|
||||
const tailBytes = encoder.encode(chunk).byteLength;
|
||||
bytesParsed = segmentStartOffset + segmentBytesRead - tailBytes;
|
||||
ChatService.saveStreamState(conversationId, bytesParsed, streamModel);
|
||||
}
|
||||
|
||||
for (const line of lines) {
|
||||
if (abortSignal?.aborted) break;
|
||||
|
||||
if (line.startsWith(SSE_DATA_PREFIX)) {
|
||||
const data = line.slice(SSE_DATA_PREFIX.length).trim();
|
||||
if (data === SSE_DONE_MARKER) {
|
||||
streamFinished = true;
|
||||
|
||||
continue;
|
||||
const chunkModel = ChatService.extractModelName(parsed);
|
||||
if (chunkModel && !modelEmitted) {
|
||||
modelEmitted = true;
|
||||
onModel?.(chunkModel);
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
|
||||
const choice = parsed.choices?.[0];
|
||||
const content = choice?.delta?.content;
|
||||
const reasoningContent = choice?.delta?.reasoning_content;
|
||||
const toolCalls = choice?.delta?.tool_calls;
|
||||
const timings = parsed.timings;
|
||||
const promptProgress = parsed.prompt_progress;
|
||||
|
||||
const chunkModel = ChatService.extractModelName(parsed);
|
||||
if (chunkModel && !modelEmitted) {
|
||||
modelEmitted = true;
|
||||
onModel?.(chunkModel);
|
||||
}
|
||||
|
||||
if (parsed.id && !idEmitted) {
|
||||
idEmitted = true;
|
||||
onCompletionId?.(parsed.id);
|
||||
}
|
||||
|
||||
if (promptProgress) {
|
||||
ChatService.notifyTimings(undefined, promptProgress, onTimings);
|
||||
}
|
||||
|
||||
if (timings) {
|
||||
ChatService.notifyTimings(timings, promptProgress, onTimings);
|
||||
lastTimings = timings;
|
||||
}
|
||||
|
||||
if (content) {
|
||||
finalizeOpenToolCallBatch();
|
||||
aggregatedContent += content;
|
||||
if (!abortSignal?.aborted) {
|
||||
onChunk?.(content);
|
||||
}
|
||||
}
|
||||
|
||||
if (reasoningContent) {
|
||||
finalizeOpenToolCallBatch();
|
||||
fullReasoningContent += reasoningContent;
|
||||
if (!abortSignal?.aborted) {
|
||||
onReasoningChunk?.(reasoningContent);
|
||||
}
|
||||
}
|
||||
|
||||
processToolCallDelta(toolCalls);
|
||||
} catch (e) {
|
||||
console.error('Error parsing JSON chunk:', e);
|
||||
if (parsed.id && !idEmitted) {
|
||||
idEmitted = true;
|
||||
onCompletionId?.(parsed.id);
|
||||
}
|
||||
|
||||
if (promptProgress) {
|
||||
ChatService.notifyTimings(undefined, promptProgress, onTimings);
|
||||
}
|
||||
|
||||
if (timings) {
|
||||
ChatService.notifyTimings(timings, promptProgress, onTimings);
|
||||
lastTimings = timings;
|
||||
}
|
||||
|
||||
if (content) {
|
||||
finalizeOpenToolCallBatch();
|
||||
aggregatedContent += content;
|
||||
if (!abortSignal?.aborted) {
|
||||
onChunk?.(content);
|
||||
}
|
||||
}
|
||||
|
||||
if (reasoningContent) {
|
||||
finalizeOpenToolCallBatch();
|
||||
fullReasoningContent += reasoningContent;
|
||||
if (!abortSignal?.aborted) {
|
||||
onReasoningChunk?.(reasoningContent);
|
||||
}
|
||||
}
|
||||
|
||||
processToolCallDelta(toolCalls);
|
||||
} catch (e) {
|
||||
console.error('Error parsing JSON chunk:', e);
|
||||
}
|
||||
}
|
||||
|
||||
if (abortSignal?.aborted) break;
|
||||
if (streamFinished) break;
|
||||
}
|
||||
|
||||
// inner reader done, decide whether to try a resume
|
||||
if (abortSignal?.aborted) break;
|
||||
if (streamFinished) break;
|
||||
if (!conversationId) break;
|
||||
|
||||
if (!madeProgress) {
|
||||
onConnectionState?.(StreamConnectionState.LOST);
|
||||
onError?.(new Error('Stream resume produced no new bytes, giving up'));
|
||||
break;
|
||||
}
|
||||
|
||||
onConnectionState?.(StreamConnectionState.RESUMING);
|
||||
madeProgress = false;
|
||||
|
||||
// the server resends starting at bytesParsed, discard any partial line we held, it
|
||||
// will be retransmitted from a clean line boundary. reuse the frozen model, not the
|
||||
// live dropdown
|
||||
const resumeResp = await ChatService.resumeStream(
|
||||
conversationId,
|
||||
abortSignal,
|
||||
streamModel
|
||||
).catch(() => null);
|
||||
// an abort landing during the resume request is intentional, not a lost connection
|
||||
if (abortSignal?.aborted) break;
|
||||
if (!resumeResp || resumeResp.status !== 200) {
|
||||
onConnectionState?.(StreamConnectionState.LOST);
|
||||
onError?.(new Error('Stream connection lost and could not be resumed'));
|
||||
break;
|
||||
}
|
||||
const newReader = resumeResp.body?.getReader();
|
||||
if (!newReader) break;
|
||||
|
||||
try {
|
||||
reader.releaseLock();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
reader = newReader;
|
||||
decoder = new TextDecoder();
|
||||
chunk = '';
|
||||
segmentStartOffset = bytesParsed;
|
||||
segmentBytesRead = 0;
|
||||
lastByteAt = Date.now();
|
||||
}
|
||||
|
||||
if (abortSignal?.aborted) return;
|
||||
@ -972,10 +718,6 @@ export class ChatService {
|
||||
if (streamFinished) {
|
||||
finalizeOpenToolCallBatch();
|
||||
|
||||
if (conversationId) {
|
||||
ChatService.clearStreamState(conversationId);
|
||||
}
|
||||
|
||||
const finalToolCalls =
|
||||
aggregatedToolCalls.length > 0 ? JSON.stringify(aggregatedToolCalls) : undefined;
|
||||
|
||||
@ -993,14 +735,7 @@ export class ChatService {
|
||||
|
||||
throw err;
|
||||
} finally {
|
||||
if (typeof document !== 'undefined') {
|
||||
document.removeEventListener('visibilitychange', onVisibilityChange);
|
||||
}
|
||||
try {
|
||||
reader.releaseLock();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
reader.releaseLock();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -628,20 +628,19 @@ export class MCPService {
|
||||
);
|
||||
|
||||
const runtimeErrorHandler = (error: Error) => {
|
||||
// the SDK reports any post initialize error here, including the abort we trigger
|
||||
// ourselves on the next health check cycle, on tab unload, or on server teardown.
|
||||
// these are lifecycle aborts, not actionable errors, so we keep them out of the red console.
|
||||
// the SDK wraps the original AbortError in a generic Error like
|
||||
// "SSE stream disconnected: AbortError: The operation was aborted."
|
||||
// which isAbortError cannot recognize by name alone, so we also pattern match on the message
|
||||
if (isAbortError(error)) {
|
||||
return;
|
||||
}
|
||||
const msg = error?.message ?? '';
|
||||
// Ignore errors that are expected when the SDK's transport is closed,
|
||||
// or when connecting to servers that don't support SSE (stateless-only
|
||||
// endpoints returning 405). The SDK wraps the original AbortError in
|
||||
// a new Error with the message "SSE stream disconnected: AbortError",
|
||||
// and also produces "Cannot cancel a stream locked by a reader".
|
||||
// DOMException is thrown by the browser when aborting fetch requests.
|
||||
const msg = error.message || String(error);
|
||||
if (
|
||||
/SSE stream disconnected:.*AbortError/i.test(msg) ||
|
||||
/AbortError: .*aborted/i.test(msg) ||
|
||||
/stream locked by a reader/i.test(msg)
|
||||
error.name === 'AbortError' ||
|
||||
error instanceof DOMException ||
|
||||
msg.includes('SSE stream disconnected') ||
|
||||
msg.includes('stream locked by a reader') ||
|
||||
msg.includes('The operation was aborted')
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -614,7 +614,7 @@ class AgenticStore {
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
conversationId,
|
||||
undefined,
|
||||
signal
|
||||
);
|
||||
|
||||
|
||||
@ -11,11 +11,9 @@
|
||||
* @see ChatService in services/chat.service.ts for API operations
|
||||
*/
|
||||
|
||||
import { SvelteMap, SvelteSet } from 'svelte/reactivity';
|
||||
import { SvelteMap } from 'svelte/reactivity';
|
||||
import { DatabaseService } from '$lib/services/database.service';
|
||||
import { ChatService } from '$lib/services/chat.service';
|
||||
import { streamIdentity } from '$lib/utils/stream-identity';
|
||||
import { getAuthHeaders } from '$lib/utils/api-headers';
|
||||
import { conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import { agenticStore } from '$lib/stores/agentic.svelte';
|
||||
@ -51,17 +49,10 @@ import type {
|
||||
import type {
|
||||
ApiChatMessageData,
|
||||
ApiProcessingState,
|
||||
ApiStreamSession,
|
||||
DatabaseMessage,
|
||||
DatabaseMessageExtra
|
||||
} from '$lib/types';
|
||||
import {
|
||||
ContinueIntentKind,
|
||||
ErrorDialogType,
|
||||
MessageRole,
|
||||
MessageType,
|
||||
StreamConnectionState
|
||||
} from '$lib/enums';
|
||||
import { ContinueIntentKind, ErrorDialogType, MessageRole, MessageType } from '$lib/enums';
|
||||
|
||||
interface ConversationStateEntry {
|
||||
lastAccessed: number;
|
||||
@ -74,25 +65,9 @@ class ChatStore {
|
||||
isLoading = $state(false);
|
||||
// true while the active conversation streams reasoning content but no visible content yet
|
||||
isReasoning = $state(false);
|
||||
// resumable stream connection state for the active conversation
|
||||
// streaming -> bytes flowing normally, resuming -> waiting on /v1/stream/:id reconnect, lost -> unrecoverable
|
||||
streamConnectionState = $state<StreamConnectionState>(StreamConnectionState.STREAMING);
|
||||
chatLoadingStates = new SvelteMap<string, boolean>();
|
||||
chatReasoningStates = new SvelteMap<string, boolean>();
|
||||
chatStreamingStates = new SvelteMap<
|
||||
string,
|
||||
{ response: string; messageId: string; model?: string | null }
|
||||
>();
|
||||
// convs that the backend reports as having a running session, populated by the global sync
|
||||
// at app mount and on visibilitychange. it does not overlap with chatLoadingStates which
|
||||
// tracks inferences driven by this browser, both are unioned to feed the sidebar spinners
|
||||
private remoteRunningConvs = new SvelteSet<string>();
|
||||
// per conv attach lifecycle, used to derive the global streaming flag without flipping it
|
||||
// off when one conv finishes while another is still streaming. mirrors chatLoadingStates
|
||||
// in scope but tracks the attach + tee replay path specifically
|
||||
private attachingConvs = new SvelteSet<string>();
|
||||
// in-flight discoverActiveStream guard, keyed by conv id
|
||||
private discoveringConvs = new SvelteSet<string>();
|
||||
chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
|
||||
private abortControllers = new SvelteMap<string, AbortController>();
|
||||
private preEncodeAbortController: AbortController | null = null;
|
||||
private processingStates = new SvelteMap<string, ApiProcessingState | null>();
|
||||
@ -123,11 +98,6 @@ class ChatStore {
|
||||
this.chatLoadingStates.delete(convId);
|
||||
if (convId === conversationsStore.activeConversation?.id) this.isLoading = false;
|
||||
this.setChatReasoning(convId, false);
|
||||
// the local pipe is the authoritative observer of session end: when it finishes (clean
|
||||
// onComplete or explicit Stop), the backend session is finalized too, so we drop the
|
||||
// sidebar hint for this conv right away instead of waiting for the next visibilitychange
|
||||
// snapshot. without this the spinner ghosts until the user toggles the tab
|
||||
this.remoteRunningConvs.delete(convId);
|
||||
}
|
||||
}
|
||||
|
||||
@ -140,18 +110,9 @@ class ChatStore {
|
||||
if (convId === conversationsStore.activeConversation?.id) this.isReasoning = false;
|
||||
}
|
||||
}
|
||||
private setChatStreaming(
|
||||
convId: string,
|
||||
response: string,
|
||||
messageId: string,
|
||||
model?: string | null
|
||||
): void {
|
||||
private setChatStreaming(convId: string, response: string, messageId: string): void {
|
||||
this.touchConversationState(convId);
|
||||
this.chatStreamingStates.set(convId, {
|
||||
response,
|
||||
messageId,
|
||||
model: model ?? this.chatStreamingStates.get(convId)?.model
|
||||
});
|
||||
this.chatStreamingStates.set(convId, { response, messageId });
|
||||
if (convId === conversationsStore.activeConversation?.id) this.currentResponse = response;
|
||||
}
|
||||
private clearChatStreaming(convId: string): void {
|
||||
@ -176,314 +137,6 @@ class ChatStore {
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Server side stream discovery, split in three pieces:
|
||||
*
|
||||
* probeServerStream(convId) -> hits POST /v1/streams/lookup with the conv id, returns the session to attach
|
||||
* to or null. Pure read, no side effect, no UI lock. Safe to fire in parallel with anything.
|
||||
*
|
||||
* attachServerStream(convId) -> flips the spinner immediately, fetches the replay stream
|
||||
* from byte 0, finds the assistant slot to splice into (creates a placeholder if the conv has
|
||||
* no assistant message yet, for cross device or fresh local DB cases), and pipes the SSE bytes
|
||||
* into the message via handleStreamResponse.
|
||||
*
|
||||
* discoverActiveStream(convId) -> probe + attach in one call. Used by callers that do not need
|
||||
* to overlap the probe with other async work.
|
||||
*
|
||||
* The mount of the chat page in +page.svelte calls probeServerStream in parallel with
|
||||
* loadConversation, then attachServerStream once both have settled. This gives the earliest
|
||||
* possible time to spinner and avoids racing against an empty activeMessages array.
|
||||
*/
|
||||
async probeServerStream(convId: string): Promise<ApiStreamSession | null> {
|
||||
if (!convId) return null;
|
||||
let listResp: Response;
|
||||
try {
|
||||
// POST the one conv id we are probing
|
||||
listResp = await fetch(`./v1/streams/lookup`, {
|
||||
method: 'POST',
|
||||
headers: { ...getAuthHeaders(), 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ conversation_ids: [convId] })
|
||||
});
|
||||
} catch (e) {
|
||||
console.warn('probeServerStream fetch failed:', e);
|
||||
return null;
|
||||
}
|
||||
if (!listResp.ok) {
|
||||
console.warn(`probeServerStream got HTTP ${listResp.status} for conv ${convId}`);
|
||||
return null;
|
||||
}
|
||||
let sessions: ApiStreamSession[];
|
||||
try {
|
||||
sessions = (await listResp.json()) as ApiStreamSession[];
|
||||
} catch (e) {
|
||||
console.warn('probeServerStream JSON parse failed:', e);
|
||||
return null;
|
||||
}
|
||||
return ChatService.selectActiveStream(sessions);
|
||||
}
|
||||
|
||||
async attachServerStream(convId: string, streamId?: string): Promise<void> {
|
||||
if (!convId) return;
|
||||
if (this.chatStreamingStates.has(convId)) return;
|
||||
|
||||
// flip the spinner immediately, the user sees activity as soon as the conv becomes active.
|
||||
// the global isStreamingActive flag is derived from attachingConvs.size, so adding here
|
||||
// turns it on, and removing in unlock only turns it off when this is the last attach
|
||||
this.setChatLoading(convId, true);
|
||||
this.attachingConvs.add(convId);
|
||||
this.setStreamingActive(true);
|
||||
// only set the active processing conv if we are looking at it, otherwise a background
|
||||
// attach would steal the indicator from the conv the user is currently viewing
|
||||
if (convId === conversationsStore.activeConversation?.id) {
|
||||
this.setActiveProcessingConversation(convId);
|
||||
}
|
||||
|
||||
const unlock = () => {
|
||||
this.attachingConvs.delete(convId);
|
||||
// flip the global flag off only when no other conv is still attaching
|
||||
if (this.attachingConvs.size === 0) {
|
||||
this.setStreamingActive(false);
|
||||
}
|
||||
this.setChatLoading(convId, false);
|
||||
this.clearChatStreaming(convId);
|
||||
};
|
||||
|
||||
// fetch the replay stream from byte 0, rebuild the assistant message from scratch.
|
||||
// resolve the server side identity, fall back to streamIdentity when the caller does not
|
||||
// pass a streamId. probeServerStream returns the full id (with ::model suffix when present)
|
||||
const id = streamId || streamIdentity(convId, selectedModelName());
|
||||
let response: Response;
|
||||
try {
|
||||
response = await fetch(`./v1/stream/${encodeURIComponent(id)}?from=0`, {
|
||||
headers: getAuthHeaders()
|
||||
});
|
||||
} catch (e) {
|
||||
console.error('attachServerStream replay fetch failed:', e);
|
||||
unlock();
|
||||
return;
|
||||
}
|
||||
if (!response.ok) {
|
||||
console.warn(`attachServerStream replay got HTTP ${response.status} for conv ${convId}`);
|
||||
unlock();
|
||||
return;
|
||||
}
|
||||
|
||||
// load the target conversation messages by id, not via the active store. when multiple
|
||||
// attaches run in parallel the active store may reflect another conv and writing through
|
||||
// its index mixes content across convs (CoT flicker, message bleed). by going through the
|
||||
// DB we stay isolated, and only mirror into the active store when the attached conv is
|
||||
// the one currently displayed
|
||||
let messages: DatabaseMessage[];
|
||||
try {
|
||||
messages = await DatabaseService.getConversationMessages(convId);
|
||||
} catch (e) {
|
||||
console.error('attachServerStream load messages failed:', e);
|
||||
unlock();
|
||||
return;
|
||||
}
|
||||
|
||||
// locate the slot to splice into, create a placeholder assistant message if there is none.
|
||||
// we use the conv-scoped findLastAssistantIdx helpers, they only depend on the array
|
||||
let targetIdx = this.findLastAssistantIdx(messages);
|
||||
if (targetIdx === -1) {
|
||||
const lastUserIdx = this.findLastUserIdx(messages);
|
||||
if (lastUserIdx === -1) {
|
||||
console.warn(
|
||||
`attachServerStream: conv ${convId} has no user or assistant message, cannot splice`
|
||||
);
|
||||
unlock();
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const placeholder = await DatabaseService.createMessageBranch(
|
||||
{
|
||||
convId,
|
||||
role: MessageRole.ASSISTANT,
|
||||
content: '',
|
||||
type: MessageType.TEXT,
|
||||
timestamp: Date.now(),
|
||||
parent: messages[lastUserIdx].id,
|
||||
children: [],
|
||||
toolCalls: ''
|
||||
} as Omit<DatabaseMessage, 'id'>,
|
||||
messages[lastUserIdx].id
|
||||
);
|
||||
messages = [...messages, placeholder];
|
||||
targetIdx = messages.length - 1;
|
||||
// only push into the active store when this conv is the one displayed right now
|
||||
if (convId === conversationsStore.activeConversation?.id) {
|
||||
conversationsStore.addMessageToActive(placeholder);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('attachServerStream placeholder creation failed:', e);
|
||||
unlock();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (targetIdx === -1) {
|
||||
unlock();
|
||||
return;
|
||||
}
|
||||
const targetMessage = messages[targetIdx];
|
||||
const targetMessageId = targetMessage.id;
|
||||
// when the assistant slot already has content, the running session is a continue or
|
||||
// another append flow and its buffer holds only the appended deltas. preserve the prefix
|
||||
// and let the replay add to it. when the slot is empty the session buffer holds the whole
|
||||
// message so we wipe and rebuild from byte 0
|
||||
const existingContent = targetMessage.content ?? '';
|
||||
const existingReasoning = targetMessage.reasoningContent ?? '';
|
||||
const isAppendMode = existingContent.length > 0;
|
||||
|
||||
// helper: write to the active store only when the attached conv is currently displayed.
|
||||
// the lookup by message id is robust to reordering of activeMessages, two parallel attaches
|
||||
// can no longer step on each other's indices
|
||||
const writeActive = (updates: Partial<DatabaseMessage>) => {
|
||||
if (convId !== conversationsStore.activeConversation?.id) {
|
||||
return;
|
||||
}
|
||||
const liveIdx = conversationsStore.findMessageIndex(targetMessageId);
|
||||
if (liveIdx === -1) return;
|
||||
conversationsStore.updateMessageAtIndex(liveIdx, updates);
|
||||
};
|
||||
|
||||
if (!isAppendMode) {
|
||||
writeActive({ content: '', reasoningContent: undefined });
|
||||
}
|
||||
|
||||
// extract the model suffix, the resume calls in handleStreamResponse must reuse the model
|
||||
// the session was tagged with, not the live dropdown
|
||||
const sepIdx = id.indexOf('::');
|
||||
const attachedModel: string | null = sepIdx === -1 ? null : id.slice(sepIdx + 2);
|
||||
this.setChatStreaming(convId, existingContent, targetMessageId, attachedModel);
|
||||
const abortController = this.getOrCreateAbortController(convId);
|
||||
|
||||
let streamedContent = '';
|
||||
let streamedReasoningContent = '';
|
||||
|
||||
const cleanup = () => {
|
||||
unlock();
|
||||
this.setProcessingState(convId, null);
|
||||
};
|
||||
|
||||
try {
|
||||
await ChatService.handleStreamResponse(
|
||||
response,
|
||||
(chunk: string) => {
|
||||
streamedContent += chunk;
|
||||
const displayed = isAppendMode ? existingContent + streamedContent : streamedContent;
|
||||
writeActive({ content: displayed });
|
||||
this.setChatStreaming(convId, displayed, targetMessageId);
|
||||
},
|
||||
async (
|
||||
finalContent?: string,
|
||||
reasoningContent?: string,
|
||||
timings?: ChatMessageTimings,
|
||||
toolCalls?: string
|
||||
) => {
|
||||
const streamed = streamedContent || finalContent || '';
|
||||
const streamedR = streamedReasoningContent || reasoningContent || '';
|
||||
const content = isAppendMode ? existingContent + streamed : streamed;
|
||||
const reasoning = isAppendMode ? existingReasoning + streamedR : streamedR;
|
||||
// the DB write is the source of truth, mirror to the active store only when
|
||||
// the conv is currently displayed
|
||||
await DatabaseService.updateMessage(targetMessageId, {
|
||||
content,
|
||||
reasoningContent: reasoning || undefined,
|
||||
toolCalls: toolCalls || '',
|
||||
timings
|
||||
});
|
||||
writeActive({
|
||||
content,
|
||||
reasoningContent: reasoning || undefined,
|
||||
timings
|
||||
});
|
||||
cleanup();
|
||||
},
|
||||
(err: Error) => {
|
||||
console.error('attachServerStream pipe error:', err);
|
||||
cleanup();
|
||||
},
|
||||
(chunk: string) => {
|
||||
streamedReasoningContent += chunk;
|
||||
const displayed = isAppendMode
|
||||
? existingReasoning + streamedReasoningContent
|
||||
: streamedReasoningContent;
|
||||
writeActive({ reasoningContent: displayed });
|
||||
},
|
||||
undefined,
|
||||
undefined,
|
||||
undefined,
|
||||
undefined,
|
||||
convId,
|
||||
abortController.signal,
|
||||
(connState: StreamConnectionState) => {
|
||||
if (convId === conversationsStore.activeConversation?.id) {
|
||||
this.streamConnectionState = connState;
|
||||
}
|
||||
},
|
||||
attachedModel
|
||||
);
|
||||
} catch (e) {
|
||||
console.error('attachServerStream pipe crashed:', e);
|
||||
cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
async discoverActiveStream(convId: string): Promise<void> {
|
||||
if (!convId) return;
|
||||
if (this.chatStreamingStates.has(convId)) return;
|
||||
if (this.chatLoadingStates.get(convId)) return;
|
||||
// concurrency guard: another discover may already be running for this conv (typical race
|
||||
// between mount and visibilitychange on tab switch). a second concurrent fetch on the same
|
||||
// /v1/stream/<id> would duplicate every byte into the DB message, this guard bounces it
|
||||
if (this.discoveringConvs.has(convId)) return;
|
||||
this.discoveringConvs.add(convId);
|
||||
|
||||
try {
|
||||
// the model is frozen at POST time, rebuild the exact conv::model identity from the
|
||||
// persisted state so the lookup key matches what the server stored. null means a single
|
||||
// model conv with no ::suffix, only guess from the dropdown with no persisted state
|
||||
const localState = ChatService.getStreamState(convId);
|
||||
const streamId = ChatService.resumeStreamIdentity(convId, localState, selectedModelName());
|
||||
|
||||
// primary path: ask the server which sessions exist for this identity
|
||||
const serverTarget = await this.probeServerStream(streamId);
|
||||
if (serverTarget) {
|
||||
// pass the full server side identity (may carry a ::model suffix) so the GET routes
|
||||
// straight to the owning session, no probe or fan out
|
||||
await this.attachServerStream(convId, serverTarget.conversation_id);
|
||||
return;
|
||||
}
|
||||
|
||||
// fallback: local state remembers an interrupted byte offset for this conv, the server may
|
||||
// still have a live session matching that identity (we just lost the bytes mid stream). retry
|
||||
// with the frozen identity, the server probe inside attachServerStream tells us if it exists
|
||||
if (!localState) {
|
||||
return;
|
||||
}
|
||||
await this.attachServerStream(convId, streamId);
|
||||
// if attachServerStream failed (session gone, TTL expired), clear the local state to avoid retrying forever
|
||||
if (!this.chatStreamingStates.has(convId) && !this.chatLoadingStates.get(convId)) {
|
||||
ChatService.clearStreamState(convId);
|
||||
}
|
||||
} finally {
|
||||
this.discoveringConvs.delete(convId);
|
||||
}
|
||||
}
|
||||
|
||||
private findLastAssistantIdx(messages: DatabaseMessage[]): number {
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
if (messages[i].role === MessageRole.ASSISTANT) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private findLastUserIdx(messages: DatabaseMessage[]): number {
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
if (messages[i].role === MessageRole.USER) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
clearUIState(): void {
|
||||
this.isLoading = false;
|
||||
@ -612,83 +265,13 @@ class ChatStore {
|
||||
}
|
||||
|
||||
getAllLoadingChats(): string[] {
|
||||
// union of local (this browser is piping) and remote (backend reports a running session
|
||||
// for this conv but no local pipe yet) sources. the sidebar shows one spinner per entry
|
||||
const out = new SvelteSet<string>(this.chatLoadingStates.keys());
|
||||
for (const id of this.remoteRunningConvs) {
|
||||
out.add(id);
|
||||
}
|
||||
return Array.from(out);
|
||||
return Array.from(this.chatLoadingStates.keys());
|
||||
}
|
||||
|
||||
getAllStreamingChats(): string[] {
|
||||
return Array.from(this.chatStreamingStates.keys());
|
||||
}
|
||||
|
||||
/**
|
||||
* Resync the remote running convs set from the backend. Called by the layout at mount and on
|
||||
* visibilitychange, no polling. A snapshot semantic: the set is replaced wholesale, stale entries
|
||||
* for sessions that finalized while the browser was elsewhere are dropped naturally.
|
||||
*/
|
||||
async syncRemoteRunningStreams(): Promise<void> {
|
||||
// the conversations store loads from IndexedDB asynchronously, the +layout onMount caller
|
||||
// fires before that finishes. read ids straight from the DB so the result does not depend
|
||||
// on the store init race, and the sidebar spinners light up at first paint for every conv
|
||||
// the user owns even if it has not been hydrated into the store yet
|
||||
let ids: string[];
|
||||
try {
|
||||
const all = await DatabaseService.getAllConversations();
|
||||
ids = all.map((c) => c.id).filter((id) => !!id);
|
||||
} catch (e) {
|
||||
console.warn('syncRemoteRunningStreams DB read failed:', e);
|
||||
return;
|
||||
}
|
||||
// only ask about conv ids the user already owns
|
||||
if (ids.length === 0) {
|
||||
for (const id of Array.from(this.remoteRunningConvs)) {
|
||||
this.remoteRunningConvs.delete(id);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// rebuild the frozen conv::model identity per conv so a session started with a model still
|
||||
// matches. the server response is mapped back to the bare id below for the sidebar set
|
||||
const lookupIds = ids.map((id) =>
|
||||
ChatService.resumeStreamIdentity(id, ChatService.getStreamState(id), null)
|
||||
);
|
||||
let sessions: ApiStreamSession[];
|
||||
try {
|
||||
const resp = await fetch('./v1/streams/lookup', {
|
||||
method: 'POST',
|
||||
headers: { ...getAuthHeaders(), 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ conversation_ids: lookupIds })
|
||||
});
|
||||
if (!resp.ok) return;
|
||||
const body = (await resp.json()) as unknown;
|
||||
if (!Array.isArray(body)) return;
|
||||
sessions = body as ApiStreamSession[];
|
||||
} catch (e) {
|
||||
console.warn('syncRemoteRunningStreams fetch failed:', e);
|
||||
return;
|
||||
}
|
||||
const running = new SvelteSet<string>();
|
||||
for (const s of sessions) {
|
||||
if (s && !s.is_done && typeof s.conversation_id === 'string' && s.conversation_id) {
|
||||
// strip the optional ::model suffix, the sidebar set is keyed by the bare conv id
|
||||
const sepIdx = s.conversation_id.indexOf('::');
|
||||
const bareId = sepIdx === -1 ? s.conversation_id : s.conversation_id.slice(0, sepIdx);
|
||||
running.add(bareId);
|
||||
}
|
||||
}
|
||||
for (const id of Array.from(this.remoteRunningConvs)) {
|
||||
if (!running.has(id)) {
|
||||
this.remoteRunningConvs.delete(id);
|
||||
}
|
||||
}
|
||||
for (const id of running) {
|
||||
this.remoteRunningConvs.add(id);
|
||||
}
|
||||
}
|
||||
|
||||
getChatStreamingPublic(convId: string): { response: string; messageId: string } | undefined {
|
||||
return this.getChatStreaming(convId);
|
||||
}
|
||||
@ -1339,11 +922,6 @@ class ChatStore {
|
||||
onModel: streamCallbacks.onModel,
|
||||
onCompletionId: streamCallbacks.onCompletionId,
|
||||
onTimings: streamCallbacks.onTimings,
|
||||
onConnectionState: (state: StreamConnectionState) => {
|
||||
if (convId === conversationsStore.activeConversation?.id) {
|
||||
this.streamConnectionState = state;
|
||||
}
|
||||
},
|
||||
onComplete: async (
|
||||
finalContent?: string,
|
||||
reasoningContent?: string,
|
||||
@ -1401,12 +979,6 @@ class ChatStore {
|
||||
async stopGenerationForChat(convId: string): Promise<void> {
|
||||
await this.savePartialResponseIfNeeded(convId);
|
||||
this.setStreamingActive(false);
|
||||
// tell the server to stop the generation, not just drop the HTTP socket. without this the
|
||||
// detached drain keeps producing tokens until eos or max_tokens. use the frozen identity
|
||||
// captured when the session started, not the live dropdown
|
||||
const streamStateForStop = this.chatStreamingStates.get(convId);
|
||||
const modelForStop = streamStateForStop?.model ?? selectedModelName();
|
||||
void ChatService.cancelServerStream(convId, modelForStop);
|
||||
this.abortRequest(convId);
|
||||
this.setChatLoading(convId, false);
|
||||
this.clearChatStreaming(convId);
|
||||
@ -1821,11 +1393,7 @@ class ChatStore {
|
||||
|
||||
const updateStreamingContent = (fullContent: string) => {
|
||||
this.setChatStreaming(msg.convId, fullContent, msg.id);
|
||||
// resolve the row by id on every write, switching to another conv mid continue makes
|
||||
// this a no op instead of writing positionally into the now displayed conversation
|
||||
conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(msg.id), {
|
||||
content: fullContent
|
||||
});
|
||||
conversationsStore.updateMessageAtIndex(idx, { content: fullContent });
|
||||
};
|
||||
|
||||
const abortController = this.getOrCreateAbortController(msg.convId);
|
||||
@ -1835,11 +1403,6 @@ class ChatStore {
|
||||
{
|
||||
...this.getApiOptions(),
|
||||
continueFinalMessage: true,
|
||||
onConnectionState: (state: StreamConnectionState) => {
|
||||
if (msg.convId === conversationsStore.activeConversation?.id) {
|
||||
this.streamConnectionState = state;
|
||||
}
|
||||
},
|
||||
onChunk: (chunk: string) => {
|
||||
appendedContent += chunk;
|
||||
hasReceivedContent = true;
|
||||
@ -1851,7 +1414,7 @@ class ChatStore {
|
||||
hasReceivedContent = true;
|
||||
// mark streaming state so a stop mid-thinking can persist the partial reasoning
|
||||
this.setChatStreaming(msg.convId, originalContent + appendedContent, msg.id);
|
||||
conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(msg.id), {
|
||||
conversationsStore.updateMessageAtIndex(idx, {
|
||||
reasoningContent: originalReasoning + appendedReasoning
|
||||
});
|
||||
this.setChatReasoning(msg.convId, true);
|
||||
@ -1892,7 +1455,7 @@ class ChatStore {
|
||||
timings
|
||||
});
|
||||
|
||||
conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(msg.id), {
|
||||
conversationsStore.updateMessageAtIndex(idx, {
|
||||
content: fullContent,
|
||||
reasoningContent: fullReasoning,
|
||||
timestamp: Date.now(),
|
||||
@ -1914,14 +1477,11 @@ class ChatStore {
|
||||
timestamp: Date.now()
|
||||
});
|
||||
|
||||
conversationsStore.updateMessageAtIndex(
|
||||
conversationsStore.findMessageIndex(msg.id),
|
||||
{
|
||||
content: originalContent + appendedContent,
|
||||
reasoningContent: originalReasoning + appendedReasoning || undefined,
|
||||
timestamp: Date.now()
|
||||
}
|
||||
);
|
||||
conversationsStore.updateMessageAtIndex(idx, {
|
||||
content: originalContent + appendedContent,
|
||||
reasoningContent: originalReasoning + appendedReasoning || undefined,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
}
|
||||
|
||||
this.setChatLoading(msg.convId, false);
|
||||
@ -1938,7 +1498,7 @@ class ChatStore {
|
||||
reasoningContent: originalReasoning + appendedReasoning || undefined,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(msg.id), {
|
||||
conversationsStore.updateMessageAtIndex(idx, {
|
||||
content: originalContent + appendedContent,
|
||||
reasoningContent: originalReasoning + appendedReasoning || undefined,
|
||||
timestamp: Date.now()
|
||||
|
||||
15
tools/ui/src/lib/types/api.d.ts
vendored
15
tools/ui/src/lib/types/api.d.ts
vendored
@ -512,18 +512,3 @@ export interface ApiRouterModelsUnloadResponse {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Entry returned by POST /v1/streams/lookup. The client passes the conv ids it owns in the body
|
||||
* and the server returns one entry per matching live or recently completed background streaming
|
||||
* session, keyed by conversation_id. The WebUI uses this at mount and on visibilitychange to
|
||||
* populate sidebar spinners and to reattach to an ongoing inference for the active conversation.
|
||||
* The server never lists ids the client did not ask about, so foreign random UUIDs stay private.
|
||||
*/
|
||||
export interface ApiStreamSession {
|
||||
conversation_id: string;
|
||||
is_done: boolean;
|
||||
total_bytes: number;
|
||||
started_at: number;
|
||||
completed_at: number;
|
||||
}
|
||||
|
||||
@ -34,8 +34,7 @@ export type {
|
||||
ApiRouterModelsListResponse,
|
||||
ApiRouterModelsUnloadRequest,
|
||||
ApiRouterModelsUnloadResponse,
|
||||
AudioInputFormat,
|
||||
ApiStreamSession
|
||||
AudioInputFormat
|
||||
} from './api';
|
||||
|
||||
// Chat types
|
||||
|
||||
6
tools/ui/src/lib/types/settings.d.ts
vendored
6
tools/ui/src/lib/types/settings.d.ts
vendored
@ -4,10 +4,9 @@ import type { OpenAIToolDefinition } from './mcp';
|
||||
import type { DatabaseMessageExtra } from './database';
|
||||
import type {
|
||||
ParameterSource,
|
||||
ReasoningEffort,
|
||||
SyncableParameterType,
|
||||
SettingsFieldType,
|
||||
StreamConnectionState,
|
||||
ReasoningEffort
|
||||
SettingsFieldType
|
||||
} from '$lib/enums';
|
||||
import type { Icon } from '@lucide/svelte';
|
||||
import type { Component } from 'svelte';
|
||||
@ -120,7 +119,6 @@ export interface SettingsChatServiceOptions {
|
||||
toolCalls?: string
|
||||
) => void;
|
||||
onError?: (error: Error) => void;
|
||||
onConnectionState?: (state: StreamConnectionState) => void;
|
||||
}
|
||||
|
||||
export type SettingsConfigType = typeof SETTING_CONFIG_DEFAULT & {
|
||||
|
||||
@ -6,17 +6,6 @@
|
||||
* when needed (e.g., user stops generation, navigates away, etc.).
|
||||
*/
|
||||
|
||||
// the standard DOMException name for a cancelled operation
|
||||
const ABORT_ERROR_NAME = 'AbortError';
|
||||
|
||||
// browser specific TypeError messages emitted when a fetch reader is cut by page unload,
|
||||
// navigation, or a transient network drop. functionally aborts, not actionable errors
|
||||
const ABORT_LIKE_MESSAGE_PATTERNS = [
|
||||
/input stream/i, // Firefox: stream cut at unload
|
||||
/network connection was lost/i, // Safari: transient network drop
|
||||
/load failed/i // Safari: page navigation during fetch
|
||||
];
|
||||
|
||||
/**
|
||||
* Throws an AbortError if the signal is aborted.
|
||||
* Use this at the start of async operations to fail fast.
|
||||
@ -34,7 +23,7 @@ const ABORT_LIKE_MESSAGE_PATTERNS = [
|
||||
*/
|
||||
export function throwIfAborted(signal?: AbortSignal): void {
|
||||
if (signal?.aborted) {
|
||||
throw new DOMException('Operation was aborted', ABORT_ERROR_NAME);
|
||||
throw new DOMException('Operation was aborted', 'AbortError');
|
||||
}
|
||||
}
|
||||
|
||||
@ -59,18 +48,11 @@ export function throwIfAborted(signal?: AbortSignal): void {
|
||||
* ```
|
||||
*/
|
||||
export function isAbortError(error: unknown): boolean {
|
||||
if (error instanceof DOMException && error.name === ABORT_ERROR_NAME) {
|
||||
if (error instanceof DOMException && error.name === 'AbortError') {
|
||||
return true;
|
||||
}
|
||||
if (error instanceof Error) {
|
||||
if (error.name === ABORT_ERROR_NAME) {
|
||||
return true;
|
||||
}
|
||||
// these patterns are functionally aborts, keep them out of the red console
|
||||
if (error instanceof TypeError) {
|
||||
const msg = error.message ?? '';
|
||||
if (ABORT_LIKE_MESSAGE_PATTERNS.some((re) => re.test(msg))) return true;
|
||||
}
|
||||
if (error instanceof Error && error.name === 'AbortError') {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -151,7 +133,7 @@ export async function withAbortSignal<T>(promise: Promise<T>, signal?: AbortSign
|
||||
|
||||
return new Promise<T>((resolve, reject) => {
|
||||
const abortHandler = () => {
|
||||
reject(new DOMException('Operation was aborted', ABORT_ERROR_NAME));
|
||||
reject(new DOMException('Operation was aborted', 'AbortError'));
|
||||
};
|
||||
|
||||
signal.addEventListener('abort', abortHandler, { once: true });
|
||||
|
||||
@ -1,13 +0,0 @@
|
||||
/**
|
||||
* Build the conversation identity used by the server side replay buffer.
|
||||
*
|
||||
* The server identifies a stream session by a conversation id sent in the
|
||||
* X-Conversation-Id header. When the user has explicitly picked a model the
|
||||
* client appends ::modelName, so a per model session stays distinct and the
|
||||
* router resolves the owning child through its conv_id -> model map.
|
||||
*/
|
||||
export function streamIdentity(conversationId: string, model?: string | null): string {
|
||||
if (!conversationId) return '';
|
||||
if (!model) return conversationId;
|
||||
return `${conversationId}::${model}`;
|
||||
}
|
||||
@ -4,7 +4,7 @@
|
||||
import { afterNavigate } from '$app/navigation';
|
||||
import { DialogModelNotAvailable } from '$lib/components/app';
|
||||
import { APP_NAME, ROUTES } from '$lib/constants';
|
||||
import { chatStore } from '$lib/stores/chat.svelte';
|
||||
import { chatStore, isLoading } from '$lib/stores/chat.svelte';
|
||||
import { conversationsStore, activeConversation } from '$lib/stores/conversations.svelte';
|
||||
import { modelsStore, modelOptions } from '$lib/stores/models.svelte';
|
||||
|
||||
@ -83,7 +83,7 @@
|
||||
|
||||
// Skip loading if this conversation is already active (e.g., just created)
|
||||
if (activeConversation()?.id === chatId) {
|
||||
void chatStore.discoverActiveStream(chatId);
|
||||
// Still handle URL params even if conversation is active
|
||||
if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) {
|
||||
handleUrlParams();
|
||||
}
|
||||
@ -92,33 +92,35 @@
|
||||
|
||||
(async () => {
|
||||
const success = await conversationsStore.loadConversation(chatId);
|
||||
if (!success) {
|
||||
await goto(ROUTES.START);
|
||||
return;
|
||||
}
|
||||
chatStore.syncLoadingStateForChat(chatId);
|
||||
// server probe (with localStorage fallback) and attach
|
||||
await chatStore.discoverActiveStream(chatId);
|
||||
if (success) {
|
||||
chatStore.syncLoadingStateForChat(chatId);
|
||||
|
||||
if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) {
|
||||
await handleUrlParams();
|
||||
// Handle URL params after conversation is loaded
|
||||
if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) {
|
||||
await handleUrlParams();
|
||||
}
|
||||
} else {
|
||||
await goto(ROUTES.START);
|
||||
}
|
||||
})();
|
||||
}
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (typeof window === 'undefined' || typeof document === 'undefined') return;
|
||||
if (typeof window !== 'undefined') {
|
||||
const handleBeforeUnload = () => {
|
||||
if (isLoading()) {
|
||||
console.log('Page unload detected while streaming - aborting stream');
|
||||
chatStore.stopGeneration();
|
||||
}
|
||||
};
|
||||
|
||||
// when the tab comes back to the foreground, re-run discovery to catch any race
|
||||
// where the initial mount probe missed an active session
|
||||
const onVisibility = () => {
|
||||
if (document.visibilityState !== 'visible') return;
|
||||
if (!chatId) return;
|
||||
void chatStore.discoverActiveStream(chatId);
|
||||
};
|
||||
document.addEventListener('visibilitychange', onVisibility);
|
||||
return () => document.removeEventListener('visibilitychange', onVisibility);
|
||||
window.addEventListener('beforeunload', handleBeforeUnload);
|
||||
|
||||
return () => {
|
||||
window.removeEventListener('beforeunload', handleBeforeUnload);
|
||||
};
|
||||
}
|
||||
});
|
||||
</script>
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user