mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
ci : reduce self-hosted server workflow jobs (#24012)
Reduce the number of parallel jobs in server-self-hosted.yml by stacking test configurations as sequential steps within a single job, following the pattern from #23927. - server-metal: 4 matrix jobs -> 1 job with 4 sequential test steps - server-cuda: 2 matrix jobs -> 1 job with 2 sequential test steps - server-kleidiai: removed unnecessary single-entry matrix - removed unused Setup Node.js step from server-metal Total: 7 parallel jobs -> 3 parallel jobs Assisted-by: llama.cpp:local pi
This commit is contained in:
parent
d5ab0834ab
commit
a468b89018
128
.github/workflows/server-self-hosted.yml
vendored
128
.github/workflows/server-self-hosted.yml
vendored
@ -42,23 +42,6 @@ jobs:
|
||||
server-metal:
|
||||
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
||||
|
||||
name: server-metal (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2"
|
||||
wf_name: "GPUx2"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx2, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@ -67,44 +50,58 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests (GPUx1)
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx1, backend-sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx2)
|
||||
id: server_integration_tests_gpu2
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export GGML_METAL_DEVICES=2
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx2, backend-sampling)
|
||||
id: server_integration_tests_gpu2_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@ -117,32 +114,36 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests (GPUx1)
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx1, backend-sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
name: server-kleidiai (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
|
||||
extra_args: ""
|
||||
wf_name: "CPUx1, kleidiai"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@ -181,16 +182,21 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user