name: Server (self-hosted) on: workflow_dispatch: # allows manual triggering inputs: sha: description: 'Commit SHA1 to build' required: false type: string slow_tests: description: 'Run slow tests' required: true type: boolean push: branches: - master paths: [ '.github/workflows/server-self-hosted.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*' ] env: LLAMA_ARG_LOG_COLORS: 1 LLAMA_ARG_LOG_PREFIX: 1 LLAMA_ARG_LOG_TIMESTAMPS: 1 LLAMA_ARG_LOG_VERBOSITY: 10 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: server-metal: runs-on: [self-hosted, llama-server, macOS, ARM64] steps: - name: Clone id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: Build id: cmake_build run: | cmake -B build -DGGML_SCHED_NO_REALLOC=ON cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server - name: Python setup id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - name: Tests (GPUx1) id: server_integration_tests if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate pytest -v -x -m "not slow" - name: Tests (GPUx1, backend-sampling) id: server_integration_tests_backend_sampling if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate export LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" - name: Tests (GPUx2) id: server_integration_tests_gpu2 if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate export GGML_METAL_DEVICES=2 pytest -v -x -m "not slow" - name: Tests (GPUx2, backend-sampling) id: server_integration_tests_gpu2_backend_sampling if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" server-cuda: runs-on: [self-hosted, llama-server, Linux, NVIDIA] steps: - name: Clone id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: Build id: cmake_build run: | cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON cmake --build build --config Release -j $(nproc) --target llama-server - name: Python setup id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - name: Tests (GPUx1) id: server_integration_tests if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate pytest -v -x -m "not slow" - name: Tests (GPUx1, backend-sampling) id: server_integration_tests_backend_sampling if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate export LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" server-kleidiai: runs-on: ah-ubuntu_22_04-c8g_8x steps: - name: Clone id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: Dependencies id: depends run: | set -euxo pipefail sudo apt-get update sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ apt-get install -y \ build-essential \ libssl-dev \ python3-venv \ gpg \ wget \ time \ git-lfs git lfs install # install the latest cmake sudo install -d /usr/share/keyrings wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ | gpg --dearmor \ | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ | sudo tee /etc/apt/sources.list.d/kitware.list sudo apt-get update sudo apt-get install -y cmake - name: Build id: cmake_build run: | cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON cmake --build build --config Release -j $(nproc) --target llama-server - name: Python setup id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - name: Tests id: server_integration_tests if: ${{ !github.event.pull_request }} run: | cd tools/server/tests source venv/bin/activate pytest -v -x -m "not slow"