Fix minor CUDA discrepencies (#2005)

* CUDA : typo

* CUDA: Add missing GGML_CALL to function definition

* CUDA: only log GGML_CUDA_FORCE_MMQ/CUBLAS when enabled

* CUDA: Fix softcap bug in flash_attn_tile_ext_f16

The else branch (softcap != 0) incorrectly called launch_fattn_tile_f16_64_128
with use_softcap=false instead of true, causing logit softcap to be silently
ignored for the col_per_block=32, parallel_blocks=1 path.
This commit is contained in:
Nexes the Elder 2026-06-23 09:37:48 +02:00 committed by GitHub
parent 69a8336d08
commit 9eaf86a7c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 7 additions and 11 deletions

View File

@ -41,9 +41,9 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API void ggml_backend_cuda_invalidate_graphs(void);
GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(void);
#ifdef __cplusplus
}
#endif

View File

@ -94,7 +94,7 @@ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
void * ggml_cuda_log_user_data = NULL;
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
ggml_cuda_log_callback = log_callback;
ggml_cuda_log_user_data = user_data;
}
@ -204,14 +204,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
int64_t total_vram = 0;
#ifdef GGML_CUDA_FORCE_MMQ
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ (instead of CUBLAS): yes\n", __func__);
#endif // GGML_CUDA_FORCE_MMQ
#ifdef GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
#else
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS (Instead of MMQ): yes\n", __func__);
#endif // GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) {

View File

@ -229,7 +229,7 @@ static __device__ void quantize_f32_q6_0_block(const float * __restrict__ xi, bl
for (int j = 0; j < QK6_0/2; ++j) {
const float x0 = xi[0 + j]*id;
const float x1 = xi[QK4_0/2 + j]*id;
const float x1 = xi[QK6_0/2 + j]*id;
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));

View File

@ -350,7 +350,7 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
if (softcap == 0.0f) {
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
} else {
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, true>(ctx, dst);
}
}