mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Fix minor CUDA discrepencies (#2005)
* CUDA : typo * CUDA: Add missing GGML_CALL to function definition * CUDA: only log GGML_CUDA_FORCE_MMQ/CUBLAS when enabled * CUDA: Fix softcap bug in flash_attn_tile_ext_f16 The else branch (softcap != 0) incorrectly called launch_fattn_tile_f16_64_128 with use_softcap=false instead of true, causing logit softcap to be silently ignored for the col_per_block=32, parallel_blocks=1 path.
This commit is contained in:
parent
69a8336d08
commit
9eaf86a7c7
@ -41,9 +41,9 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|||||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API void ggml_backend_cuda_invalidate_graphs(void);
|
GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(void);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -94,7 +94,7 @@ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char
|
|||||||
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
||||||
void * ggml_cuda_log_user_data = NULL;
|
void * ggml_cuda_log_user_data = NULL;
|
||||||
|
|
||||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||||
ggml_cuda_log_callback = log_callback;
|
ggml_cuda_log_callback = log_callback;
|
||||||
ggml_cuda_log_user_data = user_data;
|
ggml_cuda_log_user_data = user_data;
|
||||||
}
|
}
|
||||||
@ -204,14 +204,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||||||
|
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
#ifdef GGML_CUDA_FORCE_MMQ
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ (instead of CUBLAS): yes\n", __func__);
|
||||||
#else
|
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
|
||||||
#endif // GGML_CUDA_FORCE_MMQ
|
#endif // GGML_CUDA_FORCE_MMQ
|
||||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS (Instead of MMQ): yes\n", __func__);
|
||||||
#else
|
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
|
||||||
#endif // GGML_CUDA_FORCE_CUBLAS
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
|
|||||||
@ -229,7 +229,7 @@ static __device__ void quantize_f32_q6_0_block(const float * __restrict__ xi, bl
|
|||||||
|
|
||||||
for (int j = 0; j < QK6_0/2; ++j) {
|
for (int j = 0; j < QK6_0/2; ++j) {
|
||||||
const float x0 = xi[0 + j]*id;
|
const float x0 = xi[0 + j]*id;
|
||||||
const float x1 = xi[QK4_0/2 + j]*id;
|
const float x1 = xi[QK6_0/2 + j]*id;
|
||||||
|
|
||||||
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
|
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
|
||||||
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
|
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
|
||||||
|
|||||||
@ -350,7 +350,7 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
|||||||
if (softcap == 0.0f) {
|
if (softcap == 0.0f) {
|
||||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
|
||||||
} else {
|
} else {
|
||||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, true>(ctx, dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user