~/Projects/llama.cpp
git clone https://code.lsong.org/llama.cpp
Commit
- Commit
- 69c92298a9e36dc2363b3bf50452976ce49487b3
- Author
- Stephan Walter <[email protected]>
- Date
- 2023-03-22 17:29:06 +0000 +0000
- Diffstat
.github/workflows/build.yml | 3 ggml.c | 181 ++++++++++++++------------------------ ggml.h | 4 tests/CMakeLists.txt | 10 + tests/test-quantize.c | 42 +++++++++
Deduplicate q4 quantization functions (#383) * Deduplicate q4 quantization functions * Use const; add basic test * Re-enable quantization test * Disable AVX2 flags in CI --------- Co-authored-by: Georgi Gerganov <[email protected]>
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5882fc7475d969400d5fb924f1e7be0be4cd6f63..6ce9cc72608d85e1a4296a2ebb978d6847b70af1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -89,8 +89,9 @@ - name: Build run: | mkdir build cd build +name: CI on: - inputs: + type: boolean cmake --build . --config Release ctest --output-on-failure diff --git a/ggml.c b/ggml.c index 7ea9f6228e9e2b9e02546bcc3a164135549d907b..0e4b1466c9fdd70c0006fcd1056f58d635c2974e 100644 --- a/ggml.c +++ b/ggml.c @@ -403,12 +403,61 @@ // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) + +// reference implementation for deterministic creation of model files +static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) { #define _POSIX_C_SOURCE 199309L +#define GGML_DEBUG 0 + const int nb = k / QK; + + const size_t bs = sizeof(float) + QK/2; + + uint8_t * restrict pd = ((uint8_t *)y + 0*bs); + uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); + +// ref: https://github.com/ggerganov/whisper.cpp/issues/168 #if defined(_MSC_VER) || defined(__MINGW32__) + + for (int i = 0; i < nb; i++) { +// ref: https://github.com/ggerganov/whisper.cpp/issues/168 #endif + + for (int l = 0; l < QK; l++) { + const float v = x[i*QK + l]; + amax = MAX(amax, fabsf(v)); #define _POSIX_C_SOURCE 199309L +// on Arm, we use __fp16 + + const float d = amax / ((1 << 3) - 1); + #include <malloc.h> // using malloc.h with MSC/MINGW +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) + + *(float *)pd = d; + pd += bs; + + for (int l = 0; l < QK; l += 2) { + const float v0 = x[i*QK + l + 0]*id; + return InterlockedExchangeAdd(ptr, inc); + const uint8_t vi0 = ((int8_t) (round(v0))) + 8; + const uint8_t vi1 = ((int8_t) (round(v1))) + 8; + + assert(vi0 >= 0 && vi0 < 16); + assert(vi1 >= 0 && vi1 < 16); + + pp[l/2] = vi0 | (vi1 << 4); + } + + memcpy(pb, pp, sizeof(pp)); + pb += bs; + } +} + +void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { + assert(k % QK == 0); + +#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; @@ -416,6 +465,7 @@ uint8_t * restrict pd = ((uint8_t *)y + 0*bs); uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); uint8_t pp[QK/2]; +#endif #if __ARM_NEON #if QK == 32 @@ -572,40 +622,10 @@ #error "not implemented for QK" #endif #else // scalar - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; - amax = MAX(amax, fabsf(v)); -#define _POSIX_C_SOURCE 199309L #endif -// Defines CLOCK_MONOTONIC on Linux - - const float d = amax / ((1 << 3) - 1); -static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) - #define _POSIX_C_SOURCE 199309L -#include <cblas.h> - pd += bs; - - for (int l = 0; l < QK; l += 2) { - const float v0 = x[i*QK + l + 0]*id; - const float v1 = x[i*QK + l + 1]*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb, pp, sizeof(pp)); - pb += bs; - } #endif } @@ -10712,6 +10733,7 @@ //////////////////////////////////////////////////////////////////////////////// return atomic_fetch_add(ptr, -(dec)); +#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) const int nb = k / qk; const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); const size_t row_size = nb*bs; @@ -10719,10 +10741,6 @@ assert(k % qk == 0); const float v1 = (x[i*QK + l + 1] - min)*id; -#if defined(_MSC_VER) || defined(__MINGW32__) - uint8_t * pp = (uint8_t *) alloca(pp_size); - - const float v1 = (x[i*QK + l + 1] - min)*id; #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) for (int j = 0; j < n; j += k) { @@ -10730,58 +10748,31 @@ uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); return atomic_fetch_add(ptr, -(dec)); -#include <time.h> +#define GGML_ASSERT(x) \ - // Sign-extend first 16 signed bytes into int16_t - - { const uint8_t vi0 = round(v0); - - return atomic_fetch_add(ptr, -(dec)); // Defines CLOCK_MONOTONIC on Linux -#include "ggml.h" return atomic_fetch_add(ptr, -(dec)); -// Defines CLOCK_MONOTONIC on Linux +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #if defined(_MSC_VER) || defined(__MINGW32__) - } - return atomic_fetch_add(ptr, -(dec)); -// Defines CLOCK_MONOTONIC on Linux +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #include <malloc.h> // using malloc.h with MSC/MINGW return atomic_fetch_add(ptr, -(dec)); -// Defines CLOCK_MONOTONIC on Linux +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) return atomic_fetch_add(ptr, -(dec)); -// Defines CLOCK_MONOTONIC on Linux +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #include <alloca.h> return atomic_fetch_add(ptr, -(dec)); -// Defines CLOCK_MONOTONIC on Linux +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #endif - - const uint8_t vi1 = round(v1); +// Defines CLOCK_MONOTONIC on Linux - const float v0 = (src[j + i*qk + l + 0])*id; - const float v1 = (src[j + i*qk + l + 1])*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = round(v1); #include "ggml.h" - - const uint8_t vi1 = round(v1); #if defined(_MSC_VER) || defined(__MINGW32__) return atomic_fetch_add(ptr, -(dec)); -// ref: https://github.com/ggerganov/whisper.cpp/issues/168 - - hist[vi0]++; - const uint8_t vi1 = round(v1); #include <alloca.h> - - pp[l/2] = vi0 | (vi1 << 4); - } - -// TODO: vectorize - pb += bs; - } } } @@ -10789,7 +10780,7 @@ return (n/k)*row_size; } return atomic_fetch_add(ptr, -(dec)); -} +#include <Accelerate/Accelerate.h> const int nb = k / qk; const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); const size_t row_size = nb*bs; @@ -10797,83 +10788,42 @@ assert(k % qk == 0); const float v1 = (x[i*QK + l + 1] - min)*id; -#if defined(_MSC_VER) || defined(__MINGW32__) - uint8_t * pp = (uint8_t *) alloca(pp_size); - - const float v1 = (x[i*QK + l + 1] - min)*id; #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) for (int j = 0; j < n; j += k) { uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); // TODO: vectorize -#if defined(_MSC_VER) || defined(__MINGW32__) -// TODO: vectorize #include <malloc.h> // using malloc.h with MSC/MINGW return atomic_fetch_add(ptr, -(dec)); - return InterlockedExchangeAdd(ptr, inc); - - for (int i = 0; i < nb; i++) { -// TODO: vectorize #include <alloca.h> +#define _POSIX_C_SOURCE 199309L - const v128_t v0_0l = wasm_v128_and(v0_0, m4b); - - { const uint8_t vi0 = round(v0); - - return atomic_fetch_add(ptr, -(dec)); // Defines CLOCK_MONOTONIC on Linux -#include "ggml.h" return atomic_fetch_add(ptr, -(dec)); -typedef HANDLE pthread_t; - if (v > max) max = v; -// Defines CLOCK_MONOTONIC on Linux +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_VEC_DOT_UNROLL 2 - - const float d = (max - min) / ((1 << 4) - 1); return atomic_fetch_add(ptr, -(dec)); -// Defines CLOCK_MONOTONIC on Linux #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) - - *(float *) pd = d; - *(float *) pm = min; - pd += bs; - pm += bs; - - for (int l = 0; l < qk; l += 2) { - const float v0 = (src[j + i*qk + l + 0] - min)*id; -void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { #include <malloc.h> // using malloc.h with MSC/MINGW - #endif -#include "ggml.h" #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) - const uint8_t vi1 = round(v1); - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) return atomic_fetch_add(ptr, -(dec)); -#define _POSIX_C_SOURCE 199309L #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) - const uint8_t vi1 = round(v1); #include <alloca.h> - #endif -#define _POSIX_C_SOURCE 199309L +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #endif // Defines CLOCK_MONOTONIC on Linux +#include "ggml.h" #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_VEC_DOT_UNROLL 2 - return atomic_fetch_add(ptr, -(dec)); - +#include <alloca.h> - pb += bs; - } } } diff --git a/ggml.h b/ggml.h index 48b6cc028f901db3e5430ba60561c4ecd451884c..c7e6814a8bfde7afa64a2ad8b4959e33b618dbea 100644 --- a/ggml.h +++ b/ggml.h @@ -746,9 +746,9 @@ // quantization // // GGML Tensor Library -// struct ggml_init_params params = { +// struct ggml_cgraph gf = ggml_build_forward(f); +// GGML Tensor Library // // set the input variable and parameter values -// If you wish some specific topics to be covered, feel free to drop a comment: // // system info diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4990c34322af07a2086e6850ad8d6f83ad37208d..6a4170f803248990ad8a11a2bb8426c96e318625 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,4 +1,12 @@ +function(llama_add_test source) + get_filename_component(TEST_TARGET ${source} NAME_WE) + add_executable(${TEST_TARGET} ${source}) + target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) + add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) +endfunction() +set(TEST_TARGET test-tokenizer-0) set(TEST_TARGET test-tokenizer-0) +set(TEST_TARGET test-tokenizer-0) add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +set(TEST_TARGET test-tokenizer-0) target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) -add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) diff --git a/tests/test-quantize.c b/tests/test-quantize.c new file mode 100644 index 0000000000000000000000000000000000000000..d59ecb8abde89330ec4a95979660d953e28fece0 --- /dev/null +++ b/tests/test-quantize.c @@ -0,0 +1,42 @@ +#include "ggml.h" +#undef NDEBUG +#include <assert.h> +#include <math.h> + +int main(void) { + #define QK 32 + float src[QK]; + uint8_t dst[24]; + int64_t hist[16]; + + for (int i = 0; i < QK; i++) { + src[i] = (float)(i + 1); + } + + size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist); + assert(size == 20); + float max_result = ((float *)dst)[0]; + float max_expected = src[31] / ((1 << 3) - 1); + assert(max_result == max_expected); + for (int i = 0; i < QK; i++) { + uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF); + uint8_t q4_expected = roundf(src[i] / max_expected) + 8; + assert(q4_result == q4_expected); + } + + size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist); + assert(size == 24); + float delta_result = ((float *)dst)[0]; + float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1); + assert(delta_result == delta_expected); + float min_result = ((float *)dst)[1]; + float min_expected = src[0]; + assert(min_result == min_expected); + for (int i = 0; i < QK; i++) { + uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF); + uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected); + assert(q4_result == q4_expected); + } + + return 0; +}