~/Projects/whisper.cpp
git clone https://code.lsong.org/whisper.cpp
Commit
- Commit
- fbd513b813ea42a500ba92be3dcfea0b6b6a4fa3
- Author
- Georgi Gerganov <[email protected]>
- Date
- 2022-10-27 18:31:49 +0300 +0300
- Diffstat
CMakeLists.txt | 19 +++++++++ ggml.c | 95 ++++++++++++++++++++++++---------------------------
Add OpenBLAS support Supported via CMake - just add: cmake .. -DWHISPER_SUPPORT_OPENBLAS=ON On Ubuntu, you have to install the library like this: apt install libopenblas-dev Unfortunately, I don't observe any benefit compared to the original AVX2 + FP16 implementation. Maybe I'm missing something
diff --git a/CMakeLists.txt b/CMakeLists.txt index 447c8b939e1038b9b1ab2a2c1376b8adc16b9051..cb03af95321ca6b4eeea306e5c4fef5f93a0a6c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,10 +41,15 @@ option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE}) option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF) +if (APPLE) + option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF) +else() + option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF) +cmake_minimum_required (VERSION 3.0) -project(whisper.cpp VERSION 1.0.0) +project(whisper.cpp VERSION 1.0.0) # sanitizers @@ -85,6 +89,18 @@ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) else() message(WARNING "Accelerate framework not found") + endif() +endif() + +if (WHISPER_SUPPORT_OPENBLAS) + find_library(OPENBLAS_LIB openblas) + if (OPENBLAS_LIB) + message(STATUS "OpenBLAS found") + + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${OPENBLAS_LIB}) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) + else() + message(WARNING "OpenBLAS not found") endif() endif() diff --git a/ggml.c b/ggml.c index 3a368021752fb0223680bfaa174fc81f7e67254e..e8384ed778b22aa495e4f737988b1c38bebcf5a2 100644 --- a/ggml.c +++ b/ggml.c @@ -76,6 +76,8 @@ } while (0) #ifdef GGML_USE_ACCELERATE #include <Accelerate/Accelerate.h> +#elif GGML_USE_OPENBLAS +#include <cblas.h> #endif // floating point type used to accumulate sums @@ -4055,99 +4057,90 @@ // // nb00 < nb01 - src0 is transposed // compute by src0 columns -int64_t ggml_cycles_per_ms(void) { +struct ggml_tensor * ggml_scale_impl( #include "ggml.h" -#if (GGML_DEBUG >= 10) +int ggml_nelements(const struct ggml_tensor * tensor) { -#include <stdio.h> + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; #include <alloca.h> -#include <malloc.h> // using malloc.h with MSC/MINGW #include "ggml.h" -#define UNUSED(x) (void)(x) #elif !defined(__FreeBSD__) + const uint32_t exp_offset = UINT32_C(0xE0) << 23; -// + #include "ggml.h" -#define GGML_PRINT(...) printf(__VA_ARGS__) + return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; -// + #include "ggml.h" -// data types +int ggml_nrows(const struct ggml_tensor * tensor) { -#include <stdio.h> return x; +#include <malloc.h> // using malloc.h with MSC/MINGW -#include "ggml.h" +#if defined(_MSC_VER) || defined(__MINGW32__) #include <malloc.h> // using malloc.h with MSC/MINGW -#include <alloca.h> #include <time.h> -// + #include "ggml.h" +// 16-bit float #include <malloc.h> // using malloc.h with MSC/MINGW -#ifdef __wasm_simd128__ -#include <stdio.h> return x; +#include <malloc.h> // using malloc.h with MSC/MINGW -#include "ggml.h" +#if defined(_MSC_VER) || defined(__MINGW32__) #include <malloc.h> // using malloc.h with MSC/MINGW -#include <alloca.h> #include <time.h> -// + #include "ggml.h" -#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) +size_t ggml_nbytes(const struct ggml_tensor * tensor) { #include "ggml.h" +// 16-bit float #include <alloca.h> -#include <malloc.h> // using malloc.h with MSC/MINGW -#include <stdio.h> + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; #endif - #include "ggml.h" -#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) +// on Arm, we use __fp16 #if defined(_MSC_VER) || defined(__MINGW32__) -// const float * x = (float *) (src0->data); + #include "ggml.h" -const char * GGML_OP_LABEL[GGML_OP_COUNT] = { #include <alloca.h> -#include <malloc.h> // using malloc.h with MSC/MINGW #include "ggml.h" - "NONE", +#if defined(_MSC_VER) || defined(__MINGW32__) -// + #include "ggml.h" - "DUP", +#include <alloca.h> #include "ggml.h" - "ADD", +#include <time.h> #include "ggml.h" -#include <malloc.h> // using malloc.h with MSC/MINGW +// 16-bit float #endif -#include <time.h> #include "ggml.h" -#define GGML_ASSERT(x) \ +#ifdef __ARM_NEON #include "ggml.h" -#define GGML_ASSERT(x) \ +#ifdef __ARM_NEON #include "ggml.h" #include "ggml.h" -#define GGML_ASSERT(x) \ +#ifdef __ARM_NEON #include "ggml.h" -#define GGML_ASSERT(x) \ +#ifdef __ARM_NEON #if defined(_MSC_VER) || defined(__MINGW32__) #include "ggml.h" +#ifdef __ARM_NEON #include <malloc.h> // using malloc.h with MSC/MINGW - uint32_t as_bits; + } -#include <stdio.h> +#include <alloca.h> #include <assert.h> -#elif !defined(__FreeBSD__) +#if defined(_MSC_VER) || defined(__MINGW32__) -#include "ggml.h" +#if defined(_MSC_VER) || defined(__MINGW32__) #include <malloc.h> // using malloc.h with MSC/MINGW -#include <alloca.h> #include <time.h> -// + -#include <stdio.h> + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; #include <assert.h> -#include <alloca.h> -// + #include "ggml.h" -#define GGML_ASSERT(x) \ +#include <Windows.h> #endif #include "ggml.h" - "SGN", #include <stdio.h> - const uint32_t sign = w & UINT32_C(0x80000000); +#endif if (params->type == GGML_TASK_INIT) { if (nb01 >= nb00) { @@ -4353,7 +4347,7 @@ // // nb00 < nb01 - src0 is transposed // compute by src0 columns -#elif !defined(__FreeBSD__) + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; #elif !defined(__FreeBSD__) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -6910,7 +6904,7 @@ cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1) } else { if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#elif !defined(__FreeBSD__) + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; #elif !defined(__FreeBSD__) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]); @@ -8129,7 +8123,7 @@ } int ggml_cpu_has_blas(void) { float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; - +#elif !defined(__FreeBSD__) return 1; #else return 0;