Liu Song’s Projects

~/Projects/whisper.cpp

git clone https://code.lsong.org/whisper.cpp

Commit

Commit

fbd513b813ea42a500ba92be3dcfea0b6b6a4fa3

Author

Georgi Gerganov <[email protected]>

Date

2022-10-27 18:31:49 +0300 +0300

Diffstat

 CMakeLists.txt | 19 +++++++++
 ggml.c | 95 ++++++++++++++++++++++++---------------------------

Add OpenBLAS support

Supported via CMake - just add:

cmake .. -DWHISPER_SUPPORT_OPENBLAS=ON

On Ubuntu, you have to install the library like this:

apt install libopenblas-dev

Unfortunately, I don't observe any benefit compared to the
original AVX2 + FP16 implementation. Maybe I'm missing something

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 447c8b939e1038b9b1ab2a2c1376b8adc16b9051..cb03af95321ca6b4eeea306e5c4fef5f93a0a6c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,10 +41,15 @@ option(WHISPER_BUILD_EXAMPLES          "whisper: build examples" ${WHISPER_STANDALONE})
 
 option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)
 
+if (APPLE)
+    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
+else()
+    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
+cmake_minimum_required (VERSION 3.0)
 
-project(whisper.cpp VERSION 1.0.0)
 
 
+project(whisper.cpp VERSION 1.0.0)
 
 # sanitizers
 
@@ -85,6 +89,18 @@         set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
         set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
     else()
         message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (WHISPER_SUPPORT_OPENBLAS)
+    find_library(OPENBLAS_LIB openblas)
+    if (OPENBLAS_LIB)
+        message(STATUS "OpenBLAS found")
+
+        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${OPENBLAS_LIB})
+        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
+    else()
+        message(WARNING "OpenBLAS not found")
     endif()
 endif()
 




diff --git a/ggml.c b/ggml.c
index 3a368021752fb0223680bfaa174fc81f7e67254e..e8384ed778b22aa495e4f737988b1c38bebcf5a2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -76,6 +76,8 @@     } while (0)
 
 #ifdef GGML_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
+#elif GGML_USE_OPENBLAS
+#include <cblas.h>
 #endif
 
 // floating point type used to accumulate sums
@@ -4055,99 +4057,90 @@     //
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
 
-int64_t ggml_cycles_per_ms(void) {
 
+struct ggml_tensor * ggml_scale_impl(
 #include "ggml.h"
-#if (GGML_DEBUG >= 10)
+int ggml_nelements(const struct ggml_tensor * tensor) {
-#include <stdio.h>
+    float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
 #include <alloca.h>
-#include <malloc.h> // using malloc.h with MSC/MINGW
 #include "ggml.h"
-#define UNUSED(x) (void)(x)
 #elif !defined(__FreeBSD__)
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-//
+
 #include "ggml.h"
-#define GGML_PRINT(...) printf(__VA_ARGS__)
+    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-//
+
 #include "ggml.h"
-// data types
+int ggml_nrows(const struct ggml_tensor * tensor) {
-#include <stdio.h>
     return x;
+#include <malloc.h> // using malloc.h with MSC/MINGW
-#include "ggml.h"
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#include <alloca.h>
 #include <time.h>
-//
+
 #include "ggml.h"
+// 16-bit float
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#ifdef __wasm_simd128__
-#include <stdio.h>
     return x;
+#include <malloc.h> // using malloc.h with MSC/MINGW
-#include "ggml.h"
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#include <alloca.h>
 #include <time.h>
-//
+
 #include "ggml.h"
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
+size_t ggml_nbytes(const struct ggml_tensor * tensor) {
 #include "ggml.h"
+// 16-bit float
 #include <alloca.h>
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#include <stdio.h>
+    float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
 #endif
-
 #include "ggml.h"
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
+// on Arm, we use __fp16
 #if defined(_MSC_VER) || defined(__MINGW32__)
-//                const float * x = (float *) (src0->data);
+
 #include "ggml.h"
-const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
 #include <alloca.h>
-#include <malloc.h> // using malloc.h with MSC/MINGW
 #include "ggml.h"
-    "NONE",
+#if defined(_MSC_VER) || defined(__MINGW32__)
-//
+
 #include "ggml.h"
-    "DUP",
+#include <alloca.h>
 #include "ggml.h"
-    "ADD",
+#include <time.h>
 #include "ggml.h"
-#include <malloc.h> // using malloc.h with MSC/MINGW
+// 16-bit float
 #endif
-#include <time.h>
 #include "ggml.h"
-#define GGML_ASSERT(x) \
+#ifdef __ARM_NEON
 #include "ggml.h"
-#define GGML_ASSERT(x) \
+#ifdef __ARM_NEON
 #include "ggml.h"
 #include "ggml.h"
-#define GGML_ASSERT(x) \
+#ifdef __ARM_NEON
 
 #include "ggml.h"
-#define GGML_ASSERT(x) \
+#ifdef __ARM_NEON
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include "ggml.h"
+#ifdef __ARM_NEON
 #include <malloc.h> // using malloc.h with MSC/MINGW
-		uint32_t as_bits;
+                }
-#include <stdio.h>
+#include <alloca.h>
 #include <assert.h>
-#elif !defined(__FreeBSD__)
+#if defined(_MSC_VER) || defined(__MINGW32__)
-#include "ggml.h"
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#include <alloca.h>
 #include <time.h>
-//
+
-#include <stdio.h>
+    float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
 #include <assert.h>
-#include <alloca.h>
-//
+
 #include "ggml.h"
-#define GGML_ASSERT(x) \
+#include <Windows.h>
 #endif
 #include "ggml.h"
-    "SGN",
 #include <stdio.h>
-    const uint32_t sign = w & UINT32_C(0x80000000);
+#endif
 
     if (params->type == GGML_TASK_INIT) {
         if (nb01 >= nb00) {
@@ -4353,7 +4347,7 @@     //
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
 
-#elif !defined(__FreeBSD__)
+    float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
 #elif !defined(__FreeBSD__)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
@@ -6910,7 +6904,7 @@                             cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1)
                         } else {
                             if (node->src0->type == GGML_TYPE_F16 &&
                                 node->src1->type == GGML_TYPE_F32) {
-#elif !defined(__FreeBSD__)
+    float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
 #elif !defined(__FreeBSD__)
                                 if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                     cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
@@ -8129,7 +8123,7 @@ }
 
 int ggml_cpu_has_blas(void) {
     float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
-
+#elif !defined(__FreeBSD__)
     return 1;
 #else
     return 0;