Liu Song’s Projects

~/Projects/whisper.cpp

git clone https://code.lsong.org/whisper.cpp

Commit

Commit

00f46dbc1db35e98007bdfb4fc69f7777fe78a50

Author

Georgi Gerganov <[email protected]>

Date

2022-11-23 23:22:40 +0200 +0200

Diffstat

 models/convert-h5-to-ggml.py | 17 +++
 models/convert-pt-to-ggml.py | 203 +++++++++++++++++++------------------

models : add usage comments to the HF convert script (#157)

diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py
index f236355a4d48a54a155e0497a8a86130b256bf54..b882c4d4ecc4a6edce472bbc8d12890bd7a26ce4 100644
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@@ -1,3 +1,20 @@
+# Convert Hugging Face fine-tuned models to ggml format
+#
+# Usage:
+#
+#   git clone https://github.com/openai/whisper
+#   git clone https://github.com/ggerganov/whisper.cpp
+#   git clone https://huggingface.co/openai/whisper-medium
+#
+#   python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
+#
+# This script is similar to "convert-pt-to-ggml.py"
+#
+# For more info:
+#
+#   https://github.com/ggerganov/whisper.cpp/issues/157
+#
+
 import io
 import os
 import sys




diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py
index ef4759f663b9adf2e9ee3974ec7efce79094698c..5cf9cf9541d03addd95075e07a4a674991cb3110 100644
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@@ -44,199 +44,200 @@ #from transformers import GPTJForCausalLM
 #from transformers import GPT2TokenizerFast
 
 # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
-LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#
 #  - tokenizer
+# Also, you need to have the original models in ~/.cache/whisper/
-# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#
 #  - mel filters
-    "ru": "russian",
-# You need to clone the original repo in ~/path/to/repo/whisper/
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 # Convert Whisper transformer model from PyTorch to ggml format
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 #
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 # You need to clone the original repo in ~/path/to/repo/whisper/
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 # It is used to various assets needed by the algorithm:
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 #  - tokenizer
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 #  - mel filters
-# You need to clone the original repo in ~/path/to/repo/whisper/
+import code
 # Also, you need to have the original models in ~/.cache/whisper/
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 # Convert Whisper transformer model from PyTorch to ggml format
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 #
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 # You need to clone the original repo in ~/path/to/repo/whisper/
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 # It is used to various assets needed by the algorithm:
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 #  - tokenizer
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 #  - mel filters
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+import torch
 # Also, you need to have the original models in ~/.cache/whisper/
-# It is used to various assets needed by the algorithm:
+import numpy as np
-# It is used to various assets needed by the algorithm:
+import numpy as np
 # Convert Whisper transformer model from PyTorch to ggml format
-# It is used to various assets needed by the algorithm:
+import numpy as np
 #
-# It is used to various assets needed by the algorithm:
+import numpy as np
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# It is used to various assets needed by the algorithm:
+import numpy as np
 # You need to clone the original repo in ~/path/to/repo/whisper/
-# It is used to various assets needed by the algorithm:
+import numpy as np
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-# It is used to various assets needed by the algorithm:
+import numpy as np
 # It is used to various assets needed by the algorithm:
-# It is used to various assets needed by the algorithm:
+import numpy as np
 #  - tokenizer
-# It is used to various assets needed by the algorithm:
+import numpy as np
 #  - mel filters
-# It is used to various assets needed by the algorithm:
+import numpy as np
 # Also, you need to have the original models in ~/.cache/whisper/
-#  - tokenizer
+#from transformers import GPTJForCausalLM
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 # Convert Whisper transformer model from PyTorch to ggml format
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 #
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 # You need to clone the original repo in ~/path/to/repo/whisper/
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 # It is used to various assets needed by the algorithm:
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 #  - tokenizer
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 #  - mel filters
-#  - tokenizer
+#from transformers import GPTJForCausalLM
 # Also, you need to have the original models in ~/.cache/whisper/
-#  - mel filters
+#from transformers import GPT2TokenizerFast
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 # Convert Whisper transformer model from PyTorch to ggml format
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 #
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 # You need to clone the original repo in ~/path/to/repo/whisper/
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 # It is used to various assets needed by the algorithm:
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 #  - tokenizer
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 #  - mel filters
-#  - mel filters
+#from transformers import GPT2TokenizerFast
 # Also, you need to have the original models in ~/.cache/whisper/
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 # Convert Whisper transformer model from PyTorch to ggml format
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 #
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 # You need to clone the original repo in ~/path/to/repo/whisper/
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 # It is used to various assets needed by the algorithm:
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 #  - tokenizer
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 #  - mel filters
-# Also, you need to have the original models in ~/.cache/whisper/
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
 # Also, you need to have the original models in ~/.cache/whisper/
-# See the original repo for more details.
+LANGUAGES = {
-# See the original repo for more details.
+LANGUAGES = {
 # Convert Whisper transformer model from PyTorch to ggml format
-# See the original repo for more details.
+LANGUAGES = {
 #
-# See the original repo for more details.
+LANGUAGES = {
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# See the original repo for more details.
+LANGUAGES = {
 # You need to clone the original repo in ~/path/to/repo/whisper/
-# See the original repo for more details.
+LANGUAGES = {
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-# See the original repo for more details.
+LANGUAGES = {
 # It is used to various assets needed by the algorithm:
-# See the original repo for more details.
+LANGUAGES = {
 #  - tokenizer
-# See the original repo for more details.
+LANGUAGES = {
 #  - mel filters
-# See the original repo for more details.
+LANGUAGES = {
 # Also, you need to have the original models in ~/.cache/whisper/
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 # Convert Whisper transformer model from PyTorch to ggml format
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 #
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 # You need to clone the original repo in ~/path/to/repo/whisper/
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 # It is used to various assets needed by the algorithm:
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 #  - tokenizer
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 #  - mel filters
-# This script loads the specified model and whisper assets and saves them in ggml format.
+    "en": "english",
 # Also, you need to have the original models in ~/.cache/whisper/
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 # Convert Whisper transformer model from PyTorch to ggml format
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 #
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 # You need to clone the original repo in ~/path/to/repo/whisper/
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 #  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 # It is used to various assets needed by the algorithm:
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 #  - tokenizer
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 #  - mel filters
-# The output is a single binary file containing the following information:
+    "zh": "chinese",
 # Also, you need to have the original models in ~/.cache/whisper/
-# Convert Whisper transformer model from PyTorch to ggml format
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#  - tokenizer
-# Convert Whisper transformer model from PyTorch to ggml format
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#  - tokenizer
 # Convert Whisper transformer model from PyTorch to ggml format
-# Convert Whisper transformer model from PyTorch to ggml format
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#  - tokenizer
 #
-# Convert Whisper transformer model from PyTorch to ggml format
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#  - tokenizer
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-# Convert Whisper transformer model from PyTorch to ggml format
 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#  - tokenizer
 # You need to clone the original repo in ~/path/to/repo/whisper/
+#    "ha": "hausa",
+#    "ba": "bashkir",
+#    "jw": "javanese",
+#    "su": "sundanese",
+#}
 
 ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
 #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):