~/Projects/whisper.cpp
git clone https://code.lsong.org/whisper.cpp
Commit
- Commit
- 00f46dbc1db35e98007bdfb4fc69f7777fe78a50
- Author
- Georgi Gerganov <[email protected]>
- Date
- 2022-11-23 23:22:40 +0200 +0200
- Diffstat
models/convert-h5-to-ggml.py | 17 +++ models/convert-pt-to-ggml.py | 203 +++++++++++++++++++------------------
models : add usage comments to the HF convert script (#157)
diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py index f236355a4d48a54a155e0497a8a86130b256bf54..b882c4d4ecc4a6edce472bbc8d12890bd7a26ce4 100644 --- a/models/convert-h5-to-ggml.py +++ b/models/convert-h5-to-ggml.py @@ -1,3 +1,20 @@ +# Convert Hugging Face fine-tuned models to ggml format +# +# Usage: +# +# git clone https://github.com/openai/whisper +# git clone https://github.com/ggerganov/whisper.cpp +# git clone https://huggingface.co/openai/whisper-medium +# +# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper . +# +# This script is similar to "convert-pt-to-ggml.py" +# +# For more info: +# +# https://github.com/ggerganov/whisper.cpp/issues/157 +# + import io import os import sys diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py index ef4759f663b9adf2e9ee3974ec7efce79094698c..5cf9cf9541d03addd95075e07a4a674991cb3110 100644 --- a/models/convert-pt-to-ggml.py +++ b/models/convert-pt-to-ggml.py @@ -44,199 +44,200 @@ #from transformers import GPTJForCausalLM #from transformers import GPT2TokenizerFast # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 -LANGUAGES = { - "en": "english", - "zh": "chinese", -# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# # - tokenizer +# Also, you need to have the original models in ~/.cache/whisper/ -# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# # - mel filters - "ru": "russian", -# You need to clone the original repo in ~/path/to/repo/whisper/ -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # Convert Whisper transformer model from PyTorch to ggml format -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # You need to clone the original repo in ~/path/to/repo/whisper/ -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # It is used to various assets needed by the algorithm: -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # - tokenizer -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # - mel filters -# You need to clone the original repo in ~/path/to/repo/whisper/ +import code # Also, you need to have the original models in ~/.cache/whisper/ -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # Convert Whisper transformer model from PyTorch to ggml format -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # You need to clone the original repo in ~/path/to/repo/whisper/ -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # It is used to various assets needed by the algorithm: -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # - tokenizer -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # - mel filters -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +import torch # Also, you need to have the original models in ~/.cache/whisper/ -# It is used to various assets needed by the algorithm: +import numpy as np -# It is used to various assets needed by the algorithm: +import numpy as np # Convert Whisper transformer model from PyTorch to ggml format -# It is used to various assets needed by the algorithm: +import numpy as np # -# It is used to various assets needed by the algorithm: +import numpy as np # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# It is used to various assets needed by the algorithm: +import numpy as np # You need to clone the original repo in ~/path/to/repo/whisper/ -# It is used to various assets needed by the algorithm: +import numpy as np # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# It is used to various assets needed by the algorithm: +import numpy as np # It is used to various assets needed by the algorithm: -# It is used to various assets needed by the algorithm: +import numpy as np # - tokenizer -# It is used to various assets needed by the algorithm: +import numpy as np # - mel filters -# It is used to various assets needed by the algorithm: +import numpy as np # Also, you need to have the original models in ~/.cache/whisper/ -# - tokenizer +#from transformers import GPTJForCausalLM -# - tokenizer +#from transformers import GPTJForCausalLM # Convert Whisper transformer model from PyTorch to ggml format -# - tokenizer +#from transformers import GPTJForCausalLM # -# - tokenizer +#from transformers import GPTJForCausalLM # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# - tokenizer +#from transformers import GPTJForCausalLM # You need to clone the original repo in ~/path/to/repo/whisper/ -# - tokenizer +#from transformers import GPTJForCausalLM # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# - tokenizer +#from transformers import GPTJForCausalLM # It is used to various assets needed by the algorithm: -# - tokenizer +#from transformers import GPTJForCausalLM # - tokenizer -# - tokenizer +#from transformers import GPTJForCausalLM # - mel filters -# - tokenizer +#from transformers import GPTJForCausalLM # Also, you need to have the original models in ~/.cache/whisper/ -# - mel filters +#from transformers import GPT2TokenizerFast -# - mel filters +#from transformers import GPT2TokenizerFast # Convert Whisper transformer model from PyTorch to ggml format -# - mel filters +#from transformers import GPT2TokenizerFast # -# - mel filters +#from transformers import GPT2TokenizerFast # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# - mel filters +#from transformers import GPT2TokenizerFast # You need to clone the original repo in ~/path/to/repo/whisper/ -# - mel filters +#from transformers import GPT2TokenizerFast # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# - mel filters +#from transformers import GPT2TokenizerFast # It is used to various assets needed by the algorithm: -# - mel filters +#from transformers import GPT2TokenizerFast # - tokenizer -# - mel filters +#from transformers import GPT2TokenizerFast # - mel filters -# - mel filters +#from transformers import GPT2TokenizerFast # Also, you need to have the original models in ~/.cache/whisper/ -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # Convert Whisper transformer model from PyTorch to ggml format -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # You need to clone the original repo in ~/path/to/repo/whisper/ -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # It is used to various assets needed by the algorithm: -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # - tokenizer -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # - mel filters -# Also, you need to have the original models in ~/.cache/whisper/ +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # Also, you need to have the original models in ~/.cache/whisper/ -# See the original repo for more details. +LANGUAGES = { -# See the original repo for more details. +LANGUAGES = { # Convert Whisper transformer model from PyTorch to ggml format -# See the original repo for more details. +LANGUAGES = { # -# See the original repo for more details. +LANGUAGES = { # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# See the original repo for more details. +LANGUAGES = { # You need to clone the original repo in ~/path/to/repo/whisper/ -# See the original repo for more details. +LANGUAGES = { # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# See the original repo for more details. +LANGUAGES = { # It is used to various assets needed by the algorithm: -# See the original repo for more details. +LANGUAGES = { # - tokenizer -# See the original repo for more details. +LANGUAGES = { # - mel filters -# See the original repo for more details. +LANGUAGES = { # Also, you need to have the original models in ~/.cache/whisper/ -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # Convert Whisper transformer model from PyTorch to ggml format -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # You need to clone the original repo in ~/path/to/repo/whisper/ -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # It is used to various assets needed by the algorithm: -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # - tokenizer -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # - mel filters -# This script loads the specified model and whisper assets and saves them in ggml format. + "en": "english", # Also, you need to have the original models in ~/.cache/whisper/ -# The output is a single binary file containing the following information: + "zh": "chinese", -# The output is a single binary file containing the following information: + "zh": "chinese", # Convert Whisper transformer model from PyTorch to ggml format -# The output is a single binary file containing the following information: + "zh": "chinese", # -# The output is a single binary file containing the following information: + "zh": "chinese", # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# The output is a single binary file containing the following information: + "zh": "chinese", # You need to clone the original repo in ~/path/to/repo/whisper/ -# The output is a single binary file containing the following information: + "zh": "chinese", # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# The output is a single binary file containing the following information: + "zh": "chinese", # It is used to various assets needed by the algorithm: -# The output is a single binary file containing the following information: + "zh": "chinese", # - tokenizer -# The output is a single binary file containing the following information: + "zh": "chinese", # - mel filters -# The output is a single binary file containing the following information: + "zh": "chinese", # Also, you need to have the original models in ~/.cache/whisper/ -# Convert Whisper transformer model from PyTorch to ggml format # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# - tokenizer -# Convert Whisper transformer model from PyTorch to ggml format # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# - tokenizer # Convert Whisper transformer model from PyTorch to ggml format -# Convert Whisper transformer model from PyTorch to ggml format # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# - tokenizer # -# Convert Whisper transformer model from PyTorch to ggml format # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# - tokenizer # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# Convert Whisper transformer model from PyTorch to ggml format # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# - tokenizer # You need to clone the original repo in ~/path/to/repo/whisper/ +# "ha": "hausa", +# "ba": "bashkir", +# "jw": "javanese", +# "su": "sundanese", +#} ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):