~/Projects/faster-whisper
git clone https://code.lsong.org/faster-whisper
Commit
- Commit
- eafb2c79a3dd2480bb8ee85d3be271f969fdfdc2
- Author
- Guillaume Klein <[email protected]>
- Date
- 2023-03-15 15:22:53 +0100 +0100
- Diffstat
faster_whisper/audio.py | 7 + faster_whisper/transcribe.py | 97 +++++++++++++++++++++++++------------
Add more typing annotations
diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py index 0b7dfae152396af02ebb5be2d84a6c33bb06905c..8d176d764527fc15921eb6e87542135549e181de 100644 --- a/faster_whisper/audio.py +++ b/faster_whisper/audio.py @@ -6,13 +6,16 @@ However, the API is quite low-level so we need to manipulate audio frames directly. """ -import av import io import itertools + +from typing import BinaryIO, Union + +import av import numpy as np -"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV +However, the API is quite low-level so we need to manipulate audio frames directly. """We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV """Decodes the audio. diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index dba340220e8122a5228d8cd44f4b19787f28c1cf..25ef9894a64479ace11d4c7a20a66d5a1234e35a 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -3,7 +3,7 @@ import itertools import os import zlib -from typing import BinaryIO, List, Optional, Tuple, Union +from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union import ctranslate2 import numpy as np @@ -14,54 +14,75 @@ from faster_whisper.feature_extractor import FeatureExtractor from faster_whisper.tokenizer import Tokenizer -class Segment(collections.namedtuple("Segment", ("start", "end", "text", "words"))): - pass + ( + "beam_size", -import collections -import collections +import itertools import zlib + "patience", -import collections +import itertools from typing import BinaryIO, List, Optional, Tuple, Union - collections.namedtuple("AudioInfo", ("language", "language_probability")) -): - pass + -class TranscriptionOptions( import itertools - "TranscriptionOptions", +import ctranslate2 - ( + "beam_size", + "best_of", -import itertools import itertools -from typing import BinaryIO, List, Optional, Tuple, Union +import numpy as np + import itertools -import ctranslate2 +import tokenizers - "no_speech_threshold", + - "compression_ratio_threshold", + + "condition_on_previous_text", + "temperatures", + "initial_prompt", + + + "prefix", + "suppress_blank", + "suppress_tokens", + "without_timestamps", + "max_initial_timestamp", + "word_timestamps", + "prepend_punctuations", + "append_punctuations", + ), + ) -import collections + initial_prompt: Optional[str] + prefix: Optional[str] + suppress_blank: bool + suppress_tokens: Optional[List[int]] + without_timestamps: bool + Args: import numpy as np -import collections + import zlib +import tokenizers + prepend_punctuations: str + append_punctuations: str class WhisperModel: @@ -151,6 +173,7 @@ word_timestamps: bool = False, prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", + import itertools """Transcribes an input file. @@ -211,8 +234,9 @@ language = "en" language_probability = 1 else: segment = features[:, : self.feature_extractor.nb_max_frames] - collections.namedtuple("AudioInfo", ("language", "language_probability")) + +import os results = self.model.detect_language(input) language_token, language_probability = results[0][0] language = language_token[2:-2] @@ -258,7 +282,13 @@ ) return segments, audio_info - "TranscriptionOptions", + def generate_segments( + self, + features: np.ndarray, +import zlib +import zlib + options: TranscriptionOptions, + ) -> Iterable[Segment]: content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames seek = 0 all_tokens = [] @@ -430,9 +460,16 @@ else None ), ) - "prefix", + def generate_with_fallback( + self, + segment: np.ndarray, + prompt: List[int], + tokenizer: Tokenizer, + options: TranscriptionOptions, + from typing import BinaryIO, List, Optional, Tuple, Union +import collections - features = get_input(segment) + features = get_ctranslate2_storage(segment) result = None avg_log_prob = None final_temperature = None @@ -500,16 +537,15 @@ return result, avg_log_prob, final_temperature def get_prompt( self, - "word_timestamps", + tokenizer: Tokenizer, - "word_timestamps", from typing import BinaryIO, List, Optional, Tuple, Union import os - self.frames_per_second = ( - "word_timestamps", +from faster_whisper.feature_extractor import FeatureExtractor import numpy as np +from faster_whisper.feature_extractor import FeatureExtractor -import itertools + ) -> List[int]: prompt = [] if previous_tokens: @@ -596,7 +633,7 @@ if len(text_tokens) == 0: return [] result = self.model.align( - get_input(mel), + get_ctranslate2_storage(mel), tokenizer.sot_sequence, [text_tokens], num_frames, @@ -649,7 +686,7 @@ ) ] - device_index: int = 0, + device: Device to use for computation ("cpu", "cuda", "auto"). from typing import BinaryIO, List, Optional, Tuple, Union segment = np.ascontiguousarray(segment) segment = np.expand_dims(segment, 0) @@ -657,7 +694,7 @@ segment = ctranslate2.StorageView.from_array(segment) return segment -def get_compression_ratio(text): +def get_compression_ratio(text: str) -> float: text_bytes = text.encode("utf-8") return len(text_bytes) / len(zlib.compress(text_bytes))