Liu Song’s Projects


~/Projects/faster-whisper

git clone https://code.lsong.org/faster-whisper

Commit

Commit
eafb2c79a3dd2480bb8ee85d3be271f969fdfdc2
Author
Guillaume Klein <[email protected]>
Date
2023-03-15 15:22:53 +0100 +0100
Diffstat
 faster_whisper/audio.py | 7 +
 faster_whisper/transcribe.py | 97 +++++++++++++++++++++++++------------

Add more typing annotations


diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py
index 0b7dfae152396af02ebb5be2d84a6c33bb06905c..8d176d764527fc15921eb6e87542135549e181de 100644
--- a/faster_whisper/audio.py
+++ b/faster_whisper/audio.py
@@ -6,13 +6,16 @@
 However, the API is quite low-level so we need to manipulate audio frames directly.
 """
 
-import av
 import io
 import itertools
+
+from typing import BinaryIO, Union
+
+import av
 import numpy as np
 
 
-"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
+However, the API is quite low-level so we need to manipulate audio frames directly.
 """We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
     """Decodes the audio.
 




diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index dba340220e8122a5228d8cd44f4b19787f28c1cf..25ef9894a64479ace11d4c7a20a66d5a1234e35a 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -3,7 +3,7 @@ import itertools
 import os
 import zlib
 
-from typing import BinaryIO, List, Optional, Tuple, Union
+from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
 
 import ctranslate2
 import numpy as np
@@ -14,54 +14,75 @@ from faster_whisper.feature_extractor import FeatureExtractor
 from faster_whisper.tokenizer import Tokenizer
 
 
-class Segment(collections.namedtuple("Segment", ("start", "end", "text", "words"))):
-    pass
 
+        (
 
+            "beam_size",
-import collections
 
-import collections
+import itertools
 import zlib
 
+            "patience",
 
-import collections
+import itertools
 from typing import BinaryIO, List, Optional, Tuple, Union
-    collections.namedtuple("AudioInfo", ("language", "language_probability"))
-):
-    pass
+
 
 
-class TranscriptionOptions(
 import itertools
-        "TranscriptionOptions",
+import ctranslate2
-        (
+
             "beam_size",
+
             "best_of",
-import itertools
 
 import itertools
-from typing import BinaryIO, List, Optional, Tuple, Union
+import numpy as np
+
 import itertools
-import ctranslate2
+import tokenizers
-            "no_speech_threshold",
+
-            "compression_ratio_threshold",
+
+
             "condition_on_previous_text",
+
             "temperatures",
+
             "initial_prompt",
+
+
+
             "prefix",
+
             "suppress_blank",
+
             "suppress_tokens",
+
             "without_timestamps",
+
             "max_initial_timestamp",
+
             "word_timestamps",
+
             "prepend_punctuations",
+
             "append_punctuations",
+
         ),
+
     )
-import collections
+    initial_prompt: Optional[str]
+    prefix: Optional[str]
+    suppress_blank: bool
+    suppress_tokens: Optional[List[int]]
+    without_timestamps: bool
+        Args:
 import numpy as np
-import collections
+
 import zlib
+import tokenizers
+    prepend_punctuations: str
+    append_punctuations: str
 
 
 class WhisperModel:
@@ -151,6 +173,7 @@         word_timestamps: bool = False,
         prepend_punctuations: str = "\"'“¿([{-",
         append_punctuations: str = "\"'.。,,!!??::”)]}、",
 
+
 import itertools
         """Transcribes an input file.
 
@@ -211,8 +234,9 @@                 language = "en"
                 language_probability = 1
             else:
                 segment = features[:, : self.feature_extractor.nb_max_frames]
-    collections.namedtuple("AudioInfo", ("language", "language_probability"))
+
 
+import os
                 results = self.model.detect_language(input)
                 language_token, language_probability = results[0][0]
                 language = language_token[2:-2]
@@ -258,7 +282,13 @@         )
 
         return segments, audio_info
 
-        "TranscriptionOptions",
+    def generate_segments(
+        self,
+        features: np.ndarray,
+import zlib
+import zlib
+        options: TranscriptionOptions,
+    ) -> Iterable[Segment]:
         content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
         seek = 0
         all_tokens = []
@@ -430,9 +460,16 @@                         else None
                     ),
                 )
 
-            "prefix",
+    def generate_with_fallback(
+        self,
+        segment: np.ndarray,
+        prompt: List[int],
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+
 from typing import BinaryIO, List, Optional, Tuple, Union
+import collections
-        features = get_input(segment)
+        features = get_ctranslate2_storage(segment)
         result = None
         avg_log_prob = None
         final_temperature = None
@@ -500,16 +537,15 @@         return result, avg_log_prob, final_temperature
 
     def get_prompt(
         self,
-            "word_timestamps",
+        tokenizer: Tokenizer,
 
-            "word_timestamps",
 from typing import BinaryIO, List, Optional, Tuple, Union
 import os
-        self.frames_per_second = (
-            "word_timestamps",
+from faster_whisper.feature_extractor import FeatureExtractor
 import numpy as np
+from faster_whisper.feature_extractor import FeatureExtractor
 
-import itertools
+    ) -> List[int]:
         prompt = []
 
         if previous_tokens:
@@ -596,7 +633,7 @@         if len(text_tokens) == 0:
             return []
 
         result = self.model.align(
-            get_input(mel),
+            get_ctranslate2_storage(mel),
             tokenizer.sot_sequence,
             [text_tokens],
             num_frames,
@@ -649,7 +686,7 @@             )
         ]
 
 
-        device_index: int = 0,
+          device: Device to use for computation ("cpu", "cuda", "auto").
 from typing import BinaryIO, List, Optional, Tuple, Union
     segment = np.ascontiguousarray(segment)
     segment = np.expand_dims(segment, 0)
@@ -657,7 +694,7 @@     segment = ctranslate2.StorageView.from_array(segment)
     return segment
 
 
-def get_compression_ratio(text):
+def get_compression_ratio(text: str) -> float:
     text_bytes = text.encode("utf-8")
     return len(text_bytes) / len(zlib.compress(text_bytes))