qdrant · hh-space-invader · May 1, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -1,4 +1,5 @@
 name: Tests
+run-name: Tests (gpu)
 
 on:
   push:
@@ -21,8 +22,6 @@ jobs:
           - '3.13.x'
         os:
           - ubuntu-latest
-          - macos-latest
-          - windows-latest
 
     runs-on: ${{ matrix.os }}
 

diff --git a/experiments/Throughput_Across_Models_GPU.ipynb b/experiments/Throughput_Across_Models_GPU.ipynb
diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -68,7 +68,15 @@ def _load_onnx_model(
             if device_id is None:
                 onnx_providers = ["CUDAExecutionProvider"]
             else:
-                onnx_providers = [("CUDAExecutionProvider", {"device_id": device_id})]
+                # kSameAsRequested: Allocates only the requested memory, avoiding over-allocation.
+                # more precise than 'kNextPowerOfTwo', which grows memory aggressively.
+                # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
+                onnx_providers = [
+                    (
+                        "CUDAExecutionProvider",
+                        {"device_id": device_id, "arena_extend_strategy": "kSameAsRequested"},
+                    )
+                ]
         else:
             onnx_providers = ["CPUExecutionProvider"]
 
@@ -132,5 +140,7 @@ def __init__(
     def start(cls, model_name: str, cache_dir: str, **kwargs: Any) -> "EmbeddingWorker[T]":
         return cls(model_name=model_name, cache_dir=cache_dir, **kwargs)
 
-    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+    def process(
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
+    ) -> Iterable[tuple[int, Any]]:
         raise NotImplementedError("Subclasses must implement this method")
diff --git a/fastembed/common/utils.py b/fastembed/common/utils.py
@@ -5,12 +5,12 @@
 import unicodedata
 from pathlib import Path
 from itertools import islice
-from typing import Iterable, Optional, TypeVar
+from typing import Iterable, Optional, TypeVar, Sequence
 
 import numpy as np
 from numpy.typing import NDArray
 
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, OnnxProvider
 
 T = TypeVar("T")
 
@@ -67,3 +67,18 @@ def get_all_punctuation() -> set[str]:
 
 def remove_non_alphanumeric(text: str) -> str:
     return re.sub(r"[^\w\s]", " ", text, flags=re.UNICODE)
+
+
+def is_cuda_enabled(cuda: bool, providers: Optional[Sequence[OnnxProvider]]) -> bool:
+    """
+    Check if CUDA is enabled based on the `cuda` and `providers` parameters
+    """
+    if cuda:
+        return True
+    if not providers:
+        return False
+    if isinstance(providers, str):
+        return "CUDAExecutionProvider" in providers
+    return isinstance(providers, (list, tuple)) and any(
+        isinstance(p, str) and "CUDAExecutionProvider" in p for p in providers
+    )
diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
@@ -6,13 +6,14 @@
 
 import numpy as np
 from PIL import Image
+import onnxruntime as ort
 
 from fastembed.image.transform.operators import Compose
 from fastembed.common.types import NumpyArray
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_preprocessor
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.parallel_processor import ParallelWorkerPool
 
 # Holds type of the embedding result
@@ -74,7 +75,21 @@ def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputConte
             encoded = np.array(self.processor(image_files))
         onnx_input = self._build_onnx_input(encoded)
         onnx_input = self._preprocess_onnx_input(onnx_input)
-        model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
+
+        run_options = ort.RunOptions()
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        model_output = self.model.run(None, onnx_input, run_options)  # type: ignore[union-attr]
         embeddings = model_output[0].reshape(len(images), -1)
         return OnnxOutputContext(model_output=embeddings)
 
@@ -104,7 +119,9 @@ def _embed_images(
                 self.load_onnx_model()
 
             for batch in iter_batch(images, batch_size):
-                yield from self._post_process_onnx_output(self.onnx_embed(batch))
+                yield from self._post_process_onnx_output(
+                    self.onnx_embed(batch, cuda=cuda, providers=providers)
+                )
         else:
             if parallel == 0:
                 parallel = os.cpu_count()
@@ -129,7 +146,9 @@ def _embed_images(
 
 
 class ImageEmbeddingWorker(EmbeddingWorker[T]):
-    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+    def process(
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
+    ) -> Iterable[tuple[int, Any]]:
         for idx, batch in items:
-            embeddings = self.model.onnx_embed(batch)
+            embeddings = self.model.onnx_embed(batch, **kwargs)
             yield idx, embeddings
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -6,13 +6,14 @@
 
 import numpy as np
 from PIL import Image
+import onnxruntime as ort
 from tokenizers import Encoding, Tokenizer
 
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor
 from fastembed.common.types import NumpyArray
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.image.transform.operators import Compose
 from fastembed.parallel_processor import ParallelWorkerPool
 
@@ -103,7 +104,21 @@ def onnx_embed_text(
             )
 
         onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs)
-        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)  # type: ignore[union-attr]
+
+        run_options = ort.RunOptions()
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input, run_options)  # type: ignore[union-attr]
         return OnnxOutputContext(
             model_output=model_output[0],
             attention_mask=onnx_input.get("attention_mask", attention_mask),
@@ -136,7 +151,9 @@ def _embed_documents(
             if not hasattr(self, "model") or self.model is None:
                 self.load_onnx_model()
             for batch in iter_batch(documents, batch_size):
-                yield from self._post_process_onnx_text_output(self.onnx_embed_text(batch))
+                yield from self._post_process_onnx_text_output(
+                    self.onnx_embed_text(batch, cuda=cuda, providers=providers)
+                )
         else:
             if parallel == 0:
                 parallel = os.cpu_count()
@@ -169,7 +186,21 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             encoded = np.array(self.processor(image_files))
         onnx_input = {"pixel_values": encoded}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
-        model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
+
+        run_options = ort.RunOptions()
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        model_output = self.model.run(None, onnx_input, run_options)  # type: ignore[union-attr]
         embeddings = model_output[0].reshape(len(images), -1)
         return OnnxOutputContext(model_output=embeddings)
 
@@ -199,7 +230,9 @@ def _embed_images(
                 self.load_onnx_model()
 
             for batch in iter_batch(images, batch_size):
-                yield from self._post_process_onnx_image_output(self.onnx_embed_image(batch))
+                yield from self._post_process_onnx_image_output(
+                    self.onnx_embed_image(batch, cuda=cuda, providers=providers)
+                )
         else:
             if parallel == 0:
                 parallel = os.cpu_count()
@@ -241,9 +274,11 @@ def init_embedding(
     ) -> OnnxMultimodalModel:
         raise NotImplementedError()
 
-    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+    def process(
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
+    ) -> Iterable[tuple[int, Any]]:
         for idx, batch in items:
-            onnx_output = self.model.onnx_embed_text(batch)
+            onnx_output = self.model.onnx_embed_text(batch, **kwargs)
             yield idx, onnx_output
 
 
@@ -265,7 +300,9 @@ def init_embedding(
     ) -> OnnxMultimodalModel:
         raise NotImplementedError()
 
-    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+    def process(
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
+    ) -> Iterable[tuple[int, Any]]:
         for idx, batch in items:
-            embeddings = self.model.onnx_embed_image(batch)
+            embeddings = self.model.onnx_embed_image(batch, **kwargs)
             yield idx, embeddings
diff --git a/fastembed/parallel_processor.py b/fastembed/parallel_processor.py
@@ -28,7 +28,9 @@ class Worker:
     def start(cls, *args: Any, **kwargs: Any) -> "Worker":
         raise NotImplementedError()
 
-    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+    def process(
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
+    ) -> Iterable[tuple[int, Any]]:
         raise NotImplementedError()
 
 
@@ -63,7 +65,7 @@ def input_queue_iterable() -> Iterable[Any]:
                     break
                 yield item
 
-        for processed_item in worker.process(input_queue_iterable()):
+        for processed_item in worker.process(input_queue_iterable(), **kwargs):
             output_queue.put(processed_item)
     except Exception as e:  # pylint: disable=broad-except
         logging.exception(e)

diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -4,6 +4,7 @@
 from typing import Any, Iterable, Optional, Sequence, Type
 
 import numpy as np
+import onnxruntime as ort
 from tokenizers import Encoding
 
 from fastembed.common.onnx_model import (
@@ -14,7 +15,7 @@
 )
 from fastembed.common.types import NumpyArray
 from fastembed.common.preprocessor_utils import load_tokenizer
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.parallel_processor import ParallelWorkerPool
 
 
@@ -71,7 +72,21 @@ def onnx_embed_pairs(self, pairs: list[tuple[str, str]], **kwargs: Any) -> OnnxO
         tokenized_input = self.tokenize(pairs, **kwargs)
         inputs = self._build_onnx_input(tokenized_input)
         onnx_input = self._preprocess_onnx_input(inputs, **kwargs)
-        outputs = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)  # type: ignore[union-attr]
+
+        run_options = ort.RunOptions()
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            # Enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # Helps prevent excessive memory retention, especially for dynamic workloads.
+            # Source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        outputs = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input, run_options)  # type: ignore[union-attr]
         relevant_output = outputs[0]
         scores: NumpyArray = relevant_output[:, 0]
         return OnnxOutputContext(model_output=scores)
@@ -110,7 +125,9 @@ def _rerank_pairs(
             if not hasattr(self, "model") or self.model is None:
                 self.load_onnx_model()
             for batch in iter_batch(pairs, batch_size):
-                yield from self._post_process_onnx_output(self.onnx_embed_pairs(batch, **kwargs))
+                yield from self._post_process_onnx_output(
+                    self.onnx_embed_pairs(batch, cuda=cuda, providers=providers, **kwargs)
+                )
         else:
             if parallel == 0:
                 parallel = os.cpu_count()
@@ -163,7 +180,9 @@ def init_embedding(
     ) -> OnnxCrossEncoderModel:
         raise NotImplementedError()
 
-    def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
+    def process(
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
+    ) -> Iterable[tuple[int, Any]]:
         for idx, batch in items:
-            onnx_output = self.model.onnx_embed_pairs(batch)
+            onnx_output = self.model.onnx_embed_pairs(batch, **kwargs)
             yield idx, onnx_output
diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py
@@ -344,7 +344,7 @@ def start(cls, model_name: str, cache_dir: str, **kwargs: Any) -> "Bm25Worker":
         return cls(model_name=model_name, cache_dir=cache_dir, **kwargs)
 
     def process(
-        self, items: Iterable[tuple[int, Any]]
+        self, items: Iterable[tuple[int, Any]], **kwargs: Any
     ) -> Iterable[tuple[int, list[SparseEmbedding]]]:
         for idx, batch in items:
             onnx_output = self.model.raw_embed(batch)