From 7e539a2281ad6e077e22cace6d7ae6cb6b007a92 Mon Sep 17 00:00:00 2001 From: Daniel van Strien Date: Tue, 12 Aug 2025 22:28:37 +0100 Subject: [PATCH 1/2] Add batch size parameter to CLI for Sentence Transformers - Add --batch-size option to CLI with default value of 32 - Pass batch_size parameter through to compute_text_projection() - Update _projection_for_texts() to use SentenceTransformer's built-in batch_size parameter - Include batch_size in cache key for proper caching - Add documentation for the new parameter This allows users to control memory usage and performance when processing embeddings. Larger batch sizes use more memory but may be faster, while smaller batch sizes use less memory at the cost of potentially slower processing. --- packages/backend/embedding_atlas/cli.py | 8 ++++++++ packages/backend/embedding_atlas/projection.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/packages/backend/embedding_atlas/cli.py b/packages/backend/embedding_atlas/cli.py index e48c0e7..71d63a8 100644 --- a/packages/backend/embedding_atlas/cli.py +++ b/packages/backend/embedding_atlas/cli.py @@ -130,6 +130,12 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost default=False, help="Allow execution of remote code when loading models from Hugging Face Hub.", ) +@click.option( + "--batch-size", + type=int, + default=32, + help="Batch size for processing embeddings. Larger values use more memory but may be faster. Smaller values use less memory.", +) @click.option( "--x", "x_column", @@ -207,6 +213,7 @@ def main( enable_projection: bool, model: str | None, trust_remote_code: bool, + batch_size: int, x_column: str | None, y_column: str | None, neighbors_column: str | None, @@ -280,6 +287,7 @@ def main( neighbors=new_neighbors_column, model=model, trust_remote_code=trust_remote_code, + batch_size=batch_size, umap_args=umap_args, ) elif image is not None: diff --git a/packages/backend/embedding_atlas/projection.py b/packages/backend/embedding_atlas/projection.py index e018e61..5e030e8 100644 --- a/packages/backend/embedding_atlas/projection.py +++ b/packages/backend/embedding_atlas/projection.py @@ -92,6 +92,7 @@ def _projection_for_texts( texts: list[str], model: str | None = None, trust_remote_code: bool = False, + batch_size: int = 32, umap_args: dict = {}, ) -> Projection: if model is None: @@ -102,6 +103,7 @@ def _projection_for_texts( "version": 1, "texts": texts, "model": model, + "batch_size": batch_size, "umap_args": umap_args, } ) @@ -118,8 +120,8 @@ def _projection_for_texts( logger.info("Loading model %s...", model) transformer = SentenceTransformer(model, trust_remote_code=trust_remote_code) - logger.info("Running embedding for %d texts...", len(texts)) - hidden_vectors = transformer.encode(texts) + logger.info("Running embedding for %d texts with batch size %d...", len(texts), batch_size) + hidden_vectors = transformer.encode(texts, batch_size=batch_size) result = _run_umap(hidden_vectors, umap_args) Projection.save(cpath, result) @@ -207,6 +209,7 @@ def compute_text_projection( neighbors: str | None = "neighbors", model: str | None = None, trust_remote_code: bool = False, + batch_size: int = 32, umap_args: dict = {}, ): """ @@ -225,6 +228,8 @@ def compute_text_projection( model: str, name or path of the SentenceTransformer model to use for embedding. trust_remote_code: bool, whether to trust and execute remote code when loading the model from HuggingFace Hub. Default is False. + batch_size: int, batch size for processing embeddings. Larger values use more + memory but may be faster. Default is 32. umap_args: dict, additional keyword arguments to pass to the UMAP algorithm (e.g., n_neighbors, min_dist, metric). @@ -237,6 +242,7 @@ def compute_text_projection( list(text_series), model=model, trust_remote_code=trust_remote_code, + batch_size=batch_size, umap_args=umap_args, ) data_frame[x] = proj.projection[:, 0] From dfb792639393076d756a21987d015805328a8d0a Mon Sep 17 00:00:00 2001 From: Daniel van Strien Date: Wed, 13 Aug 2025 20:00:04 +0100 Subject: [PATCH 2/2] Add batch size support for image processing with dynamic defaults - Add batch_size parameter to image processing functions - Use None as CLI default with modality-specific defaults (32 for text, 16 for images) - Add educational logging when using defaults - Include batch_size in image cache keys --- packages/backend/embedding_atlas/cli.py | 7 +++--- .../backend/embedding_atlas/projection.py | 23 +++++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/packages/backend/embedding_atlas/cli.py b/packages/backend/embedding_atlas/cli.py index 71d63a8..a32636e 100644 --- a/packages/backend/embedding_atlas/cli.py +++ b/packages/backend/embedding_atlas/cli.py @@ -133,8 +133,8 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost @click.option( "--batch-size", type=int, - default=32, - help="Batch size for processing embeddings. Larger values use more memory but may be faster. Smaller values use less memory.", + default=None, + help="Batch size for processing embeddings (default: 32 for text, 16 for images). Larger values use more memory but may be faster.", ) @click.option( "--x", @@ -213,7 +213,7 @@ def main( enable_projection: bool, model: str | None, trust_remote_code: bool, - batch_size: int, + batch_size: int | None, x_column: str | None, y_column: str | None, neighbors_column: str | None, @@ -299,6 +299,7 @@ def main( neighbors=new_neighbors_column, model=model, trust_remote_code=trust_remote_code, + batch_size=batch_size, umap_args=umap_args, ) else: diff --git a/packages/backend/embedding_atlas/projection.py b/packages/backend/embedding_atlas/projection.py index 5e030e8..6d5b849 100644 --- a/packages/backend/embedding_atlas/projection.py +++ b/packages/backend/embedding_atlas/projection.py @@ -92,7 +92,7 @@ def _projection_for_texts( texts: list[str], model: str | None = None, trust_remote_code: bool = False, - batch_size: int = 32, + batch_size: int | None = None, umap_args: dict = {}, ) -> Projection: if model is None: @@ -117,6 +117,11 @@ def _projection_for_texts( # Import on demand. from sentence_transformers import SentenceTransformer + # Set default batch size if not provided + if batch_size is None: + batch_size = 32 + logger.info("Using default batch size of %d for text. Adjust with --batch-size if you encounter memory issues or want to speed up processing.", batch_size) + logger.info("Loading model %s...", model) transformer = SentenceTransformer(model, trust_remote_code=trust_remote_code) @@ -132,6 +137,7 @@ def _projection_for_images( images: list, model: str | None = None, trust_remote_code: bool = False, + batch_size: int | None = None, umap_args: dict = {}, ) -> Projection: if model is None: @@ -142,6 +148,7 @@ def _projection_for_images( "version": 1, "images": images, "model": model, + "batch_size": batch_size, "umap_args": umap_args, } ) @@ -172,9 +179,13 @@ def load_image(value): pipe = pipeline("image-feature-extraction", model=model, device_map="auto") - logger.info("Running embedding for %d images...", len(images)) + # Set default batch size if not provided + if batch_size is None: + batch_size = 16 + logger.info("Using default batch size of %d for images. Adjust with --batch-size if you encounter memory issues or want to speed up processing.", batch_size) + + logger.info("Running embedding for %d images with batch size %d...", len(images), batch_size) tensors = [] - batch_size = 16 current_batch = [] @@ -209,7 +220,7 @@ def compute_text_projection( neighbors: str | None = "neighbors", model: str | None = None, trust_remote_code: bool = False, - batch_size: int = 32, + batch_size: int | None = None, umap_args: dict = {}, ): """ @@ -320,6 +331,7 @@ def compute_image_projection( neighbors: str | None = "neighbors", model: str | None = None, trust_remote_code: bool = False, + batch_size: int | None = None, umap_args: dict = {}, ): """ @@ -338,6 +350,8 @@ def compute_image_projection( model: str, name or path of the model to use for embedding. trust_remote_code: bool, whether to trust and execute remote code when loading the model from HuggingFace Hub. Default is False. + batch_size: int, batch size for processing images. Larger values use more + memory but may be faster. Default is 16. umap_args: dict, additional keyword arguments to pass to the UMAP algorithm (e.g., n_neighbors, min_dist, metric). @@ -350,6 +364,7 @@ def compute_image_projection( list(image_series), model=model, trust_remote_code=trust_remote_code, + batch_size=batch_size, umap_args=umap_args, ) data_frame[x] = proj.projection[:, 0]