apple · donghaoren · Aug 11, 2025 · Aug 11, 2025
diff --git a/packages/backend/README.md b/packages/backend/README.md
@@ -47,6 +47,16 @@ embedding-atlas path_to_dataset.parquet --x projection_x --y projection_y
 
 You may use the [SentenceTransformers](https://sbert.net/) package to compute high-dimensional embeddings from text data, and then use the [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html) package to compute 2D projections.
 
+### Using Pre-computed Vectors
+
+If you already have pre-computed embedding vectors (but not the 2D projections), you can specify the column containing the vectors with `--vector`:
+
+```bash
+embedding-atlas path_to_dataset.parquet --vector embedding_vectors
+```
+
+This will apply UMAP dimensionality reduction to your pre-existing vectors without recomputing embeddings. The vectors should be stored as lists or numpy arrays in your dataset.
+
 You may also specify a column for pre-computed nearest neighbors:
 
 ```bash

diff --git a/packages/backend/embedding_atlas/cli.py b/packages/backend/embedding_atlas/cli.py
@@ -104,6 +104,7 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost
 @click.argument("inputs", nargs=-1, required=True)
 @click.option("--text", default=None, help="Column containing text data.")
 @click.option("--image", default=None, help="Column containing image data.")
+@click.option("--vector", default=None, help="Column containing pre-computed vector embeddings.")
 @click.option(
     "--split",
     default=[],
@@ -199,6 +200,7 @@ def main(
     inputs,
     text: str | None,
     image: str | None,
+    vector: str | None,
     split: list[str] | None,
     enable_embedding: bool,
     model: str | None,
@@ -228,8 +230,8 @@ def main(
     print(df)
 
     if enable_embedding and (x_column is None or y_column is None):
-        # No x, y column selected, first see if text column is specified, if not, ask for it
-        if text is None and image is None:
+        # No x, y column selected, first see if text/image/vectors column is specified, if not, ask for it
+        if text is None and image is None and vector is None:
             text = prompt_for_column(
                 df, "Select a column you want to run the embedding on"
             )
@@ -243,8 +245,8 @@ def main(
         if umap_metric is not None:
             umap_args["metric"] = umap_metric
         # Run embedding and projection
-        if text is not None or image is not None:
-            from .projection import compute_image_projection, compute_text_projection
+        if text is not None or image is not None or vector is not None:
+            from .projection import compute_image_projection, compute_text_projection, compute_vector_projection
 
             x_column = find_column_name(df.columns, "projection_x")
             y_column = find_column_name(df.columns, "projection_y")
@@ -254,7 +256,16 @@ def main(
             else:
                 # If neighbors_column is already specified, don't overwrite it.
                 new_neighbors_column = None
-            if text is not None:
+            if vector is not None:
+                compute_vector_projection(
+                    df,
+                    vector,
+                    x=x_column,
+                    y=y_column,
+                    neighbors=new_neighbors_column,
+                    umap_args=umap_args,
+                )
+            elif text is not None:
                 compute_text_projection(
                     df,
                     text,

diff --git a/packages/backend/embedding_atlas/projection.py b/packages/backend/embedding_atlas/projection.py
@@ -248,6 +248,64 @@ def compute_text_projection(
         ]
 
 
+def compute_vector_projection(
+    data_frame: pd.DataFrame,
+    vector: str,
+    x: str = "projection_x",
+    y: str = "projection_y",
+    neighbors: str | None = "neighbors",
+    umap_args: dict = {},
+):
+    """
+    Generate 2D projections from pre-existing vector embeddings using UMAP.
+
+    This function takes pre-computed vector embeddings and reduces their dimensionality
+    to 2D coordinates using UMAP for visualization purposes.
+
+    Args:
+        data_frame: pandas DataFrame containing the vector data to process.
+        vector: str, column name containing the pre-computed vector embeddings.
+                Each entry should be a list or numpy array of numbers.
+        x: str, column name where the UMAP X coordinates will be stored.
+        y: str, column name where the UMAP Y coordinates will be stored.
+        neighbors: str, column name where the nearest neighbor indices will be stored.
+        umap_args: dict, additional keyword arguments to pass to the UMAP algorithm
+            (e.g., n_neighbors, min_dist, metric).
+
+    Returns:
+        The input DataFrame with added columns for X, Y coordinates and nearest neighbors.
+    """
+    # Convert vector column to numpy array
+    vector_series = data_frame[vector]
+
+    # Convert each vector entry to numpy array and stack them
+    vector_list = []
+    for vector in vector_series:
+        if isinstance(vector, list):
+            vector_array = np.array(vector)
+        elif isinstance(vector, np.ndarray):
+            vector_array = vector
+        else:
+            # Try to convert to numpy array
+            vector_array = np.array(vector)
+        vector_list.append(vector_array)
+
+    # Stack all vectors into a single numpy array
+    hidden_vectors = np.stack(vector_list)
+
+    # Run UMAP on the pre-existing vectors
+    proj = _run_umap(hidden_vectors, umap_args)
+
+    # Add projection results to dataframe
+    data_frame[x] = proj.projection[:, 0]
+    data_frame[y] = proj.projection[:, 1]
+    if neighbors is not None:
+        data_frame[neighbors] = [
+            {"distances": b, "ids": a}  # ID is always the same as the row index.
+            for a, b in zip(proj.knn_indices, proj.knn_distances)
+        ]
+
+
 def compute_image_projection(
     data_frame: pd.DataFrame,
     image: str,