diff --git a/packages/backend/README.md b/packages/backend/README.md index 386d5da..35d127b 100644 --- a/packages/backend/README.md +++ b/packages/backend/README.md @@ -47,6 +47,16 @@ embedding-atlas path_to_dataset.parquet --x projection_x --y projection_y You may use the [SentenceTransformers](https://sbert.net/) package to compute high-dimensional embeddings from text data, and then use the [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html) package to compute 2D projections. +### Using Pre-computed Vectors + +If you already have pre-computed embedding vectors (but not the 2D projections), you can specify the column containing the vectors with `--vector`: + +```bash +embedding-atlas path_to_dataset.parquet --vector embedding_vectors +``` + +This will apply UMAP dimensionality reduction to your pre-existing vectors without recomputing embeddings. The vectors should be stored as lists or numpy arrays in your dataset. + You may also specify a column for pre-computed nearest neighbors: ```bash diff --git a/packages/backend/embedding_atlas/cli.py b/packages/backend/embedding_atlas/cli.py index 3df11b6..9c4c71e 100644 --- a/packages/backend/embedding_atlas/cli.py +++ b/packages/backend/embedding_atlas/cli.py @@ -104,6 +104,7 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost @click.argument("inputs", nargs=-1, required=True) @click.option("--text", default=None, help="Column containing text data.") @click.option("--image", default=None, help="Column containing image data.") +@click.option("--vector", default=None, help="Column containing pre-computed vector embeddings.") @click.option( "--split", default=[], @@ -199,6 +200,7 @@ def main( inputs, text: str | None, image: str | None, + vector: str | None, split: list[str] | None, enable_embedding: bool, model: str | None, @@ -228,8 +230,8 @@ def main( print(df) if enable_embedding and (x_column is None or y_column is None): - # No x, y column selected, first see if text column is specified, if not, ask for it - if text is None and image is None: + # No x, y column selected, first see if text/image/vectors column is specified, if not, ask for it + if text is None and image is None and vector is None: text = prompt_for_column( df, "Select a column you want to run the embedding on" ) @@ -243,8 +245,8 @@ def main( if umap_metric is not None: umap_args["metric"] = umap_metric # Run embedding and projection - if text is not None or image is not None: - from .projection import compute_image_projection, compute_text_projection + if text is not None or image is not None or vector is not None: + from .projection import compute_image_projection, compute_text_projection, compute_vector_projection x_column = find_column_name(df.columns, "projection_x") y_column = find_column_name(df.columns, "projection_y") @@ -254,7 +256,16 @@ def main( else: # If neighbors_column is already specified, don't overwrite it. new_neighbors_column = None - if text is not None: + if vector is not None: + compute_vector_projection( + df, + vector, + x=x_column, + y=y_column, + neighbors=new_neighbors_column, + umap_args=umap_args, + ) + elif text is not None: compute_text_projection( df, text, diff --git a/packages/backend/embedding_atlas/projection.py b/packages/backend/embedding_atlas/projection.py index 3de478c..e018e61 100644 --- a/packages/backend/embedding_atlas/projection.py +++ b/packages/backend/embedding_atlas/projection.py @@ -248,6 +248,64 @@ def compute_text_projection( ] +def compute_vector_projection( + data_frame: pd.DataFrame, + vector: str, + x: str = "projection_x", + y: str = "projection_y", + neighbors: str | None = "neighbors", + umap_args: dict = {}, +): + """ + Generate 2D projections from pre-existing vector embeddings using UMAP. + + This function takes pre-computed vector embeddings and reduces their dimensionality + to 2D coordinates using UMAP for visualization purposes. + + Args: + data_frame: pandas DataFrame containing the vector data to process. + vector: str, column name containing the pre-computed vector embeddings. + Each entry should be a list or numpy array of numbers. + x: str, column name where the UMAP X coordinates will be stored. + y: str, column name where the UMAP Y coordinates will be stored. + neighbors: str, column name where the nearest neighbor indices will be stored. + umap_args: dict, additional keyword arguments to pass to the UMAP algorithm + (e.g., n_neighbors, min_dist, metric). + + Returns: + The input DataFrame with added columns for X, Y coordinates and nearest neighbors. + """ + # Convert vector column to numpy array + vector_series = data_frame[vector] + + # Convert each vector entry to numpy array and stack them + vector_list = [] + for vector in vector_series: + if isinstance(vector, list): + vector_array = np.array(vector) + elif isinstance(vector, np.ndarray): + vector_array = vector + else: + # Try to convert to numpy array + vector_array = np.array(vector) + vector_list.append(vector_array) + + # Stack all vectors into a single numpy array + hidden_vectors = np.stack(vector_list) + + # Run UMAP on the pre-existing vectors + proj = _run_umap(hidden_vectors, umap_args) + + # Add projection results to dataframe + data_frame[x] = proj.projection[:, 0] + data_frame[y] = proj.projection[:, 1] + if neighbors is not None: + data_frame[neighbors] = [ + {"distances": b, "ids": a} # ID is always the same as the row index. + for a, b in zip(proj.knn_indices, proj.knn_distances) + ] + + def compute_image_projection( data_frame: pd.DataFrame, image: str,