Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions packages/backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ embedding-atlas path_to_dataset.parquet --x projection_x --y projection_y

You may use the [SentenceTransformers](https://sbert.net/) package to compute high-dimensional embeddings from text data, and then use the [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html) package to compute 2D projections.

### Using Pre-computed Vectors

If you already have pre-computed embedding vectors (but not the 2D projections), you can specify the column containing the vectors with `--vector`:

```bash
embedding-atlas path_to_dataset.parquet --vector embedding_vectors
```

This will apply UMAP dimensionality reduction to your pre-existing vectors without recomputing embeddings. The vectors should be stored as lists or numpy arrays in your dataset.

You may also specify a column for pre-computed nearest neighbors:

```bash
Expand Down
21 changes: 16 additions & 5 deletions packages/backend/embedding_atlas/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost
@click.argument("inputs", nargs=-1, required=True)
@click.option("--text", default=None, help="Column containing text data.")
@click.option("--image", default=None, help="Column containing image data.")
@click.option("--vector", default=None, help="Column containing pre-computed vector embeddings.")
@click.option(
"--split",
default=[],
Expand Down Expand Up @@ -199,6 +200,7 @@ def main(
inputs,
text: str | None,
image: str | None,
vector: str | None,
split: list[str] | None,
enable_embedding: bool,
model: str | None,
Expand Down Expand Up @@ -228,8 +230,8 @@ def main(
print(df)

if enable_embedding and (x_column is None or y_column is None):
# No x, y column selected, first see if text column is specified, if not, ask for it
if text is None and image is None:
# No x, y column selected, first see if text/image/vectors column is specified, if not, ask for it
if text is None and image is None and vector is None:
text = prompt_for_column(
df, "Select a column you want to run the embedding on"
)
Expand All @@ -243,8 +245,8 @@ def main(
if umap_metric is not None:
umap_args["metric"] = umap_metric
# Run embedding and projection
if text is not None or image is not None:
from .projection import compute_image_projection, compute_text_projection
if text is not None or image is not None or vector is not None:
from .projection import compute_image_projection, compute_text_projection, compute_vector_projection

x_column = find_column_name(df.columns, "projection_x")
y_column = find_column_name(df.columns, "projection_y")
Expand All @@ -254,7 +256,16 @@ def main(
else:
# If neighbors_column is already specified, don't overwrite it.
new_neighbors_column = None
if text is not None:
if vector is not None:
compute_vector_projection(
df,
vector,
x=x_column,
y=y_column,
neighbors=new_neighbors_column,
umap_args=umap_args,
)
elif text is not None:
compute_text_projection(
df,
text,
Expand Down
58 changes: 58 additions & 0 deletions packages/backend/embedding_atlas/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,64 @@ def compute_text_projection(
]


def compute_vector_projection(
data_frame: pd.DataFrame,
vector: str,
x: str = "projection_x",
y: str = "projection_y",
neighbors: str | None = "neighbors",
umap_args: dict = {},
):
"""
Generate 2D projections from pre-existing vector embeddings using UMAP.

This function takes pre-computed vector embeddings and reduces their dimensionality
to 2D coordinates using UMAP for visualization purposes.

Args:
data_frame: pandas DataFrame containing the vector data to process.
vector: str, column name containing the pre-computed vector embeddings.
Each entry should be a list or numpy array of numbers.
x: str, column name where the UMAP X coordinates will be stored.
y: str, column name where the UMAP Y coordinates will be stored.
neighbors: str, column name where the nearest neighbor indices will be stored.
umap_args: dict, additional keyword arguments to pass to the UMAP algorithm
(e.g., n_neighbors, min_dist, metric).

Returns:
The input DataFrame with added columns for X, Y coordinates and nearest neighbors.
"""
# Convert vector column to numpy array
vector_series = data_frame[vector]

# Convert each vector entry to numpy array and stack them
vector_list = []
for vector in vector_series:
if isinstance(vector, list):
vector_array = np.array(vector)
elif isinstance(vector, np.ndarray):
vector_array = vector
else:
# Try to convert to numpy array
vector_array = np.array(vector)
vector_list.append(vector_array)

# Stack all vectors into a single numpy array
hidden_vectors = np.stack(vector_list)

# Run UMAP on the pre-existing vectors
proj = _run_umap(hidden_vectors, umap_args)

# Add projection results to dataframe
data_frame[x] = proj.projection[:, 0]
data_frame[y] = proj.projection[:, 1]
if neighbors is not None:
data_frame[neighbors] = [
{"distances": b, "ids": a} # ID is always the same as the row index.
for a, b in zip(proj.knn_indices, proj.knn_distances)
]


def compute_image_projection(
data_frame: pd.DataFrame,
image: str,
Expand Down