1
1
# /// script
2
2
# requires-python = ">=3.11"
3
- # dependencies = ["click", "datasets", "pandas", "sentence-transformers", "umap-learn"]
3
+ # dependencies = ["click", "datasets", "pandas", "sentence-transformers", "umap-learn", "duckdb" ]
4
4
# ///
5
5
6
6
import json
7
7
import os
8
8
import shutil
9
+ from pathlib import Path
9
10
10
11
import click
12
+ import duckdb
11
13
import numpy as np
12
14
import pandas as pd
13
15
from datasets import load_dataset
@@ -45,8 +47,9 @@ def add_embedding_projection(df: pd.DataFrame, text: str):
45
47
@click .command ()
46
48
@click .option ("--output" , default = "demo-data" )
47
49
def main (output : str ):
48
- shutil .rmtree (output , ignore_errors = True )
49
- os .makedirs (output , exist_ok = True )
50
+ output_path = Path (output )
51
+ shutil .rmtree (output_path , ignore_errors = True )
52
+ output_path .mkdir (parents = True , exist_ok = True )
50
53
51
54
name = "spawn99/wine-reviews"
52
55
columns = [
@@ -66,7 +69,26 @@ def main(output: str):
66
69
67
70
add_embedding_projection (df , text = "description" )
68
71
69
- df .to_parquet (os .path .join (output , "dataset.parquet" ), index = False )
72
+ # Setup DuckDB with Hilbert support
73
+ # See https://duckdb.org/2025/06/06/advanced-sorting-for-fast-selective-queries.html
74
+ conn = duckdb .connect ()
75
+
76
+ conn .execute ("INSTALL lindel FROM community;" )
77
+ conn .execute ("LOAD lindel;" )
78
+
79
+ conn .register ("wine_data" , df )
80
+
81
+ # Sort data using Hilbert curve encoding of the projection.
82
+ conn .execute (f"""
83
+ COPY (
84
+ SELECT *
85
+ FROM wine_data
86
+ ORDER BY hilbert_encode([
87
+ projection_x,
88
+ projection_y
89
+ ]::FLOAT[2])
90
+ ) TO '{ output_path / "dataset.parquet" } ' (FORMAT PARQUET)
91
+ """ )
70
92
71
93
metadata = {
72
94
"columns" : {
@@ -79,7 +101,7 @@ def main(output: str):
79
101
"database" : {"type" : "wasm" , "load" : True },
80
102
}
81
103
82
- with open (os . path . join ( output , "metadata.json" ) , "wb" ) as f :
104
+ with open (output_path / "metadata.json" , "wb" ) as f :
83
105
f .write (json .dumps (metadata ).encode ("utf-8" ))
84
106
85
107
0 commit comments