Skip to content

Commit ae4d0f0

Browse files
committed
Add: String distances to SQLite
1 parent 6334983 commit ae4d0f0

File tree

3 files changed

+412
-17
lines changed

3 files changed

+412
-17
lines changed

python/scripts/test_sqlite.py

Lines changed: 148 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,150 @@
1616
dimensions = [3, 97, 256]
1717

1818

19+
def test_sqlite_minimal_json_cosine_vector_search():
20+
"""Minimal test for searching JSON vectors in an SQLite database."""
21+
conn = sqlite3.connect(":memory:")
22+
conn.enable_load_extension(True)
23+
conn.load_extension(usearch.sqlite)
24+
25+
cursor = conn.cursor()
26+
27+
# Create a table with a JSON column for vectors
28+
cursor.executescript(
29+
"""
30+
CREATE TABLE vectors_table (
31+
id SERIAL PRIMARY KEY,
32+
vector JSON NOT NULL
33+
);
34+
INSERT INTO vectors_table (id, vector)
35+
VALUES
36+
(42, '[1.0, 2.0, 3.0]'),
37+
(43, '[4.0, 5.0, 6.0]');
38+
"""
39+
)
40+
# Compute the distances to [7.0, 8.0, 9.0] using
41+
# the `distance_cosine_f32` extension function
42+
cursor.execute(
43+
"""
44+
SELECT vt.id,
45+
distance_cosine_f32(vt.vector, '[7.0, 8.0, 9.0]') AS distance
46+
FROM vectors_table AS vt;
47+
"""
48+
)
49+
ids_and_distances = list(cursor.fetchall())
50+
assert [ids_and_distances[0][0], ids_and_distances[1][0]] == [42, 43]
51+
assert ids_and_distances[0][1] < 0.05 and ids_and_distances[1][1] < 0.002
52+
53+
54+
def test_sqlite_minimal_text_search():
55+
"""Minimal test for Unicode strings in an SQLite database."""
56+
conn = sqlite3.connect(":memory:")
57+
conn.enable_load_extension(True)
58+
conn.load_extension(usearch.sqlite)
59+
60+
cursor = conn.cursor()
61+
62+
# Create a table with a TEXT column for strings
63+
str42 = "école" # 6 codepoints (runes), 7 bytes
64+
str43 = "école" # 5 codepoints (runes), 6 bytes
65+
str44 = "écolé" # 5 codepoints (runes), 7 bytes
66+
assert (
67+
str42 != str43
68+
), "etter 'é' as a single character vs 'e' + '´' are not the same"
69+
70+
# Inject the different strings into the table
71+
cursor.executescript(
72+
f"""
73+
CREATE TABLE strings_table (
74+
id SERIAL PRIMARY KEY,
75+
word TEXT NOT NULL
76+
);
77+
INSERT INTO strings_table (id, word)
78+
VALUES
79+
(42, '{str42}'),
80+
(43, '{str43}');
81+
"""
82+
)
83+
cursor.execute(
84+
f"""
85+
SELECT st.id,
86+
87+
distance_levenshtein_bytes(st.word, '{str44}') AS levenshtein_bytes,
88+
distance_levenshtein_unicode(st.word, '{str44}') AS levenshtein_unicode,
89+
distance_hamming_bytes(st.word, '{str44}') AS hamming_bytes,
90+
distance_hamming_unicode(st.word, '{str44}') AS hamming_unicode,
91+
92+
distance_levenshtein_bytes(st.word, '{str44}', 2) AS levenshtein_bytes_bounded,
93+
distance_levenshtein_unicode(st.word, '{str44}', 2) AS levenshtein_unicode_bounded,
94+
distance_hamming_bytes(st.word, '{str44}', 2) AS hamming_bytes_bounded,
95+
distance_hamming_unicode(st.word, '{str44}', 2) AS hamming_unicode_bounded
96+
FROM strings_table AS st;
97+
"""
98+
)
99+
ids_and_distances = list(cursor.fetchall())
100+
assert ids_and_distances[0] == (42, 5, 3, 7, 6, 2, 2, 2, 2)
101+
assert ids_and_distances[1] == (43, 2, 1, 2, 1, 2, 1, 2, 1)
102+
103+
104+
def test_sqlite_blob_bits_vector_search():
105+
"""Minimal test for searching binary vectors in an SQLite database."""
106+
107+
conn = sqlite3.connect(":memory:")
108+
conn.enable_load_extension(True)
109+
conn.load_extension(usearch.sqlite)
110+
111+
cursor = conn.cursor()
112+
113+
# Create a table with a BLOB column for binary vectors
114+
cursor.executescript(
115+
"""
116+
CREATE TABLE binary_vectors (
117+
id SERIAL PRIMARY KEY,
118+
vector BLOB NOT NULL
119+
);
120+
INSERT INTO binary_vectors (id, vector)
121+
VALUES
122+
(42, X'FFFFFF'), -- 111111111111111111111111 in binary
123+
(43, X'000000'); -- 000000000000000000000000 in binary
124+
"""
125+
)
126+
127+
# Compute the distances between binary vectors and a sample vector using
128+
# the `distance_hamming_binary` and `distance_jaccard_binary` extension functions
129+
cursor.execute(
130+
"""
131+
SELECT bv.id,
132+
distance_hamming_binary(bv.vector, X'FFFF00') AS hamming_distance,
133+
distance_jaccard_binary(bv.vector, X'FFFF00') AS jaccard_distance
134+
FROM binary_vectors AS bv;
135+
"""
136+
)
137+
138+
ids_and_distances = list(cursor.fetchall())
139+
np.testing.assert_array_almost_equal(ids_and_distances[0], (42, 8.0, 1.0 / 3))
140+
np.testing.assert_array_almost_equal(ids_and_distances[1], (43, 16.0, 1.0))
141+
142+
19143
@pytest.mark.parametrize("num_vectors", batch_sizes)
20144
@pytest.mark.parametrize("ndim", dimensions)
21145
def test_sqlite_distances_in_high_dimensions(num_vectors: int, ndim: int):
146+
"""
147+
Test the computation of cosine distances in high-dimensional spaces with random vectors stored in an SQLite database.
148+
149+
This function tests the accuracy and consistency of cosine distance calculations between vectors in different formats:
150+
- distance_cosine_f32(JSON, JSON)
151+
- distance_cosine_f32(BLOB, BLOB)
152+
- distance_cosine_f16(BLOB, BLOB)
153+
154+
The vectors are stored and retrieved as JSON strings and as binary blobs (in both 32-bit and 16-bit precision formats).
155+
The function asserts that the cosine similarities computed from the different storage formats (JSON, f32 BLOB, f16 BLOB)
156+
are within a certain tolerance of each other, ensuring that the distance calculations are consistent across different data formats.
157+
158+
Parameters:
159+
num_vectors (int): The number of random vectors to generate and test.
160+
ndim (int): The dimensionality of each vector.
161+
"""
162+
22163
conn = sqlite3.connect(":memory:")
23164
conn.enable_load_extension(True)
24165
conn.load_extension(usearch.sqlite)
@@ -91,7 +232,7 @@ def test_sqlite_distances_in_low_dimensions(num_vectors: int):
91232
conn = sqlite3.connect(":memory:")
92233
conn.enable_load_extension(True)
93234
conn.load_extension(usearch.sqlite)
94-
235+
95236
cursor = conn.cursor()
96237

97238
# Create a table for storing vectors and their descriptions
@@ -139,8 +280,12 @@ def test_sqlite_distances_in_low_dimensions(num_vectors: int):
139280

140281
# Validate the results of the distance computations
141282
for id1, id2, similarity_f32, similarity_f16, haversine_meters in cursor.fetchall():
142-
assert 0 <= similarity_f32 <= 1, "Cosine similarity (f32) must be between 0 and 1"
143-
assert 0 <= similarity_f16 <= 1, "Cosine similarity (f16) must be between 0 and 1"
283+
assert (
284+
0 <= similarity_f32 <= 1
285+
), "Cosine similarity (f32) must be between 0 and 1"
286+
assert (
287+
0 <= similarity_f16 <= 1
288+
), "Cosine similarity (f16) must be between 0 and 1"
144289
assert haversine_meters >= 0, "Haversine distance must be non-negative"
145290

146291
# Clean up

sqlite/README.md

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# USearch Extensions for SQLite
2+
3+
USearch exposes the SIMD-accelerated distance functions for SQLite databases aimed to accelerate vector search and fuzzy string matching operations.
4+
This includes:
5+
6+
- [x] Cosine and Euclidean distances for float vectors,
7+
- [x] Jaccard and Hamming distances for binary vectors,
8+
- [x] Levenshtein and Hamming distances for strings,
9+
- [x] Haversine distance for geographical coordinates.
10+
11+
The SIMD-acceleration covers AVX2, most subsets of AVX512, ARM NEON, and Arm SVE instruction sets, more than most BLAS libraries.
12+
The implementations are coming from [SimSIMD](https://github.com/ashvardanian/simsimd) and [StringZilla](https://github.com/ashvardanian/stringzilla).
13+
They are most efficient when vectors are stored as BLOBs, but for broader compatibility can also handle JSONs, and even separate columns containing vector elements.
14+
15+
## Installation
16+
17+
USearch currently ships the SQLite extensions as part of the Python wheels, so they can be obtained from PyPi.
18+
19+
```sh
20+
pip install usearch # brings sqlite extensions
21+
```
22+
23+
## Quickstart
24+
25+
To USearch extensions to your SQLite database, you can use the following commands:
26+
27+
```py
28+
import sqlite3
29+
import usearch
30+
31+
conn = sqlite3.connect(":memory:")
32+
conn.enable_load_extension(True)
33+
conn.load_extension(usearch.sqlite)
34+
```
35+
36+
Afterwards, the following script should work fine.
37+
38+
```sql
39+
-- Create a table with a JSON column for vectors
40+
CREATE TABLE vectors_table (
41+
id SERIAL PRIMARY KEY,
42+
vector JSON NOT NULL
43+
);
44+
45+
-- Insert two 3D vectors
46+
INSERT INTO vectors_table (id, vector)
47+
VALUES
48+
(42, '[1.0, 2.0, 3.0]'),
49+
(43, '[4.0, 5.0, 6.0]');
50+
51+
-- Compute the distances to [7.0, 8.0, 9.0] using
52+
-- the `distance_cosine_f32` extension function
53+
SELECT
54+
id,
55+
distance_cosine_f32(vt.vector, '[7.0, 8.0, 9.0]') AS distance
56+
FROM vectors_table AS vt;
57+
```
58+
59+
### Functionality
60+
61+
#### Strings
62+
63+
String functions accept BLOBs and TEXTs, and return the distance as an integer.
64+
This includes:
65+
66+
- `distance_levenshtein_bytes`,
67+
- `distance_levenshtein_unicode`,
68+
- `distance_hamming_bytes`,
69+
- `distance_hamming_unicode`.
70+
71+
The Levenshtein distance would report the number of insertions, deletions, and substitutions required to transform one string into another, while the Hamming distance would only report the substitutons.
72+
When applied to strings of different length, the Hamming distance would evalute the prefix of the longer string and will add the length difference to the result.
73+
The `_bytes` variants compute the distance in the number of bytes, while the `_unicode` variants compute the distance in the number of Unicode code points, assuming the inputs are UTF8 encoded.
74+
Take a look at the following three similar looking, but distinct strings:
75+
76+
- `école` - 6 codepoints (runes), 7 bytes.
77+
- `école` - 5 codepoints (runes), 6 bytes.
78+
- `écolé` - 5 codepoints (runes), 7 bytes.
79+
80+
Those have three different letter "e" variants, including etter "é" as a single character, etter as "e" and an accent "´", as well as the plain letter "e".
81+
This would result in different distances between those strings, depending on the chosen metric.
82+
83+
```sql
84+
CREATE TABLE strings_table (
85+
id SERIAL PRIMARY KEY,
86+
word TEXT NOT NULL
87+
);
88+
89+
INSERT INTO strings_table (id, word)
90+
VALUES
91+
(42, 'école'),
92+
(43, 'école');
93+
94+
SELECT
95+
st.id,
96+
97+
distance_levenshtein_bytes(st.word, 'écolé') AS lb,
98+
distance_levenshtein_unicode(st.word, 'écolé') AS lu,
99+
distance_hamming_bytes(st.word, 'écolé') AS hb,
100+
distance_hamming_unicode(st.word, 'écolé') AS hu,
101+
102+
distance_levenshtein_bytes(st.word, 'écolé', 2) AS lbb,
103+
distance_levenshtein_unicode(st.word, 'écolé', 2) AS lub,
104+
distance_hamming_bytes(st.word, 'écolé', 2) AS hbb,
105+
distance_hamming_unicode(st.word, 'écolé', 2) AS hub
106+
107+
FROM strings_table AS st;
108+
```
109+
110+
The last 4 columns will contain the bounded versions of the distance, which are faster to compute, thanks to the early stopping condition.
111+
They might be handy, if you want to accelerate search for something like an "autocomplete" feature in a search bar.
112+
The output will look like this:
113+
114+
| id | lb | lu | hb | hu | lbb | lub | hbb | hub |
115+
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
116+
| 42 | 5 | 3 | 7 | 6 | 2 | 2 | 2 | 2 |
117+
| 43 | 2 | 1 | 2 | 1 | 2 | 1 | 2 | 1 |
118+
119+
#### Bit Vectors
120+
121+
If using SQLite as a feature store, you might be dealing with single-bit vectors.
122+
123+
- `distance_hamming_binary` - the Hamming distance, meaning number of bits that differ between two vectors,
124+
- `distance_jaccard_binary` - the Jaccard distance, meaning the number of bits that differ between two vectors divided by the number of bits that are set in at least one of the vectors.
125+
126+
Here is an example:
127+
128+
```sql
129+
CREATE TABLE binary_vectors (
130+
id SERIAL PRIMARY KEY,
131+
vector BLOB NOT NULL
132+
);
133+
INSERT INTO binary_vectors (id, vector)
134+
VALUES
135+
(42, X'FFFFFF'), -- 111111111111111111111111 in binary
136+
(43, X'000000'); -- 000000000000000000000000 in binary
137+
SELECT bv.id,
138+
distance_hamming_binary(bv.vector, X'FFFF00') AS hamming_distance,
139+
distance_jaccard_binary(bv.vector, X'FFFF00') AS jaccard_distance
140+
FROM binary_vectors AS bv;
141+
```
142+
143+
#### Dense Vectors
144+
145+
Distance functions for desnse vectors can be used on both BLOBs and JSONs.
146+
Every name is structured as `distance_<metric>_<type>`, where
147+
148+
- `<metric>` is the name of the metric, like `cosine`, `inner`, `sqeuclidean`, and `divergence`,
149+
- `<type>` is the type of the vector elements, like `f64`, `f32`, `f16`, and `i8`.
150+
151+
The `cosine` metric is the cosine similarity, the `inner` metric is the inner (dot) product, the `sqeuclidean` metric is the squared Euclidean distance, and the `divergence` metric is the Jensen-Shannon divergence - the symmetric variant of the Kullback-Leibler divergence.
152+
153+
- `distance_sqeuclidean_f64`
154+
- `distance_cosine_f64`
155+
- `distance_inner_f64`
156+
- `distance_divergence_f64`
157+
- `distance_sqeuclidean_f32`
158+
- `distance_cosine_f32`
159+
- `distance_inner_f32`
160+
- `distance_divergence_f32`
161+
- `distance_sqeuclidean_f16`
162+
- `distance_cosine_f16`
163+
- `distance_inner_f16`
164+
- `distance_divergence_f16`
165+
- `distance_sqeuclidean_i8`
166+
- `distance_cosine_i8`
167+
- `distance_inner_i8`
168+
- `distance_divergence_i8`
169+
170+
#### Geographical Coordinates
171+
172+
When dealing with geographical or other spherical coordinates, you can use:
173+
174+
- `distance_haversine_meters` - the Haversine distance on the sphere multiplied by the Earth's radius in meters.
175+
176+
---
177+
178+
Feel free to contribute more examples to this file 🤗

0 commit comments

Comments
 (0)