|
16 | 16 | dimensions = [3, 97, 256]
|
17 | 17 |
|
18 | 18 |
|
| 19 | +def test_sqlite_minimal_json_cosine_vector_search(): |
| 20 | + """Minimal test for searching JSON vectors in an SQLite database.""" |
| 21 | + conn = sqlite3.connect(":memory:") |
| 22 | + conn.enable_load_extension(True) |
| 23 | + conn.load_extension(usearch.sqlite) |
| 24 | + |
| 25 | + cursor = conn.cursor() |
| 26 | + |
| 27 | + # Create a table with a JSON column for vectors |
| 28 | + cursor.executescript( |
| 29 | + """ |
| 30 | + CREATE TABLE vectors_table ( |
| 31 | + id SERIAL PRIMARY KEY, |
| 32 | + vector JSON NOT NULL |
| 33 | + ); |
| 34 | + INSERT INTO vectors_table (id, vector) |
| 35 | + VALUES |
| 36 | + (42, '[1.0, 2.0, 3.0]'), |
| 37 | + (43, '[4.0, 5.0, 6.0]'); |
| 38 | + """ |
| 39 | + ) |
| 40 | + # Compute the distances to [7.0, 8.0, 9.0] using |
| 41 | + # the `distance_cosine_f32` extension function |
| 42 | + cursor.execute( |
| 43 | + """ |
| 44 | + SELECT vt.id, |
| 45 | + distance_cosine_f32(vt.vector, '[7.0, 8.0, 9.0]') AS distance |
| 46 | + FROM vectors_table AS vt; |
| 47 | + """ |
| 48 | + ) |
| 49 | + ids_and_distances = list(cursor.fetchall()) |
| 50 | + assert [ids_and_distances[0][0], ids_and_distances[1][0]] == [42, 43] |
| 51 | + assert ids_and_distances[0][1] < 0.05 and ids_and_distances[1][1] < 0.002 |
| 52 | + |
| 53 | + |
| 54 | +def test_sqlite_minimal_text_search(): |
| 55 | + """Minimal test for Unicode strings in an SQLite database.""" |
| 56 | + conn = sqlite3.connect(":memory:") |
| 57 | + conn.enable_load_extension(True) |
| 58 | + conn.load_extension(usearch.sqlite) |
| 59 | + |
| 60 | + cursor = conn.cursor() |
| 61 | + |
| 62 | + # Create a table with a TEXT column for strings |
| 63 | + str42 = "école" # 6 codepoints (runes), 7 bytes |
| 64 | + str43 = "école" # 5 codepoints (runes), 6 bytes |
| 65 | + str44 = "écolé" # 5 codepoints (runes), 7 bytes |
| 66 | + assert ( |
| 67 | + str42 != str43 |
| 68 | + ), "etter 'é' as a single character vs 'e' + '´' are not the same" |
| 69 | + |
| 70 | + # Inject the different strings into the table |
| 71 | + cursor.executescript( |
| 72 | + f""" |
| 73 | + CREATE TABLE strings_table ( |
| 74 | + id SERIAL PRIMARY KEY, |
| 75 | + word TEXT NOT NULL |
| 76 | + ); |
| 77 | + INSERT INTO strings_table (id, word) |
| 78 | + VALUES |
| 79 | + (42, '{str42}'), |
| 80 | + (43, '{str43}'); |
| 81 | + """ |
| 82 | + ) |
| 83 | + cursor.execute( |
| 84 | + f""" |
| 85 | + SELECT st.id, |
| 86 | +
|
| 87 | + distance_levenshtein_bytes(st.word, '{str44}') AS levenshtein_bytes, |
| 88 | + distance_levenshtein_unicode(st.word, '{str44}') AS levenshtein_unicode, |
| 89 | + distance_hamming_bytes(st.word, '{str44}') AS hamming_bytes, |
| 90 | + distance_hamming_unicode(st.word, '{str44}') AS hamming_unicode, |
| 91 | +
|
| 92 | + distance_levenshtein_bytes(st.word, '{str44}', 2) AS levenshtein_bytes_bounded, |
| 93 | + distance_levenshtein_unicode(st.word, '{str44}', 2) AS levenshtein_unicode_bounded, |
| 94 | + distance_hamming_bytes(st.word, '{str44}', 2) AS hamming_bytes_bounded, |
| 95 | + distance_hamming_unicode(st.word, '{str44}', 2) AS hamming_unicode_bounded |
| 96 | + FROM strings_table AS st; |
| 97 | + """ |
| 98 | + ) |
| 99 | + ids_and_distances = list(cursor.fetchall()) |
| 100 | + assert ids_and_distances[0] == (42, 5, 3, 7, 6, 2, 2, 2, 2) |
| 101 | + assert ids_and_distances[1] == (43, 2, 1, 2, 1, 2, 1, 2, 1) |
| 102 | + |
| 103 | + |
| 104 | +def test_sqlite_blob_bits_vector_search(): |
| 105 | + """Minimal test for searching binary vectors in an SQLite database.""" |
| 106 | + |
| 107 | + conn = sqlite3.connect(":memory:") |
| 108 | + conn.enable_load_extension(True) |
| 109 | + conn.load_extension(usearch.sqlite) |
| 110 | + |
| 111 | + cursor = conn.cursor() |
| 112 | + |
| 113 | + # Create a table with a BLOB column for binary vectors |
| 114 | + cursor.executescript( |
| 115 | + """ |
| 116 | + CREATE TABLE binary_vectors ( |
| 117 | + id SERIAL PRIMARY KEY, |
| 118 | + vector BLOB NOT NULL |
| 119 | + ); |
| 120 | + INSERT INTO binary_vectors (id, vector) |
| 121 | + VALUES |
| 122 | + (42, X'FFFFFF'), -- 111111111111111111111111 in binary |
| 123 | + (43, X'000000'); -- 000000000000000000000000 in binary |
| 124 | + """ |
| 125 | + ) |
| 126 | + |
| 127 | + # Compute the distances between binary vectors and a sample vector using |
| 128 | + # the `distance_hamming_binary` and `distance_jaccard_binary` extension functions |
| 129 | + cursor.execute( |
| 130 | + """ |
| 131 | + SELECT bv.id, |
| 132 | + distance_hamming_binary(bv.vector, X'FFFF00') AS hamming_distance, |
| 133 | + distance_jaccard_binary(bv.vector, X'FFFF00') AS jaccard_distance |
| 134 | + FROM binary_vectors AS bv; |
| 135 | + """ |
| 136 | + ) |
| 137 | + |
| 138 | + ids_and_distances = list(cursor.fetchall()) |
| 139 | + np.testing.assert_array_almost_equal(ids_and_distances[0], (42, 8.0, 1.0 / 3)) |
| 140 | + np.testing.assert_array_almost_equal(ids_and_distances[1], (43, 16.0, 1.0)) |
| 141 | + |
| 142 | + |
19 | 143 | @pytest.mark.parametrize("num_vectors", batch_sizes)
|
20 | 144 | @pytest.mark.parametrize("ndim", dimensions)
|
21 | 145 | def test_sqlite_distances_in_high_dimensions(num_vectors: int, ndim: int):
|
| 146 | + """ |
| 147 | + Test the computation of cosine distances in high-dimensional spaces with random vectors stored in an SQLite database. |
| 148 | +
|
| 149 | + This function tests the accuracy and consistency of cosine distance calculations between vectors in different formats: |
| 150 | + - distance_cosine_f32(JSON, JSON) |
| 151 | + - distance_cosine_f32(BLOB, BLOB) |
| 152 | + - distance_cosine_f16(BLOB, BLOB) |
| 153 | +
|
| 154 | + The vectors are stored and retrieved as JSON strings and as binary blobs (in both 32-bit and 16-bit precision formats). |
| 155 | + The function asserts that the cosine similarities computed from the different storage formats (JSON, f32 BLOB, f16 BLOB) |
| 156 | + are within a certain tolerance of each other, ensuring that the distance calculations are consistent across different data formats. |
| 157 | +
|
| 158 | + Parameters: |
| 159 | + num_vectors (int): The number of random vectors to generate and test. |
| 160 | + ndim (int): The dimensionality of each vector. |
| 161 | + """ |
| 162 | + |
22 | 163 | conn = sqlite3.connect(":memory:")
|
23 | 164 | conn.enable_load_extension(True)
|
24 | 165 | conn.load_extension(usearch.sqlite)
|
@@ -91,7 +232,7 @@ def test_sqlite_distances_in_low_dimensions(num_vectors: int):
|
91 | 232 | conn = sqlite3.connect(":memory:")
|
92 | 233 | conn.enable_load_extension(True)
|
93 | 234 | conn.load_extension(usearch.sqlite)
|
94 |
| - |
| 235 | + |
95 | 236 | cursor = conn.cursor()
|
96 | 237 |
|
97 | 238 | # Create a table for storing vectors and their descriptions
|
@@ -139,8 +280,12 @@ def test_sqlite_distances_in_low_dimensions(num_vectors: int):
|
139 | 280 |
|
140 | 281 | # Validate the results of the distance computations
|
141 | 282 | for id1, id2, similarity_f32, similarity_f16, haversine_meters in cursor.fetchall():
|
142 |
| - assert 0 <= similarity_f32 <= 1, "Cosine similarity (f32) must be between 0 and 1" |
143 |
| - assert 0 <= similarity_f16 <= 1, "Cosine similarity (f16) must be between 0 and 1" |
| 283 | + assert ( |
| 284 | + 0 <= similarity_f32 <= 1 |
| 285 | + ), "Cosine similarity (f32) must be between 0 and 1" |
| 286 | + assert ( |
| 287 | + 0 <= similarity_f16 <= 1 |
| 288 | + ), "Cosine similarity (f16) must be between 0 and 1" |
144 | 289 | assert haversine_meters >= 0, "Haversine distance must be non-negative"
|
145 | 290 |
|
146 | 291 | # Clean up
|
|
0 commit comments