@@ -229,6 +229,8 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
229
229
/* * @copydoc sz_hash_state_fold */
230
230
SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial (sz_hash_state_t const *state);
231
231
232
+ #if SZ_USE_HASWELL
233
+
232
234
/* * @copydoc sz_bytesum */
233
235
SZ_PUBLIC sz_u64_t sz_bytesum_haswell (sz_cptr_t text, sz_size_t length);
234
236
@@ -247,6 +249,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
247
249
/* * @copydoc sz_hash_state_fold */
248
250
SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell (sz_hash_state_t const *state);
249
251
252
+ #endif
253
+
254
+ #if SZ_USE_SKYLAKE
255
+
250
256
/* * @copydoc sz_bytesum */
251
257
SZ_PUBLIC sz_u64_t sz_bytesum_skylake (sz_cptr_t text, sz_size_t length);
252
258
@@ -265,6 +271,10 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
265
271
/* * @copydoc sz_hash_state_fold */
266
272
SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake (sz_hash_state_t const *state);
267
273
274
+ #endif
275
+
276
+ #if SZ_USE_ICE
277
+
268
278
/* * @copydoc sz_bytesum */
269
279
SZ_PUBLIC sz_u64_t sz_bytesum_ice (sz_cptr_t text, sz_size_t length);
270
280
@@ -283,9 +293,17 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
283
293
/* * @copydoc sz_hash_state_fold */
284
294
SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice (sz_hash_state_t const *state);
285
295
296
+ #endif
297
+
298
+ #if SZ_USE_NEON
299
+
286
300
/* * @copydoc sz_bytesum */
287
301
SZ_PUBLIC sz_u64_t sz_bytesum_neon (sz_cptr_t text, sz_size_t length);
288
302
303
+ #endif
304
+
305
+ #if SZ_USE_NEON_AES
306
+
289
307
/* * @copydoc sz_hash */
290
308
SZ_PUBLIC sz_u64_t sz_hash_neon (sz_cptr_t text, sz_size_t length, sz_u64_t seed);
291
309
@@ -301,6 +319,41 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
301
319
/* * @copydoc sz_hash_state_fold */
302
320
SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon (sz_hash_state_t const *state);
303
321
322
+ #endif
323
+
324
+ #if SZ_USE_SVE
325
+
326
+ /* * @copydoc sz_bytesum */
327
+ SZ_PUBLIC sz_u64_t sz_bytesum_sve (sz_cptr_t text, sz_size_t length);
328
+
329
+ #endif
330
+
331
+ #if SZ_USE_SVE2
332
+
333
+ /* * @copydoc sz_bytesum */
334
+ SZ_PUBLIC sz_u64_t sz_bytesum_sve2 (sz_cptr_t text, sz_size_t length);
335
+
336
+ #endif
337
+
338
+ #if SZ_USE_SVE2_AES
339
+
340
+ /* * @copydoc sz_hash */
341
+ SZ_PUBLIC sz_u64_t sz_hash_sve2 (sz_cptr_t text, sz_size_t length, sz_u64_t seed);
342
+
343
+ /* * @copydoc sz_fill_random */
344
+ SZ_PUBLIC void sz_fill_random_sve2 (sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
345
+
346
+ /* * @copydoc sz_hash_state_init */
347
+ SZ_PUBLIC void sz_hash_state_init_sve2 (sz_hash_state_t *state, sz_u64_t seed);
348
+
349
+ /* * @copydoc sz_hash_state_stream */
350
+ SZ_PUBLIC void sz_hash_state_stream_sve2 (sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
351
+
352
+ /* * @copydoc sz_hash_state_fold */
353
+ SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2 (sz_hash_state_t const *state);
354
+
355
+ #endif
356
+
304
357
#pragma endregion // Core API
305
358
306
359
#pragma region Helper Methods
@@ -1922,8 +1975,8 @@ SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __
1922
1975
#pragma region NEON Implementation
1923
1976
#if SZ_USE_NEON
1924
1977
#pragma GCC push_options
1925
- #pragma GCC target("arch=armv8.2-a+simd+crypto ")
1926
- #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto "))), apply_to = function)
1978
+ #pragma GCC target("arch=armv8.2-a+simd")
1979
+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
1927
1980
1928
1981
SZ_PUBLIC sz_u64_t sz_bytesum_neon (sz_cptr_t text, sz_size_t length) {
1929
1982
uint64x2_t sum_vec = vdupq_n_u64 (0 );
@@ -1943,6 +1996,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
1943
1996
return sum;
1944
1997
}
1945
1998
1999
+ #pragma clang attribute pop
2000
+ #pragma GCC pop_options
2001
+ #endif // SZ_USE_NEON
2002
+ #pragma endregion // NEON Implementation
2003
+
2004
+ #pragma region NEON AES Implementation
2005
+ #if SZ_USE_NEON_AES
2006
+ #pragma GCC push_options
2007
+ #pragma GCC target("arch=armv8.2-a+simd+crypto+aes")
2008
+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto+aes"))), apply_to = function)
2009
+
1946
2010
/* *
1947
2011
* @brief Emulates the Intel's AES-NI `AESENC` instruction on Arm NEON.
1948
2012
* @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2303,7 +2367,7 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
2303
2367
#pragma clang attribute pop
2304
2368
#pragma GCC pop_options
2305
2369
#endif // SZ_USE_NEON
2306
- #pragma endregion // NEON Implementation
2370
+ #pragma endregion // NEON AES Implementation
2307
2371
2308
2372
/* Implementation of the string search algorithms using the Arm SVE variable-length registers,
2309
2373
* available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
@@ -2340,11 +2404,11 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
2340
2404
*
2341
2405
* @see https://stackoverflow.com/a/73218637/2766161
2342
2406
*/
2343
- #pragma region SVE Implementation
2407
+ #pragma region SVE2 Implementation
2344
2408
#if SZ_USE_SVE2
2345
2409
#pragma GCC push_options
2346
- #pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes ")
2347
- #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes "))), apply_to = function)
2410
+ #pragma GCC target("arch=armv8.2-a+sve+sve2")
2411
+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
2348
2412
2349
2413
SZ_PUBLIC sz_u64_t sz_bytesum_sve2 (sz_cptr_t text, sz_size_t length) {
2350
2414
sz_u64_t sum = 0 ;
@@ -2371,6 +2435,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
2371
2435
return sum;
2372
2436
}
2373
2437
2438
+ #pragma clang attribute pop
2439
+ #pragma GCC pop_options
2440
+ #endif // SZ_USE_SVE
2441
+ #pragma endregion // SVE2 Implementation
2442
+
2443
+ #pragma region SVE2 AES Implementation
2444
+ #if SZ_USE_SVE2_AES
2445
+ #pragma GCC push_options
2446
+ #pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
2447
+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
2448
+
2374
2449
/* *
2375
2450
* @brief Emulates the Intel's AES-NI `AESENC` instruction with Arm SVE2.
2376
2451
* @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2504,7 +2579,7 @@ SZ_PUBLIC void sz_hash_sve2_upto16x16_(char texts[16][16], sz_size_t length[16],
2504
2579
#pragma clang attribute pop
2505
2580
#pragma GCC pop_options
2506
2581
#endif // SZ_USE_SVE2
2507
- #pragma endregion // SVE Implementation
2582
+ #pragma endregion // SVE2 Implementation
2508
2583
2509
2584
/* Pick the right implementation for the string search algorithms.
2510
2585
* To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
@@ -2537,9 +2612,9 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
2537
2612
return sz_hash_skylake (text, length, seed);
2538
2613
#elif SZ_USE_HASWELL
2539
2614
return sz_hash_haswell (text, length, seed);
2540
- #elif SZ_USE_SVE2
2615
+ #elif SZ_USE_SVE2_AES
2541
2616
return sz_hash_sve2 (text, length, seed);
2542
- #elif SZ_USE_NEON
2617
+ #elif SZ_USE_NEON_AES
2543
2618
return sz_hash_neon (text, length, seed);
2544
2619
#else
2545
2620
return sz_hash_serial (text, length, seed);
@@ -2553,9 +2628,9 @@ SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce)
2553
2628
sz_fill_random_skylake (text, length, nonce);
2554
2629
#elif SZ_USE_HASWELL
2555
2630
sz_fill_random_haswell (text, length, nonce);
2556
- #elif SZ_USE_SVE2
2631
+ #elif SZ_USE_SVE2_AES
2557
2632
sz_fill_random_sve2 (text, length, nonce);
2558
- #elif SZ_USE_NEON
2633
+ #elif SZ_USE_NEON_AES
2559
2634
sz_fill_random_neon (text, length, nonce);
2560
2635
#else
2561
2636
sz_fill_random_serial (text, length, nonce);
@@ -2569,9 +2644,9 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
2569
2644
sz_hash_state_init_skylake (state, seed);
2570
2645
#elif SZ_USE_HASWELL
2571
2646
sz_hash_state_init_haswell (state, seed);
2572
- #elif SZ_USE_SVE2
2647
+ #elif SZ_USE_SVE2_AES
2573
2648
sz_hash_state_init_sve2 (state, seed);
2574
- #elif SZ_USE_NEON
2649
+ #elif SZ_USE_NEON_AES
2575
2650
sz_hash_state_init_neon (state, seed);
2576
2651
#else
2577
2652
sz_hash_state_init_serial (state, seed);
@@ -2585,9 +2660,9 @@ SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_
2585
2660
sz_hash_state_stream_skylake (state, text, length);
2586
2661
#elif SZ_USE_HASWELL
2587
2662
sz_hash_state_stream_haswell (state, text, length);
2588
- #elif SZ_USE_SVE2
2663
+ #elif SZ_USE_SVE2_AES
2589
2664
sz_hash_state_stream_sve2 (state, text, length);
2590
- #elif SZ_USE_NEON
2665
+ #elif SZ_USE_NEON_AES
2591
2666
sz_hash_state_stream_neon (state, text, length);
2592
2667
#else
2593
2668
sz_hash_state_stream_serial (state, text, length);
@@ -2601,9 +2676,9 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
2601
2676
return sz_hash_state_fold_skylake (state);
2602
2677
#elif SZ_USE_HASWELL
2603
2678
return sz_hash_state_fold_haswell (state);
2604
- #elif SZ_USE_SVE2
2679
+ #elif SZ_USE_SVE2_AES
2605
2680
return sz_hash_state_fold_sve2 (state);
2606
- #elif SZ_USE_NEON
2681
+ #elif SZ_USE_NEON_AES
2607
2682
return sz_hash_state_fold_neon (state);
2608
2683
#else
2609
2684
return sz_hash_state_fold_serial (state);
0 commit comments