Skip to content

Commit ae74d44

Browse files
committed
Make: Detect CPU AES support on Arm
1 parent 6431900 commit ae74d44

File tree

8 files changed

+170
-37
lines changed

8 files changed

+170
-37
lines changed

c/stringzilla.c

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,11 +187,6 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
187187
impl->lookup = sz_lookup_neon;
188188

189189
impl->bytesum = sz_bytesum_neon;
190-
impl->hash = sz_hash_neon;
191-
impl->hash_state_init = sz_hash_state_init_neon;
192-
impl->hash_state_stream = sz_hash_state_stream_neon;
193-
impl->hash_state_fold = sz_hash_state_fold_neon;
194-
impl->fill_random = sz_fill_random_neon;
195190

196191
impl->find = sz_find_neon;
197192
impl->rfind = sz_rfind_neon;
@@ -202,13 +197,51 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
202197
}
203198
#endif
204199

200+
#if SZ_USE_NEON_AES
201+
if (caps & sz_cap_neon_aes_k) {
202+
impl->hash = sz_hash_neon;
203+
impl->hash_state_init = sz_hash_state_init_neon;
204+
impl->hash_state_stream = sz_hash_state_stream_neon;
205+
impl->hash_state_fold = sz_hash_state_fold_neon;
206+
impl->fill_random = sz_fill_random_neon;
207+
}
208+
#endif
209+
205210
#if SZ_USE_SVE
206211
if (caps & sz_cap_sve_k) {
212+
impl->equal = sz_equal_sve;
213+
impl->order = sz_order_sve;
214+
215+
impl->copy = sz_copy_sve;
216+
impl->move = sz_move_sve;
217+
impl->fill = sz_fill_sve;
218+
219+
impl->find = sz_find_sve;
220+
// TODO: impl->rfind = sz_rfind_sve;
221+
impl->find_byte = sz_find_byte_sve;
222+
impl->rfind_byte = sz_rfind_byte_sve;
223+
224+
impl->bytesum = sz_bytesum_sve;
225+
207226
impl->sequence_argsort = sz_sequence_argsort_sve;
208227
impl->sequence_intersect = sz_sequence_intersect_sve;
209228
impl->pgrams_sort = sz_pgrams_sort_sve;
210229
}
211230
#endif
231+
232+
#if SZ_USE_SVE2
233+
if (caps & sz_cap_sve2_k) { impl->bytesum = sz_bytesum_sve2; }
234+
#endif
235+
236+
#if SZ_USE_SVE2_AES
237+
if (caps & sz_cap_sve2_aes_k) {
238+
impl->hash = sz_hash_sve2;
239+
impl->hash_state_init = sz_hash_state_init_sve2;
240+
impl->hash_state_stream = sz_hash_state_stream_sve2;
241+
impl->hash_state_fold = sz_hash_state_fold_sve2;
242+
impl->fill_random = sz_fill_random_sve2;
243+
}
244+
#endif
212245
}
213246

214247
#if defined(_MSC_VER)

include/stringzilla/hash.h

Lines changed: 92 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
229229
/** @copydoc sz_hash_state_fold */
230230
SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state);
231231

232+
#if SZ_USE_HASWELL
233+
232234
/** @copydoc sz_bytesum */
233235
SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length);
234236

@@ -247,6 +249,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
247249
/** @copydoc sz_hash_state_fold */
248250
SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state);
249251

252+
#endif
253+
254+
#if SZ_USE_SKYLAKE
255+
250256
/** @copydoc sz_bytesum */
251257
SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length);
252258

@@ -265,6 +271,10 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
265271
/** @copydoc sz_hash_state_fold */
266272
SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state);
267273

274+
#endif
275+
276+
#if SZ_USE_ICE
277+
268278
/** @copydoc sz_bytesum */
269279
SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length);
270280

@@ -283,9 +293,17 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
283293
/** @copydoc sz_hash_state_fold */
284294
SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state);
285295

296+
#endif
297+
298+
#if SZ_USE_NEON
299+
286300
/** @copydoc sz_bytesum */
287301
SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length);
288302

303+
#endif
304+
305+
#if SZ_USE_NEON_AES
306+
289307
/** @copydoc sz_hash */
290308
SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
291309

@@ -301,6 +319,41 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
301319
/** @copydoc sz_hash_state_fold */
302320
SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state);
303321

322+
#endif
323+
324+
#if SZ_USE_SVE
325+
326+
/** @copydoc sz_bytesum */
327+
SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length);
328+
329+
#endif
330+
331+
#if SZ_USE_SVE2
332+
333+
/** @copydoc sz_bytesum */
334+
SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length);
335+
336+
#endif
337+
338+
#if SZ_USE_SVE2_AES
339+
340+
/** @copydoc sz_hash */
341+
SZ_PUBLIC sz_u64_t sz_hash_sve2(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
342+
343+
/** @copydoc sz_fill_random */
344+
SZ_PUBLIC void sz_fill_random_sve2(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
345+
346+
/** @copydoc sz_hash_state_init */
347+
SZ_PUBLIC void sz_hash_state_init_sve2(sz_hash_state_t *state, sz_u64_t seed);
348+
349+
/** @copydoc sz_hash_state_stream */
350+
SZ_PUBLIC void sz_hash_state_stream_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
351+
352+
/** @copydoc sz_hash_state_fold */
353+
SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2(sz_hash_state_t const *state);
354+
355+
#endif
356+
304357
#pragma endregion // Core API
305358

306359
#pragma region Helper Methods
@@ -1922,8 +1975,8 @@ SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __
19221975
#pragma region NEON Implementation
19231976
#if SZ_USE_NEON
19241977
#pragma GCC push_options
1925-
#pragma GCC target("arch=armv8.2-a+simd+crypto")
1926-
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto"))), apply_to = function)
1978+
#pragma GCC target("arch=armv8.2-a+simd")
1979+
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
19271980

19281981
SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
19291982
uint64x2_t sum_vec = vdupq_n_u64(0);
@@ -1943,6 +1996,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
19431996
return sum;
19441997
}
19451998

1999+
#pragma clang attribute pop
2000+
#pragma GCC pop_options
2001+
#endif // SZ_USE_NEON
2002+
#pragma endregion // NEON Implementation
2003+
2004+
#pragma region NEON AES Implementation
2005+
#if SZ_USE_NEON_AES
2006+
#pragma GCC push_options
2007+
#pragma GCC target("arch=armv8.2-a+simd+crypto+aes")
2008+
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto+aes"))), apply_to = function)
2009+
19462010
/**
19472011
* @brief Emulates the Intel's AES-NI `AESENC` instruction on Arm NEON.
19482012
* @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2303,7 +2367,7 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
23032367
#pragma clang attribute pop
23042368
#pragma GCC pop_options
23052369
#endif // SZ_USE_NEON
2306-
#pragma endregion // NEON Implementation
2370+
#pragma endregion // NEON AES Implementation
23072371

23082372
/* Implementation of the string search algorithms using the Arm SVE variable-length registers,
23092373
* available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
@@ -2340,11 +2404,11 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
23402404
*
23412405
* @see https://stackoverflow.com/a/73218637/2766161
23422406
*/
2343-
#pragma region SVE Implementation
2407+
#pragma region SVE2 Implementation
23442408
#if SZ_USE_SVE2
23452409
#pragma GCC push_options
2346-
#pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
2347-
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
2410+
#pragma GCC target("arch=armv8.2-a+sve+sve2")
2411+
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
23482412

23492413
SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
23502414
sz_u64_t sum = 0;
@@ -2371,6 +2435,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
23712435
return sum;
23722436
}
23732437

2438+
#pragma clang attribute pop
2439+
#pragma GCC pop_options
2440+
#endif // SZ_USE_SVE
2441+
#pragma endregion // SVE2 Implementation
2442+
2443+
#pragma region SVE2 AES Implementation
2444+
#if SZ_USE_SVE2_AES
2445+
#pragma GCC push_options
2446+
#pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
2447+
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
2448+
23742449
/**
23752450
* @brief Emulates the Intel's AES-NI `AESENC` instruction with Arm SVE2.
23762451
* @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2504,7 +2579,7 @@ SZ_PUBLIC void sz_hash_sve2_upto16x16_(char texts[16][16], sz_size_t length[16],
25042579
#pragma clang attribute pop
25052580
#pragma GCC pop_options
25062581
#endif // SZ_USE_SVE2
2507-
#pragma endregion // SVE Implementation
2582+
#pragma endregion // SVE2 Implementation
25082583

25092584
/* Pick the right implementation for the string search algorithms.
25102585
* To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
@@ -2537,9 +2612,9 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
25372612
return sz_hash_skylake(text, length, seed);
25382613
#elif SZ_USE_HASWELL
25392614
return sz_hash_haswell(text, length, seed);
2540-
#elif SZ_USE_SVE2
2615+
#elif SZ_USE_SVE2_AES
25412616
return sz_hash_sve2(text, length, seed);
2542-
#elif SZ_USE_NEON
2617+
#elif SZ_USE_NEON_AES
25432618
return sz_hash_neon(text, length, seed);
25442619
#else
25452620
return sz_hash_serial(text, length, seed);
@@ -2553,9 +2628,9 @@ SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce)
25532628
sz_fill_random_skylake(text, length, nonce);
25542629
#elif SZ_USE_HASWELL
25552630
sz_fill_random_haswell(text, length, nonce);
2556-
#elif SZ_USE_SVE2
2631+
#elif SZ_USE_SVE2_AES
25572632
sz_fill_random_sve2(text, length, nonce);
2558-
#elif SZ_USE_NEON
2633+
#elif SZ_USE_NEON_AES
25592634
sz_fill_random_neon(text, length, nonce);
25602635
#else
25612636
sz_fill_random_serial(text, length, nonce);
@@ -2569,9 +2644,9 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
25692644
sz_hash_state_init_skylake(state, seed);
25702645
#elif SZ_USE_HASWELL
25712646
sz_hash_state_init_haswell(state, seed);
2572-
#elif SZ_USE_SVE2
2647+
#elif SZ_USE_SVE2_AES
25732648
sz_hash_state_init_sve2(state, seed);
2574-
#elif SZ_USE_NEON
2649+
#elif SZ_USE_NEON_AES
25752650
sz_hash_state_init_neon(state, seed);
25762651
#else
25772652
sz_hash_state_init_serial(state, seed);
@@ -2585,9 +2660,9 @@ SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_
25852660
sz_hash_state_stream_skylake(state, text, length);
25862661
#elif SZ_USE_HASWELL
25872662
sz_hash_state_stream_haswell(state, text, length);
2588-
#elif SZ_USE_SVE2
2663+
#elif SZ_USE_SVE2_AES
25892664
sz_hash_state_stream_sve2(state, text, length);
2590-
#elif SZ_USE_NEON
2665+
#elif SZ_USE_NEON_AES
25912666
sz_hash_state_stream_neon(state, text, length);
25922667
#else
25932668
sz_hash_state_stream_serial(state, text, length);
@@ -2601,9 +2676,9 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
26012676
return sz_hash_state_fold_skylake(state);
26022677
#elif SZ_USE_HASWELL
26032678
return sz_hash_state_fold_haswell(state);
2604-
#elif SZ_USE_SVE2
2679+
#elif SZ_USE_SVE2_AES
26052680
return sz_hash_state_fold_sve2(state);
2606-
#elif SZ_USE_NEON
2681+
#elif SZ_USE_NEON_AES
26072682
return sz_hash_state_fold_neon(state);
26082683
#else
26092684
return sz_hash_state_fold_serial(state);

include/stringzilla/stringzilla.h

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,24 +192,27 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
192192

193193
// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
194194
uint32_t supports_neon = 0;
195+
uint32_t supports_neon_aes = 0;
195196
size_t size = sizeof(supports_neon);
196197
if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
198+
if (sysctlbyname("hw.optional.arm.FEAT_AES", &supports_neon_aes, &size, NULL, 0) != 0) supports_neon_aes = 0;
197199

198-
return (sz_capability_t)( //
199-
(sz_cap_neon_k * (supports_neon)) | //
200+
return (sz_capability_t)( //
201+
(sz_cap_neon_k * (supports_neon)) | //
202+
(sz_cap_neon_aes_k * (supports_neon_aes)) | //
200203
(sz_cap_serial_k));
201204

202205
#elif defined(SZ_IS_LINUX_)
203206

204207
// Read CPUID registers directly
205208
unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
206-
unsigned supports_neon = 0, supports_sve = 0, supports_sve2 = 0;
209+
unsigned supports_neon = 0, supports_neon_aes = 0, supports_sve = 0, supports_sve2 = 0, supports_sve2_aes = 0;
207210
sz_unused_(id_aa64isar0_el1);
208211
sz_unused_(id_aa64isar1_el1);
209212
sz_unused_(id_aa64pfr0_el1);
210213
sz_unused_(id_aa64zfr0_el1);
211214

212-
#if SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2
215+
#if SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2 || SZ_USE_NEON_AES || SZ_USE_SVE2_AES
213216
// Now let's unpack the status flags from ID_AA64ISAR0_EL1
214217
// https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
215218
__asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
@@ -228,8 +231,9 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
228231
// That's a really weird way to encode lack of NEON support, but it's important to
229232
// check in case we are running on R-profile CPUs.
230233
supports_neon = ((id_aa64pfr0_el1 >> 20) & 0xF) != 0xF;
234+
supports_neon_aes = ((id_aa64isar0_el1 >> 4) & 0xF) >= 1;
231235

232-
#if SZ_USE_SVE || SZ_USE_SVE2
236+
#if SZ_USE_SVE || SZ_USE_SVE2 || SZ_USE_SVE2_AES
233237
// SVE, bits [35:32] of ID_AA64PFR0_EL1
234238
supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
235239
// Now let's unpack the status flags from ID_AA64ZFR0_EL1
@@ -241,12 +245,15 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
241245
// - 0b0010: SVE2.1 is implemented
242246
// This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
243247
supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
248+
supports_sve2_aes = ((id_aa64zfr0_el1 >> 4) & 0xF) >= 1;
244249
#endif // SZ_USE_SVE || SZ_USE_SVE2
245250

246-
return (sz_capability_t)( //
247-
(sz_cap_neon_k * (supports_neon)) | //
248-
(sz_cap_sve_k * (supports_sve)) | //
249-
(sz_cap_sve2_k * (supports_sve2)) | //
251+
return (sz_capability_t)( //
252+
(sz_cap_neon_k * (supports_neon)) | //
253+
(sz_cap_neon_aes_k * (supports_neon_aes)) | //
254+
(sz_cap_sve_k * (supports_sve)) | //
255+
(sz_cap_sve2_k * (supports_sve2)) | //
256+
(sz_cap_sve2_aes_k * (supports_sve2_aes)) | //
250257
(sz_cap_serial_k));
251258

252259
#else // if !defined(SZ_IS_APPLE_) && !defined(SZ_IS_LINUX_)

include/stringzilla/types.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,22 @@
260260
#endif
261261
#endif
262262

263+
#if !defined(SZ_USE_NEON_AES)
264+
#ifdef __ARM_FEATURE_AES
265+
#define SZ_USE_NEON_AES (1)
266+
#else
267+
#define SZ_USE_NEON_AES (0)
268+
#endif
269+
#endif
270+
271+
#if !defined(SZ_USE_SVE2_AES)
272+
#ifdef __ARM_FEATURE_SVE2_AES
273+
#define SZ_USE_SVE2_AES (1)
274+
#else
275+
#define SZ_USE_SVE2_AES (0)
276+
#endif
277+
#endif
278+
263279
#if !defined(SZ_USE_CUDA)
264280
#ifdef __NVCC__
265281
#define SZ_USE_CUDA (1)

scripts/bench_container.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ void bench_associative_lookups_with_different_simd_backends(environment_t const
148148
.log(base_umap);
149149
}
150150
#endif
151-
#if SZ_USE_NEON
151+
#if SZ_USE_NEON_AES
152152
{
153153
auto callable_map =
154154
callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_neon>>>(env);

scripts/bench_memory.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,8 @@ void bench_fill(environment_t const &env) {
305305
#endif
306306
#if SZ_USE_NEON
307307
bench_unary(env, "sz_fill_neon", fill_from_sz<sz_fill_neon> {env, o}).log(zeros);
308+
#endif
309+
#if SZ_USE_NEON_AES
308310
bench_unary(env, "sz_fill_random_neon", random_call, fill_random_from_sz<sz_fill_random_neon> {env, o})
309311
.log(zeros, random);
310312
#endif

0 commit comments

Comments
 (0)