@@ -1170,9 +1170,27 @@ FORCE_INLINE_TEMPLATE seq_t
1170
1170
ZSTD_decodeSequence (seqState_t * seqState , const ZSTD_longOffset_e longOffsets )
1171
1171
{
1172
1172
seq_t seq ;
1173
+ /*
1174
+ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
1175
+ * loaded in one operation and extracted its fields by simply shifting or
1176
+ * bit-extracting on aarch64.
1177
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
1178
+ * operations that cause performance drop. This can be avoided by using this
1179
+ * ZSTD_memcpy hack.
1180
+ */
1181
+ #if defined(__aarch64__ ) && (defined(__GNUC__ ) && !defined(__clang__ ))
1182
+ ZSTD_seqSymbol llDInfoS , mlDInfoS , ofDInfoS ;
1183
+ ZSTD_seqSymbol * const llDInfo = & llDInfoS ;
1184
+ ZSTD_seqSymbol * const mlDInfo = & mlDInfoS ;
1185
+ ZSTD_seqSymbol * const ofDInfo = & ofDInfoS ;
1186
+ ZSTD_memcpy (llDInfo , seqState -> stateLL .table + seqState -> stateLL .state , sizeof (ZSTD_seqSymbol ));
1187
+ ZSTD_memcpy (mlDInfo , seqState -> stateML .table + seqState -> stateML .state , sizeof (ZSTD_seqSymbol ));
1188
+ ZSTD_memcpy (ofDInfo , seqState -> stateOffb .table + seqState -> stateOffb .state , sizeof (ZSTD_seqSymbol ));
1189
+ #else
1173
1190
const ZSTD_seqSymbol * const llDInfo = seqState -> stateLL .table + seqState -> stateLL .state ;
1174
1191
const ZSTD_seqSymbol * const mlDInfo = seqState -> stateML .table + seqState -> stateML .state ;
1175
1192
const ZSTD_seqSymbol * const ofDInfo = seqState -> stateOffb .table + seqState -> stateOffb .state ;
1193
+ #endif
1176
1194
seq .matchLength = mlDInfo -> baseValue ;
1177
1195
seq .litLength = llDInfo -> baseValue ;
1178
1196
{ U32 const ofBase = ofDInfo -> baseValue ;
0 commit comments