@@ -7395,56 +7395,65 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
7395
7395
return ZSTD_convertBlockSequences_internal (cctx , inSeqs , nbSequences , 0 );
7396
7396
}
7397
7397
7398
- #if 0 && defined(__AVX2__ )
7398
+ #if defined(__AVX2__ )
7399
7399
7400
7400
/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */
7401
7401
#if defined(__GNUC__ )
7402
7402
# define ALIGNED32 __attribute__((aligned(32)))
7403
+ #elif defined(__STDC_VERSION__ ) && (__STDC_VERSION__ >= 201112L ) /* C11 */
7404
+ # define ALIGNED32 alignas(32)
7403
7405
#else
7406
+ /* this compiler will require its own alignment instruction */
7404
7407
# define ALIGNED32
7405
7408
#endif
7406
7409
7407
7410
BlockSummary ZSTD_get1BlockSummary (const ZSTD_Sequence * seqs , size_t nbSeqs )
7408
7411
{
7409
7412
size_t i ;
7410
- __m256i sumVec ; /* accumulates match+lit in 32-bit lanes */
7411
- __m256i mask ; /* shuffling control */
7412
- ALIGNED32 int tmp [8 ]; /* temporary buffer for reduction */
7413
- uint64_t sum ;
7414
- int k ;
7415
-
7416
- sumVec = _mm256_setzero_si256 ();
7417
- mask = _mm256_setr_epi32 (
7418
- 1 ,5 , /* match(0), match(1) */
7419
- 2 ,6 , /* lit(0), lit(1) */
7420
- 1 ,5 , /* match(0), match(1) */
7421
- 2 ,6 /* lit(0), lit(1) */
7422
- );
7413
+ __m256i const zeroVec = _mm256_setzero_si256 ();
7414
+ __m256i sumVec = zeroVec ; /* accumulates match+lit in 32-bit lanes */
7415
+ __m256i shuffle32 ; /* shuffling control */
7416
+ ALIGNED32 U32 tmp [8 ]; /* temporary buffer for reduction */
7417
+ size_t mSum = 0 , lSum = 0 ;
7423
7418
7424
7419
/* Process 2 structs (32 bytes) at a time */
7425
- for (i = 0 ; i + 2 <= count ; i += 2 ) {
7426
- /* Load two consecutive MyStructs (8×4 = 32 bytes) */
7427
- __m256i data = _mm256_loadu_si256 ((const __m256i * )& arr [i ]);
7428
- /* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */
7429
- __m256i selected = _mm256_permutevar8x32_epi32 (data , mask );
7420
+ for (i = 0 ; i + 2 <= nbSeqs ; i += 2 ) {
7421
+ /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */
7422
+ __m256i data = _mm256_loadu_si256 ((const __m256i * )& seqs [i ]);
7423
+ /* check end of block signal */
7424
+ __m256i cmp = _mm256_cmpeq_epi32 (data , zeroVec );
7425
+ int cmp_res = _mm256_movemask_epi8 (cmp );
7426
+ /* indices for match lengths correspond to bits [8..11], [24..27]
7427
+ * => combined mask = 0x0F000F00 */
7428
+ if (cmp_res & 0x0F000F00 ) break ;
7430
7429
/* Accumulate in sumVec */
7431
- sumVec = _mm256_add_epi32 (sumVec , selected );
7430
+ sumVec = _mm256_add_epi32 (sumVec , data );
7432
7431
}
7433
7432
7434
- /* Horizontal reduction of sumVec */
7433
+ /* Horizontal reduction */
7435
7434
_mm256_store_si256 ((__m256i * )tmp , sumVec );
7436
- sum = 0 ;
7437
- for (k = 0 ; k < 8 ; k ++ ) {
7438
- sum += (uint64_t )tmp [k ]; /* each lane is match+lit from pairs, repeated twice */
7439
- }
7435
+ lSum = tmp [1 ] + tmp [5 ];
7436
+ mSum = tmp [2 ] + tmp [6 ];
7440
7437
7441
- /* Handle the leftover (if count is odd) */
7442
- for (; i < count ; i ++ ) {
7443
- sum += arr [i ].matchLength ;
7444
- sum += arr [i ].litLength ;
7438
+ /* Handle the leftover */
7439
+ for (; i < nbSeqs ; i ++ ) {
7440
+ lSum += seqs [i ].litLength ;
7441
+ mSum += seqs [i ].matchLength ;
7442
+ if (seqs [i ].matchLength == 0 ) break ; /* end of block */
7445
7443
}
7446
7444
7447
- return sum ;
7445
+ if (i == nbSeqs ) {
7446
+ /* reaching end of sequences: end of block signal was not present */
7447
+ BlockSummary bs ;
7448
+ bs .nbSequences = ERROR (externalSequences_invalid );
7449
+ return bs ;
7450
+ }
7451
+ { BlockSummary bs ;
7452
+ bs .nbSequences = i + 1 ;
7453
+ bs .blockSize = lSum + mSum ;
7454
+ bs .litSize = lSum ;
7455
+ return bs ;
7456
+ }
7448
7457
}
7449
7458
7450
7459
#else
0 commit comments