Skip to content

Commit bde3e5f

Browse files
committed
AVX2 version of ZSTD_get1BlockSummary()
1 parent c101f12 commit bde3e5f

File tree

1 file changed

+39
-30
lines changed

1 file changed

+39
-30
lines changed

lib/compress/zstd_compress.c

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7395,56 +7395,65 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
73957395
return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0);
73967396
}
73977397

7398-
#if 0 && defined(__AVX2__)
7398+
#if defined(__AVX2__)
73997399

74007400
/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */
74017401
#if defined(__GNUC__)
74027402
# define ALIGNED32 __attribute__((aligned(32)))
7403+
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */
7404+
# define ALIGNED32 alignas(32)
74037405
#else
7406+
/* this compiler will require its own alignment instruction */
74047407
# define ALIGNED32
74057408
#endif
74067409

74077410
BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
74087411
{
74097412
size_t i;
7410-
__m256i sumVec; /* accumulates match+lit in 32-bit lanes */
7411-
__m256i mask; /* shuffling control */
7412-
ALIGNED32 int tmp[8]; /* temporary buffer for reduction */
7413-
uint64_t sum;
7414-
int k;
7415-
7416-
sumVec = _mm256_setzero_si256();
7417-
mask = _mm256_setr_epi32(
7418-
1,5, /* match(0), match(1) */
7419-
2,6, /* lit(0), lit(1) */
7420-
1,5, /* match(0), match(1) */
7421-
2,6 /* lit(0), lit(1) */
7422-
);
7413+
__m256i const zeroVec = _mm256_setzero_si256();
7414+
__m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */
7415+
__m256i shuffle32; /* shuffling control */
7416+
ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */
7417+
size_t mSum = 0, lSum = 0;
74237418

74247419
/* Process 2 structs (32 bytes) at a time */
7425-
for (i = 0; i + 2 <= count; i += 2) {
7426-
/* Load two consecutive MyStructs (8×4 = 32 bytes) */
7427-
__m256i data = _mm256_loadu_si256((const __m256i*)&arr[i]);
7428-
/* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */
7429-
__m256i selected = _mm256_permutevar8x32_epi32(data, mask);
7420+
for (i = 0; i + 2 <= nbSeqs; i += 2) {
7421+
/* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */
7422+
__m256i data = _mm256_loadu_si256((const __m256i*)&seqs[i]);
7423+
/* check end of block signal */
7424+
__m256i cmp = _mm256_cmpeq_epi32(data, zeroVec);
7425+
int cmp_res = _mm256_movemask_epi8(cmp);
7426+
/* indices for match lengths correspond to bits [8..11], [24..27]
7427+
* => combined mask = 0x0F000F00 */
7428+
if (cmp_res & 0x0F000F00) break;
74307429
/* Accumulate in sumVec */
7431-
sumVec = _mm256_add_epi32(sumVec, selected);
7430+
sumVec = _mm256_add_epi32(sumVec, data);
74327431
}
74337432

7434-
/* Horizontal reduction of sumVec */
7433+
/* Horizontal reduction */
74357434
_mm256_store_si256((__m256i*)tmp, sumVec);
7436-
sum = 0;
7437-
for (k = 0; k < 8; k++) {
7438-
sum += (uint64_t)tmp[k]; /* each lane is match+lit from pairs, repeated twice */
7439-
}
7435+
lSum = tmp[1] + tmp[5];
7436+
mSum = tmp[2] + tmp[6];
74407437

7441-
/* Handle the leftover (if count is odd) */
7442-
for (; i < count; i++) {
7443-
sum += arr[i].matchLength;
7444-
sum += arr[i].litLength;
7438+
/* Handle the leftover */
7439+
for (; i < nbSeqs; i++) {
7440+
lSum += seqs[i].litLength;
7441+
mSum += seqs[i].matchLength;
7442+
if (seqs[i].matchLength == 0) break; /* end of block */
74457443
}
74467444

7447-
return sum;
7445+
if (i==nbSeqs) {
7446+
/* reaching end of sequences: end of block signal was not present */
7447+
BlockSummary bs;
7448+
bs.nbSequences = ERROR(externalSequences_invalid);
7449+
return bs;
7450+
}
7451+
{ BlockSummary bs;
7452+
bs.nbSequences = i+1;
7453+
bs.blockSize = lSum + mSum;
7454+
bs.litSize = lSum;
7455+
return bs;
7456+
}
74487457
}
74497458

74507459
#else

0 commit comments

Comments
 (0)