@@ -587,11 +587,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
587
587
U32 * const hashTable = ms -> hashTable ;
588
588
U32 const hlog = cParams -> hashLog ;
589
589
/* support stepSize of 0 */
590
- U32 const stepSize = cParams -> targetLength + !(cParams -> targetLength );
590
+ size_t const stepSize = cParams -> targetLength + !(cParams -> targetLength ) + 1 ;
591
591
const BYTE * const base = ms -> window .base ;
592
592
const BYTE * const dictBase = ms -> window .dictBase ;
593
593
const BYTE * const istart = (const BYTE * )src ;
594
- const BYTE * ip = istart ;
595
594
const BYTE * anchor = istart ;
596
595
const U32 endIndex = (U32 )((size_t )(istart - base ) + srcSize );
597
596
const U32 lowLimit = ZSTD_getLowestMatchIndex (ms , endIndex , cParams -> windowLog );
@@ -605,6 +604,27 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
605
604
const BYTE * const ilimit = iend - 8 ;
606
605
U32 offset_1 = rep [0 ], offset_2 = rep [1 ];
607
606
607
+ const BYTE * ip0 = istart ;
608
+ const BYTE * ip1 ;
609
+ const BYTE * ip2 ;
610
+ const BYTE * ip3 ;
611
+ U32 current0 ;
612
+
613
+
614
+ size_t hash0 ; /* hash for ip0 */
615
+ size_t hash1 ; /* hash for ip1 */
616
+ U32 idx ; /* match idx for ip0 */
617
+ const BYTE * idxBase ; /* base pointer for idx */
618
+
619
+ U32 offcode ;
620
+ const BYTE * match0 ;
621
+ size_t mLength ;
622
+ const BYTE * matchEnd = 0 ; /* initialize to avoid warning, assert != 0 later */
623
+
624
+ size_t step ;
625
+ const BYTE * nextStep ;
626
+ const size_t kStepIncr = (1 << (kSearchStrength - 1 ));
627
+
608
628
(void )hasStep ; /* not currently specialized on whether it's accelerated */
609
629
610
630
DEBUGLOG (5 , "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)" , offset_1 );
@@ -613,75 +633,198 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
613
633
if (prefixStartIndex == dictStartIndex )
614
634
return ZSTD_compressBlock_fast (ms , seqStore , rep , src , srcSize );
615
635
616
- /* Search Loop */
617
- while (ip < ilimit ) { /* < instead of <=, because (ip+1) */
618
- const size_t h = ZSTD_hashPtr (ip , hlog , mls );
619
- const U32 matchIndex = hashTable [h ];
620
- const BYTE * const matchBase = matchIndex < prefixStartIndex ? dictBase : base ;
621
- const BYTE * match = matchBase + matchIndex ;
622
- const U32 curr = (U32 )(ip - base );
623
- const U32 repIndex = curr + 1 - offset_1 ;
624
- const BYTE * const repBase = repIndex < prefixStartIndex ? dictBase : base ;
625
- const BYTE * const repMatch = repBase + repIndex ;
626
- hashTable [h ] = curr ; /* update hash table */
627
- DEBUGLOG (7 , "offset_1 = %u , curr = %u" , offset_1 , curr );
628
-
629
- if ( ( ((U32 )((prefixStartIndex - 1 ) - repIndex ) >= 3 ) /* intentional underflow */
630
- & (offset_1 <= curr + 1 - dictStartIndex ) ) /* note: we are searching at curr+1 */
631
- && (MEM_read32 (repMatch ) == MEM_read32 (ip + 1 )) ) {
632
- const BYTE * const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend ;
633
- size_t const rLength = ZSTD_count_2segments (ip + 1 + 4 , repMatch + 4 , iend , repMatchEnd , prefixStart ) + 4 ;
634
- ip ++ ;
635
- ZSTD_storeSeq (seqStore , (size_t )(ip - anchor ), anchor , iend , REPCODE1_TO_OFFBASE , rLength );
636
- ip += rLength ;
637
- anchor = ip ;
638
- } else {
639
- if ( (matchIndex < dictStartIndex ) ||
640
- (MEM_read32 (match ) != MEM_read32 (ip )) ) {
641
- assert (stepSize >= 1 );
642
- ip += ((ip - anchor ) >> kSearchStrength ) + stepSize ;
643
- continue ;
636
+ { U32 const curr = (U32 )(ip0 - base );
637
+ U32 const maxRep = curr - dictStartIndex ;
638
+ if (offset_2 >= maxRep ) offset_2 = 0 ;
639
+ if (offset_1 >= maxRep ) offset_1 = 0 ;
640
+ }
641
+
642
+ /* start each op */
643
+ _start : /* Requires: ip0 */
644
+
645
+ step = stepSize ;
646
+ nextStep = ip0 + kStepIncr ;
647
+
648
+ /* calculate positions, ip0 - anchor == 0, so we skip step calc */
649
+ ip1 = ip0 + 1 ;
650
+ ip2 = ip0 + step ;
651
+ ip3 = ip2 + 1 ;
652
+
653
+ if (ip3 >= ilimit ) {
654
+ goto _cleanup ;
655
+ }
656
+
657
+ hash0 = ZSTD_hashPtr (ip0 , hlog , mls );
658
+ hash1 = ZSTD_hashPtr (ip1 , hlog , mls );
659
+
660
+ idx = hashTable [hash0 ];
661
+ idxBase = idx < prefixStartIndex ? dictBase : base ;
662
+
663
+ do {
664
+ { /* load repcode match for ip[2] */
665
+ U32 const current2 = (U32 )(ip2 - base );
666
+ U32 const repIndex = current2 - offset_1 ;
667
+ const BYTE * const repBase = repIndex < prefixStartIndex ? dictBase : base ;
668
+ U32 rval ;
669
+ if ( ((U32 )(prefixStartIndex - repIndex ) >= 4 ) /* intentional underflow */
670
+ & (offset_1 > 0 ) ) {
671
+ rval = MEM_read32 (repBase + repIndex );
672
+ } else {
673
+ rval = MEM_read32 (ip2 ) ^ 1 ; /* guaranteed to not match. */
644
674
}
645
- { const BYTE * const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend ;
646
- const BYTE * const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart ;
647
- U32 const offset = curr - matchIndex ;
648
- size_t mLength = ZSTD_count_2segments (ip + 4 , match + 4 , iend , matchEnd , prefixStart ) + 4 ;
649
- while (((ip > anchor ) & (match > lowMatchPtr )) && (ip [-1 ] == match [-1 ])) { ip -- ; match -- ; mLength ++ ; } /* catch up */
650
- offset_2 = offset_1 ; offset_1 = offset ; /* update offset history */
651
- ZSTD_storeSeq (seqStore , (size_t )(ip - anchor ), anchor , iend , OFFSET_TO_OFFBASE (offset ), mLength );
652
- ip += mLength ;
653
- anchor = ip ;
675
+
676
+ /* write back hash table entry */
677
+ current0 = (U32 )(ip0 - base );
678
+ hashTable [hash0 ] = current0 ;
679
+
680
+ /* check repcode at ip[2] */
681
+ if (MEM_read32 (ip2 ) == rval ) {
682
+ ip0 = ip2 ;
683
+ match0 = repBase + repIndex ;
684
+ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend ;
685
+ assert ((match0 != prefixStart ) & (match0 != dictStart ));
686
+ mLength = ip0 [-1 ] == match0 [-1 ];
687
+ ip0 -= mLength ;
688
+ match0 -= mLength ;
689
+ offcode = REPCODE1_TO_OFFBASE ;
690
+ mLength += 4 ;
691
+ goto _match ;
654
692
} }
655
693
656
- if (ip <= ilimit ) {
657
- /* Fill Table */
658
- hashTable [ZSTD_hashPtr (base + curr + 2 , hlog , mls )] = curr + 2 ;
659
- hashTable [ZSTD_hashPtr (ip - 2 , hlog , mls )] = (U32 )(ip - 2 - base );
660
- /* check immediate repcode */
661
- while (ip <= ilimit ) {
662
- U32 const current2 = (U32 )(ip - base );
663
- U32 const repIndex2 = current2 - offset_2 ;
664
- const BYTE * const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2 ;
665
- if ( (((U32 )((prefixStartIndex - 1 ) - repIndex2 ) >= 3 ) & (offset_2 <= curr - dictStartIndex )) /* intentional overflow */
666
- && (MEM_read32 (repMatch2 ) == MEM_read32 (ip )) ) {
667
- const BYTE * const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend ;
668
- size_t const repLength2 = ZSTD_count_2segments (ip + 4 , repMatch2 + 4 , iend , repEnd2 , prefixStart ) + 4 ;
669
- { U32 const tmpOffset = offset_2 ; offset_2 = offset_1 ; offset_1 = tmpOffset ; } /* swap offset_2 <=> offset_1 */
670
- ZSTD_storeSeq (seqStore , 0 /*litlen*/ , anchor , iend , REPCODE1_TO_OFFBASE , repLength2 );
671
- hashTable [ZSTD_hashPtr (ip , hlog , mls )] = current2 ;
672
- ip += repLength2 ;
673
- anchor = ip ;
674
- continue ;
675
- }
676
- break ;
677
- } } }
694
+ { /* load match for ip[0] */
695
+ U32 const mval = idx >= dictStartIndex ?
696
+ MEM_read32 (idxBase + idx ) :
697
+ MEM_read32 (ip0 ) ^ 1 ; /* guaranteed not to match */
698
+
699
+ /* check match at ip[0] */
700
+ if (MEM_read32 (ip0 ) == mval ) {
701
+ /* found a match! */
702
+ goto _offset ;
703
+ } }
704
+
705
+ /* lookup ip[1] */
706
+ idx = hashTable [hash1 ];
707
+ idxBase = idx < prefixStartIndex ? dictBase : base ;
708
+
709
+ /* hash ip[2] */
710
+ hash0 = hash1 ;
711
+ hash1 = ZSTD_hashPtr (ip2 , hlog , mls );
712
+
713
+ /* advance to next positions */
714
+ ip0 = ip1 ;
715
+ ip1 = ip2 ;
716
+ ip2 = ip3 ;
717
+
718
+ /* write back hash table entry */
719
+ current0 = (U32 )(ip0 - base );
720
+ hashTable [hash0 ] = current0 ;
721
+
722
+ { /* load match for ip[0] */
723
+ U32 const mval = idx >= dictStartIndex ?
724
+ MEM_read32 (idxBase + idx ) :
725
+ MEM_read32 (ip0 ) ^ 1 ; /* guaranteed not to match */
726
+
727
+ /* check match at ip[0] */
728
+ if (MEM_read32 (ip0 ) == mval ) {
729
+ /* found a match! */
730
+ goto _offset ;
731
+ } }
732
+
733
+ /* lookup ip[1] */
734
+ idx = hashTable [hash1 ];
735
+ idxBase = idx < prefixStartIndex ? dictBase : base ;
736
+
737
+ /* hash ip[2] */
738
+ hash0 = hash1 ;
739
+ hash1 = ZSTD_hashPtr (ip2 , hlog , mls );
740
+
741
+ /* advance to next positions */
742
+ ip0 = ip1 ;
743
+ ip1 = ip2 ;
744
+ ip2 = ip0 + step ;
745
+ ip3 = ip1 + step ;
746
+
747
+ /* calculate step */
748
+ if (ip2 >= nextStep ) {
749
+ step ++ ;
750
+ PREFETCH_L1 (ip1 + 64 );
751
+ PREFETCH_L1 (ip1 + 128 );
752
+ nextStep += kStepIncr ;
753
+ }
754
+ } while (ip3 < ilimit );
755
+
756
+ _cleanup :
757
+ /* Note that there are probably still a couple positions we could search.
758
+ * However, it seems to be a meaningful performance hit to try to search
759
+ * them. So let's not. */
678
760
679
761
/* save reps for next block */
680
- rep [0 ] = offset_1 ;
681
- rep [1 ] = offset_2 ;
762
+ rep [0 ] = offset_1 ? offset_1 : rep [ 0 ] ;
763
+ rep [1 ] = offset_2 ? offset_2 : rep [ 1 ] ;
682
764
683
765
/* Return the last literals size */
684
766
return (size_t )(iend - anchor );
767
+
768
+ _offset : /* Requires: ip0, idx, idxBase */
769
+
770
+ /* Compute the offset code. */
771
+ { U32 const offset = current0 - idx ;
772
+ const BYTE * const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart ;
773
+ matchEnd = idx < prefixStartIndex ? dictEnd : iend ;
774
+ match0 = idxBase + idx ;
775
+ offset_2 = offset_1 ;
776
+ offset_1 = offset ;
777
+ offcode = OFFSET_TO_OFFBASE (offset );
778
+ mLength = 4 ;
779
+
780
+ /* Count the backwards match length. */
781
+ while (((ip0 > anchor ) & (match0 > lowMatchPtr )) && (ip0 [-1 ] == match0 [-1 ])) {
782
+ ip0 -- ;
783
+ match0 -- ;
784
+ mLength ++ ;
785
+ } }
786
+
787
+ _match : /* Requires: ip0, match0, offcode, matchEnd */
788
+
789
+ /* Count the forward length. */
790
+ assert (matchEnd != 0 );
791
+ mLength += ZSTD_count_2segments (ip0 + mLength , match0 + mLength , iend , matchEnd , prefixStart );
792
+
793
+ ZSTD_storeSeq (seqStore , (size_t )(ip0 - anchor ), anchor , iend , offcode , mLength );
794
+
795
+ ip0 += mLength ;
796
+ anchor = ip0 ;
797
+
798
+ /* write next hash table entry */
799
+ if (ip1 < ip0 ) {
800
+ hashTable [hash1 ] = (U32 )(ip1 - base );
801
+ }
802
+
803
+ /* Fill table and check for immediate repcode. */
804
+ if (ip0 <= ilimit ) {
805
+ /* Fill Table */
806
+ assert (base + current0 + 2 > istart ); /* check base overflow */
807
+ hashTable [ZSTD_hashPtr (base + current0 + 2 , hlog , mls )] = current0 + 2 ; /* here because current+2 could be > iend-8 */
808
+ hashTable [ZSTD_hashPtr (ip0 - 2 , hlog , mls )] = (U32 )(ip0 - 2 - base );
809
+
810
+ while (ip0 <= ilimit ) {
811
+ U32 const repIndex2 = (U32 )(ip0 - base ) - offset_2 ;
812
+ const BYTE * const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2 ;
813
+ if ( (((U32 )((prefixStartIndex - 1 ) - repIndex2 ) >= 3 ) & (offset_2 > 0 )) /* intentional underflow */
814
+ && (MEM_read32 (repMatch2 ) == MEM_read32 (ip0 )) ) {
815
+ const BYTE * const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend ;
816
+ size_t const repLength2 = ZSTD_count_2segments (ip0 + 4 , repMatch2 + 4 , iend , repEnd2 , prefixStart ) + 4 ;
817
+ { U32 const tmpOffset = offset_2 ; offset_2 = offset_1 ; offset_1 = tmpOffset ; } /* swap offset_2 <=> offset_1 */
818
+ ZSTD_storeSeq (seqStore , 0 /*litlen*/ , anchor , iend , REPCODE1_TO_OFFBASE , repLength2 );
819
+ hashTable [ZSTD_hashPtr (ip0 , hlog , mls )] = (U32 )(ip0 - base );
820
+ ip0 += repLength2 ;
821
+ anchor = ip0 ;
822
+ continue ;
823
+ }
824
+ break ;
825
+ } }
826
+
827
+ goto _start ;
685
828
}
686
829
687
830
ZSTD_GEN_FAST_FN (extDict , 4 , 0 )
@@ -694,6 +837,7 @@ size_t ZSTD_compressBlock_fast_extDict(
694
837
void const * src , size_t srcSize )
695
838
{
696
839
U32 const mls = ms -> cParams .minMatch ;
840
+ assert (ms -> dictMatchState == NULL );
697
841
switch (mls )
698
842
{
699
843
default : /* includes case 3 */
0 commit comments