@@ -6529,7 +6529,6 @@ PP(pp_unshift)
6529
6529
return NORMAL ;
6530
6530
}
6531
6531
6532
-
6533
6532
PP_wrapped (pp_reverse , 0 , 1 )
6534
6533
{
6535
6534
dSP ; dMARK ;
@@ -6679,10 +6678,50 @@ PP_wrapped(pp_reverse, 0, 1)
6679
6678
}
6680
6679
}
6681
6680
} else {
6681
+ STRLEN i = 0 ;
6682
+ STRLEN j = len ;
6682
6683
char * outp = SvPVX (TARG );
6683
- const char * p = src + len ;
6684
- while (p != src )
6685
- * outp ++ = * -- p ;
6684
+ /* Take a chunk of bytes from the front and from the
6685
+ * back, reverse the bytes in each and and swap the
6686
+ * chunks over. This should have generally good
6687
+ * performance but also is likely to be optimised
6688
+ * into bswap instructions by the compiler.
6689
+ */
6690
+ #ifdef HAS_QUAD
6691
+ while (j - i >= 16 ) {
6692
+ * (U64 * )(outp + i ) = _swab_64_ ( * (U64 * )(src + j - 8 ) );
6693
+ * (U64 * )(outp + j - 8 ) = _swab_64_ ( * (U64 * )(src + i ) );
6694
+ i += 8 ;
6695
+ j -= 8 ;
6696
+ }
6697
+
6698
+ if (j - i >= 8 ) {
6699
+ * (U32 * )(outp + i ) = _swab_32_ ( * (U32 * )(src + j - 4 ) );
6700
+ * (U32 * )(outp + j - 4 ) = _swab_32_ ( * (U32 * )(src + i ) );
6701
+ i += 4 ;
6702
+ j -= 4 ;
6703
+ }
6704
+ #else
6705
+ while (j - i >= 8 ) {
6706
+ * (U32 * )(outp + i ) = _swab_32_ ( * (U32 * )(src + j - 4 ) );
6707
+ * (U32 * )(outp + j - 4 ) = _swab_32_ ( * (U32 * )(src + i ) );
6708
+ i += 4 ;
6709
+ j -= 4 ;
6710
+ }
6711
+ #endif
6712
+ if (j - i >= 4 ) {
6713
+ * (U16 * )(outp + i ) = _swab_16_ ( * (U16 * )(src + j - 2 ) );
6714
+ * (U16 * )(outp + j - 2 ) = _swab_16_ ( * (U16 * )(src + i ) );
6715
+ i += 2 ;
6716
+ j -= 2 ;
6717
+ }
6718
+
6719
+ /* Swap any remaining bytes one by one. */
6720
+ while (i < j ) {
6721
+ outp [i ] = src [j - 1 ];
6722
+ outp [j - 1 ] = src [i ];
6723
+ i ++ ; j -- ;
6724
+ }
6686
6725
}
6687
6726
RETURN ;
6688
6727
}
@@ -6695,8 +6734,8 @@ PP_wrapped(pp_reverse, 0, 1)
6695
6734
6696
6735
if (len > 1 ) {
6697
6736
/* The traditional way, operate on the current byte buffer */
6698
- char * down ;
6699
6737
if (DO_UTF8 (TARG )) { /* first reverse each character */
6738
+ char * down ;
6700
6739
U8 * s = (U8 * )SvPVX (TARG );
6701
6740
const U8 * send = (U8 * )(s + len );
6702
6741
while (s < send ) {
@@ -6720,11 +6759,53 @@ PP_wrapped(pp_reverse, 0, 1)
6720
6759
}
6721
6760
up = SvPVX (TARG );
6722
6761
}
6723
- down = SvPVX (TARG ) + len - 1 ;
6724
- while (down > up ) {
6725
- const char tmp = * up ;
6726
- * up ++ = * down ;
6727
- * down -- = tmp ;
6762
+ STRLEN i = 0 ;
6763
+ STRLEN j = len ;
6764
+ /* Reverse the buffer in place, in chunks where possible */
6765
+ #ifdef HAS_QUAD
6766
+ while (j - i >= 16 ) {
6767
+ U64 lchunk = _swab_64_ ( * (U64 * )(up + j - 8 ) );
6768
+ U64 rchunk = _swab_64_ ( * (U64 * )(up + i ) );
6769
+ * (U64 * )(up + i ) = lchunk ;
6770
+ * (U64 * )(up + j - 8 ) = rchunk ;
6771
+ i += 8 ;
6772
+ j -= 8 ;
6773
+ }
6774
+
6775
+ if (j - i >= 8 ) {
6776
+ U32 lchunk = _swab_32_ ( * (U32 * )(up + j - 4 ) );
6777
+ U32 rchunk = _swab_32_ ( * (U32 * )(up + i ) );
6778
+ * (U32 * )(up + i ) = lchunk ;
6779
+ * (U32 * )(up + j - 4 ) = rchunk ;
6780
+ i += 4 ;
6781
+ j -= 4 ;
6782
+ }
6783
+ #else
6784
+ while (j - i >= 8 ) {
6785
+ U32 lchunk = _swab_32_ ( * (U32 * )(up + j - 4 ) );
6786
+ U32 rchunk = _swab_32_ ( * (U32 * )(up + i ) );
6787
+ * (U32 * )(up + i ) = lchunk ;
6788
+ * (U32 * )(up + j - 4 ) = rchunk ;
6789
+ i += 4 ;
6790
+ j -= 4 ;
6791
+ }
6792
+ #endif
6793
+ if (j - i >= 4 ) {
6794
+ U16 lchunk = _swab_16_ ( * (U16 * )(up + j - 2 ) );
6795
+ U16 rchunk = _swab_16_ ( * (U16 * )(up + i ) );
6796
+ * (U16 * )(up + i ) = lchunk ;
6797
+ * (U16 * )(up + j - 2 ) = rchunk ;
6798
+ i += 2 ;
6799
+ j -= 2 ;
6800
+ }
6801
+
6802
+ /* Finally, swap any remaining bytes one-by-one. */
6803
+ while (i < j ) {
6804
+ unsigned char tmp = up [i ];
6805
+ up [i ] = up [j - 1 ];
6806
+ up [j - 1 ] = tmp ;
6807
+ i ++ ;
6808
+ j -- ;
6728
6809
}
6729
6810
}
6730
6811
(void )SvPOK_only_UTF8 (TARG );
0 commit comments