@@ -867,10 +867,10 @@ SZ_PUBLIC void sz_hashes_neon_readahead(sz_cptr_t start, sz_size_t length, sz_si
867
867
chars_readahead_vec [3 ].u8x16 = vld1q_u8 (text + window_length * 3 + i );
868
868
869
869
for (; i != window_length ; ++ i ) {
870
- chars_incoming_low_vec .u32s [0 ] = chars_readahead_vec [0 ].u8x16 [i ];
871
- chars_incoming_low_vec .u32s [1 ] = chars_readahead_vec [1 ].u8x16 [i ];
872
- chars_incoming_low_vec .u32s [2 ] = chars_readahead_vec [2 ].u8x16 [i ];
873
- chars_incoming_low_vec .u32s [3 ] = chars_readahead_vec [3 ].u8x16 [i ];
870
+ chars_incoming_low_vec .u32s [0 ] = chars_readahead_vec [0 ].u16s [i ];
871
+ chars_incoming_low_vec .u32s [1 ] = chars_readahead_vec [1 ].u16s [i ];
872
+ chars_incoming_low_vec .u32s [2 ] = chars_readahead_vec [2 ].u16s [i ];
873
+ chars_incoming_low_vec .u32s [3 ] = chars_readahead_vec [3 ].u16s [i ];
874
874
chars_incoming_high_vec .u8x16 = vaddq_u8 (chars_incoming_low_vec .u8x16 , vld1q_dup_u8 (& high_shift ));
875
875
876
876
// Append new data.
@@ -906,11 +906,11 @@ SZ_PUBLIC void sz_hashes_neon_readahead(sz_cptr_t start, sz_size_t length, sz_si
906
906
907
907
for (; i + 1 < window_length ; ++ i ) {
908
908
// Transpose
909
- chars_outgoing_low_vec .u32s [0 ] = chars_readahead_vec [0 ].u8x16 [i ];
910
- chars_outgoing_low_vec .u32s [1 ] = chars_incoming_low_vec .u32s [0 ] = chars_readahead_vec [1 ].u8x16 [i ];
911
- chars_outgoing_low_vec .u32s [2 ] = chars_incoming_low_vec .u32s [1 ] = chars_readahead_vec [2 ].u8x16 [i ];
912
- chars_outgoing_low_vec .u32s [3 ] = chars_incoming_low_vec .u32s [2 ] = chars_readahead_vec [3 ].u8x16 [i ];
913
- chars_incoming_low_vec .u32s [3 ] = chars_readahead_vec [4 ].u8x16 [i ];
909
+ chars_outgoing_low_vec .u32s [0 ] = chars_readahead_vec [0 ].u16s [i ];
910
+ chars_outgoing_low_vec .u32s [1 ] = chars_incoming_low_vec .u32s [0 ] = chars_readahead_vec [1 ].u16s [i ];
911
+ chars_outgoing_low_vec .u32s [2 ] = chars_incoming_low_vec .u32s [1 ] = chars_readahead_vec [2 ].u16s [i ];
912
+ chars_outgoing_low_vec .u32s [3 ] = chars_incoming_low_vec .u32s [2 ] = chars_readahead_vec [3 ].u16s [i ];
913
+ chars_incoming_low_vec .u32s [3 ] = chars_readahead_vec [4 ].u16s [i ];
914
914
915
915
chars_outgoing_high_vec .u8x16 = vaddq_u8 (chars_outgoing_low_vec .u8x16 , vld1q_dup_u8 (& high_shift ));
916
916
chars_incoming_high_vec .u8x16 = vaddq_u8 (chars_incoming_low_vec .u8x16 , vld1q_dup_u8 (& high_shift ));
0 commit comments