Skip to content

Commit 56451d8

Browse files
Various fixes to byte / bytearray search (#54579)
This was originally intended as a targeted fix to #54578, but I ran into a bunch of smaller issues with this code that also needed to be solved and it turned out to be difficult to fix them with small, trivial PRs. I would also like to refactor this whole file, but I want these correctness fixes to be merged first, because a larger refactoring has higher risk of getting stuck without getting reviewed and merged. ## Larger things that needs decisions * The internal union `Base.ByteArray` has been deleted. Instead, the unions `DenseInt8` and `DenseUInt8` have been added. These more comprehensively cover the types that was meant, e.g. `Memory{UInt8}` was incorrectly not covered by the former. As stated in the TODO, the concept of a "memory backed dense byte array" is needed throughout Julia, so this ideally needs to be implemented as a single type and used throughout Base. The fix here is a decent temporary solution. See #53178 #54581 * The `findall` docstring between two arrays was incorrectly not attached to the method - now it is. **Note that this change _changes_ the documentation** since it includes a docstring that was previously missed. Hence, it's an API addition. * Added a new minimal `testhelpers/OffsetDenseArrays.jl` which provide a `DenseVector` with offset axes for testing purposes. ## Trivial fixes * `findfirst(==(Int8(-1)), [0xff])` and similar findlast, findnext and findprev is no longer buggy, see #54578 * `findfirst([0x0ff], Int8[-1])` is similarly no longer buggy, see #54578 * `findnext(==('\xa6'), "æ", 1)` and `findprev(==('\xa6'), "æa", 2)` no longer incorrectly throws an error * The byte-oriented find* functions now work correctly with offset arrays * Fixed incorrect use of `GC.@preserve`, where the pointer was taken before the preserve block. * More of the optimised string methods now also apply to `SubString{String}` Closes #54578 Co-authored-by: Martin Holters <martin.holters@hsu-hh.de>
1 parent a7c9235 commit 56451d8

File tree

4 files changed

+170
-47
lines changed

4 files changed

+170
-47
lines changed

base/char.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ hash(x::Char, h::UInt) =
223223
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
224224

225225
first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
226+
first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)
226227

227228
# fallbacks:
228229
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))

base/strings/search.jl

Lines changed: 89 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,29 @@ match strings with [`match`](@ref).
1010
"""
1111
abstract type AbstractPattern end
1212

13-
nothing_sentinel(i) = i == 0 ? nothing : i
13+
# TODO: These unions represent bytes in memory that can be accessed via a pointer.
14+
# this property is used throughout Julia, e.g. also in IO code.
15+
# This deserves a better solution - see #53178.
16+
# If such a better solution comes in place, these unions should be replaced.
17+
const DenseInt8 = Union{
18+
DenseArray{Int8},
19+
FastContiguousSubArray{Int8,N,<:DenseArray} where N
20+
}
21+
22+
# Note: This union is different from that above in that it includes CodeUnits.
23+
# Currently, this is redundant as CodeUnits <: DenseVector, but this subtyping
24+
# is buggy and may be removed in the future, see #54002
25+
const DenseUInt8 = Union{
26+
DenseArray{UInt8},
27+
FastContiguousSubArray{UInt8,N,<:DenseArray} where N,
28+
CodeUnits{UInt8, <:Union{String, SubString{String}}},
29+
FastContiguousSubArray{UInt8,N,<:CodeUnits{UInt8, <:Union{String, SubString{String}}}} where N,
30+
}
31+
32+
const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}
33+
34+
last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
35+
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
1436

1537
function last_utf8_byte(c::Char)
1638
u = reinterpret(UInt32, c)
@@ -30,11 +52,11 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}
3052
end
3153
@inbounds isvalid(s, i) || string_index_err(s, i)
3254
c = pred.x
33-
c '\x7f' && return nothing_sentinel(_search(s, c % UInt8, i))
55+
c '\x7f' && return _search(s, first_utf8_byte(c), i)
3456
while true
3557
i = _search(s, first_utf8_byte(c), i)
36-
i == 0 && return nothing
37-
pred(s[i]) && return i
58+
i === nothing && return nothing
59+
isvalid(s, i) && pred(s[i]) && return i
3860
i = nextind(s, i)
3961
end
4062
end
@@ -47,31 +69,41 @@ const DenseBytes = Union{
4769
CodeUnits{UInt8, <:Union{String, SubString{String}}},
4870
}
4971

50-
const ByteArray = Union{DenseBytes, DenseArrayType{Int8}}
72+
function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
73+
findnext(pred, a, firstindex(a))
74+
end
5175

52-
findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
53-
nothing_sentinel(_search(a, pred.x))
76+
function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
77+
_search(a, pred.x, i)
78+
end
5479

55-
findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
56-
nothing_sentinel(_search(a, pred.x, i))
80+
function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
81+
_search(a, pred.x, i)
82+
end
5783

58-
findfirst(::typeof(iszero), a::ByteArray) = nothing_sentinel(_search(a, zero(UInt8)))
59-
findnext(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_search(a, zero(UInt8), i))
84+
# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
85+
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
86+
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
87+
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
6088

61-
function _search(a::Union{String,SubString{String},<:ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
62-
if i < 1
89+
function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
90+
fst = firstindex(a)
91+
lst = last_byteindex(a)
92+
if i < fst
6393
throw(BoundsError(a, i))
6494
end
65-
n = sizeof(a)
66-
if i > n
67-
return i == n+1 ? 0 : throw(BoundsError(a, i))
95+
n_bytes = lst - i + 1
96+
if i > lst
97+
return i == lst+1 ? nothing : throw(BoundsError(a, i))
6898
end
69-
p = pointer(a)
70-
q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
71-
return q == C_NULL ? 0 : Int(q-p+1)
99+
GC.@preserve a begin
100+
p = pointer(a)
101+
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
102+
end
103+
return q == C_NULL ? nothing : (q-p+fst) % Int
72104
end
73105

74-
function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
106+
function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
75107
if isascii(b)
76108
_search(a,UInt8(b),i)
77109
else
@@ -80,41 +112,51 @@ function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
80112
end
81113

82114
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
83-
s::String, i::Integer)
115+
s::Union{String, SubString{String}}, i::Integer)
84116
c = pred.x
85-
c '\x7f' && return nothing_sentinel(_rsearch(s, c % UInt8, i))
117+
c '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
86118
b = first_utf8_byte(c)
87119
while true
88120
i = _rsearch(s, b, i)
89-
i == 0 && return nothing
90-
pred(s[i]) && return i
121+
i == nothing && return nothing
122+
isvalid(s, i) && pred(s[i]) && return i
91123
i = prevind(s, i)
92124
end
93125
end
94126

95-
findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
96-
nothing_sentinel(_rsearch(a, pred.x))
127+
function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
128+
findprev(pred, a, lastindex(a))
129+
end
97130

98-
findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
99-
nothing_sentinel(_rsearch(a, pred.x, i))
131+
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
132+
_rsearch(a, pred.x, i)
133+
end
100134

101-
findlast(::typeof(iszero), a::ByteArray) = nothing_sentinel(_rsearch(a, zero(UInt8)))
102-
findprev(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, zero(UInt8), i))
135+
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
136+
_rsearch(a, pred.x, i)
137+
end
103138

104-
function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a))
105-
if i < 1
106-
return i == 0 ? 0 : throw(BoundsError(a, i))
139+
# See comments above for findfirst(::typeof(iszero)) methods
140+
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
141+
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
142+
143+
function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
144+
fst = firstindex(a)
145+
lst = last_byteindex(a)
146+
if i < fst
147+
return i == fst - 1 ? nothing : throw(BoundsError(a, i))
148+
end
149+
if i > lst
150+
return i == lst+1 ? nothing : throw(BoundsError(a, i))
107151
end
108-
n = sizeof(a)
109-
if i > n
110-
return i == n+1 ? 0 : throw(BoundsError(a, i))
152+
GC.@preserve a begin
153+
p = pointer(a)
154+
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
111155
end
112-
p = pointer(a)
113-
q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
114-
return q == C_NULL ? 0 : Int(q-p+1)
156+
return q == C_NULL ? nothing : (q-p+fst) % Int
115157
end
116158

117-
function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
159+
function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
118160
if isascii(b)
119161
_rsearch(a,UInt8(b),i)
120162
else
@@ -224,18 +266,19 @@ end
224266

225267
in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)
226268

227-
function _searchindex(s::Union{AbstractString,ByteArray},
269+
function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},
228270
t::Union{AbstractString,AbstractChar,Int8,UInt8},
229271
i::Integer)
272+
sentinel = firstindex(s) - 1
230273
x = Iterators.peel(t)
231274
if isnothing(x)
232-
return 1 <= i <= nextind(s,lastindex(s))::Int ? i :
275+
return firstindex(s) <= i <= nextind(s,lastindex(s))::Int ? i :
233276
throw(BoundsError(s, i))
234277
end
235278
t1, trest = x
236279
while true
237280
i = findnext(isequal(t1),s,i)
238-
if i === nothing return 0 end
281+
if i === nothing return sentinel end
239282
ii = nextind(s, i)::Int
240283
a = Iterators.Stateful(trest)
241284
matched = all(splat(==), zip(SubString(s, ii), a))
@@ -509,9 +552,8 @@ julia> findall(UInt8[1,2], UInt8[1,2,3,1,2])
509552
!!! compat "Julia 1.3"
510553
This method requires at least Julia 1.3.
511554
"""
512-
513-
function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
514-
s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
555+
function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},
556+
s::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},
515557
; overlap::Bool=false)
516558
found = UnitRange{Int}[]
517559
i, e = firstindex(s), lastindex(s)
@@ -564,7 +606,7 @@ function _rsearchindex(s::AbstractString,
564606
end
565607
end
566608

567-
function _rsearchindex(s::String, t::String, i::Integer)
609+
function _rsearchindex(s::Union{String, SubString{String}}, t::Union{String, SubString{String}}, i::Integer)
568610
# Check for fast case of a single byte
569611
if lastindex(t) == 1
570612
return something(findprev(isequal(t[1]), s, i), 0)

test/strings/search.jl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,16 @@ for str in [u8str]
155155
@test findprev(isequal('ε'), str, 4) === nothing
156156
end
157157

158+
# See the comments in #54579
159+
@testset "Search for invalid chars" begin
160+
@test findfirst(==('\xff'), "abc\xffde") == 4
161+
@test findprev(isequal('\xa6'), "abc\xa69", 5) == 4
162+
@test isnothing(findfirst(==('\xff'), "abcdeæd"))
163+
164+
@test isnothing(findnext(==('\xa6'), "æ", 1))
165+
@test isnothing(findprev(==('\xa6'), "æa", 2))
166+
end
167+
158168
# string forward search with a single-char string
159169
@test findfirst("x", astr) === nothing
160170
@test findfirst("H", astr) == 1:1
@@ -445,6 +455,45 @@ end
445455
@test_throws BoundsError findprev(pattern, A, -3)
446456
end
447457
end
458+
459+
@test findall([0x01, 0x02], [0x03, 0x01, 0x02, 0x01, 0x02, 0x06]) == [2:3, 4:5]
460+
@test isempty(findall([0x04, 0x05], [0x03, 0x04, 0x06]))
461+
end
462+
463+
# Issue 54578
464+
@testset "No conflation of Int8 and UInt8" begin
465+
# Work for mixed types if the values are the same
466+
@test findfirst(==(Int8(1)), [0x01]) == 1
467+
@test findnext(iszero, Int8[0, -2, 0, -3], 2) == 3
468+
@test findfirst(Int8[1,4], UInt8[0, 2, 4, 1, 8, 1, 4, 2]) == 6:7
469+
@test findprev(UInt8[5, 6], Int8[1, 9, 2, 5, 6, 3], 6) == 4:5
470+
471+
# Returns nothing for the same methods if the values are different,
472+
# even if the bitpatterns are the same
473+
@test isnothing(findfirst(==(Int8(-1)), [0xff]))
474+
@test isnothing(findnext(isequal(0xff), Int8[-1, -2, -1], 2))
475+
@test isnothing(findfirst(UInt8[0xff, 0xfe], Int8[0, -1, -2, 1, 8, 1, 4, 2]))
476+
@test isnothing(findprev(UInt8[0xff, 0xfe], Int8[1, 9, 2, -1, -2, 3], 6))
477+
end
478+
479+
@testset "DenseArray with offsets" begin
480+
isdefined(Main, :OffsetDenseArrays) || @eval Main include("../testhelpers/OffsetDenseArrays.jl")
481+
OffsetDenseArrays = Main.OffsetDenseArrays
482+
483+
A = OffsetDenseArrays.OffsetDenseArray(collect(0x61:0x69), 100)
484+
@test findfirst(==(0x61), A) == 101
485+
@test findlast(==(0x61), A) == 101
486+
@test findfirst(==(0x00), A) === nothing
487+
488+
@test findfirst([0x62, 0x63, 0x64], A) == 102:104
489+
@test findlast([0x63, 0x64], A) == 103:104
490+
@test findall([0x62, 0x63], A) == [102:103]
491+
492+
@test findfirst(iszero, A) === nothing
493+
A = OffsetDenseArrays.OffsetDenseArray([0x01, 0x02, 0x00, 0x03], -100)
494+
@test findfirst(iszero, A) == -97
495+
@test findnext(==(0x02), A, -99) == -98
496+
@test findnext(==(0x02), A, -97) === nothing
448497
end
449498

450499
# issue 32568

test/testhelpers/OffsetDenseArrays.jl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""
2+
module OffsetDenseArrays
3+
4+
A minimal implementation of an offset array which is also <: DenseArray.
5+
"""
6+
module OffsetDenseArrays
7+
8+
struct OffsetDenseArray{A <: DenseVector, T} <: DenseVector{T}
9+
x::A
10+
offset::Int
11+
end
12+
OffsetDenseArray(x::AbstractVector{T}, i::Integer) where {T} = OffsetDenseArray{typeof(x), T}(x, Int(i))
13+
14+
Base.size(x::OffsetDenseArray) = size(x.x)
15+
Base.pointer(x::OffsetDenseArray) = pointer(x.x)
16+
17+
function Base.getindex(x::OffsetDenseArray, i::Integer)
18+
@boundscheck checkbounds(x.x, i - x.offset)
19+
x.x[i - x.offset]
20+
end
21+
22+
function Base.setindex(x::OffsetDenseArray, v, i::Integer)
23+
@boundscheck checkbounds(x.x, i - x.offset)
24+
x.x[i - x.offset] = v
25+
end
26+
27+
IndexStyle(::Type{<:OffsetDenseArray}) = Base.IndexLinear()
28+
Base.axes(x::OffsetDenseArray) = (x.offset + 1 : x.offset + length(x.x),)
29+
Base.keys(x::OffsetDenseArray) = only(axes(x))
30+
31+
end # module

0 commit comments

Comments
 (0)