Skip to content

Commit dbf063f

Browse files
committed
optimize SubString{StringView}
1 parent 54e2bc7 commit dbf063f

File tree

3 files changed

+65
-37
lines changed

3 files changed

+65
-37
lines changed

src/StringViews.jl

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,35 @@ struct StringView{T} <: AbstractString where {T<:AbstractVector{UInt8}}
2424
end
2525

2626
const DenseStringView = StringView{<:Union{DenseVector{UInt8},<:Base.FastContiguousSubArray{UInt8,1,<:DenseVector{UInt8}}}}
27+
const StringAndSub = Union{String,SubString{String}}
28+
const StringViewAndSub = Union{StringView,SubString{<:StringView}}
29+
const DenseStringViewAndSub = Union{DenseStringView,SubString{<:DenseStringView}}
2730

2831
Base.Vector{UInt8}(s::StringView{Vector{UInt8}}) = s.data
29-
Base.Vector{UInt8}(s::StringView) = Vector{UInt8}(s.data)
30-
Base.Array{UInt8}(s::StringView) = Vector{UInt8}(s)
31-
Base.String(s::StringView) = String(copyto!(Base.StringVector(length(s.data)), s.data))
32+
Base.Vector{UInt8}(s::StringViewAndSub) = Vector{UInt8}(codeunits(s))
33+
Base.Array{UInt8}(s::StringViewAndSub) = Vector{UInt8}(s)
34+
Base.String(s::StringViewAndSub) = String(copyto!(Base.StringVector(ncodeunits(s)), codeunits(s)))
3235
StringView(s::StringView) = s
3336
StringView(s::String) = StringView(codeunits(s))
3437

35-
Base.Symbol(s::DenseStringView) =
38+
Base.Symbol(s::DenseStringViewAndSub) =
3639
return ccall(:jl_symbol_n, Ref{Symbol}, (Ptr{UInt8}, Int), s, ncodeunits(s))
3740

3841
Base.pointer(s::DenseStringView) = pointer(s.data)
3942
Base.pointer(s::DenseStringView, i::Integer) = pointer(s.data, i)
40-
Base.unsafe_convert(::Type{Ptr{UInt8}}, s::DenseStringView) = pointer(s.data)
41-
Base.unsafe_convert(::Type{Ptr{Int8}}, s::DenseStringView) = convert(Ptr{Int8}, pointer(s.data))
43+
Base.pointer(x::SubString{<:DenseStringView}) = pointer(x.string) + x.offset
44+
Base.pointer(x::SubString{<:DenseStringView}, i::Integer) = pointer(x.string) + x.offset + (i-1)
45+
Base.unsafe_convert(::Type{Ptr{UInt8}}, s::DenseStringViewAndSub) = pointer(s)
46+
Base.unsafe_convert(::Type{Ptr{Int8}}, s::DenseStringViewAndSub) = convert(Ptr{Int8}, pointer(s))
47+
Base.cconvert(::Type{Ptr{UInt8}}, s::SubString{<:DenseStringView}) = s
48+
Base.cconvert(::Type{Ptr{Int8}}, s::SubString{<:DenseStringView}) = s
4249

4350
Base.sizeof(s::StringView) = sizeof(s.data)
4451
Base.ncodeunits(s::StringView) = length(s.data)
4552
Base.codeunit(s::StringView) = UInt8
4653
Base.@propagate_inbounds Base.codeunit(s::StringView, i::Integer) = s.data[i]
4754
Base.codeunits(s::StringView) = s.data
55+
Base.codeunits(s::SubString{<:StringView}) = @view s.string.data[1+s.offset:s.offset+s.ncodeunits]
4856

4957
_memcmp(a, b, len) =
5058
ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len % Csize_t) % Int
@@ -53,40 +61,41 @@ function _cmp(a, b)
5361
c = _memcmp(a, b, min(al,bl))
5462
return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
5563
end
56-
Base.cmp(a::DenseStringView, b::DenseStringView) = _cmp(a, b)
57-
Base.cmp(a::DenseStringView, b::String) = _cmp(a, b)
58-
Base.cmp(a::String, b::DenseStringView) = _cmp(a, b)
59-
Base.:(==)(s1::StringView, s2::StringView) = s1.data == s2.data
60-
function Base.:(==)(a::String, b::StringView)
64+
Base.cmp(a::DenseStringViewAndSub, b::DenseStringViewAndSub) = _cmp(a, b)
65+
Base.cmp(a::DenseStringViewAndSub, b::StringAndSub) = _cmp(a, b)
66+
Base.cmp(a::StringAndSub, b::DenseStringViewAndSub) = _cmp(a, b)
67+
Base.:(==)(s1::StringViewAndSub, s2::StringViewAndSub) = codeunits(s1) == codeunits(s2)
68+
Base.:(==)(s1::StringAndSub, s2::StringViewAndSub) = codeunits(s1) == codeunits(s2)
69+
function Base.:(==)(a::StringAndSub, b::DenseStringViewAndSub)
6170
al = sizeof(a)
6271
return al == sizeof(b) && 0 == _memcmp(a, b, al)
6372
end
64-
Base.:(==)(s1::StringView, s2::String) = s2 == s1
73+
Base.:(==)(s1::StringViewAndSub, s2::StringAndSub) = s2 == s1
6574

6675
Base.typemin(::Type{StringView{Vector{UInt8}}}) = StringView(Vector{UInt8}(undef,0))
6776
Base.typemin(::T) where {T<:StringView} = typemin(T)
6877

69-
Base.isvalid(s::DenseStringView) = ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) 0
70-
Base.isvalid(s::StringView) = all(isvalid, s)
71-
Base.isvalid(::Type{String}, s::StringView) = isvalid(s)
78+
Base.isvalid(s::DenseStringViewAndSub) = ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) 0
79+
Base.isvalid(s::StringViewAndSub) = all(isvalid, s)
80+
Base.isvalid(::Type{String}, s::StringViewAndSub) = isvalid(s)
7281

73-
function Base.isascii(s::StringView)
82+
function Base.isascii(s::StringViewAndSub)
7483
@inbounds for i = 1:ncodeunits(s)
7584
codeunit(s, i) >= 0x80 && return false
7685
end
7786
return true
7887
end
7988

80-
write(io::IO, s::StringView) = write(io, s.data)
81-
print(io::IO, s::StringView) = (write(io, s); nothing)
89+
write(io::IO, s::StringViewAndSub) = write(io, codeunits(s))
90+
print(io::IO, s::StringViewAndSub) = (write(io, s); nothing)
8291

83-
Base.@propagate_inbounds Base.thisind(s::StringView, i::Int) = Base._thisind_str(s, i)
84-
Base.@propagate_inbounds Base.nextind(s::StringView, i::Int) = Base._nextind_str(s, i)
85-
Base.isvalid(s::StringView, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
92+
Base.@propagate_inbounds Base.thisind(s::StringViewAndSub, i::Int) = Base._thisind_str(s, i)
93+
Base.@propagate_inbounds Base.nextind(s::StringViewAndSub, i::Int) = Base._nextind_str(s, i)
94+
Base.isvalid(s::StringViewAndSub, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
8695

87-
function Base.hash(s::DenseStringView, h::UInt)
96+
function Base.hash(s::DenseStringViewAndSub, h::UInt)
8897
h += Base.memhash_seed
89-
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.data, length(s.data), h % UInt32) + h
98+
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, ncodeunits(s), h % UInt32) + h
9099
end
91100

92101
include("decoding.jl")

src/regex.jl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,22 @@
33

44
import Base.PCRE
55

6-
function Base.occursin(r::Regex, s::DenseStringView; offset::Integer=0)
6+
function Base.occursin(r::Regex, s::DenseStringViewAndSub; offset::Integer=0)
77
Base.compile(r)
88
return PCRE.exec_r(r.regex, s, offset, r.match_options)
99
end
1010

11-
function Base.startswith(s::DenseStringView, r::Regex)
11+
function Base.startswith(s::DenseStringViewAndSub, r::Regex)
1212
Base.compile(r)
1313
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ANCHORED)
1414
end
1515

16-
function Base.endswith(s::DenseStringView, r::Regex)
16+
function Base.endswith(s::DenseStringViewAndSub, r::Regex)
1717
Base.compile(r)
1818
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED)
1919
end
2020

21-
function Base.match(re::Regex, str::DenseStringView, idx::Integer, add_opts::UInt32=UInt32(0))
21+
function Base.match(re::Regex, str::DenseStringViewAndSub, idx::Integer, add_opts::UInt32=UInt32(0))
2222
Base.compile(re)
2323
opts = re.match_options | add_opts
2424
matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts)
@@ -38,9 +38,9 @@ function Base.match(re::Regex, str::DenseStringView, idx::Integer, add_opts::UIn
3838
return result
3939
end
4040

41-
Base.findnext(re::Regex, str::DenseStringView, idx::Integer) = _findnext_re(re, str, idx, C_NULL)
41+
Base.findnext(re::Regex, str::DenseStringViewAndSub, idx::Integer) = _findnext_re(re, str, idx, C_NULL)
4242

43-
function _findnext_re(re::Regex, str::DenseStringView, idx::Integer, match_data::Ptr{Cvoid})
43+
function _findnext_re(re::Regex, str::DenseStringViewAndSub, idx::Integer, match_data::Ptr{Cvoid})
4444
if idx > nextind(str,lastindex(str))
4545
throw(BoundsError())
4646
end
@@ -64,7 +64,7 @@ function _findnext_re(re::Regex, str::DenseStringView, idx::Integer, match_data:
6464
end
6565

6666
# copied from Base.RegexMatchIterator
67-
struct RegexMatchIterator{T<:DenseStringView}
67+
struct RegexMatchIterator{T<:DenseStringViewAndSub}
6868
regex::Regex
6969
string::T
7070
overlap::Bool
@@ -103,11 +103,11 @@ function Base.iterate(itr::RegexMatchIterator, (offset,prevempty)=(1,false))
103103
nothing
104104
end
105105

106-
Base.eachmatch(re::Regex, str::DenseStringView; overlap = false) =
106+
Base.eachmatch(re::Regex, str::DenseStringViewAndSub; overlap = false) =
107107
RegexMatchIterator(re, str, overlap)
108108

109109
# copied from julia/base/pcre.jl:
110-
function PCRE.exec(re, subject::DenseStringView, offset, options, match_data)
110+
function PCRE.exec(re, subject::DenseStringViewAndSub, offset, options, match_data)
111111
rc = ccall((:pcre2_match_8, PCRE.PCRE_LIB), Cint,
112112
(Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, UInt32, Ptr{Cvoid}, Ptr{Cvoid}),
113113
re, subject, ncodeunits(subject), offset, options, match_data, PCRE.get_local_match_context())

test/runtests.jl

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ using StringViews, Test
22

33
b = Vector{UInt8}("foobar")
44
s = StringView(b)
5+
ss = SubString(s, 2, 5) # "ooba"
56
abc = StringView(0x61:0x63)
67
invalid = StringView([0x8b, 0x52, 0x9b, 0x8d])
78

8-
@testset "Construction/conversion" begin
9+
@testset "construction/conversion" begin
910
@test StringView(s) === s
1011
@test Vector{UInt8}(s) === Array{UInt8}(s) === codeunits(s) === b
1112
@test Vector{UInt8}(StringView(@view b[1:3])) == b[1:3]
@@ -34,8 +35,26 @@ invalid = StringView([0x8b, 0x52, 0x9b, 0x8d])
3435
@test Base.print_to_string(abc) == "abc"
3536
end
3637

38+
@testset "substrings" begin
39+
@test Vector{UInt8}(ss) == Array{UInt8}(ss) == codeunits(ss) == b[2:5]
40+
@test codeunits(ss) isa Base.FastContiguousSubArray
41+
@test Symbol(ss) == :ooba
42+
43+
@test pointer(ss) == pointer(b) + 1 == Base.unsafe_convert(Ptr{UInt8}, ss)
44+
@test ncodeunits(ss) == sizeof(ss) == length(b)-2
45+
@test codeunit(ss) == UInt8
46+
@test codeunit(ss,3) == b[4]
47+
48+
@test Base.print_to_string(ss) == "ooba"
49+
50+
@test cmp("foobar","bar") == cmp(ss,"bar") == -cmp("bar",ss) == cmp(ss,StringView("bar"))
51+
@test ss == StringView("ooba") == "ooba" == ss == "ooba"
52+
@test isvalid(ss)
53+
end
54+
3755
@testset "regular expressions" begin
38-
@test [m.match for m in collect(eachmatch(r"[aeiou]+", s))] == ["oo", "a"]
56+
@test [m.match for m in collect(eachmatch(r"[aeiou]+", s))] == ["oo", "a"] ==
57+
[m.match for m in collect(eachmatch(r"[aeiou]+", ss))]
3958
end
4059

4160
@testset "miscellaneous" begin
@@ -54,7 +73,7 @@ end
5473
@test !isvalid(invalid)
5574
@test !invoke(isvalid, Tuple{StringView}, invalid)
5675

57-
for str in (s, abc, invalid)
58-
@test hash(s) == hash(String(s))
76+
for str in (s, abc, invalid, ss)
77+
@test hash(str) == hash(String(str))
5978
end
60-
end
79+
end

0 commit comments

Comments
 (0)