Skip to content

Commit 745b1d8

Browse files
authored
Merge pull request #3 from JuliaStrings/searching
add searching utils
2 parents 7e94dc5 + fcfd75c commit 745b1d8

File tree

6 files changed

+208
-6
lines changed

6 files changed

+208
-6
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,5 @@ julia> abc = StringView(0x61:0x63) # and for other array types
3737
```
3838

3939
Other optimized (copy-free) operations include I/O, hashing, iteration/indexing,
40-
comparisons, parsing, and validation. Working with a `SubString` of a `StringView` is
41-
similarly efficient.
40+
comparisons, parsing, searching, and validation. Working with a `SubString` of
41+
a `StringView` is similarly efficient.

src/StringViews.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ function Base.isascii(s::StringViewAndSub)
8686
return true
8787
end
8888

89-
write(io::IO, s::StringViewAndSub) = write(io, codeunits(s))
90-
print(io::IO, s::StringViewAndSub) = (write(io, s); nothing)
89+
Base.write(io::IO, s::StringViewAndSub) = write(io, codeunits(s))
90+
Base.print(io::IO, s::StringViewAndSub) = (write(io, s); nothing)
9191

9292
Base.@propagate_inbounds Base.thisind(s::StringViewAndSub, i::Int) = Base._thisind_str(s, i)
9393
Base.@propagate_inbounds Base.nextind(s::StringViewAndSub, i::Int) = Base._nextind_str(s, i)
@@ -114,5 +114,7 @@ end
114114
include("decoding.jl")
115115
include("regex.jl")
116116
include("parse.jl")
117+
include("util.jl")
118+
include("search.jl")
117119

118120
end # module

src/parse.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ function tryparse_internal(::Type{Bool}, sbuff::DenseStringViewAndSub,
108108
len = endpos - startpos + 1
109109
p = pointer(sbuff) + startpos - 1
110110
GC.@preserve sbuff begin
111-
(len == 4) && (0 == Base._memcmp(p, "true", 4)) && (return true)
112-
(len == 5) && (0 == Base._memcmp(p, "false", 5)) && (return false)
111+
(len == 4) && (0 == _memcmp(p, "true", 4)) && (return true)
112+
(len == 5) && (0 == _memcmp(p, "false", 5)) && (return false)
113113
end
114114

115115
if raise

src/search.jl

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# optimized string routines copied from julia/base/strings/search.jl
2+
3+
function Base.findnext(pred::Base.Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
4+
s::StringViewAndSub, i::Integer)
5+
if i < 1 || i > sizeof(s)
6+
i == sizeof(s) + 1 && return nothing
7+
throw(BoundsError(s, i))
8+
end
9+
@inbounds isvalid(s, i) || Base.string_index_err(s, i)
10+
c = pred.x
11+
c '\x7f' && return Base.nothing_sentinel(Base._search(s, c % UInt8, i))
12+
while true
13+
i = Base._search(s, Base.first_utf8_byte(c), i)
14+
i == 0 && return nothing
15+
pred(s[i]) && return i
16+
i = nextind(s, i)
17+
end
18+
end
19+
20+
function Base._search(a::StringViewAndSub, b::Union{Int8,UInt8}, i::Integer = 1)
21+
if i < 1
22+
throw(BoundsError(a, i))
23+
end
24+
n = sizeof(a)
25+
if i > n
26+
return i == n+1 ? 0 : throw(BoundsError(a, i))
27+
end
28+
if a isa DenseStringViewAndSub
29+
p = pointer(a)
30+
q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
31+
return q == C_NULL ? 0 : Int(q-p+1)
32+
else
33+
_i = Int(i)
34+
while true
35+
codeunit(a,_i) == b && return _i
36+
(_i += 1) > n && break
37+
end
38+
return 0
39+
end
40+
end
41+
42+
function Base.findprev(pred::Base.Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
43+
s::StringViewAndSub, i::Integer)
44+
c = pred.x
45+
c '\x7f' && return Base.nothing_sentinel(Base._rsearch(s, c % UInt8, i))
46+
b = Base.first_utf8_byte(c)
47+
while true
48+
i = Base._rsearch(s, b, i)
49+
i == 0 && return nothing
50+
pred(s[i]) && return i
51+
i = prevind(s, i)
52+
end
53+
end
54+
55+
function Base._rsearch(a::StringViewAndSub, b::Union{Int8,UInt8}, i::Integer = sizeof(a))
56+
if i < 1
57+
return i == 0 ? 0 : throw(BoundsError(a, i))
58+
end
59+
n = sizeof(a)
60+
if i > n
61+
return i == n+1 ? 0 : throw(BoundsError(a, i))
62+
end
63+
if a isa DenseStringViewAndSub
64+
p = pointer(a)
65+
q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
66+
return q == C_NULL ? 0 : Int(q-p+1)
67+
else
68+
_i = Int(i)
69+
while true
70+
codeunit(a,_i) == b && return _i
71+
(_i -= 1) < 1 && break
72+
end
73+
return 0
74+
end
75+
end
76+
77+
# The following functions require julia#37283 in Julia 1.6, which
78+
# allow us to search byte arrays (applied to codeunits(s)).
79+
@static if VERSION v"1.6.0-DEV.1341"
80+
function Base._searchindex(s::Union{StringViewAndSub,StringAndSub}, t::Union{StringViewAndSub,StringAndSub}, i::Integer)
81+
# Check for fast case of a single byte
82+
lastindex(t) == 1 && return something(findnext(isequal(t[1]), s, i), 0)
83+
Base._searchindex(codeunits(s), codeunits(t), i)
84+
end
85+
function Base._rsearchindex(s::Union{StringViewAndSub,StringAndSub}, t::Union{StringViewAndSub,StringAndSub}, i::Integer)
86+
# Check for fast case of a single byte
87+
if lastindex(t) == 1
88+
return something(findprev(isequal(t[1]), s, i), 0)
89+
elseif lastindex(t) != 0
90+
j = i ncodeunits(s) ? nextind(s, i)-1 : i
91+
return Base._rsearchindex(codeunits(s), codeunits(t), j)
92+
elseif i > sizeof(s)
93+
return 0
94+
elseif i == 0
95+
return 1
96+
else
97+
return i
98+
end
99+
end
100+
end

src/util.jl

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# other optimized string routines copied from julia/base/strings/util.jl
2+
3+
function Base.startswith(a::Union{DenseStringViewAndSub,StringAndSub},
4+
b::Union{DenseStringViewAndSub,StringAndSub})
5+
cub = ncodeunits(b)
6+
if ncodeunits(a) < cub
7+
false
8+
elseif _memcmp(a, b, sizeof(b)) == 0
9+
nextind(a, cub) == cub + 1
10+
else
11+
false
12+
end
13+
end
14+
15+
function Base.endswith(a::Union{DenseStringViewAndSub,StringAndSub},
16+
b::Union{DenseStringViewAndSub,StringAndSub})
17+
cub = ncodeunits(b)
18+
astart = ncodeunits(a) - ncodeunits(b) + 1
19+
if astart < 1
20+
false
21+
elseif GC.@preserve(a, _memcmp(pointer(a, astart), b, sizeof(b))) == 0
22+
thisind(a, astart) == astart
23+
else
24+
false
25+
end
26+
end
27+
28+
function Base.chomp(s::StringViewAndSub)
29+
i = lastindex(s)
30+
if i < 1 || codeunit(s,i) != 0x0a
31+
return @inbounds SubString(s, 1, i)
32+
elseif i < 2 || codeunit(s,i-1) != 0x0d
33+
return @inbounds SubString(s, 1, prevind(s, i))
34+
else
35+
return @inbounds SubString(s, 1, prevind(s, i-1))
36+
end
37+
end
38+
39+
Base.replace(str::DenseStringViewAndSub, pat_repl::Pair{<:AbstractChar}; count::Integer=typemax(Int)) =
40+
replace(str, isequal(first(pat_repl)) => last(pat_repl); count=count)
41+
42+
Base.replace(str::DenseStringViewAndSub, pat_repl::Pair{<:Union{Tuple{Vararg{<:AbstractChar}},
43+
AbstractVector{<:AbstractChar},Set{<:AbstractChar}}};
44+
count::Integer=typemax(Int)) =
45+
replace(str, in(first(pat_repl)) => last(pat_repl), count=count)
46+
47+
_pat_replacer(x) = x
48+
_free_pat_replacer(x) = nothing
49+
50+
function Base.replace(str::DenseStringViewAndSub, pat_repl::Pair; count::Integer=typemax(Int))
51+
pattern, repl = pat_repl
52+
count == 0 && return str
53+
count < 0 && throw(DomainError(count, "`count` must be non-negative."))
54+
n = 1
55+
e = lastindex(str)
56+
i = a = firstindex(str)
57+
pattern = _pat_replacer(pattern)
58+
r = something(findnext(pattern,str,i), 0)
59+
j, k = first(r), last(r)
60+
if j == 0
61+
_free_pat_replacer(pattern)
62+
return str
63+
end
64+
out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str)))
65+
while j != 0
66+
if i == a || i <= k
67+
GC.@preserve str unsafe_write(out, pointer(str, i), UInt(j-i))
68+
Base._replace(out, repl, str, r, pattern)
69+
end
70+
if k < j
71+
i = j
72+
j > e && break
73+
k = nextind(str, j)
74+
else
75+
i = k = nextind(str, k)
76+
end
77+
r = something(findnext(pattern,str,k), 0)
78+
r === 0:-1 || n == count && break
79+
j, k = first(r), last(r)
80+
n += 1
81+
end
82+
_free_pat_replacer(pattern)
83+
write(out, SubString(str,i))
84+
String(take!(out))
85+
end

test/runtests.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,21 @@ end
8989
end
9090
end
9191

92+
@testset "searching" begin
93+
for str in (s, ss, abc)
94+
sS, n = String(str), lastindex(str)
95+
@test startswith(str, "foo") == startswith(sS, "foo")
96+
@test endswith(str, "bar") == endswith(sS, "bar")
97+
@test replace(str, 'o'=>"xy") == replace(sS, 'o'=>"xy")
98+
@test replace(str, ('o','a')=>'x') == replace(sS, ('o','a')=>'x')
99+
@test findnext(==('b'), str, 1) === findnext(==('b'), sS, 1)
100+
@test findprev(==('b'), str, n) === findprev(==('b'), sS, n)
101+
@test findnext(==("ba"), str, 1) === findnext(==("ba"), sS, 1)
102+
@test findprev(==("ba"), str, n) === findprev(==("ba"), sS, n)
103+
end
104+
@test chomp(StringView("foo\n")) == "foo"
105+
end
106+
92107
@testset "miscellaneous" begin
93108
@test cmp("foobar","bar") == cmp(s,"bar") == -cmp("bar",s) == cmp(s,StringView("bar"))
94109
@test s == StringView("foobar") == "foobar" == s == "foobar" != StringView("bar")

0 commit comments

Comments
 (0)