Skip to content

Commit 68c1537

Browse files
committed
initial commit
0 parents  commit 68c1537

File tree

7 files changed

+427
-0
lines changed

7 files changed

+427
-0
lines changed

LICENSE.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
The StringViews.jl package is licensed under the MIT "Expat" License:
2+
3+
> Copyright (c) 2020: Steven G. Johnson.
4+
>
5+
> Permission is hereby granted, free of charge, to any person obtaining a copy
6+
> of this software and associated documentation files (the "Software"), to deal
7+
> in the Software without restriction, including without limitation the rights
8+
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
> copies of the Software, and to permit persons to whom the Software is
10+
> furnished to do so, subject to the following conditions:
11+
>
12+
> The above copyright notice and this permission notice shall be included in all
13+
> copies or substantial portions of the Software.
14+
>
15+
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
> SOFTWARE.
22+
>

Project.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
name = "StringViews"
2+
uuid = "354b36f9-a18e-4713-926e-db85100087ba"
3+
authors = ["Steven G. Johnson <stevenj@alum.mit.edu>"]
4+
version = "0.1.0"
5+
6+
[compat]
7+
julia = "1"
8+
9+
[extras]
10+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
11+
12+
[targets]
13+
test = ["Test"]

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# StringViews
2+
3+
This Julia package implements a new type of `AbstractString`, a `StringView`,
4+
that provides a string representation of any underlying array of bytes
5+
(any `AbstractVector{UInt8}`), interpreted as UTF-8 encoded Unicode data.
6+
7+
Unlike Julia's built-in `String` type (which also wraps UTF-8 data), the
8+
`StringView` type is a copy-free wrap of *any* `AbstractVector{UInt8}`
9+
instance, and does not take "ownership" or modify the arrray. Otherwise,
10+
a `StringView` is intended to be usable in any context where you might
11+
have otherwise used `String`.
12+
13+
(In particular, as much as possible we try to implement efficient copy-free
14+
`String`-like operations on `StringView`, such as iteration and regular-expression
15+
searching, as long as the underlying `UInt8` array is a contiguous dense array.)

src/StringViews.jl

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""
2+
This module implements a new type of `AbstractString`, a `StringView`,
3+
that provides a string representation of any underlying array of bytes
4+
(any `AbstractVector{UInt8}`), interpreted as UTF-8 encoded Unicode data.
5+
6+
Unlike Julia's built-in `String` type (which also wraps UTF-8 data), the
7+
`StringView` type is a copy-free wrap of *any* `AbstractVector{UInt8}`
8+
instance, and does not take "ownership" or modify the arrray. Otherwise,
9+
a `StringView` is intended to be usable in any context where you might
10+
have otherwise used `String`.
11+
"""
12+
module StringViews
13+
export StringView
14+
15+
"""
16+
StringView{T<:AbstractVector{UInt8}} <: AbstractString
17+
18+
`StringView(array)` creates an `AbstractString` representation of
19+
any `array` of `UInt8` data, interpreted as UTF-8 encoded Unicode.
20+
It does *not* make a copy of `array`.
21+
"""
22+
struct StringView{T} <: AbstractString where {T<:AbstractVector{UInt8}}
23+
data::T
24+
end
25+
26+
const DenseStringView = StringView{<:Union{DenseVector{UInt8},<:Base.FastContiguousSubArray{UInt8,1,<:DenseVector{UInt8}}}}
27+
28+
Base.Vector{UInt8}(s::StringView{Vector{UInt8}}) = s.data
29+
Base.Vector{UInt8}(s::StringView) = Vector{UInt8}(s.data)
30+
Base.Array{UInt8}(s::StringView) = Vector{UInt8}(s)
31+
Base.String(s::StringView) = String(copyto!(Base.StringVector(length(s.data)), s.data))
32+
StringView(s::StringView) = s
33+
StringView(s::String) = StringView(codeunits(s))
34+
35+
Base.Symbol(s::DenseStringView) =
36+
return ccall(:jl_symbol_n, Ref{Symbol}, (Ptr{UInt8}, Int), s, ncodeunits(s))
37+
38+
Base.pointer(s::DenseStringView) = pointer(s.data)
39+
Base.pointer(s::DenseStringView, i::Integer) = pointer(s.data, i)
40+
Base.unsafe_convert(::Type{Ptr{UInt8}}, s::DenseStringView) = pointer(s.data)
41+
Base.unsafe_convert(::Type{Ptr{Int8}}, s::DenseStringView) = convert(Ptr{Int8}, pointer(s.data))
42+
43+
Base.sizeof(s::StringView) = sizeof(s.data)
44+
Base.ncodeunits(s::StringView) = length(s.data)
45+
Base.codeunit(s::StringView) = UInt8
46+
Base.@propagate_inbounds Base.codeunit(s::StringView, i::Integer) = s.data[i]
47+
Base.codeunits(s::StringView) = s.data
48+
49+
_memcmp(a, b, len) =
50+
ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len % Csize_t) % Int
51+
function _cmp(a, b)
52+
al, bl = sizeof(a), sizeof(b)
53+
c = _memcmp(a, b, min(al,bl))
54+
return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
55+
end
56+
Base.cmp(a::DenseStringView, b::DenseStringView) = _cmp(a, b)
57+
Base.cmp(a::DenseStringView, b::String) = _cmp(a, b)
58+
Base.cmp(a::String, b::DenseStringView) = _cmp(a, b)
59+
Base.:(==)(s1::StringView, s2::StringView) = s1.data == s2.data
60+
function Base.:(==)(a::String, b::StringView)
61+
al = sizeof(a)
62+
return al == sizeof(b) && 0 == _memcmp(a, b, al)
63+
end
64+
Base.:(==)(s1::StringView, s2::String) = s2 == s1
65+
66+
Base.typemin(::Type{StringView{Vector{UInt8}}}) = StringView(Vector{UInt8}(undef,0))
67+
Base.typemin(::T) where {T<:StringView} = typemin(T)
68+
69+
Base.isvalid(s::DenseStringView) = ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) 0
70+
Base.isvalid(::Type{String}, s::StringView) = isvalid(s)
71+
72+
function Base.isascii(s::StringView)
73+
@inbounds for i = 1:ncodeunits(s)
74+
codeunit(s, i) >= 0x80 && return false
75+
end
76+
return true
77+
end
78+
79+
write(io::IO, s::StringView) = write(io, s.data)
80+
print(io::IO, s::StringView) = (write(io, s); nothing)
81+
82+
Base.@propagate_inbounds Base.thisind(s::StringView, i::Int) = Base._thisind_str(s, i)
83+
Base.@propagate_inbounds Base.nextind(s::String, i::Int) = Base._nextind_str(s, i)
84+
Base.isvalid(s::StringView, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
85+
86+
function Base.hash(s::DenseStringView, h::UInt)
87+
h += Base.memhash_seed
88+
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.data, length(s.data), h % UInt32) + h
89+
end
90+
91+
include("decoding.jl")
92+
include("regex.jl")
93+
94+
end # module

src/decoding.jl

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Unicode iteration and decoding for StringView, copied from the corresponding StringView functions in Base
2+
3+
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo b) & (b hi)
4+
5+
@inline function Base.iterate(s::StringView, i::Int=firstindex(s))
6+
(i % UInt) - 1 < ncodeunits(s) || return nothing
7+
b = @inbounds codeunit(s, i)
8+
u = UInt32(b) << 24
9+
between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
10+
return iterate_continued(s, i, u)
11+
end
12+
13+
function iterate_continued(s::StringView, i::Int, u::UInt32)
14+
u < 0xc0000000 && (i += 1; @goto ret)
15+
n = ncodeunits(s)
16+
# first continuation byte
17+
(i += 1) > n && @goto ret
18+
@inbounds b = codeunit(s, i)
19+
b & 0xc0 == 0x80 || @goto ret
20+
u |= UInt32(b) << 16
21+
# second continuation byte
22+
((i += 1) > n) | (u < 0xe0000000) && @goto ret
23+
@inbounds b = codeunit(s, i)
24+
b & 0xc0 == 0x80 || @goto ret
25+
u |= UInt32(b) << 8
26+
# third continuation byte
27+
((i += 1) > n) | (u < 0xf0000000) && @goto ret
28+
@inbounds b = codeunit(s, i)
29+
b & 0xc0 == 0x80 || @goto ret
30+
u |= UInt32(b); i += 1
31+
@label ret
32+
return reinterpret(Char, u), i
33+
end
34+
35+
Base.@propagate_inbounds function Base.getindex(s::StringView, i::Int)
36+
b = codeunit(s, i)
37+
u = UInt32(b) << 24
38+
between(b, 0x80, 0xf7) || return reinterpret(Char, u)
39+
return getindex_continued(s, i, u)
40+
end
41+
42+
function getindex_continued(s::StringView, i::Int, u::UInt32)
43+
if u < 0xc0000000
44+
# called from `getindex` which checks bounds
45+
@inbounds isvalid(s, i) && @goto ret
46+
Base.string_index_err(s, i)
47+
end
48+
n = ncodeunits(s)
49+
50+
(i += 1) > n && @goto ret
51+
@inbounds b = codeunit(s, i) # cont byte 1
52+
b & 0xc0 == 0x80 || @goto ret
53+
u |= UInt32(b) << 16
54+
55+
((i += 1) > n) | (u < 0xe0000000) && @goto ret
56+
@inbounds b = codeunit(s, i) # cont byte 2
57+
b & 0xc0 == 0x80 || @goto ret
58+
u |= UInt32(b) << 8
59+
60+
((i += 1) > n) | (u < 0xf0000000) && @goto ret
61+
@inbounds b = codeunit(s, i) # cont byte 3
62+
b & 0xc0 == 0x80 || @goto ret
63+
u |= UInt32(b)
64+
@label ret
65+
return reinterpret(Char, u)
66+
end
67+
68+
Base.getindex(s::StringView, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
69+
70+
@inline function Base.getindex(s::StringView, r::UnitRange{Int})
71+
isempty(r) && return ""
72+
i, j = first(r), last(r)
73+
@boundscheck begin
74+
checkbounds(s, r)
75+
@inbounds isvalid(s, i) || string_index_err(s, i)
76+
@inbounds isvalid(s, j) || string_index_err(s, j)
77+
end
78+
j = nextind(s, j) - 1
79+
n = j - i + 1
80+
ss = _string_n(n)
81+
GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
82+
return ss
83+
end
84+
85+
Base.length(s::StringView) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
86+
87+
@inline function Base.length(s::StringView, i::Int, j::Int)
88+
@boundscheck begin
89+
0 < i ncodeunits(s)+1 || throw(BoundsError(s, i))
90+
0  j < ncodeunits(s)+1 || throw(BoundsError(s, j))
91+
end
92+
j < i && return 0
93+
@inbounds i, k = thisind(s, i), i
94+
c = j - i + (i == k)
95+
length_continued(s, i, j, c)
96+
end
97+
98+
@inline function length_continued(s::StringView, i::Int, n::Int, c::Int)
99+
i < n || return c
100+
@inbounds b = codeunit(s, i)
101+
@inbounds while true
102+
while true
103+
(i += 1)  n || return c
104+
0xc0  b  0xf7 && break
105+
b = codeunit(s, i)
106+
end
107+
l = b
108+
b = codeunit(s, i) # cont byte 1
109+
c -= (x = b & 0xc0 == 0x80)
110+
x & (l 0xe0) || continue
111+
112+
(i += 1)  n || return c
113+
b = codeunit(s, i) # cont byte 2
114+
c -= (x = b & 0xc0 == 0x80)
115+
x & (l 0xf0) || continue
116+
117+
(i += 1)  n || return c
118+
b = codeunit(s, i) # cont byte 3
119+
c -= (b & 0xc0 == 0x80)
120+
end
121+
end

src/regex.jl

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# copy-free regular-expression searches on DenseStringViews, implemented
2+
# by copying the low-level PCRE calls from julia/base/regex.jl
3+
4+
import Base.PCRE
5+
6+
function Base.occursin(r::Regex, s::DenseStringView; offset::Integer=0)
7+
Base.compile(r)
8+
return PCRE.exec_r(r.regex, s, offset, r.match_options)
9+
end
10+
11+
function Base.startswith(s::DenseStringView, r::Regex)
12+
Base.compile(r)
13+
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ANCHORED)
14+
end
15+
16+
function Base.endswith(s::DenseStringView, r::Regex)
17+
Base.compile(r)
18+
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED)
19+
end
20+
21+
function Base.match(re::Regex, str::DenseStringView, idx::Integer, add_opts::UInt32=UInt32(0))
22+
Base.compile(re)
23+
opts = re.match_options | add_opts
24+
matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts)
25+
if !matched
26+
PCRE.free_match_data(data)
27+
return nothing
28+
end
29+
n = div(PCRE.ovec_length(data), 2) - 1
30+
p = PCRE.ovec_ptr(data)
31+
mat = SubString(str, unsafe_load(p, 1)+1, prevind(str, unsafe_load(p, 2)+1))
32+
cap = Union{Nothing,SubString{String}}[unsafe_load(p,2i+1) == PCRE.UNSET ? nothing :
33+
SubString(str, unsafe_load(p,2i+1)+1,
34+
prevind(str, unsafe_load(p,2i+2)+1)) for i=1:n]
35+
off = Int[ unsafe_load(p,2i+1)+1 for i=1:n ]
36+
result = RegexMatch(mat, cap, unsafe_load(p,1)+1, off, re)
37+
PCRE.free_match_data(data)
38+
return result
39+
end
40+
41+
Base.findnext(re::Regex, str::DenseStringView, idx::Integer) = _findnext_re(re, str, idx, C_NULL)
42+
43+
function _findnext_re(re::Regex, str::DenseStringView, idx::Integer, match_data::Ptr{Cvoid})
44+
if idx > nextind(str,lastindex(str))
45+
throw(BoundsError())
46+
end
47+
opts = re.match_options
48+
Base.compile(re)
49+
alloc = match_data == C_NULL
50+
if alloc
51+
matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts)
52+
else
53+
matched = PCRE.exec(re.regex, str, idx-1, opts, match_data)
54+
data = match_data
55+
end
56+
if matched
57+
p = PCRE.ovec_ptr(data)
58+
ans = (Int(unsafe_load(p,1))+1):prevind(str,Int(unsafe_load(p,2))+1)
59+
else
60+
ans = nothing
61+
end
62+
alloc && PCRE.free_match_data(data)
63+
return ans
64+
end
65+
66+
# copied from Base.RegexMatchIterator
67+
struct RegexMatchIterator{T<:DenseStringView}
68+
regex::Regex
69+
string::T
70+
overlap::Bool
71+
end
72+
RegexMatchIterator(regex::Regex, string::T, ovr::Bool=false) where {T<:DenseStringView} = RegexMatchIterator{T}(regex, string, ovr)
73+
Base.compile(itr::RegexMatchIterator) = (compile(itr.regex); itr)
74+
Base.eltype(::Type{RegexMatchIterator}) = RegexMatch
75+
Base.IteratorSize(::Type{RegexMatchIterator}) = Base.SizeUnknown()
76+
77+
function Base.iterate(itr::RegexMatchIterator, (offset,prevempty)=(1,false))
78+
opts_nonempty = UInt32(PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART)
79+
while true
80+
mat = match(itr.regex, itr.string, offset,
81+
prevempty ? opts_nonempty : UInt32(0))
82+
83+
if mat === nothing
84+
if prevempty && offset <= sizeof(itr.string)
85+
offset = nextind(itr.string, offset)
86+
prevempty = false
87+
continue
88+
else
89+
break
90+
end
91+
else
92+
if itr.overlap
93+
if !isempty(mat.match)
94+
offset = nextind(itr.string, mat.offset)
95+
else
96+
offset = mat.offset
97+
end
98+
else
99+
offset = mat.offset + ncodeunits(mat.match)
100+
end
101+
return (mat, (offset, isempty(mat.match)))
102+
end
103+
end
104+
nothing
105+
end
106+
107+
Base.eachmatch(re::Regex, str::DenseStringView; overlap = false) =
108+
RegexMatchIterator(re, str, overlap)
109+
110+
# copied from julia/base/pcre.jl:
111+
function PCRE.exec(re, subject::DenseStringView, offset, options, match_data)
112+
rc = ccall((:pcre2_match_8, PCRE_LIB), Cint,
113+
(Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, UInt32, Ptr{Cvoid}, Ptr{Cvoid}),
114+
re, subject, ncodeunits(subject), offset, options, match_data, get_local_match_context())
115+
# rc == -1 means no match, -2 means partial match.
116+
rc < -2 && error("PCRE.exec error: $(err_message(rc))")
117+
return rc >= 0
118+
end

0 commit comments

Comments
 (0)