Skip to content

Commit 7a44c04

Browse files
committed
deprecate Vector<->String conversion in favor of something safer
add `CodeUnits` and `codeunits` fixes #24388
1 parent 41697f9 commit 7a44c04

File tree

22 files changed

+116
-41
lines changed

22 files changed

+116
-41
lines changed

base/c.jl

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ cconvert(::Type{Cstring}, s::AbstractString) =
127127
cconvert(Cstring, String(s)::String)
128128

129129
function cconvert(::Type{Cwstring}, s::AbstractString)
130-
v = transcode(Cwchar_t, Vector{UInt8}(String(s)))
130+
v = transcode(Cwchar_t, String(s))
131131
!isempty(v) && v[end] == 0 || push!(v, 0)
132132
return v
133133
end
@@ -140,7 +140,7 @@ containsnul(p::Ptr, len) =
140140
containsnul(s::String) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s))
141141
containsnul(s::AbstractString) = '\0' in s
142142

143-
function unsafe_convert(::Type{Cstring}, s::Union{String,Vector{UInt8}})
143+
function unsafe_convert(::Type{Cstring}, s::Union{String,AbstractVector{UInt8}})
144144
p = unsafe_convert(Ptr{Cchar}, s)
145145
containsnul(p, sizeof(s)) &&
146146
throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))"))
@@ -174,7 +174,7 @@ same argument.
174174
This is only available on Windows.
175175
"""
176176
function cwstring(s::AbstractString)
177-
bytes = Vector{UInt8}(String(s))
177+
bytes = codeunits(String(s))
178178
0 in bytes && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))"))
179179
return push!(transcode(UInt16, bytes), 0)
180180
end
@@ -202,19 +202,20 @@ Only conversion to/from UTF-8 is currently supported.
202202
"""
203203
function transcode end
204204

205-
transcode(::Type{T}, src::Vector{T}) where {T<:Union{UInt8,UInt16,UInt32,Int32}} = src
205+
transcode(::Type{T}, src::AbstractVector{T}) where {T<:Union{UInt8,UInt16,UInt32,Int32}} = src
206206
transcode(::Type{T}, src::String) where {T<:Union{Int32,UInt32}} = T[T(c) for c in src]
207-
transcode(::Type{T}, src::Vector{UInt8}) where {T<:Union{Int32,UInt32}} = transcode(T, String(src))
207+
transcode(::Type{T}, src::Union{Vector{UInt8},CodeUnits{UInt8,String}}) where {T<:Union{Int32,UInt32}} =
208+
transcode(T, String(src))
208209
function transcode(::Type{UInt8}, src::Vector{<:Union{Int32,UInt32}})
209210
buf = IOBuffer()
210211
for c in src; print(buf, Char(c)); end
211212
take!(buf)
212213
end
213214
transcode(::Type{String}, src::String) = src
214-
transcode(T, src::String) = transcode(T, Vector{UInt8}(src))
215+
transcode(T, src::String) = transcode(T, codeunits(src))
215216
transcode(::Type{String}, src) = String(transcode(UInt8, src))
216217

217-
function transcode(::Type{UInt16}, src::Vector{UInt8})
218+
function transcode(::Type{UInt16}, src::Union{Vector{UInt8},CodeUnits{UInt8,String}})
218219
dst = UInt16[]
219220
i, n = 1, length(src)
220221
n > 0 || return dst

base/deprecated.jl

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,8 +1555,19 @@ export hex2num
15551555
@deprecate ctranspose adjoint
15561556
@deprecate ctranspose! adjoint!
15571557

1558-
@deprecate convert(::Type{Vector{UInt8}}, s::AbstractString) Vector{UInt8}(s)
1559-
@deprecate convert(::Type{Array{UInt8}}, s::AbstractString) Vector{UInt8}(s)
1558+
function convert(::Union{Type{Vector{UInt8}}, Type{Array{UInt8}}}, s::AbstractString)
1559+
depwarn("Strings can no longer be `convert`ed to byte arrays. Use `unsafe_wrap` or `codeunits` instead.", :Type)
1560+
unsafe_wrap(Vector{UInt8}, String(s))
1561+
end
1562+
function (::Type{Vector{UInt8}})(s::String)
1563+
depwarn("Vector{UInt8}(s::String) will copy data in the future. To avoid copying, use `unsafe_wrap` or `codeunits` instead.", :Type)
1564+
unsafe_wrap(Vector{UInt8}, s)
1565+
end
1566+
function (::Type{Array{UInt8}})(s::String)
1567+
depwarn("Array{UInt8}(s::String) will copy data in the future. To avoid copying, use `unsafe_wrap` or `codeunits` instead.", :Type)
1568+
unsafe_wrap(Vector{UInt8}, s)
1569+
end
1570+
15601571
@deprecate convert(::Type{Vector{Char}}, s::AbstractString) Vector{Char}(s)
15611572
@deprecate convert(::Type{Symbol}, s::AbstractString) Symbol(s)
15621573
@deprecate convert(::Type{String}, s::Symbol) String(s)

base/exports.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,7 @@ export
686686
chomp,
687687
chop,
688688
codeunit,
689+
codeunits,
689690
dec,
690691
digits,
691692
digits!,

base/io.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ function readuntil(io::IO, target::AbstractString)
706706
# decide how we can index target
707707
if target isa String
708708
# convert String to a utf8-byte-iterator
709-
target = Vector{UInt8}(target)
709+
target = codeunits(target)
710710
#elseif applicable(codeunit, target)
711711
# TODO: a more general version of above optimization
712712
# would be to permit accessing any string via codeunit

base/iobuffer.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ function GenericIOBuffer(data::T, readable::Bool, writable::Bool, seekable::Bool
2727
end
2828

2929
# allocate Vector{UInt8}s for IOBuffer storage that can efficiently become Strings
30-
StringVector(n::Integer) = Vector{UInt8}(_string_n(n))
30+
StringVector(n::Integer) = unsafe_wrap(Vector{UInt8}, _string_n(n))
3131

3232
# IOBuffers behave like Files. They are typically readable and writable. They are seekable. (They can be appendable).
3333

base/loading.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,12 @@ elseif Sys.isapple()
6464
break
6565
end
6666
# Hack to compensate for inability to create a string from a subarray with no allocations.
67-
Vector{UInt8}(path_basename) == casepreserved_basename && return true
67+
codeunits(path_basename) == casepreserved_basename && return true
6868

6969
# If there is no match, it's possible that the file does exist but HFS+
7070
# performed unicode normalization. See https://developer.apple.com/library/mac/qa/qa1235/_index.html.
7171
isascii(path_basename) && return false
72-
Vector{UInt8}(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename
72+
codeunits(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename
7373
end
7474
else
7575
# Generic fallback that performs a slow directory listing.

base/repl/LineEdit.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -657,7 +657,7 @@ function edit_splice!(s, r::Region=region(s), ins::AbstractString = ""; rigid_ma
657657
elseif buf.mark >= B
658658
buf.mark += sizeof(ins) - B + A
659659
end
660-
ret = splice!(buf.data, A+1:B, Vector{UInt8}(ins)) # position(), etc, are 0-indexed
660+
ret = splice!(buf.data, A+1:B, codeunits(String(ins))) # position(), etc, are 0-indexed
661661
buf.size = buf.size + sizeof(ins) - B + A
662662
adjust_pos && seek(buf, position(buf) + sizeof(ins))
663663
String(ret)

base/replutil.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ function showerror(io::IO, ex::ErrorException)
283283
print(io, ex.msg)
284284
if ex.msg == "type String has no field data"
285285
println(io)
286-
print(io, "Use `Vector{UInt8}(str)` instead.")
286+
print(io, "Use `codeunits(str)` instead.")
287287
end
288288
end
289289
showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found")

base/strings/basic.jl

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ end
152152
getindex(s::AbstractString, i::Colon) = s
153153
# TODO: handle other ranges with stride ±1 specially?
154154
# TODO: add more @propagate_inbounds annotations?
155-
getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r)
156155
getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
157156
sprint(io->(for i in v; write(io, s[i]) end), sizehint=length(v))
158157
getindex(s::AbstractString, v::AbstractVector{Bool}) =
@@ -185,8 +184,8 @@ checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) =
185184
string() = ""
186185
string(s::AbstractString) = s
187186

188-
(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s))
189-
(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s)
187+
(::Type{Vector{UInt8}})(s::AbstractString) = unsafe_wrap(Vector{UInt8}, String(s))
188+
(::Type{Array{UInt8}})(s::AbstractString) = unsafe_wrap(Vector{UInt8}, String(s))
190189
(::Type{Vector{Char}})(s::AbstractString) = collect(s)
191190

192191
Symbol(s::AbstractString) = Symbol(String(s))
@@ -629,3 +628,40 @@ next(r::Iterators.Reverse{<:AbstractString}, i) = (r.itr[i], prevind(r.itr, i))
629628
start(r::Iterators.Reverse{<:EachStringIndex}) = endof(r.itr.s)
630629
done(r::Iterators.Reverse{<:EachStringIndex}, i) = i < start(r.itr.s)
631630
next(r::Iterators.Reverse{<:EachStringIndex}, i) = (i, prevind(r.itr.s, i))
631+
632+
## code unit access ##
633+
634+
"""
635+
CodeUnits(s::AbstractString)
636+
637+
Wrap a string (without copying) in an immutable vector-like object that accesses the code units
638+
of the string's representation.
639+
"""
640+
struct CodeUnits{T,S<:AbstractString} <: DenseVector{T}
641+
s::S
642+
CodeUnits(s::S) where {S<:AbstractString} = new{codeunit(s),S}(s)
643+
end
644+
645+
length(s::CodeUnits) = ncodeunits(s.s)
646+
sizeof(s::CodeUnits{T}) where {T} = ncodeunits(s.s) * sizeof(T)
647+
size(s::CodeUnits) = (length(s),)
648+
strides(s::CodeUnits) = (1,)
649+
@propagate_inbounds getindex(s::CodeUnits, i::Int) = codeunit(s.s, i)
650+
IndexStyle(::Type{<:CodeUnits}) = IndexLinear()
651+
start(s::CodeUnits) = 1
652+
next(s::CodeUnits, i) = (@_propagate_inbounds_meta; (s[i], i+1))
653+
done(s::CodeUnits, i) = (@_inline_meta; i == length(s)+1)
654+
655+
write(io::IO, s::CodeUnits) = write(io, s.s)
656+
657+
unsafe_convert(::Type{Ptr{T}}, s::CodeUnits{T}) where {T} = unsafe_convert(Ptr{T}, s.s)
658+
unsafe_convert(::Type{Ptr{Int8}}, s::CodeUnits{UInt8}) = unsafe_convert(Ptr{Int8}, s.s)
659+
660+
"""
661+
codeunits(s::AbstractString)
662+
663+
Obtain a vector-like object containing the code units of a string.
664+
Returns a `CodeUnits` wrapper by default, but `codeunits` may optionally be defined
665+
for new string types if necessary.
666+
"""
667+
codeunits(s::AbstractString) = CodeUnits(s)

base/strings/io.jl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ julia> String(take!(io))
188188
"Haho"
189189
```
190190
"""
191-
IOBuffer(str::String) = IOBuffer(Vector{UInt8}(str))
192-
IOBuffer(s::SubString{String}) = IOBuffer(view(Vector{UInt8}(s.string), s.offset + 1 : s.offset + sizeof(s)))
191+
IOBuffer(str::String) = IOBuffer(unsafe_wrap(Vector{UInt8}, str))
192+
IOBuffer(s::SubString{String}) = IOBuffer(view(unsafe_wrap(Vector{UInt8}, s.string), s.offset + 1 : s.offset + sizeof(s)))
193193

194194
# join is implemented using IO
195195

@@ -373,7 +373,10 @@ function unescape_string(io, s::AbstractString)
373373
end
374374
end
375375

376-
macro b_str(s); :(Vector{UInt8}($(unescape_string(s)))); end
376+
macro b_str(s)
377+
v = Vector{UInt8}(codeunits(unescape_string(s)))
378+
QuoteNode(v)
379+
end
377380

378381
"""
379382
@raw_str -> String

0 commit comments

Comments
 (0)