Skip to content

Commit d52a878

Browse files
authored
feat: allows ZendStr to contain null bytes (#202)
Closes #200 ## Rationale In PHP zend_strings are binary strings with no encoding information. They can contain any byte at any position. The current implementation use `CString` to transfer zend_strings between Rust and PHP, which prevents zend_strings containing null-bytes to roundtrip through the ffi layer. Moreover, `ZendStr::new()` accepts only a `&str`, which is incorrect since a zend_string is not required to be valid UTF8. When reading the PHP source code, it is apparent that most functions marked with `ZEND_API` that accept a `const *char` are convenience wrappers that convert the `const *char` to a zend_string and delegate to another function. For example [zend_throw_exception()](https://github.com/php/php-src/blob/eb83e0206c9b9261d786943bf2c5ad61dca287e2/Zend/zend_exceptions.c#L823) takes a `const *char message`, and just converts it to a zend_string before delegating to [zend_throw_exception_zstr()](https://github.com/php/php-src/blob/eb83e0206c9b9261d786943bf2c5ad61dca287e2/Zend/zend_exceptions.c#L795). I kept this PR focused around `ZendStr` and it's usages in the library, but it should be seen as the first step of a more global effort to remove usages of `CString` everywhere possible. Also, I didn't change the return type of the string related methods of `Zval` (e.g. I could have made `Zval::set_string()` accept an `impl AsRef<[u8]>` instead of `&str` and return `()` instead of `Result<()>`). If I get feedback that it should be done in this PR, I'll do it. ## Summary of the changes: ### ZendStr * [BC break]: `ZendStr::new()` and `ZendStr::new_interned()` now accept an `impl AsRef<[u8]>` instead of just `&str`, and are therefore infaillible (outside of the cases where we panic, e.g. when allocation fails). This is a BC break, but it's impact shouldn't be huge (users will most likely just have to remove a bunch of `?` or add a few `Ok()`). * [BC break]: Conversely, `ZendStr::as_c_str()` now returns a `Result<&CStr>` since it can fail on strings containing null bytes. * [BC break]: `ZensStr::as_str()` now returns a `Result<&str>` instead of an `Option<&str>` since we have to return an error in case of invalid UTF8. * adds method `ZendStr::as_bytes()` to return the underlying byte slice. * adds convenience methods `ZendStr::as_ptr()` and `ZendStr::as_mut_ptr()` to return raw pointers to the zend_string. ### ZendStr conversion traits * adds `impl AsRef<[u8]> for ZendStr` * [BC break]: replaces `impl TryFrom<String> for ZBox<ZendStr>` by `impl From<String> for ZBox<ZendStr>`. * [BC break]: replaces `impl TryFrom<&str> for ZBox<ZendStr>` by `impl From<&str> for ZBox<ZendStr>`. * [BC break]: replaces `impl From<&ZendStr> for &CStr` by `impl TryFrom<&ZendStr> for &CStr`. ### Error * adds new enum member `Error::InvalidUtf8` used when converting a `ZendStr` to `String` or `&str`
1 parent 9e08e25 commit d52a878

File tree

12 files changed

+135
-89
lines changed

12 files changed

+135
-89
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
Cargo.lock
33
/.vscode
44
/.idea
5-
expand.rs
5+
/tmp
6+
expand.rs

allowed_bindings.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ bind! {
4343
// ext_php_rs_zend_object_release,
4444
// ext_php_rs_zend_string_init,
4545
// ext_php_rs_zend_string_release,
46+
// ext_php_rs_is_kown_valid_utf8,
47+
// ext_php_rs_set_kown_valid_utf8,
4648
object_properties_init,
4749
php_info_print_table_end,
4850
php_info_print_table_header,

src/builders/class.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ impl ClassBuilder {
226226
///
227227
/// Returns an [`Error`] variant if the class could not be registered.
228228
pub fn build(mut self) -> Result<&'static mut ClassEntry> {
229-
self.ce.name = ZendStr::new_interned(&self.name, true)?.into_raw();
229+
self.ce.name = ZendStr::new_interned(&self.name, true).into_raw();
230230

231231
self.methods.push(FunctionEntry::end());
232232
let func = Box::into_raw(self.methods.into_boxed_slice()) as *const FunctionEntry;

src/error.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ pub enum Error {
4848
/// The string could not be converted into a C-string due to the presence of
4949
/// a NUL character.
5050
InvalidCString,
51+
/// The string could not be converted into a valid Utf8 string
52+
InvalidUtf8,
5153
/// Could not call the given function.
5254
Callable,
5355
/// An invalid exception type was thrown.
@@ -82,6 +84,7 @@ impl Display for Error {
8284
f,
8385
"String given contains NUL-bytes which cannot be present in a C string."
8486
),
87+
Error::InvalidUtf8 => write!(f, "Invalid Utf8 byte sequence."),
8588
Error::Callable => write!(f, "Could not call given function."),
8689
Error::InvalidException(flags) => {
8790
write!(f, "Invalid exception type was thrown: {:?}", flags)

src/ffi.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ extern "C" {
1919
persistent: bool,
2020
) -> *mut zend_string;
2121
pub fn ext_php_rs_zend_string_release(zs: *mut zend_string);
22+
pub fn ext_php_rs_is_known_valid_utf8(zs: *const zend_string) -> bool;
23+
pub fn ext_php_rs_set_known_valid_utf8(zs: *mut zend_string);
24+
2225
pub fn ext_php_rs_php_build_id() -> *const c_char;
2326
pub fn ext_php_rs_zend_object_alloc(obj_size: usize, ce: *mut zend_class_entry) -> *mut c_void;
2427
pub fn ext_php_rs_zend_object_release(obj: *mut zend_object);

src/types/object.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ impl ZendObject {
137137
return Err(Error::InvalidProperty);
138138
}
139139

140-
let mut name = ZendStr::new(name, false)?;
140+
let mut name = ZendStr::new(name, false);
141141
let mut rv = Zval::new();
142142

143143
let zv = unsafe {
@@ -162,7 +162,7 @@ impl ZendObject {
162162
/// * `name` - The name of the property.
163163
/// * `value` - The value to set the property to.
164164
pub fn set_property(&mut self, name: &str, value: impl IntoZval) -> Result<()> {
165-
let mut name = ZendStr::new(name, false)?;
165+
let mut name = ZendStr::new(name, false);
166166
let mut value = value.into_zval(false)?;
167167

168168
unsafe {
@@ -187,7 +187,7 @@ impl ZendObject {
187187
/// * `name` - The name of the property.
188188
/// * `query` - The 'query' to classify if a property exists.
189189
pub fn has_property(&self, name: &str, query: PropertyQuery) -> Result<bool> {
190-
let mut name = ZendStr::new(name, false)?;
190+
let mut name = ZendStr::new(name, false);
191191

192192
Ok(unsafe {
193193
self.handlers()?.has_property.ok_or(Error::InvalidScope)?(

src/types/string.rs

Lines changed: 98 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use crate::{
1616
convert::{FromZval, IntoZval},
1717
error::{Error, Result},
1818
ffi::{
19+
ext_php_rs_is_known_valid_utf8, ext_php_rs_set_known_valid_utf8,
1920
ext_php_rs_zend_string_init, ext_php_rs_zend_string_release, zend_string,
2021
zend_string_init_interned,
2122
},
@@ -30,7 +31,7 @@ use crate::{
3031
/// cannot represent unsized types, an array of size 1 is used at the end of the
3132
/// type to represent the contents of the string, therefore this type is
3233
/// actually unsized. All constructors return [`ZBox<ZendStr>`], the owned
33-
/// varaint.
34+
/// variant.
3435
///
3536
/// Once the `ptr_metadata` feature lands in stable rust, this type can
3637
/// potentially be changed to a DST using slices and metadata. See the tracking issue here: <https://github.com/rust-lang/rust/issues/81513>
@@ -46,20 +47,14 @@ static INTERNED_LOCK: Mutex<()> = const_mutex(());
4647
// on the alias `ZendStr` :( <https://github.com/rust-lang/rust-clippy/issues/7702>
4748
#[allow(clippy::len_without_is_empty)]
4849
impl ZendStr {
49-
/// Creates a new Zend string from a [`str`].
50+
/// Creates a new Zend string from a slice of bytes.
5051
///
5152
/// # Parameters
5253
///
5354
/// * `str` - String content.
5455
/// * `persistent` - Whether the string should persist through the request
5556
/// boundary.
5657
///
57-
/// # Returns
58-
///
59-
/// Returns a result containing the Zend string if successful. Returns an
60-
/// error if the given string contains NUL bytes, which cannot be
61-
/// contained inside a C string.
62-
///
6358
/// # Panics
6459
///
6560
/// Panics if the function was unable to allocate memory for the Zend
@@ -78,10 +73,19 @@ impl ZendStr {
7873
/// ```no_run
7974
/// use ext_php_rs::types::ZendStr;
8075
///
81-
/// let s = ZendStr::new("Hello, world!", false).unwrap();
76+
/// let s = ZendStr::new("Hello, world!", false);
77+
/// let php = ZendStr::new([80, 72, 80], false);
8278
/// ```
83-
pub fn new(str: &str, persistent: bool) -> Result<ZBox<Self>> {
84-
Ok(Self::from_c_str(&CString::new(str)?, persistent))
79+
pub fn new(str: impl AsRef<[u8]>, persistent: bool) -> ZBox<Self> {
80+
let s = str.as_ref();
81+
// TODO: we should handle the special cases when length is either 0 or 1
82+
// see `zend_string_init_fast()` in `zend_string.h`
83+
unsafe {
84+
let ptr = ext_php_rs_zend_string_init(s.as_ptr().cast(), s.len(), persistent)
85+
.as_mut()
86+
.expect("Failed to allocate memory for new Zend string");
87+
ZBox::from_raw(ptr)
88+
}
8589
}
8690

8791
/// Creates a new Zend string from a [`CStr`].
@@ -126,7 +130,7 @@ impl ZendStr {
126130
}
127131
}
128132

129-
/// Creates a new interned Zend string from a [`str`].
133+
/// Creates a new interned Zend string from a slice of bytes.
130134
///
131135
/// An interned string is only ever stored once and is immutable. PHP stores
132136
/// the string in an internal hashtable which stores the interned
@@ -145,16 +149,12 @@ impl ZendStr {
145149
/// * `persistent` - Whether the string should persist through the request
146150
/// boundary.
147151
///
148-
/// # Returns
149-
///
150-
/// Returns a result containing the Zend string if successful. Returns an
151-
/// error if the given string contains NUL bytes, which cannot be
152-
/// contained inside a C string.
153-
///
154152
/// # Panics
155153
///
156-
/// Panics if the function was unable to allocate memory for the Zend
157-
/// string.
154+
/// Panics under the following circumstances:
155+
///
156+
/// * The function used to create interned strings has not been set.
157+
/// * The function could not allocate enough memory for the Zend string.
158158
///
159159
/// # Safety
160160
///
@@ -171,8 +171,16 @@ impl ZendStr {
171171
///
172172
/// let s = ZendStr::new_interned("PHP", true);
173173
/// ```
174-
pub fn new_interned(str: &str, persistent: bool) -> Result<ZBox<Self>> {
175-
Ok(Self::interned_from_c_str(&CString::new(str)?, persistent))
174+
pub fn new_interned(str: impl AsRef<[u8]>, persistent: bool) -> ZBox<Self> {
175+
let _lock = INTERNED_LOCK.lock();
176+
let s = str.as_ref();
177+
unsafe {
178+
let init = zend_string_init_interned.expect("`zend_string_init_interned` not ready");
179+
let ptr = init(s.as_ptr().cast(), s.len() as _, persistent)
180+
.as_mut()
181+
.expect("Failed to allocate memory for new Zend string");
182+
ZBox::from_raw(ptr)
183+
}
176184
}
177185

178186
/// Creates a new interned Zend string from a [`CStr`].
@@ -222,11 +230,8 @@ impl ZendStr {
222230
let _lock = INTERNED_LOCK.lock();
223231

224232
unsafe {
225-
let ptr = zend_string_init_interned.expect("`zend_string_init_interned` not ready")(
226-
str.as_ptr(),
227-
str.to_bytes().len() as _,
228-
persistent,
229-
);
233+
let init = zend_string_init_interned.expect("`zend_string_init_interned` not ready");
234+
let ptr = init(str.as_ptr(), str.to_bytes().len() as _, persistent);
230235

231236
ZBox::from_raw(
232237
ptr.as_mut()
@@ -242,7 +247,7 @@ impl ZendStr {
242247
/// ```no_run
243248
/// use ext_php_rs::types::ZendStr;
244249
///
245-
/// let s = ZendStr::new("hello, world!", false).unwrap();
250+
/// let s = ZendStr::new("hello, world!", false);
246251
/// assert_eq!(s.len(), 13);
247252
/// ```
248253
pub fn len(&self) -> usize {
@@ -256,39 +261,61 @@ impl ZendStr {
256261
/// ```no_run
257262
/// use ext_php_rs::types::ZendStr;
258263
///
259-
/// let s = ZendStr::new("hello, world!", false).unwrap();
264+
/// let s = ZendStr::new("hello, world!", false);
260265
/// assert_eq!(s.is_empty(), false);
261266
/// ```
262267
pub fn is_empty(&self) -> bool {
263268
self.len() == 0
264269
}
265270

266-
/// Returns a reference to the underlying [`CStr`] inside the Zend string.
267-
pub fn as_c_str(&self) -> &CStr {
268-
// SAFETY: Zend strings store their readable length in a fat pointer.
269-
unsafe {
270-
let slice = slice::from_raw_parts(self.val.as_ptr() as *const u8, self.len() + 1);
271-
CStr::from_bytes_with_nul_unchecked(slice)
272-
}
271+
/// Attempts to return a reference to the underlying bytes inside the Zend
272+
/// string as a [`CStr`].
273+
///
274+
/// Returns an [Error::InvalidCString] variant if the string contains null
275+
/// bytes.
276+
pub fn as_c_str(&self) -> Result<&CStr> {
277+
let bytes_with_null =
278+
unsafe { slice::from_raw_parts(self.val.as_ptr().cast(), self.len() + 1) };
279+
CStr::from_bytes_with_nul(bytes_with_null).map_err(|_| Error::InvalidCString)
273280
}
274281

275-
/// Attempts to return a reference to the underlying [`str`] inside the Zend
282+
/// Attempts to return a reference to the underlying bytes inside the Zend
276283
/// string.
277284
///
278-
/// Returns the [`None`] variant if the [`CStr`] contains non-UTF-8
279-
/// characters.
285+
/// Returns an [Error::InvalidUtf8] variant if the [`str`] contains
286+
/// non-UTF-8 characters.
280287
///
281288
/// # Example
282289
///
283290
/// ```no_run
284291
/// use ext_php_rs::types::ZendStr;
285292
///
286-
/// let s = ZendStr::new("hello, world!", false).unwrap();
287-
/// let as_str = s.as_str();
288-
/// assert_eq!(as_str, Some("hello, world!"));
293+
/// let s = ZendStr::new("hello, world!", false);
294+
/// assert!(s.as_str().is_ok());
289295
/// ```
290-
pub fn as_str(&self) -> Option<&str> {
291-
self.as_c_str().to_str().ok()
296+
pub fn as_str(&self) -> Result<&str> {
297+
if unsafe { ext_php_rs_is_known_valid_utf8(self.as_ptr()) } {
298+
let str = unsafe { std::str::from_utf8_unchecked(self.as_bytes()) };
299+
return Ok(str);
300+
}
301+
let str = std::str::from_utf8(self.as_bytes()).map_err(|_| Error::InvalidUtf8)?;
302+
unsafe { ext_php_rs_set_known_valid_utf8(self.as_ptr() as *mut _) };
303+
Ok(str)
304+
}
305+
306+
/// Returns a reference to the underlying bytes inside the Zend string.
307+
pub fn as_bytes(&self) -> &[u8] {
308+
unsafe { slice::from_raw_parts(self.val.as_ptr().cast(), self.len()) }
309+
}
310+
311+
/// Returns a raw pointer to this object
312+
pub fn as_ptr(&self) -> *const ZendStr {
313+
self as *const _
314+
}
315+
316+
/// Returns a mutable pointer to this object
317+
pub fn as_mut_ptr(&mut self) -> *mut ZendStr {
318+
self as *mut _
292319
}
293320
}
294321

@@ -300,27 +327,37 @@ unsafe impl ZBoxable for ZendStr {
300327

301328
impl Debug for ZendStr {
302329
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
303-
self.as_c_str().fmt(f)
330+
self.as_str().fmt(f)
331+
}
332+
}
333+
334+
impl AsRef<[u8]> for ZendStr {
335+
fn as_ref(&self) -> &[u8] {
336+
self.as_bytes()
337+
}
338+
}
339+
340+
impl<T> PartialEq<T> for ZendStr
341+
where
342+
T: AsRef<[u8]>,
343+
{
344+
fn eq(&self, other: &T) -> bool {
345+
self.as_ref() == other.as_ref()
304346
}
305347
}
306348

307349
impl ToOwned for ZendStr {
308350
type Owned = ZBox<ZendStr>;
309351

310352
fn to_owned(&self) -> Self::Owned {
311-
Self::from_c_str(self.as_c_str(), false)
353+
Self::new(self.as_bytes(), false)
312354
}
313355
}
314356

315-
impl PartialEq for ZendStr {
316-
#[inline]
317-
fn eq(&self, other: &Self) -> bool {
318-
self.as_c_str().eq(other.as_c_str())
319-
}
320-
}
357+
impl<'a> TryFrom<&'a ZendStr> for &'a CStr {
358+
type Error = Error;
321359

322-
impl<'a> From<&'a ZendStr> for &'a CStr {
323-
fn from(value: &'a ZendStr) -> Self {
360+
fn try_from(value: &'a ZendStr) -> Result<Self> {
324361
value.as_c_str()
325362
}
326363
}
@@ -329,18 +366,15 @@ impl<'a> TryFrom<&'a ZendStr> for &'a str {
329366
type Error = Error;
330367

331368
fn try_from(value: &'a ZendStr) -> Result<Self> {
332-
value.as_str().ok_or(Error::InvalidCString)
369+
value.as_str()
333370
}
334371
}
335372

336373
impl TryFrom<&ZendStr> for String {
337374
type Error = Error;
338375

339376
fn try_from(value: &ZendStr) -> Result<Self> {
340-
value
341-
.as_str()
342-
.map(|s| s.to_string())
343-
.ok_or(Error::InvalidCString)
377+
value.as_str().map(ToString::to_string)
344378
}
345379
}
346380

@@ -362,18 +396,14 @@ impl From<CString> for ZBox<ZendStr> {
362396
}
363397
}
364398

365-
impl TryFrom<&str> for ZBox<ZendStr> {
366-
type Error = Error;
367-
368-
fn try_from(value: &str) -> Result<Self> {
369-
ZendStr::new(value, false)
399+
impl From<&str> for ZBox<ZendStr> {
400+
fn from(value: &str) -> Self {
401+
ZendStr::new(value.as_bytes(), false)
370402
}
371403
}
372404

373-
impl TryFrom<String> for ZBox<ZendStr> {
374-
type Error = Error;
375-
376-
fn try_from(value: String) -> Result<Self> {
405+
impl From<String> for ZBox<ZendStr> {
406+
fn from(value: String) -> Self {
377407
ZendStr::new(value.as_str(), false)
378408
}
379409
}

0 commit comments

Comments
 (0)