Skip to content

Commit 35d12bb

Browse files
committed
Add support for binary strings and utf8 strings
1 parent a574932 commit 35d12bb

File tree

2 files changed

+116
-13
lines changed

2 files changed

+116
-13
lines changed

src/Encoder/StringEncoder.php

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ class StringEncoder implements Encoder
1313
/** @var array Default values for options in the encoder */
1414
private static $defaultOptions = [
1515
'string.escape' => true,
16+
'string.binary' => false,
17+
'string.utf8' => false,
1618
];
1719

1820
public function getDefaultOptions()
@@ -27,11 +29,44 @@ public function supports($value)
2729

2830
public function encode($value, $depth, array $options, callable $encode)
2931
{
30-
if (!$options['string.escape'] || preg_match('/^[\x20-\x7E]*$/', $value)) {
31-
return $this->getSingleQuotedString($value);
32+
$value = (string) $value;
33+
34+
if (preg_match('/[^\x20-\x7E]/', $value)) {
35+
if ($this->isBinaryString($value, $options)) {
36+
return $this->encodeBinaryString($value);
37+
} elseif ($options['string.escape']) {
38+
return $this->getDoubleQuotedString($value, $options);
39+
}
40+
}
41+
42+
return $this->getSingleQuotedString($value);
43+
}
44+
45+
private function isBinaryString($string, $options)
46+
{
47+
if (!$options['string.binary']) {
48+
return false;
3249
}
3350

34-
return $this->getDoubleQuotedString($value);
51+
// UTF-8 validity test without mbstring extension
52+
$pattern =
53+
'/^(?>
54+
[\x00-\x7F]+ # ASCII
55+
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
56+
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding over longs
57+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
58+
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
59+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
60+
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
61+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
62+
)*$/x';
63+
64+
return !preg_match($pattern, $string);
65+
}
66+
67+
private function encodeBinaryString($string)
68+
{
69+
return sprintf("base64_decode('%s')", base64_encode($string));
3570
}
3671

3772
/**
@@ -49,21 +84,62 @@ private function getSingleQuotedString($string)
4984
* @param string $string String to wrap and escape
5085
* @return string The string wrapped in double quotes and escape correctly
5186
*/
52-
private function getDoubleQuotedString($string)
87+
private function getDoubleQuotedString($string, $options)
5388
{
89+
$string = strtr($string, [
90+
"\n" => '\n',
91+
"\r" => '\r',
92+
"\t" => '\t',
93+
'$' => '\$',
94+
'"' => '\"',
95+
'\\' => '\\\\',
96+
]);
97+
98+
if ($options['string.utf8']) {
99+
$string = $this->encodeUtf8($string);
100+
}
101+
54102
return sprintf('"%s"', preg_replace_callback(
55103
'/[^\x20-\x7E]/',
56104
function ($matches) {
57105
return sprintf('\x%02x', ord($matches[0]));
58106
},
59-
strtr($string, [
60-
"\n" => '\n',
61-
"\r" => '\r',
62-
"\t" => '\t',
63-
'$' => '\$',
64-
'"' => '\"',
65-
'\\' => '\\\\',
66-
])
107+
$string
67108
));
68109
}
110+
111+
private function encodeUtf8($string)
112+
{
113+
$pattern =
114+
'/ [\xC2-\xDF][\x80-\xBF]
115+
| \xE0[\xA0-\xBF][\x80-\xBF]
116+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
117+
| \xED[\x80-\x9F][\x80-\xBF]
118+
| \xF0[\x90-\xBF][\x80-\xBF]{2}
119+
| [\xF1-\xF3][\x80-\xBF]{3}
120+
| \xF4[\x80-\x8F][\x80-\xBF]{2}/x';
121+
122+
return preg_replace_callback($pattern, function ($match) {
123+
return sprintf('\u{%s}', dechex($this->getCodePoint($match[0])));
124+
}, $string);
125+
}
126+
127+
private function getCodePoint($bytes)
128+
{
129+
if (strlen($bytes) === 2) {
130+
return ((ord($bytes[0]) & 0b11111) << 6)
131+
| (ord($bytes[1]) & 0b111111);
132+
}
133+
134+
if (strlen($bytes) === 3) {
135+
return ((ord($bytes[0]) & 0b1111) << 12)
136+
| ((ord($bytes[1]) & 0b111111) << 6)
137+
| (ord($bytes[2]) & 0b111111);
138+
}
139+
140+
return ((ord($bytes[0]) & 0b111) << 18)
141+
| ((ord($bytes[1]) & 0b111111) << 12)
142+
| ((ord($bytes[2]) & 0b111111) << 6)
143+
| (ord($bytes[3]) & 0b111111);
144+
}
69145
}

tests/tests/EncodingTest.php

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,37 @@ public function testStringEncoding()
223223
$this->assertEncode('"\t\$foo"', "\t\$foo");
224224
$this->assertEncode('"\t{\$foo}"', "\t{\$foo}");
225225
$this->assertEncode('"\x00"', "\x00");
226-
227226
$this->assertEncode("'\r'", "\r", ['string.escape' => false]);
228227
}
229228

229+
public function testBinaryStrings()
230+
{
231+
$encoder = new PHPEncoder(['string.binary' => true, 'string.escape' => false]);
232+
233+
$this->assertEncode("base64_decode('AP8Q')", "\x00\xFF\x10", $encoder);
234+
$this->assertEncode("'ABC'", 'ABC', $encoder);
235+
$this->assertEncode("'åäöÅÄÖ'", 'åäöÅÄÖ', $encoder);
236+
}
237+
238+
public function testUtf8String()
239+
{
240+
$encoder = new PHPEncoder(['string.utf8' => true]);
241+
242+
$this->assertEncode('"\nA"', "\nA", $encoder);
243+
244+
if (version_compare(PHP_VERSION, '7', '<')) {
245+
$this->assertSame('"\u{a2}"', $encoder->encode("\xC2\xA2"));
246+
$this->assertSame('"\u{20ac}"', $encoder->encode("\xE2\x82\xAC"));
247+
$this->assertSame('"\u{10348}"', $encoder->encode("\xF0\x90\x8D\x88"));
248+
$this->assertSame('"\u{e5}\u{e4}\u{f6}\u{c5}\u{c4}\u{d6}"', $encoder->encode('åäöÅÄÖ'));
249+
} else {
250+
$this->assertEncode('"\u{a2}"', "\u{a2}", $encoder);
251+
$this->assertEncode('"\u{20ac}"', "\u{20ac}", $encoder);
252+
$this->assertEncode('"\u{10348}"', "\u{10348}", $encoder);
253+
$this->assertEncode('"\u{e5}\u{e4}\u{f6}\u{c5}\u{c4}\u{d6}"', 'åäöÅÄÖ', $encoder);
254+
}
255+
}
256+
230257
public function testGMPEncoding()
231258
{
232259
if (!function_exists('gmp_init')) {

0 commit comments

Comments
 (0)