Skip to content

[RFC 9651] Add support for Display String type to RawStructuredFieldValues #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Sources/RawStructuredFieldValues/ComponentTypes.swift
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ extension BareItem {

case .date:
throw StructuredHeaderError.invalidItem
case .displayString:
throw StructuredHeaderError.invalidItem
}
}
}
Expand Down Expand Up @@ -141,6 +143,9 @@ public enum RFC9651BareItem: Sendable {

/// A date item.
case date(Int)

/// A display string item.
case displayString(String)
}

extension RFC9651BareItem: ExpressibleByBooleanLiteral {
Expand Down
2 changes: 2 additions & 0 deletions Sources/RawStructuredFieldValues/Errors.swift
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public struct StructuredHeaderError: Error, Sendable {
case invalidBoolean
case invalidToken
case invalidDate
case invalidDisplayString
case invalidList
case invalidDictionary
case missingKey
Expand All @@ -53,6 +54,7 @@ extension StructuredHeaderError {
public static let invalidBoolean = StructuredHeaderError(.invalidBoolean)
public static let invalidToken = StructuredHeaderError(.invalidToken)
public static let invalidDate = StructuredHeaderError(.invalidDate)
public static let invalidDisplayString = StructuredHeaderError(.invalidDisplayString)
public static let invalidList = StructuredHeaderError(.invalidList)
public static let invalidDictionary = StructuredHeaderError(.invalidDictionary)
public static let missingKey = StructuredHeaderError(.missingKey)
Expand Down
97 changes: 97 additions & 0 deletions Sources/RawStructuredFieldValues/FieldParser.swift
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ extension StructuredFieldValueParser {
return try self._parseAToken()
case asciiAt:
return try self._parseADate()
case asciiPercent:
return try self._parseADisplayString()
default:
throw StructuredHeaderError.invalidItem
}
Expand Down Expand Up @@ -491,6 +493,68 @@ extension StructuredFieldValueParser {
return try self._parseAnIntegerOrDecimal(isDate: true)
}

private mutating func _parseADisplayString() throws -> RFC9651BareItem {
assert(self.underlyingData.first == asciiPercent)
self.underlyingData.consumeFirst()

guard self.underlyingData.first == asciiDquote else {
throw StructuredHeaderError.invalidDisplayString
}

self.underlyingData.consumeFirst()

var byteArray = [UInt8]()

while let char = self.underlyingData.first {
self.underlyingData.consumeFirst()

switch char {
case 0x00...0x1F, 0x7F...:
throw StructuredHeaderError.invalidDisplayString
case asciiPercent:
if self.underlyingData.count < 2 {
throw StructuredHeaderError.invalidDisplayString
}

let startIndex = self.underlyingData.startIndex
let secondIndex = self.underlyingData.index(after: startIndex)
let octetHex = self.underlyingData[...secondIndex]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can skip the index math and do let octetHex = self.underlyingData.prefix(2)


self.underlyingData = self.underlyingData.dropFirst(2)

guard
octetHex.allSatisfy({ asciiDigits.contains($0) || asciiLowercases.contains($0) }),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be somewhere I'd write a little micro-extension method for this purpose, as we know that there are exactly two values and so we don't strictly need a loop.

let octet = UInt8.decodeHex(octetHex)
else {
throw StructuredHeaderError.invalidDisplayString
}

byteArray.append(octet)
case asciiDquote:
let unicodeSequence = try byteArray.withUnsafeBytes {
try $0.withMemoryRebound(to: CChar.self) {
guard let baseAddress = $0.baseAddress else {
throw StructuredHeaderError.invalidDisplayString
}

return String(validatingUTF8: baseAddress)
}
}

guard let unicodeSequence else {
throw StructuredHeaderError.invalidDisplayString
}

return .displayString(unicodeSequence)
default:
byteArray.append(char)
}
}

// Fail parsing — reached the end of the string without finding a closing DQUOTE.
throw StructuredHeaderError.invalidDisplayString
}

private mutating func _parseParameters() throws -> OrderedMap<Key, RFC9651BareItem> {
var parameters = OrderedMap<Key, RFC9651BareItem>()

Expand Down Expand Up @@ -643,3 +707,36 @@ extension StrippingStringEscapesCollection.Index: Comparable {
lhs._baseIndex < rhs._baseIndex
}
}

extension UInt8 {
/// Converts a hex value given in UTF8 to base 10.
fileprivate static func decodeHex<Bytes: RandomAccessCollection>(_ bytes: Bytes) -> Self?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also excessively generic I think. We only ever accept a single data type, and it's always two bytes long. It might actually be useful to express that as its own type, e.g:

struct EncodedHex {
    var first: UInt8
    var second: UInt8

    init(_ slice: ArraySlice<UInt8>) {
        precondition(slice.count == 2)
        self.first = slice[slice.startIndex]
        self.second = slice[slice.index(after: slice.startIndex)]
    }
}

That lets us express the rest of this math in a much less branchy way, and with an avoidance of preconditions elsewhere.

where Bytes.Element == Self {
var result = Self(0)
var power = Self(bytes.count)

for byte in bytes {
power -= 1

guard let integer = Self.htoi(byte) else { return nil }
result += integer << (power * 4)
}

return result
}

/// Converts a hex character given in UTF8 to its integer value.
private static func htoi(_ value: Self) -> Self? {
let charA = Self(UnicodeScalar("a").value)
let char0 = Self(UnicodeScalar("0").value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer the UInt8(ascii:) initializer.


switch value {
case char0...char0 + 9:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are a touch nicer using named values at both ends of the range.

return value - char0
case charA...charA + 5:
return value - charA + 10
default:
return nil
}
}
}
20 changes: 20 additions & 0 deletions Sources/RawStructuredFieldValues/FieldSerializer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,26 @@ extension StructuredFieldValueSerializer {
}

self.data.append(contentsOf: String(date, radix: 10).utf8)
case .displayString(let displayString):
let bytes = displayString.utf8

self.data.append(asciiPercent)
self.data.append(asciiDquote)

for byte in bytes {
if byte == asciiPercent
|| byte == asciiDquote
|| (0x00...0x1F).contains(byte)
|| (0x7F...).contains(byte)
{
self.data.append(asciiPercent)
self.data.append(contentsOf: String(byte, radix: 16, uppercase: false).utf8)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can avoid constructing a String here, if we write the code out.

} else {
self.data.append(byte)
}
}

self.data.append(asciiDquote)
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions Sources/sh-parser/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ extension RFC9651BareItem {
return "decimal \(d)"
case .date(let date):
return "date \(date)"
case .displayString(let displayString):
return "display string \(displayString)"
}
}
}
Expand Down
16 changes: 16 additions & 0 deletions Tests/StructuredFieldValuesTests/StructuredFieldParserTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,22 @@ final class StructuredFieldParserTests: XCTestCase {

XCTAssertEqual(typeName, "date", "\(fixtureName): Expected type date, got type \(typeName)")
XCTAssertEqual(typeValue, baseDate, "\(fixtureName): Got \(baseDate), expected \(typeValue)")
case (.displayString(let baseDisplayString), .dictionary(let typeDictionary)):
guard typeDictionary.count == 2, case .string(let typeName) = typeDictionary["__type"],
case .string(let typeValue) = typeDictionary["value"]
else {
XCTFail("\(fixtureName): Unexpected type dict \(typeDictionary)")
return
}

XCTAssertEqual(
typeName,
"displaystring",
"\(fixtureName): Expected type displaystring, got type \(typeName)")
XCTAssertEqual(
typeValue,
baseDisplayString,
"\(fixtureName): Got \(baseDisplayString), expected \(typeValue)")
default:
XCTFail("\(fixtureName): Got \(bareItem), expected \(schema)")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ extension RFC9651BareItem {
case (.some(.string("date")), .some(.integer(let value))):
self = .date(value)

case (.some(.string("displaystring")), .some(.string(let value))):
self = .displayString(value)

default:
preconditionFailure("Unexpected type object \(typeObject)")
}
Expand Down
111 changes: 111 additions & 0 deletions Tests/TestFixtures/display-string.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
[
{
"name": "basic display string (ascii content)",
"raw": ["%\"foo bar\""],
"header_type": "item",
"expected": [{"__type": "displaystring", "value": "foo bar"}, {}]
},
{
"name": "all printable ascii",
"raw": ["%\" !%22#$%25&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\""],
"header_type": "item",
"expected": [{"__type": "displaystring", "value": " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"}, {}]
},
{
"name": "non-ascii display string (uppercase escaping)",
"raw": ["%\"f%C3%BC%C3%BC\""],
"canonical": ["%\"f%c3%bc%c3%bc\""],
"header_type": "item",
"must_fail": true
},
{
"name": "non-ascii display string (lowercase escaping)",
"raw": ["%\"f%c3%bc%c3%bc\""],
"header_type": "item",
"expected": [{"__type": "displaystring", "value": "füü"}, {}]
},
{
"name": "tab in display string",
"raw": ["%\"\t\""],
"header_type": "item",
"must_fail": true
},
{
"name": "newline in display string",
"raw": ["%\"\n\""],
"header_type": "item",
"must_fail": true
},
{
"name": "single quoted display string",
"raw": ["%'foo'"],
"header_type": "item",
"must_fail": true
},
{
"name": "unquoted display string",
"raw": ["%foo"],
"header_type": "item",
"must_fail": true
},
{
"name": "display string missing initial quote",
"raw": ["%foo\""],
"header_type": "item",
"must_fail": true
},
{
"name": "unbalanced display string",
"raw": ["%\"foo"],
"header_type": "item",
"must_fail": true
},
{
"name": "display string quoting",
"raw": ["%\"foo %22bar%22 \\ baz\""],
"header_type": "item",
"expected": [{"__type": "displaystring", "value": "foo \"bar\" \\ baz"}, {}]
},
{
"name": "bad display string escaping",
"raw": ["%\"foo %a"],
"header_type": "item",
"must_fail": true
},
{
"name": "bad display string utf-8 (invalid 2-byte seq)",
"raw": ["%\"%c3%28\""],
"header_type": "item",
"must_fail": true
},
{
"name": "bad display string utf-8 (invalid sequence id)",
"raw": ["%\"%a0%a1\""],
"header_type": "item",
"must_fail": true
},
{
"name": "bad display string utf-8 (invalid hex)",
"raw": ["%\"%g0%1w\""],
"header_type": "item",
"must_fail": true
},
{
"name": "bad display string utf-8 (invalid 3-byte seq)",
"raw": ["%\"%e2%28%a1\""],
"header_type": "item",
"must_fail": true
},
{
"name": "bad display string utf-8 (invalid 4-byte seq)",
"raw": ["%\"%f0%28%8c%28\""],
"header_type": "item",
"must_fail": true
},
{
"name": "BOM in display string",
"raw": ["%\"BOM: %ef%bb%bf\""],
"header_type": "item",
"expected": [{"__type": "displaystring", "value": "BOM: \uFEFF"}, {}]
}
]
Loading