Skip to content

Commit de816e3

Browse files
committed
Add Punycode helpers
1 parent dbc9c4e commit de816e3

File tree

3 files changed

+306
-0
lines changed

3 files changed

+306
-0
lines changed

Sources/ATSyntaxTools/Documentation.docc/Overview.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ ATSyntaxTools is fully open source under the [Apache 2.0 license](https://github
122122
- ``InvalidRecordKeyError``
123123
- ``InvalidTIDError``
124124
- ``InvalidATURIError``
125+
- ``PunycodeError``
125126

126127
### Canonicalizable
127128

@@ -132,3 +133,4 @@ ATSyntaxTools is fully open source under the [Apache 2.0 license](https://github
132133
### Utilities
133134

134135
- ``RegexMatch``
136+
- ``Punycode``

Sources/ATSyntaxTools/Errors.swift

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,48 @@ public enum InvalidATURIError: Error, LocalizedError, CustomStringConvertible {
367367
return errorDescription ?? String(describing: self)
368368
}
369369
}
370+
371+
/// Errors that can occur during Punycode encoding/decoding.
372+
public enum PunycodeError: Error, CustomStringConvertible {
373+
374+
/// Input contains non-ASCII characters.
375+
case nonASCIIInput
376+
377+
/// Encountered an invalid digit in the punycode string.
378+
///
379+
/// - Parameter character: The invalid character value.
380+
case invalidDigit(invalidCharacter: Character)
381+
382+
/// Arithmetic overflow detected.
383+
case overflow
384+
385+
/// An invalid Unicode scalar was found.
386+
///
387+
/// - Parameter scalar: An integer value that represents a scalar.
388+
case invalidUnicodeScalar(scalar: UInt32)
389+
390+
/// The input is empty.
391+
case emptyInput
392+
393+
/// No Unicode scalar for a character.
394+
///
395+
/// - Parameter scalar: The invalid scalar value.
396+
case missingUnicodeScalar(scalar: Character)
397+
398+
public var description: String {
399+
switch self {
400+
case .nonASCIIInput:
401+
return "Input contains non-ASCII characters."
402+
case .invalidDigit(let character):
403+
return "Invalid digit '\(character)' encountered in Punycode string."
404+
case .overflow:
405+
return "Arithmetic overflow during encoding/decoding."
406+
case .invalidUnicodeScalar(let codepoint):
407+
return "Invalid Unicode scalar value: \(codepoint)."
408+
case .emptyInput:
409+
return "No encodable data."
410+
case .missingUnicodeScalar(let character):
411+
return "Character '\(character)' does not have a unicode scalar."
412+
}
413+
}
414+
}
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
//
2+
// Punycode.swift
3+
// ATSyntaxTools
4+
//
5+
// Created by Christopher Jr Riley on 2025-05-18.
6+
//
7+
8+
import Foundation
9+
10+
/// A namespace for helpers to encode and decode Unicode strings using the
11+
/// Punycode algorithm.
12+
public enum Punycode {
13+
14+
/// The base used in Punycode for digit values.
15+
private static let punycodeBase: UInt32 = 36
16+
17+
/// Minimum threshold value for bias adaptation.
18+
private static let thresholdMin: UInt32 = 1
19+
20+
/// Maximum threshold value for bias adaptation.
21+
private static let thresholdMax: UInt32 = 26
22+
23+
/// Skew factor used in bias adaptation.
24+
private static let skewValue: UInt32 = 38
25+
26+
/// Damping factor used for bias adaptation.
27+
private static let dampFactor: UInt32 = 700
28+
29+
/// Initial bias value for the algorithm.
30+
private static let initialBiasValue: UInt32 = 72
31+
32+
/// Starting code point for processing non-ASCII input.
33+
private static let initialCodePoint: UInt32 = 128
34+
35+
/// Delimiter used in Punycode to separate basic and encoded code points.
36+
private static let labelDelimiter: Character = "-"
37+
38+
/// Decodes a Punycode string into a Unicode string.
39+
///
40+
/// - Parameter punycode: The Punycode-encoded string (must be ASCII only).
41+
/// - Returns: The decoded Unicode string.
42+
///
43+
/// - Throws: `PunycodeError` if the input is invalid or decoding fails.
44+
public static func decode(_ punycode: String) throws -> String {
45+
guard punycode.allSatisfy({ $0.isASCII }) else { throw PunycodeError.nonASCIIInput }
46+
47+
var nextCodePoint = initialCodePoint
48+
var insertionIndex: UInt32 = 0
49+
var bias = initialBiasValue
50+
51+
// Split basic code points and encoded part
52+
let (basicScalars, encodedSequence): ([Unicode.Scalar], Substring) = {
53+
if let delimiterIndex = punycode.lastIndex(of: labelDelimiter) {
54+
let basics = punycode[..<delimiterIndex].compactMap { UnicodeScalar(String($0)) }
55+
let rest = punycode[punycode.index(after: delimiterIndex)...]
56+
return (basics, rest)
57+
} else {
58+
return ([], punycode[...])
59+
}
60+
}()
61+
62+
var decodedScalars = basicScalars
63+
var inputIterator = encodedSequence.makeIterator()
64+
65+
while peekNext(&inputIterator) != nil {
66+
let previousIndex = insertionIndex
67+
var weight: UInt32 = 1
68+
var currentThreshold: UInt32 = punycodeBase
69+
70+
while true {
71+
guard let char = inputIterator.next() else { throw PunycodeError.invalidDigit(invalidCharacter: "?") }
72+
let digit = decodePunycodeDigit(char)
73+
if digit == punycodeBase {
74+
throw PunycodeError.invalidDigit(invalidCharacter: char)
75+
}
76+
if digit > (UInt32.max - insertionIndex) / weight { throw PunycodeError.overflow }
77+
insertionIndex += digit * weight
78+
79+
let threshold = calculateThreshold(min: thresholdMin, k: currentThreshold, bias: bias, max: thresholdMax)
80+
if digit < threshold { break }
81+
if punycodeBase > (UInt32.max - threshold) / weight { throw PunycodeError.overflow }
82+
weight *= punycodeBase - threshold
83+
currentThreshold += punycodeBase
84+
}
85+
86+
let outputLength = UInt32(decodedScalars.count) + 1
87+
bias = adaptBias(delta: insertionIndex - previousIndex, codePointCount: outputLength, isFirst: previousIndex == 0)
88+
89+
let codePointOffset = insertionIndex / outputLength
90+
if nextCodePoint > UInt32.max - codePointOffset { throw PunycodeError.overflow }
91+
nextCodePoint += codePointOffset
92+
insertionIndex = insertionIndex % outputLength
93+
94+
guard let unicodeScalar = UnicodeScalar(nextCodePoint) else { throw PunycodeError.invalidUnicodeScalar(scalar: nextCodePoint) }
95+
decodedScalars.insert(unicodeScalar, at: Int(insertionIndex))
96+
insertionIndex += 1
97+
}
98+
99+
// Return the decoded string by reconstructing from scalars
100+
return String(String.UnicodeScalarView(decodedScalars))
101+
}
102+
103+
/// Encodes a Unicode string as a Punycode string.
104+
///
105+
/// - Parameter unicode: The Unicode string to encode.
106+
/// - Returns: The encoded Punycode string.
107+
///
108+
/// - Throws: `PunycodeError` if the input is invalid or encoding fails.
109+
public static func encode(_ unicode: String) throws -> String {
110+
let scalars = Array(unicode.unicodeScalars)
111+
return try encode(scalars)
112+
}
113+
114+
/// Internal encoder working directly with an array of characters.
115+
///
116+
/// - Parameter inputCharacters: The input characters to encode.
117+
/// - Returns: The encoded Punycode string.
118+
///
119+
/// - Throws: `PunycodeError` if the input is invalid or encoding fails.
120+
private static func encode(_ scalars: [Unicode.Scalar]) throws -> String {
121+
guard !scalars.isEmpty else {
122+
throw PunycodeError.emptyInput
123+
}
124+
125+
var nextCodePoint = initialCodePoint
126+
var delta: UInt32 = 0
127+
var bias = initialBiasValue
128+
129+
// Output all basic (ASCII) code points first, in order.
130+
var encodedOutput = scalars.filter { $0.isASCII }.map { Character($0) }
131+
let numBasicCodePoints = UInt32(encodedOutput.count)
132+
var numHandledCodePoints = numBasicCodePoints
133+
134+
if numBasicCodePoints > 0 {
135+
encodedOutput.append(labelDelimiter)
136+
}
137+
138+
// Work with scalars' UInt32 values
139+
let codePoints = scalars.map { $0.value }
140+
141+
while numHandledCodePoints < scalars.count {
142+
// Find the smallest code point >= current nextCodePoint
143+
guard let minCodePoint = codePoints.filter({ $0 >= nextCodePoint }).min() else {
144+
throw PunycodeError.overflow
145+
}
146+
147+
if minCodePoint - nextCodePoint > (UInt32.max - delta) / (numHandledCodePoints + 1) {
148+
throw PunycodeError.overflow
149+
}
150+
151+
delta += (minCodePoint - nextCodePoint) * (numHandledCodePoints + 1)
152+
nextCodePoint = minCodePoint
153+
154+
for codePoint in codePoints {
155+
if codePoint < nextCodePoint {
156+
delta += 1
157+
if delta == 0 { throw PunycodeError.overflow }
158+
} else if codePoint == nextCodePoint {
159+
var remainingDelta = delta
160+
var k: UInt32 = punycodeBase
161+
while true {
162+
let threshold = calculateThreshold(min: thresholdMin, k: k, bias: bias, max: thresholdMax)
163+
if remainingDelta < threshold {
164+
encodedOutput.append(encodePunycodeDigit(remainingDelta))
165+
break
166+
}
167+
encodedOutput.append(encodePunycodeDigit(threshold + (remainingDelta - threshold) % (punycodeBase - threshold)))
168+
remainingDelta = (remainingDelta - threshold) / (punycodeBase - threshold)
169+
k += punycodeBase
170+
}
171+
bias = adaptBias(delta: delta, codePointCount: numHandledCodePoints + 1, isFirst: numHandledCodePoints == numBasicCodePoints)
172+
delta = 0
173+
numHandledCodePoints += 1
174+
}
175+
}
176+
delta += 1
177+
nextCodePoint += 1
178+
}
179+
180+
return String(encodedOutput)
181+
}
182+
183+
/// Adapt the bias according to Punycode specifications.
184+
///
185+
/// - Parameters:
186+
/// - delta: The difference in code points since the last adaptation.
187+
/// - codePointCount: Number of code points processed so far.
188+
/// - isFirst: Whether this is the first bias adaptation.
189+
/// - Returns: The new bias value.
190+
private static func adaptBias(delta: UInt32, codePointCount: UInt32, isFirst: Bool) -> UInt32 {
191+
var adjustedDelta = isFirst ? delta / dampFactor : delta / 2
192+
adjustedDelta += adjustedDelta / codePointCount
193+
var k: UInt32 = 0
194+
while adjustedDelta > (punycodeBase - thresholdMin) * thresholdMax / 2 {
195+
adjustedDelta /= punycodeBase - thresholdMin
196+
k += punycodeBase
197+
}
198+
return k + (punycodeBase - thresholdMin + 1) * adjustedDelta / (adjustedDelta + skewValue)
199+
}
200+
201+
/// Calculates the threshold for bias adaptation.
202+
///
203+
/// - Parameters:
204+
/// - min: Minimum threshold.
205+
/// - k: Current bias step value.
206+
/// - bias: Current bias.
207+
/// - max: Maximum threshold.
208+
/// - Returns: The calculated threshold value.
209+
private static func calculateThreshold(min: UInt32, k: UInt32, bias: UInt32, max: UInt32) -> UInt32 {
210+
if min + bias >= k {
211+
return min
212+
} else if max + bias <= k {
213+
return max
214+
} else {
215+
return k - bias
216+
}
217+
}
218+
219+
/// Decodes a single Punycode digit character into its value.
220+
///
221+
/// - Parameter character: The character to decode.
222+
/// - Returns: The digit value, or `punycodeBase` if invalid.
223+
private static func decodePunycodeDigit(_ character: Character) -> UInt32 {
224+
guard let asciiValue = character.asciiValue else { return punycodeBase }
225+
switch character {
226+
case "0"..."9":
227+
return UInt32(asciiValue - Character("0").asciiValue! + 26)
228+
case "A"..."Z":
229+
return UInt32(asciiValue - Character("A").asciiValue!)
230+
case "a"..."z":
231+
return UInt32(asciiValue - Character("a").asciiValue!)
232+
default:
233+
return punycodeBase
234+
}
235+
}
236+
237+
/// Encodes a digit value as a Punycode character.
238+
///
239+
/// - Parameter digitValue: The digit value to encode (0-35).
240+
/// - Returns: The corresponding character.
241+
private static func encodePunycodeDigit(_ digitValue: UInt32) -> Character {
242+
// 'a'..'z' = 0..25, '0'..'9' = 26..35
243+
let asciiValue = digitValue + 22 + (digitValue < 26 ? 75 : 0)
244+
if let unicodeScalar = UnicodeScalar(asciiValue) {
245+
return Character(unicodeScalar)
246+
} else {
247+
assertionFailure("Invalid digit to encode: \(digitValue)")
248+
return "?"
249+
}
250+
}
251+
252+
/// Peeks at the next element in a given iterator without advancing it.
253+
/// - Parameter iterator: The iterator to peek into. This is passed as an `inout` value so it won't be consumed.
254+
/// - Returns: The next element if present; otherwise, `nil`.
255+
private static func peekNext<T: IteratorProtocol>(_ iterator: inout T) -> T.Element? {
256+
var copy = iterator
257+
return copy.next()
258+
}
259+
}

0 commit comments

Comments
 (0)