|
| 1 | +// |
| 2 | +// Punycode.swift |
| 3 | +// ATSyntaxTools |
| 4 | +// |
| 5 | +// Created by Christopher Jr Riley on 2025-05-18. |
| 6 | +// |
| 7 | + |
| 8 | +import Foundation |
| 9 | + |
| 10 | +/// A namespace for helpers to encode and decode Unicode strings using the |
| 11 | +/// Punycode algorithm. |
| 12 | +public enum Punycode { |
| 13 | + |
| 14 | + /// The base used in Punycode for digit values. |
| 15 | + private static let punycodeBase: UInt32 = 36 |
| 16 | + |
| 17 | + /// Minimum threshold value for bias adaptation. |
| 18 | + private static let thresholdMin: UInt32 = 1 |
| 19 | + |
| 20 | + /// Maximum threshold value for bias adaptation. |
| 21 | + private static let thresholdMax: UInt32 = 26 |
| 22 | + |
| 23 | + /// Skew factor used in bias adaptation. |
| 24 | + private static let skewValue: UInt32 = 38 |
| 25 | + |
| 26 | + /// Damping factor used for bias adaptation. |
| 27 | + private static let dampFactor: UInt32 = 700 |
| 28 | + |
| 29 | + /// Initial bias value for the algorithm. |
| 30 | + private static let initialBiasValue: UInt32 = 72 |
| 31 | + |
| 32 | + /// Starting code point for processing non-ASCII input. |
| 33 | + private static let initialCodePoint: UInt32 = 128 |
| 34 | + |
| 35 | + /// Delimiter used in Punycode to separate basic and encoded code points. |
| 36 | + private static let labelDelimiter: Character = "-" |
| 37 | + |
| 38 | + /// Decodes a Punycode string into a Unicode string. |
| 39 | + /// |
| 40 | + /// - Parameter punycode: The Punycode-encoded string (must be ASCII only). |
| 41 | + /// - Returns: The decoded Unicode string. |
| 42 | + /// |
| 43 | + /// - Throws: `PunycodeError` if the input is invalid or decoding fails. |
| 44 | + public static func decode(_ punycode: String) throws -> String { |
| 45 | + guard punycode.allSatisfy({ $0.isASCII }) else { throw PunycodeError.nonASCIIInput } |
| 46 | + |
| 47 | + var nextCodePoint = initialCodePoint |
| 48 | + var insertionIndex: UInt32 = 0 |
| 49 | + var bias = initialBiasValue |
| 50 | + |
| 51 | + // Split basic code points and encoded part |
| 52 | + let (basicScalars, encodedSequence): ([Unicode.Scalar], Substring) = { |
| 53 | + if let delimiterIndex = punycode.lastIndex(of: labelDelimiter) { |
| 54 | + let basics = punycode[..<delimiterIndex].compactMap { UnicodeScalar(String($0)) } |
| 55 | + let rest = punycode[punycode.index(after: delimiterIndex)...] |
| 56 | + return (basics, rest) |
| 57 | + } else { |
| 58 | + return ([], punycode[...]) |
| 59 | + } |
| 60 | + }() |
| 61 | + |
| 62 | + var decodedScalars = basicScalars |
| 63 | + var inputIterator = encodedSequence.makeIterator() |
| 64 | + |
| 65 | + while peekNext(&inputIterator) != nil { |
| 66 | + let previousIndex = insertionIndex |
| 67 | + var weight: UInt32 = 1 |
| 68 | + var currentThreshold: UInt32 = punycodeBase |
| 69 | + |
| 70 | + while true { |
| 71 | + guard let char = inputIterator.next() else { throw PunycodeError.invalidDigit(invalidCharacter: "?") } |
| 72 | + let digit = decodePunycodeDigit(char) |
| 73 | + if digit == punycodeBase { |
| 74 | + throw PunycodeError.invalidDigit(invalidCharacter: char) |
| 75 | + } |
| 76 | + if digit > (UInt32.max - insertionIndex) / weight { throw PunycodeError.overflow } |
| 77 | + insertionIndex += digit * weight |
| 78 | + |
| 79 | + let threshold = calculateThreshold(min: thresholdMin, k: currentThreshold, bias: bias, max: thresholdMax) |
| 80 | + if digit < threshold { break } |
| 81 | + if punycodeBase > (UInt32.max - threshold) / weight { throw PunycodeError.overflow } |
| 82 | + weight *= punycodeBase - threshold |
| 83 | + currentThreshold += punycodeBase |
| 84 | + } |
| 85 | + |
| 86 | + let outputLength = UInt32(decodedScalars.count) + 1 |
| 87 | + bias = adaptBias(delta: insertionIndex - previousIndex, codePointCount: outputLength, isFirst: previousIndex == 0) |
| 88 | + |
| 89 | + let codePointOffset = insertionIndex / outputLength |
| 90 | + if nextCodePoint > UInt32.max - codePointOffset { throw PunycodeError.overflow } |
| 91 | + nextCodePoint += codePointOffset |
| 92 | + insertionIndex = insertionIndex % outputLength |
| 93 | + |
| 94 | + guard let unicodeScalar = UnicodeScalar(nextCodePoint) else { throw PunycodeError.invalidUnicodeScalar(scalar: nextCodePoint) } |
| 95 | + decodedScalars.insert(unicodeScalar, at: Int(insertionIndex)) |
| 96 | + insertionIndex += 1 |
| 97 | + } |
| 98 | + |
| 99 | + // Return the decoded string by reconstructing from scalars |
| 100 | + return String(String.UnicodeScalarView(decodedScalars)) |
| 101 | + } |
| 102 | + |
| 103 | + /// Encodes a Unicode string as a Punycode string. |
| 104 | + /// |
| 105 | + /// - Parameter unicode: The Unicode string to encode. |
| 106 | + /// - Returns: The encoded Punycode string. |
| 107 | + /// |
| 108 | + /// - Throws: `PunycodeError` if the input is invalid or encoding fails. |
| 109 | + public static func encode(_ unicode: String) throws -> String { |
| 110 | + let scalars = Array(unicode.unicodeScalars) |
| 111 | + return try encode(scalars) |
| 112 | + } |
| 113 | + |
| 114 | + /// Internal encoder working directly with an array of characters. |
| 115 | + /// |
| 116 | + /// - Parameter inputCharacters: The input characters to encode. |
| 117 | + /// - Returns: The encoded Punycode string. |
| 118 | + /// |
| 119 | + /// - Throws: `PunycodeError` if the input is invalid or encoding fails. |
| 120 | + private static func encode(_ scalars: [Unicode.Scalar]) throws -> String { |
| 121 | + guard !scalars.isEmpty else { |
| 122 | + throw PunycodeError.emptyInput |
| 123 | + } |
| 124 | + |
| 125 | + var nextCodePoint = initialCodePoint |
| 126 | + var delta: UInt32 = 0 |
| 127 | + var bias = initialBiasValue |
| 128 | + |
| 129 | + // Output all basic (ASCII) code points first, in order. |
| 130 | + var encodedOutput = scalars.filter { $0.isASCII }.map { Character($0) } |
| 131 | + let numBasicCodePoints = UInt32(encodedOutput.count) |
| 132 | + var numHandledCodePoints = numBasicCodePoints |
| 133 | + |
| 134 | + if numBasicCodePoints > 0 { |
| 135 | + encodedOutput.append(labelDelimiter) |
| 136 | + } |
| 137 | + |
| 138 | + // Work with scalars' UInt32 values |
| 139 | + let codePoints = scalars.map { $0.value } |
| 140 | + |
| 141 | + while numHandledCodePoints < scalars.count { |
| 142 | + // Find the smallest code point >= current nextCodePoint |
| 143 | + guard let minCodePoint = codePoints.filter({ $0 >= nextCodePoint }).min() else { |
| 144 | + throw PunycodeError.overflow |
| 145 | + } |
| 146 | + |
| 147 | + if minCodePoint - nextCodePoint > (UInt32.max - delta) / (numHandledCodePoints + 1) { |
| 148 | + throw PunycodeError.overflow |
| 149 | + } |
| 150 | + |
| 151 | + delta += (minCodePoint - nextCodePoint) * (numHandledCodePoints + 1) |
| 152 | + nextCodePoint = minCodePoint |
| 153 | + |
| 154 | + for codePoint in codePoints { |
| 155 | + if codePoint < nextCodePoint { |
| 156 | + delta += 1 |
| 157 | + if delta == 0 { throw PunycodeError.overflow } |
| 158 | + } else if codePoint == nextCodePoint { |
| 159 | + var remainingDelta = delta |
| 160 | + var k: UInt32 = punycodeBase |
| 161 | + while true { |
| 162 | + let threshold = calculateThreshold(min: thresholdMin, k: k, bias: bias, max: thresholdMax) |
| 163 | + if remainingDelta < threshold { |
| 164 | + encodedOutput.append(encodePunycodeDigit(remainingDelta)) |
| 165 | + break |
| 166 | + } |
| 167 | + encodedOutput.append(encodePunycodeDigit(threshold + (remainingDelta - threshold) % (punycodeBase - threshold))) |
| 168 | + remainingDelta = (remainingDelta - threshold) / (punycodeBase - threshold) |
| 169 | + k += punycodeBase |
| 170 | + } |
| 171 | + bias = adaptBias(delta: delta, codePointCount: numHandledCodePoints + 1, isFirst: numHandledCodePoints == numBasicCodePoints) |
| 172 | + delta = 0 |
| 173 | + numHandledCodePoints += 1 |
| 174 | + } |
| 175 | + } |
| 176 | + delta += 1 |
| 177 | + nextCodePoint += 1 |
| 178 | + } |
| 179 | + |
| 180 | + return String(encodedOutput) |
| 181 | + } |
| 182 | + |
| 183 | + /// Adapt the bias according to Punycode specifications. |
| 184 | + /// |
| 185 | + /// - Parameters: |
| 186 | + /// - delta: The difference in code points since the last adaptation. |
| 187 | + /// - codePointCount: Number of code points processed so far. |
| 188 | + /// - isFirst: Whether this is the first bias adaptation. |
| 189 | + /// - Returns: The new bias value. |
| 190 | + private static func adaptBias(delta: UInt32, codePointCount: UInt32, isFirst: Bool) -> UInt32 { |
| 191 | + var adjustedDelta = isFirst ? delta / dampFactor : delta / 2 |
| 192 | + adjustedDelta += adjustedDelta / codePointCount |
| 193 | + var k: UInt32 = 0 |
| 194 | + while adjustedDelta > (punycodeBase - thresholdMin) * thresholdMax / 2 { |
| 195 | + adjustedDelta /= punycodeBase - thresholdMin |
| 196 | + k += punycodeBase |
| 197 | + } |
| 198 | + return k + (punycodeBase - thresholdMin + 1) * adjustedDelta / (adjustedDelta + skewValue) |
| 199 | + } |
| 200 | + |
| 201 | + /// Calculates the threshold for bias adaptation. |
| 202 | + /// |
| 203 | + /// - Parameters: |
| 204 | + /// - min: Minimum threshold. |
| 205 | + /// - k: Current bias step value. |
| 206 | + /// - bias: Current bias. |
| 207 | + /// - max: Maximum threshold. |
| 208 | + /// - Returns: The calculated threshold value. |
| 209 | + private static func calculateThreshold(min: UInt32, k: UInt32, bias: UInt32, max: UInt32) -> UInt32 { |
| 210 | + if min + bias >= k { |
| 211 | + return min |
| 212 | + } else if max + bias <= k { |
| 213 | + return max |
| 214 | + } else { |
| 215 | + return k - bias |
| 216 | + } |
| 217 | + } |
| 218 | + |
| 219 | + /// Decodes a single Punycode digit character into its value. |
| 220 | + /// |
| 221 | + /// - Parameter character: The character to decode. |
| 222 | + /// - Returns: The digit value, or `punycodeBase` if invalid. |
| 223 | + private static func decodePunycodeDigit(_ character: Character) -> UInt32 { |
| 224 | + guard let asciiValue = character.asciiValue else { return punycodeBase } |
| 225 | + switch character { |
| 226 | + case "0"..."9": |
| 227 | + return UInt32(asciiValue - Character("0").asciiValue! + 26) |
| 228 | + case "A"..."Z": |
| 229 | + return UInt32(asciiValue - Character("A").asciiValue!) |
| 230 | + case "a"..."z": |
| 231 | + return UInt32(asciiValue - Character("a").asciiValue!) |
| 232 | + default: |
| 233 | + return punycodeBase |
| 234 | + } |
| 235 | + } |
| 236 | + |
| 237 | + /// Encodes a digit value as a Punycode character. |
| 238 | + /// |
| 239 | + /// - Parameter digitValue: The digit value to encode (0-35). |
| 240 | + /// - Returns: The corresponding character. |
| 241 | + private static func encodePunycodeDigit(_ digitValue: UInt32) -> Character { |
| 242 | + // 'a'..'z' = 0..25, '0'..'9' = 26..35 |
| 243 | + let asciiValue = digitValue + 22 + (digitValue < 26 ? 75 : 0) |
| 244 | + if let unicodeScalar = UnicodeScalar(asciiValue) { |
| 245 | + return Character(unicodeScalar) |
| 246 | + } else { |
| 247 | + assertionFailure("Invalid digit to encode: \(digitValue)") |
| 248 | + return "?" |
| 249 | + } |
| 250 | + } |
| 251 | + |
| 252 | + /// Peeks at the next element in a given iterator without advancing it. |
| 253 | + /// - Parameter iterator: The iterator to peek into. This is passed as an `inout` value so it won't be consumed. |
| 254 | + /// - Returns: The next element if present; otherwise, `nil`. |
| 255 | + private static func peekNext<T: IteratorProtocol>(_ iterator: inout T) -> T.Element? { |
| 256 | + var copy = iterator |
| 257 | + return copy.next() |
| 258 | + } |
| 259 | +} |
0 commit comments