diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index b303736..a3b0d2f 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -1,6 +1,6 @@ // // Hub.swift -// +// // // Created by Pedro Cuenca on 18/5/23. // @@ -10,23 +10,57 @@ import Foundation public struct Hub {} public extension Hub { - enum HubClientError: Error { - case parse + enum HubClientError: LocalizedError { case authorizationRequired - case unexpectedError case httpStatusCode(Int) + case parse + case unexpectedError + case downloadError(String) + case fileNotFound(String) + case networkError(URLError) + case resourceNotFound(String) + case configurationMissing(String) + case fileSystemError(Error) + case parseError(String) + + public var errorDescription: String? { + switch self { + case .authorizationRequired: + return String(localized: "Authentication required. Please provide a valid Hugging Face token.") + case .httpStatusCode(let code): + return String(localized: "HTTP error with status code: \(code)") + case .parse: + return String(localized: "Failed to parse server response.") + case .unexpectedError: + return String(localized: "An unexpected error occurred.") + case .downloadError(let message): + return String(localized: "Download failed: \(message)") + case .fileNotFound(let filename): + return String(localized: "File not found: \(filename)") + case .networkError(let error): + return String(localized: "Network error: \(error.localizedDescription)") + case .resourceNotFound(let resource): + return String(localized: "Resource not found: \(resource)") + case .configurationMissing(let file): + return String(localized: "Required configuration file missing: \(file)") + case .fileSystemError(let error): + return String(localized: "File system error: \(error.localizedDescription)") + case .parseError(let message): + return String(localized: "Parse error: \(message)") + } + } } - + enum RepoType: String { case models case datasets case spaces } - + struct Repo { public let id: String public let type: RepoType - + public init(id: String, type: RepoType = .models) { self.id = id self.type = type @@ -51,11 +85,11 @@ public struct Config { .map { $0.offset == 0 ? $0.element.lowercased() : $0.element.capitalized } .joined() } - + func uncamelCase(_ string: String) -> String { let scalars = string.unicodeScalars var result = "" - + var previousCharacterIsLowercase = false for scalar in scalars { if CharacterSet.uppercaseLetters.contains(scalar) { @@ -70,7 +104,7 @@ public struct Config { previousCharacterIsLowercase = true } } - + return result } @@ -88,17 +122,17 @@ public struct Config { public var value: Any? { return dictionary["value"] } - + public var intValue: Int? { value as? Int } public var boolValue: Bool? { value as? Bool } public var stringValue: String? { value as? String } - + // Instead of doing this we could provide custom classes and decode to them public var arrayValue: [Config]? { guard let list = value as? [Any] else { return nil } return list.map { Config($0 as! [NSString : Any]) } } - + /// Tuple of token identifier and string value public var tokenValue: (UInt, String)? { value as? (UInt, String) } } @@ -120,7 +154,7 @@ public class LanguageModelConfigurationFromHub { return try await self.loadConfig(modelName: modelName, hubApi: hubApi) } } - + public init( modelFolder: URL, hubApi: HubApi = .shared @@ -179,47 +213,104 @@ public class LanguageModelConfigurationFromHub { ) async throws -> Configurations { let filesToDownload = ["config.json", "tokenizer_config.json", "chat_template.json", "tokenizer.json"] let repo = Hub.Repo(id: modelName) - let downloadedModelFolder = try await hubApi.snapshot(from: repo, matching: filesToDownload) - return try await loadConfig(modelFolder: downloadedModelFolder, hubApi: hubApi) + do { + let downloadedModelFolder = try await hubApi.snapshot(from: repo, matching: filesToDownload) + return try await loadConfig(modelFolder: downloadedModelFolder, hubApi: hubApi) + } catch { + // Convert generic errors to more specific ones + if let urlError = error as? URLError { + switch urlError.code { + case .notConnectedToInternet, .networkConnectionLost: + throw Hub.HubClientError.networkError(urlError) + case .resourceUnavailable: + throw Hub.HubClientError.resourceNotFound(modelName) + default: + throw Hub.HubClientError.networkError(urlError) + } + } else { + throw error + } + } } func loadConfig( modelFolder: URL, hubApi: HubApi = .shared ) async throws -> Configurations { - // Load required configurations - let modelConfig = try hubApi.configuration(fileURL: modelFolder.appending(path: "config.json")) - let tokenizerData = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json")) - // Load tokenizer config - var tokenizerConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer_config.json")) - // Check for chat template and merge if available - if let chatTemplateConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "chat_template.json")), - let chatTemplate = chatTemplateConfig.chatTemplate?.stringValue { - // The value of chat_template could also be an array of strings, but we're not handling that case here, since it's discouraged. - // Create or update tokenizer config with chat template - if var configDict = tokenizerConfig?.dictionary { - configDict["chat_template"] = chatTemplate - tokenizerConfig = Config(configDict) - } else { - tokenizerConfig = Config(["chat_template": chatTemplate]) + do { + // Load required configurations + let modelConfigURL = modelFolder.appending(path: "config.json") + guard FileManager.default.fileExists(atPath: modelConfigURL.path) else { + throw Hub.HubClientError.configurationMissing("config.json") + } + + let modelConfig = try hubApi.configuration(fileURL: modelConfigURL) + + let tokenizerDataURL = modelFolder.appending(path: "tokenizer.json") + guard FileManager.default.fileExists(atPath: tokenizerDataURL.path) else { + throw Hub.HubClientError.configurationMissing("tokenizer.json") + } + + let tokenizerData = try hubApi.configuration(fileURL: tokenizerDataURL) + + // Load tokenizer config (optional) + var tokenizerConfig: Config? = nil + let tokenizerConfigURL = modelFolder.appending(path: "tokenizer_config.json") + if FileManager.default.fileExists(atPath: tokenizerConfigURL.path) { + tokenizerConfig = try hubApi.configuration(fileURL: tokenizerConfigURL) + } + + // Check for chat template and merge if available + let chatTemplateURL = modelFolder.appending(path: "chat_template.json") + if FileManager.default.fileExists(atPath: chatTemplateURL.path), + let chatTemplateConfig = try? hubApi.configuration(fileURL: chatTemplateURL), + let chatTemplate = chatTemplateConfig.chatTemplate?.stringValue { + // Create or update tokenizer config with chat template + if var configDict = tokenizerConfig?.dictionary { + configDict["chat_template"] = chatTemplate + tokenizerConfig = Config(configDict) + } else { + tokenizerConfig = Config(["chat_template": chatTemplate]) + } + } + + return Configurations( + modelConfig: modelConfig, + tokenizerConfig: tokenizerConfig, + tokenizerData: tokenizerData + ) + } catch let error as Hub.HubClientError { + throw error + } catch { + if let nsError = error as NSError? { + if nsError.domain == NSCocoaErrorDomain && nsError.code == NSFileReadNoSuchFileError { + throw Hub.HubClientError.fileSystemError(error) + } else if nsError.domain == "NSJSONSerialization" { + throw Hub.HubClientError.parseError("Invalid JSON format: \(nsError.localizedDescription)") + } } + throw Hub.HubClientError.fileSystemError(error) } - return Configurations( - modelConfig: modelConfig, - tokenizerConfig: tokenizerConfig, - tokenizerData: tokenizerData - ) } static func fallbackTokenizerConfig(for modelType: String) -> Config? { - guard let url = Bundle.module.url(forResource: "\(modelType)_tokenizer_config", withExtension: "json") else { return nil } + guard let url = Bundle.module.url(forResource: "\(modelType)_tokenizer_config", withExtension: "json") else { + return nil + } + do { let data = try Data(contentsOf: url) let parsed = try JSONSerialization.jsonObject(with: data, options: []) - guard let dictionary = parsed as? [NSString: Any] else { return nil } + guard let dictionary = parsed as? [NSString: Any] else { + throw Hub.HubClientError.parseError("Failed to parse fallback tokenizer config") + } return Config(dictionary) + } catch let error as Hub.HubClientError { + print("Error loading fallback tokenizer config: \(error.localizedDescription)") + return nil } catch { + print("Error loading fallback tokenizer config: \(error.localizedDescription)") return nil } } diff --git a/Sources/Hub/HubApi.swift b/Sources/Hub/HubApi.swift index 6112399..4e4c1b3 100644 --- a/Sources/Hub/HubApi.swift +++ b/Sources/Hub/HubApi.swift @@ -92,18 +92,30 @@ public extension HubApi { if let hfToken = hfToken { request.setValue("Bearer \(hfToken)", forHTTPHeaderField: "Authorization") } - let (data, response) = try await URLSession.shared.data(for: request) - guard let response = response as? HTTPURLResponse else { throw Hub.HubClientError.unexpectedError } - - switch response.statusCode { - case 200..<300: break - case 400..<500: throw Hub.HubClientError.authorizationRequired - default: throw Hub.HubClientError.httpStatusCode(response.statusCode) - } - return (data, response) + do { + let (data, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse else { + throw Hub.HubClientError.unexpectedError + } + + switch httpResponse.statusCode { + case 200..<300: + return (data, httpResponse) + case 401, 403: + throw Hub.HubClientError.authorizationRequired + case 404: + throw Hub.HubClientError.fileNotFound(url.lastPathComponent) + default: + throw Hub.HubClientError.httpStatusCode(httpResponse.statusCode) + } + } catch let error as Hub.HubClientError { + throw error + } catch { + throw Hub.HubClientError.downloadError(error.localizedDescription) + } } - + /// Throws error if page does not exist or is not accessible. /// Allows relative redirects but ignores absolute ones for LFS files. func httpHead(for url: URL) async throws -> (Data, HTTPURLResponse) { @@ -162,12 +174,19 @@ public extension HubApi { enum EnvironmentError: LocalizedError { case invalidMetadataError(String) case offlineModeError(String) - + case fileIntegrityError(String) + case fileWriteError(String) + public var errorDescription: String? { switch self { - case .invalidMetadataError(let message), - .offlineModeError(let message): - return message + case .invalidMetadataError(let message): + return String(localized: "Invalid metadata: \(message)") + case .offlineModeError(let message): + return String(localized: "Offline mode error: \(message)") + case .fileIntegrityError(let message): + return String(localized: "File integrity check failed: \(message)") + case .fileWriteError(let message): + return String(localized: "Failed to write file: \(message)") } } } @@ -224,43 +243,47 @@ public extension HubApi { func readDownloadMetadata(metadataPath: URL) throws -> LocalDownloadFileMetadata? { if FileManager.default.fileExists(atPath: metadataPath.path) { do { - let attributes = try FileManager.default.attributesOfItem(atPath: metadataPath.path) - print("File attributes: \(attributes)") let contents = try String(contentsOf: metadataPath, encoding: .utf8) let lines = contents.components(separatedBy: .newlines) - + guard lines.count >= 3 else { - throw EnvironmentError.invalidMetadataError("Metadata file is missing required fields.") + throw EnvironmentError.invalidMetadataError(String(localized: "Metadata file is missing required fields")) } - + let commitHash = lines[0].trimmingCharacters(in: .whitespacesAndNewlines) let etag = lines[1].trimmingCharacters(in: .whitespacesAndNewlines) + guard let timestamp = Double(lines[2].trimmingCharacters(in: .whitespacesAndNewlines)) else { - throw EnvironmentError.invalidMetadataError("Missing or invalid timestamp.") + throw EnvironmentError.invalidMetadataError(String(localized: "Invalid timestamp format")) } + let timestampDate = Date(timeIntervalSince1970: timestamp) - - // TODO: check if file hasn't been modified since the metadata was saved - // Reference: https://github.com/huggingface/huggingface_hub/blob/2fdc6f48ef5e6b22ee9bcdc1945948ac070da675/src/huggingface_hub/_local_folder.py#L303 - let filename = metadataPath.lastPathComponent.replacingOccurrences(of: ".metadata", with: "") - + return LocalDownloadFileMetadata(commitHash: commitHash, etag: etag, filename: filename, timestamp: timestampDate) + } catch let error as EnvironmentError { + do { + HubApi.logger.warning("Invalid metadata file \(metadataPath): \(error.localizedDescription). Removing it from disk and continuing.") + try FileManager.default.removeItem(at: metadataPath) + } catch { + throw EnvironmentError.invalidMetadataError(String(localized: "Could not remove corrupted metadata file: \(error.localizedDescription)")) + } + return nil } catch { do { - HubApi.logger.warning("Invalid metadata file \(metadataPath): \(error). Removing it from disk and continue.") + HubApi.logger.warning("Error reading metadata file \(metadataPath): \(error.localizedDescription). Removing it from disk and continuing.") try FileManager.default.removeItem(at: metadataPath) } catch { - throw EnvironmentError.invalidMetadataError("Could not remove corrupted metadata file \(metadataPath): \(error)") + throw EnvironmentError.invalidMetadataError(String(localized: "Could not remove corrupted metadata file: \(error.localizedDescription)")) } return nil } } - + // metadata file does not exist return nil } - + func isValidHash(hash: String, pattern: String) -> Bool { let regex = try? NSRegularExpression(pattern: pattern) let range = NSRange(location: 0, length: hash.utf16.count) @@ -270,34 +293,35 @@ public extension HubApi { func computeFileHash(file url: URL) throws -> String { // Open file for reading guard let fileHandle = try? FileHandle(forReadingFrom: url) else { - throw Hub.HubClientError.unexpectedError + throw Hub.HubClientError.fileNotFound(url.lastPathComponent) } - + defer { try? fileHandle.close() } - + var hasher = SHA256() let chunkSize = 1024 * 1024 // 1MB chunks - + while autoreleasepool(invoking: { let nextChunk = try? fileHandle.read(upToCount: chunkSize) - + guard let nextChunk, - !nextChunk.isEmpty + !nextChunk.isEmpty else { return false } - + hasher.update(data: nextChunk) - + return true }) { } - + let digest = hasher.finalize() return digest.map { String(format: "%02x", $0) }.joined() + } - + /// Reference: https://github.com/huggingface/huggingface_hub/blob/b2c9a148d465b43ab90fab6e4ebcbbf5a9df27d4/src/huggingface_hub/_local_folder.py#L391 func writeDownloadMetadata(commitHash: String, etag: String, metadataPath: URL) throws { let metadataContent = "\(commitHash)\n\(etag)\n\(Date().timeIntervalSince1970)\n" @@ -305,10 +329,10 @@ public extension HubApi { try FileManager.default.createDirectory(at: metadataPath.deletingLastPathComponent(), withIntermediateDirectories: true) try metadataContent.write(to: metadataPath, atomically: true, encoding: .utf8) } catch { - throw EnvironmentError.invalidMetadataError("Failed to write metadata file \(metadataPath)") + throw EnvironmentError.fileWriteError(String(localized: "Failed to write metadata to \(metadataPath.path): \(error.localizedDescription)")) } } - + struct HubFileDownloader { let repo: Repo let repoDestination: URL @@ -427,39 +451,39 @@ public extension HubApi { if useOfflineMode ?? NetworkMonitor.shared.shouldUseOfflineMode() { if !FileManager.default.fileExists(atPath: repoDestination.path) { - throw EnvironmentError.offlineModeError("File not available locally in offline mode") + throw EnvironmentError.offlineModeError(String(localized: "Repository not available locally")) } - + let fileUrls = try FileManager.default.getFileUrls(at: repoDestination) if fileUrls.isEmpty { - throw EnvironmentError.offlineModeError("File not available locally in offline mode") + throw EnvironmentError.offlineModeError(String(localized: "No files available locally for this repository")) } - + for fileUrl in fileUrls { let metadataPath = URL(fileURLWithPath: fileUrl.path.replacingOccurrences( - of: repoDestination.path, + of: repoDestination.path, with: repoMetadataDestination.path ) + ".metadata") - + let localMetadata = try readDownloadMetadata(metadataPath: metadataPath) - + guard let localMetadata = localMetadata else { - throw EnvironmentError.offlineModeError("Metadata not available or invalid in offline mode") + throw EnvironmentError.offlineModeError(String(localized: "Metadata not available for \(fileUrl.lastPathComponent)")) } let localEtag = localMetadata.etag - + // LFS file so check file integrity if self.isValidHash(hash: localEtag, pattern: self.sha256Pattern) { let fileHash = try computeFileHash(file: fileUrl) if fileHash != localEtag { - throw EnvironmentError.offlineModeError("File integrity check failed in offline mode") + throw EnvironmentError.fileIntegrityError(String(localized: "Hash mismatch for \(fileUrl.lastPathComponent)")) } } } - + return repoDestination } - + let filenames = try await getFilenames(from: repo, matching: globs) let progress = Progress(totalUnitCount: Int64(filenames.count)) for filename in filenames { diff --git a/Sources/Models/LanguageModel.swift b/Sources/Models/LanguageModel.swift index 22ba7aa..9e73ad9 100644 --- a/Sources/Models/LanguageModel.swift +++ b/Sources/Models/LanguageModel.swift @@ -214,6 +214,13 @@ extension LanguageModel: TextGenerationModel { } } -public enum TokenizerError: Error { +public enum TokenizerError: LocalizedError { case tokenizerConfigNotFound + + public var errorDescription: String? { + switch self { + case .tokenizerConfigNotFound: + return String(localized: "Tokenizer configuration could not be found. The model may be missing required tokenizer files.", comment: "Error when tokenizer configuration is missing") + } + } } diff --git a/Sources/TensorUtils/Weights.swift b/Sources/TensorUtils/Weights.swift index 2050e01..b77de6e 100644 --- a/Sources/TensorUtils/Weights.swift +++ b/Sources/TensorUtils/Weights.swift @@ -3,9 +3,18 @@ import CoreML public struct Weights { - enum WeightsError: Error { + enum WeightsError: LocalizedError { case notSupported(message: String) case invalidFile + + public var errorDescription: String? { + switch self { + case .notSupported(let message): + return String(localized: "The weight format '\(message)' is not supported by this application.", comment: "Error when weight format is not supported") + case .invalidFile: + return String(localized: "The weights file is invalid or corrupted.", comment: "Error when weight file is invalid") + } + } } private let dictionary: [String: MLMultiArray] diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index db53337..935a37e 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -12,7 +12,7 @@ import Jinja public typealias Message = [String: Any] public typealias ToolSpec = [String: Any] -public enum TokenizerError: Error { +public enum TokenizerError: LocalizedError { case missingConfig case missingTokenizerClassInConfig case unsupportedTokenizer(String) @@ -21,6 +21,27 @@ public enum TokenizerError: Error { case chatTemplate(String) case tooLong(String) case mismatchedConfig(String) + + public var errorDescription: String? { + switch self { + case .missingConfig: + return String(localized: "Tokenizer configuration is missing.", comment: "Error when tokenizer config cannot be found") + case .missingTokenizerClassInConfig: + return String(localized: "The tokenizer class is not specified in the configuration.", comment: "Error when tokenizer_class is missing in config") + case .unsupportedTokenizer(let name): + return String(localized: "The tokenizer type '\(name)' is not supported.", comment: "Error when tokenizer type is not supported") + case .missingVocab: + return String(localized: "Vocabulary file is missing from the tokenizer configuration.", comment: "Error when vocab file is missing") + case .malformedVocab: + return String(localized: "The vocabulary file is malformed or corrupted.", comment: "Error when vocab file is malformed") + case .chatTemplate(let message): + return String(localized: "Chat template error: \(message)", comment: "Error with chat template") + case .tooLong(let message): + return String(localized: "Input is too long: \(message)", comment: "Error when input exceeds maximum length") + case .mismatchedConfig(let message): + return String(localized: "Tokenizer configuration mismatch: \(message)", comment: "Error when tokenizer configuration is inconsistent") + } + } } public protocol TokenizingModel { diff --git a/Tests/HubTests/DownloaderTests.swift b/Tests/HubTests/DownloaderTests.swift index 452ec54..124e609 100644 --- a/Tests/HubTests/DownloaderTests.swift +++ b/Tests/HubTests/DownloaderTests.swift @@ -10,9 +10,18 @@ import Combine @testable import Hub /// Errors that can occur during the download process -enum DownloadError: Error { +enum DownloadError: LocalizedError { case invalidDownloadLocation case unexpectedError + + var errorDescription: String? { + switch self { + case .invalidDownloadLocation: + return String(localized: "The download location is invalid or inaccessible.", comment: "Error when download destination is invalid") + case .unexpectedError: + return String(localized: "An unexpected error occurred during the download process.", comment: "Generic download error message") + } + } } final class DownloaderTests: XCTestCase { diff --git a/Tests/HubTests/HubApiTests.swift b/Tests/HubTests/HubApiTests.swift index 756060b..f64b35c 100644 --- a/Tests/HubTests/HubApiTests.swift +++ b/Tests/HubTests/HubApiTests.swift @@ -852,7 +852,7 @@ class SnapshotDownloadTests: XCTestCase { } catch let error as HubApi.EnvironmentError { switch error { case .offlineModeError(let message): - XCTAssertEqual(message, "File not available locally in offline mode") + XCTAssertEqual(message, "Repository not available locally") default: XCTFail("Wrong error type: \(error)") } @@ -889,7 +889,7 @@ class SnapshotDownloadTests: XCTestCase { } catch let error as HubApi.EnvironmentError { switch error { case .offlineModeError(let message): - XCTAssertEqual(message, "Metadata not available or invalid in offline mode") + XCTAssertEqual(message, "Metadata not available for x.bin") default: XCTFail("Wrong error type: \(error)") } @@ -924,8 +924,8 @@ class SnapshotDownloadTests: XCTestCase { XCTFail("Expected an error to be thrown") } catch let error as HubApi.EnvironmentError { switch error { - case .offlineModeError(let message): - XCTAssertEqual(message, "File integrity check failed in offline mode") + case .fileIntegrityError(let message): + XCTAssertEqual(message, "Hash mismatch for x.bin") default: XCTFail("Wrong error type: \(error)") } @@ -960,7 +960,7 @@ class SnapshotDownloadTests: XCTestCase { } catch let error as HubApi.EnvironmentError { switch error { case .offlineModeError(let message): - XCTAssertEqual(message, "File not available locally in offline mode") + XCTAssertEqual(message, "No files available locally for this repository") default: XCTFail("Wrong error type: \(error)") }