From 4979491202e37832208e9fbe7b436817c668212e Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 4 Jun 2024 09:18:00 +0000 Subject: [PATCH 01/15] Implement parse hf url --- datafusion-cli/Cargo.lock | 1 + datafusion-cli/Cargo.toml | 1 + datafusion-cli/src/hf_store.rs | 449 +++++++++++++++++++++++++++++++++ datafusion-cli/src/lib.rs | 1 + 4 files changed, 452 insertions(+) create mode 100644 datafusion-cli/src/hf_store.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 304058650164..25601d4cbf82 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1183,6 +1183,7 @@ dependencies = [ "dirs", "env_logger", "futures", + "http", "mimalloc", "object_store", "parking_lot", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 4e3d800cfe97..f1aa8f4ad3a2 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -48,6 +48,7 @@ datafusion = { path = "../datafusion/core", version = "38.0.0", features = [ dirs = "4.0.0" env_logger = "0.9" futures = "0.3" +http= "0.2.12" mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.9.0", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs new file mode 100644 index 000000000000..c78190413fb2 --- /dev/null +++ b/datafusion-cli/src/hf_store.rs @@ -0,0 +1,449 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::Bytes; +use datafusion::common::{config_err, Result}; +use datafusion::config::{ + ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, Visit, +}; +use futures::stream::BoxStream; +use object_store::http::HttpStore; +use object_store::path::Path; +use object_store::{ + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, + PutResult, +}; +use std::any::Any; +use std::fmt::Display; +use std::sync::Arc; +use tokio::io::AsyncWrite; + +#[derive(Debug, Clone)] +pub struct ParsedHFUrl { + endpoint: Option, + path: Option, + repository: Option, + revision: Option, + repo_type: Option, +} + +impl Default for ParsedHFUrl { + fn default() -> Self { + Self { + endpoint: Some("https://huggingface.com".to_string()), + path: None, + repository: None, + revision: Some("main".to_string()), + repo_type: Some("datasets".to_string()), + } + } +} + +impl ParsedHFUrl { + pub const SCHEMA: &'static str = "hf://"; + + /// Parse a HuggingFace URL into a ParsedHFUrl struct. + /// The URL should be in the format `hf:///[@revision]/` + /// where `repo_type` is either `datasets` or `spaces`. + /// If the revision is not provided, it defaults to `main`. + /// If the endpoint is not provided, it defaults to `https://huggingface.com`. + /// + /// url: The HuggingFace URL to parse. + pub fn parse(url: String) -> Result { + if !url.starts_with(Self::SCHEMA) { + return config_err!( + "Invalid HuggingFace URL: {}, only 'hf://' URLs are supported", + url + ); + } + + let mut parsed_url = Self::default(); + let mut last_delim = 5; + + // parse repository type. + if let Some(curr_delim) = url[last_delim..].find('/') { + let repo_type = &url[last_delim..last_delim + curr_delim]; + if (repo_type != "datasets") && (repo_type != "spaces") { + return config_err!( + "Invalid HuggingFace URL: {}, currently only 'datasets' or 'spaces' are supported", + url + ); + } + + parsed_url.repo_type = Some(repo_type.to_string()); + last_delim += curr_delim + 1; + } else { + return config_err!("Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", url); + } + + let start_delim = last_delim; + // parse repository and revision. + if let Some(curr_delim) = url[last_delim..].find('/') { + last_delim += curr_delim + 1; + } else { + return config_err!("Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", url); + } + + let next_slash = url[last_delim..].find('/'); + + // next slash is not found + if next_slash.is_none() { + return config_err!("Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", url); + } + + let next_at = url[last_delim..].find('@'); + // @ is found before the next slash. + if let Some(at) = next_at { + if let Some(slash) = next_slash { + if at < slash { + let repo = &url[start_delim..last_delim + at]; + let revision = &url[last_delim + at + 1..last_delim + slash]; + parsed_url.repository = Some(repo.to_string()); + parsed_url.revision = Some(revision.to_string()); + last_delim += slash; + } + } + } + + // @ is not found before the next slash. + if parsed_url.repository.is_none() { + last_delim += next_slash.unwrap(); + let repo = &url[start_delim..last_delim]; + parsed_url.repository = Some(repo.to_string()); + } + + if (last_delim + 1) >= url.len() { + return config_err!( + "Invalid HuggingFace URL: {}, please specify a path", + url + ); + } + + // parse path. + let path = &url[last_delim + 1..]; + parsed_url.path = Some(path.to_string()); + + Ok(parsed_url) + } + + pub fn file_url(&self) -> Result { + let mut url = self.endpoint.clone().unwrap(); + url.push_str("/"); + url.push_str(self.repo_type.as_deref().unwrap()); + url.push_str("/"); + url.push_str(self.repository.as_deref().unwrap()); + url.push_str("/resolve/"); + url.push_str(self.revision.as_deref().unwrap()); + url.push_str("/"); + url.push_str(self.path.as_deref().unwrap()); + + Ok(url) + } + + pub fn tree_url(&self) -> Result { + let mut url = self.endpoint.clone().unwrap(); + url.push_str("/api/"); + url.push_str(self.repo_type.as_deref().unwrap()); + url.push_str("/"); + url.push_str(self.repository.as_deref().unwrap()); + url.push_str("/tree/"); + url.push_str(self.revision.as_deref().unwrap()); + url.push_str("/"); + url.push_str(self.path.as_deref().unwrap()); + + Ok(url) + } +} + +/// HFOptions is the configuration options for the HFStoreBuilder. +#[derive(Debug, Clone, Default)] +pub struct HFOptions { + endpoint: Option, + user_access_token: Option, +} + +impl ConfigExtension for HFOptions { + const PREFIX: &'static str = "hf"; +} + +impl ExtensionOptions for HFOptions { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn cloned(&self) -> Box { + Box::new(self.clone()) + } + + fn set(&mut self, key: &str, value: &str) -> datafusion::common::Result<()> { + let (_key, rem) = key.split_once('.').unwrap_or((key, "")); + match rem { + "endpoint" => { + self.endpoint.set(rem, value)?; + } + "user_access_token" => { + self.user_access_token.set(rem, value)?; + } + _ => { + return config_err!("Config value \"{}\" not found on HFOptions", rem); + } + } + Ok(()) + } + + fn entries(&self) -> Vec { + struct Visitor(Vec); + + impl Visit for Visitor { + fn some( + &mut self, + key: &str, + value: V, + description: &'static str, + ) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: Some(value.to_string()), + description, + }) + } + + fn none(&mut self, key: &str, description: &'static str) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: None, + description, + }) + } + } + + let mut v = Visitor(vec![]); + self.endpoint + .visit(&mut v, "endpoint", "The HuggingFace API endpoint"); + self.user_access_token.visit( + &mut v, + "user_access_token", + "The HuggingFace user access token", + ); + v.0 + } +} + +// pub struct HFStoreBuilder {} + +// #[derive(Debug, Clone)] +// pub struct HFStore { +// inner: Arc, +// } + +// impl Display for HFStore { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// write!(f, "HFStore") +// } +// } + +// impl HFStore {} + +// impl ObjectStore for HFStore { +// async fn put_opts( +// &self, +// location: &Path, +// bytes: Bytes, +// opts: PutOptions, +// ) -> object_store::Result { +// todo!() +// } + +// async fn put_multipart( +// &self, +// location: &Path, +// ) -> object_store::Result<(MultipartId, Box)> { +// todo!() +// } + +// async fn abort_multipart( +// &self, +// location: &Path, +// multipart_id: &MultipartId, +// ) -> object_store::Result<()> { +// todo!() +// } + +// async fn get_opts( +// &self, +// location: &Path, +// options: GetOptions, +// ) -> object_store::Result { +// todo!() +// } + +// async fn delete(&self, location: &Path) -> object_store::Result<()> { +// todo!() +// } + +// fn list( +// &self, +// prefix: Option<&Path>, +// ) -> BoxStream<'_, object_store::Result> { +// todo!() +// } + +// async fn list_with_delimiter( +// &self, +// prefix: Option<&Path>, +// ) -> object_store::Result { +// todo!() +// } + +// async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> { +// todo!() +// } + +// async fn copy_if_not_exists( +// &self, +// from: &Path, +// to: &Path, +// ) -> object_store::Result<()> { +// todo!() +// } +// } + +#[cfg(test)] +mod tests { + use datafusion::error::DataFusionError; + + use crate::hf_store::ParsedHFUrl; + + #[test] + fn test_parse_hf_url() { + let url = + "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + + let parsed_url = ParsedHFUrl::parse(url).unwrap(); + + assert_eq!( + parsed_url.endpoint, + Some("https://huggingface.com".to_string()) + ); + assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); + assert_eq!( + parsed_url.repository, + Some("datasets-examples/doc-formats-csv-1".to_string()) + ); + assert_eq!(parsed_url.revision, Some("main".to_string())); + assert_eq!(parsed_url.path, Some("data.csv".to_string())); + } + + #[test] + fn test_parse_hf_url_with_revision() { + let url = + "hf://datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); + + let parsed_url = ParsedHFUrl::parse(url).unwrap(); + + assert_eq!( + parsed_url.endpoint, + Some("https://huggingface.com".to_string()) + ); + assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); + assert_eq!( + parsed_url.repository, + Some("datasets-examples/doc-formats-csv-1".to_string()) + ); + assert_eq!(parsed_url.revision, Some("~csv".to_string())); + assert_eq!(parsed_url.path, Some("data.csv".to_string())); + } + + #[test] + fn test_parse_hf_url_errors() { + test_error( + "hg://datasets/datasets-examples/doc-formats-csv-1/data.csv", + "Invalid HuggingFace URL: hg://datasets/datasets-examples/doc-formats-csv-1/data.csv, only 'hf://' URLs are supported", + ); + + test_error( + "hf://datasets/datasets-examples/doc-formats-csv-1", + "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1, please format as 'hf:///[@revision]/'", + ); + + test_error( + "hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv", + "Invalid HuggingFace URL: hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv, currently only 'datasets' or 'spaces' are supported", + ); + + test_error( + "hf://datasets/datasets-examples/doc-formats-csv-1@~csv", + "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1@~csv, please format as 'hf:///[@revision]/'", + ); + + test_error( + "hf://datasets/datasets-examples/doc-formats-csv-1@~csv/", + "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1@~csv/, please specify a path", + ); + } + + #[test] + fn test_file_url() { + let url = + "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + + let parsed_url = ParsedHFUrl::parse(url).unwrap(); + + let file_url = parsed_url.file_url().unwrap(); + + assert_eq!( + file_url, + "https://huggingface.com/datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" + ); + } + + #[test] + fn test_tree_url() { + let url = + "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + + let parsed_url = ParsedHFUrl::parse(url).unwrap(); + + let tree_url = parsed_url.tree_url().unwrap(); + + assert_eq!( + tree_url, + "https://huggingface.com/api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" + ); + } + + fn test_error(url: &str, expected: &str) { + let parsed_url_result = ParsedHFUrl::parse(url.to_string()); + + match parsed_url_result { + Ok(_) => panic!("Expected error, but got success"), + Err(err) => match err { + DataFusionError::Configuration(_) => { + assert_eq!( + err.to_string(), + format!("Invalid or Unsupported Configuration: {}", expected) + ) + } + _ => panic!("Expected Configuration error, but got {:?}", err), + }, + } + } +} diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index 139a60b8cf16..1a877eb8bb92 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -27,3 +27,4 @@ pub mod highlighter; pub mod object_storage; pub mod print_format; pub mod print_options; +pub mod hf_store; From b47cca158bd8d3d8ab217443d7e792cf702e610a Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 4 Jun 2024 15:51:55 +0000 Subject: [PATCH 02/15] Draft hf_store impl --- datafusion-cli/Cargo.lock | 1 + datafusion-cli/Cargo.toml | 1 + datafusion-cli/src/hf_store.rs | 299 ++++++++++++++++++++++----------- 3 files changed, 205 insertions(+), 96 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 25601d4cbf82..a2118f88a441 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1177,6 +1177,7 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", + "bytes", "clap", "ctor", "datafusion", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index f1aa8f4ad3a2..721c555b6bb6 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -34,6 +34,7 @@ arrow = "51.0.0" async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" +bytes = "1.0" clap = { version = "3", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "38.0.0", features = [ "avro", diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index c78190413fb2..b64c77ce0fcd 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -15,23 +15,55 @@ // specific language governing permissions and limitations // under the License. +use async_trait::async_trait; use bytes::Bytes; use datafusion::common::{config_err, Result}; use datafusion::config::{ ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, Visit, }; +use datafusion::error::DataFusionError; use futures::stream::BoxStream; -use object_store::http::HttpStore; +use http::{header, HeaderMap}; +use object_store::http::{HttpBuilder, HttpStore}; use object_store::path::Path; use object_store::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, - PutResult, + ClientOptions, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult }; use std::any::Any; +use std::env; use std::fmt::Display; +use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; +pub const DEFAULT_ENDPOINT: &str = "https://huggingface.co"; + +pub enum HFConfigKey { + Endpoint, + UserAccessToken, +} + +impl AsRef for HFConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::Endpoint => "endpoint", + Self::UserAccessToken => "user_access_token", + } + } +} + +impl FromStr for HFConfigKey { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "endpoint" => Ok(Self::Endpoint), + "user_access_token" => Ok(Self::UserAccessToken), + _ => config_err!("Invalid HuggingFace configuration key: {}", s), + } + } +} + #[derive(Debug, Clone)] pub struct ParsedHFUrl { endpoint: Option, @@ -44,7 +76,7 @@ pub struct ParsedHFUrl { impl Default for ParsedHFUrl { fn default() -> Self { Self { - endpoint: Some("https://huggingface.com".to_string()), + endpoint: Some(DEFAULT_ENDPOINT.to_string()), path: None, repository: None, revision: Some("main".to_string()), @@ -60,10 +92,10 @@ impl ParsedHFUrl { /// The URL should be in the format `hf:///[@revision]/` /// where `repo_type` is either `datasets` or `spaces`. /// If the revision is not provided, it defaults to `main`. - /// If the endpoint is not provided, it defaults to `https://huggingface.com`. + /// If the endpoint is not provided, it defaults to `https://huggingface.co`. /// /// url: The HuggingFace URL to parse. - pub fn parse(url: String) -> Result { + pub fn parse(url: String, hf_options: HFOptions) -> Result { if !url.starts_with(Self::SCHEMA) { return config_err!( "Invalid HuggingFace URL: {}, only 'hf://' URLs are supported", @@ -72,6 +104,10 @@ impl ParsedHFUrl { } let mut parsed_url = Self::default(); + if let Some(endpoint) = hf_options.endpoint { + parsed_url.endpoint = Some(endpoint); + } + let mut last_delim = 5; // parse repository type. @@ -236,8 +272,11 @@ impl ExtensionOptions for HFOptions { } let mut v = Visitor(vec![]); - self.endpoint - .visit(&mut v, "endpoint", "The HuggingFace API endpoint"); + self.endpoint.visit( + &mut v, + "endpoint", + "The HuggingFace API endpoint", + ); self.user_access_token.visit( &mut v, "user_access_token", @@ -247,101 +286,164 @@ impl ExtensionOptions for HFOptions { } } -// pub struct HFStoreBuilder {} - -// #[derive(Debug, Clone)] -// pub struct HFStore { -// inner: Arc, -// } - -// impl Display for HFStore { -// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -// write!(f, "HFStore") -// } -// } - -// impl HFStore {} - -// impl ObjectStore for HFStore { -// async fn put_opts( -// &self, -// location: &Path, -// bytes: Bytes, -// opts: PutOptions, -// ) -> object_store::Result { -// todo!() -// } - -// async fn put_multipart( -// &self, -// location: &Path, -// ) -> object_store::Result<(MultipartId, Box)> { -// todo!() -// } - -// async fn abort_multipart( -// &self, -// location: &Path, -// multipart_id: &MultipartId, -// ) -> object_store::Result<()> { -// todo!() -// } - -// async fn get_opts( -// &self, -// location: &Path, -// options: GetOptions, -// ) -> object_store::Result { -// todo!() -// } - -// async fn delete(&self, location: &Path) -> object_store::Result<()> { -// todo!() -// } - -// fn list( -// &self, -// prefix: Option<&Path>, -// ) -> BoxStream<'_, object_store::Result> { -// todo!() -// } - -// async fn list_with_delimiter( -// &self, -// prefix: Option<&Path>, -// ) -> object_store::Result { -// todo!() -// } - -// async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> { -// todo!() -// } - -// async fn copy_if_not_exists( -// &self, -// from: &Path, -// to: &Path, -// ) -> object_store::Result<()> { -// todo!() -// } -// } +#[derive(Debug, Clone, Default)] +pub struct HFStoreBuilder { + endpoint: Option, + user_access_token: Option, +} + +impl HFStoreBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn with_endpoint(mut self, endpoint: String) -> Self { + self.endpoint = Some(endpoint); + self + } + + pub fn with_user_access_token(mut self, user_access_token: String) -> Self { + self.user_access_token = Some(user_access_token); + self + } + + pub fn from_env() -> Self { + let mut builder = Self::new(); + if let Ok(endpoint) = env::var("HF_ENDPOINT") { + builder = builder.with_endpoint(endpoint); + } + + if let Ok(user_access_token) = env::var("HF_USER_ACCESS_TOKEN") { + builder = builder.with_user_access_token(user_access_token); + } + + builder + } + + pub fn build(&self) -> Result { + let mut inner_builder = HttpBuilder::new(); + + if let Some(ep) = &self.endpoint { + inner_builder = inner_builder.with_url(ep); + } else { + inner_builder = inner_builder.with_url(DEFAULT_ENDPOINT); + } + + if let Some(user_access_token) = &self.user_access_token { + if let Ok(token) = format!("Bearer {}", user_access_token).parse() { + let mut header_map = HeaderMap::new(); + header_map.insert( + header::AUTHORIZATION, + token, + ); + let options = ClientOptions::new().with_default_headers(header_map); + + inner_builder = inner_builder.with_client_options(options); + } + } + let inner_store = inner_builder.build()?; + + return Ok(HFStore { + inner: Arc::new(inner_store), + }); + } +} + +#[derive(Debug, Clone)] +pub struct HFStore { + inner: Arc, +} + +impl Display for HFStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "HFStore") + } +} + +#[async_trait] +impl ObjectStore for HFStore { + async fn put_opts( + &self, + _location: &Path, + _bytes: Bytes, + _opts: PutOptions, + ) -> object_store::Result { + Err(ObjectStoreError::NotSupported {source: "HFStore::put_opts".to_string().into()}) + } + + async fn put_multipart( + &self, + _location: &Path, + ) -> object_store::Result<(MultipartId, Box)> { + Err(ObjectStoreError::NotSupported {source: "HFStore::put_multipart".to_string().into()}) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> object_store::Result<()> { + Err(ObjectStoreError::NotSupported {source: "HFStore::abort_multipart".to_string().into()}) + } + + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> object_store::Result { + println!("HFStore::get_opts: {:?}", location); + + self.inner.get_opts(location, options).await + } + + async fn delete(&self, _location: &Path) -> object_store::Result<()> { + Err(ObjectStoreError::NotSupported {source: "HFStore::delete".to_string().into()}) + } + + fn list( + &self, + _prefix: Option<&Path>, + ) -> BoxStream<'_, object_store::Result> { + Box::pin(futures::stream::empty()) + } + + async fn list_with_delimiter( + &self, + _prefix: Option<&Path>, + ) -> object_store::Result { + Err(ObjectStoreError::NotSupported {source: "HFStore::list_with_delimiter".to_string().into()}) + } + + async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { + Err(ObjectStoreError::NotSupported {source: "HFStore::copy".to_string().into()}) + } + + async fn copy_if_not_exists( + &self, + _from: &Path, + _to: &Path, + ) -> object_store::Result<()> { + Err(ObjectStoreError::NotSupported {source: "HFStore::copy_if_not_exists".to_string().into()}) + } +} #[cfg(test)] mod tests { use datafusion::error::DataFusionError; - use crate::hf_store::ParsedHFUrl; + use crate::hf_store::{HFOptions, ParsedHFUrl}; #[test] fn test_parse_hf_url() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url).unwrap(); + let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); assert_eq!( parsed_url.endpoint, - Some("https://huggingface.com".to_string()) + Some("https://huggingface.co".to_string()) ); assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); assert_eq!( @@ -356,12 +458,13 @@ mod tests { fn test_parse_hf_url_with_revision() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); + let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url).unwrap(); + let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); assert_eq!( parsed_url.endpoint, - Some("https://huggingface.com".to_string()) + Some("https://huggingface.co".to_string()) ); assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); assert_eq!( @@ -404,14 +507,15 @@ mod tests { fn test_file_url() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url).unwrap(); + let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); let file_url = parsed_url.file_url().unwrap(); assert_eq!( file_url, - "https://huggingface.com/datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" + "https://huggingface.co/datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" ); } @@ -419,19 +523,22 @@ mod tests { fn test_tree_url() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url).unwrap(); + let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); let tree_url = parsed_url.tree_url().unwrap(); assert_eq!( tree_url, - "https://huggingface.com/api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" + "https://huggingface.co/api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" ); } fn test_error(url: &str, expected: &str) { - let parsed_url_result = ParsedHFUrl::parse(url.to_string()); + let options = HFOptions::default(); + + let parsed_url_result = ParsedHFUrl::parse(url.to_string(), options); match parsed_url_result { Ok(_) => panic!("Expected error, but got success"), From cc16287f96f486c8ec07ad4a103105f31eced22a Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 6 Jun 2024 20:43:03 +0800 Subject: [PATCH 03/15] Complete integrations code --- datafusion-cli/src/catalog.rs | 4 + datafusion-cli/src/hf_store.rs | 214 +++++++++------------------ datafusion-cli/src/object_storage.rs | 16 ++ 3 files changed, 92 insertions(+), 142 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index faa657da6511..6bc3c4458180 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -18,6 +18,7 @@ use std::any::Any; use std::sync::{Arc, Weak}; +use crate::hf_store::HFOptions; use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; use datafusion::catalog::schema::SchemaProvider; @@ -183,6 +184,9 @@ impl SchemaProvider for DynamicFileSchemaProvider { "gs" | "gcs" => { state = state.add_table_options_extension(GcpOptions::default()) } + "hf" => { + state = state.add_table_options_extension(HFOptions::default()); + } _ => {} }; let store = get_object_store( diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index b64c77ce0fcd..3a89d7cf18a4 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -15,26 +15,19 @@ // specific language governing permissions and limitations // under the License. -use async_trait::async_trait; -use bytes::Bytes; use datafusion::common::{config_err, Result}; use datafusion::config::{ ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, Visit, }; use datafusion::error::DataFusionError; -use futures::stream::BoxStream; use http::{header, HeaderMap}; use object_store::http::{HttpBuilder, HttpStore}; -use object_store::path::Path; -use object_store::{ - ClientOptions, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult -}; +use object_store::ClientOptions; +use url::Url; use std::any::Any; use std::env; use std::fmt::Display; use std::str::FromStr; -use std::sync::Arc; -use tokio::io::AsyncWrite; pub const DEFAULT_ENDPOINT: &str = "https://huggingface.co"; @@ -66,7 +59,6 @@ impl FromStr for HFConfigKey { #[derive(Debug, Clone)] pub struct ParsedHFUrl { - endpoint: Option, path: Option, repository: Option, revision: Option, @@ -76,7 +68,6 @@ pub struct ParsedHFUrl { impl Default for ParsedHFUrl { fn default() -> Self { Self { - endpoint: Some(DEFAULT_ENDPOINT.to_string()), path: None, repository: None, revision: Some("main".to_string()), @@ -95,7 +86,7 @@ impl ParsedHFUrl { /// If the endpoint is not provided, it defaults to `https://huggingface.co`. /// /// url: The HuggingFace URL to parse. - pub fn parse(url: String, hf_options: HFOptions) -> Result { + pub fn parse(url: String) -> Result { if !url.starts_with(Self::SCHEMA) { return config_err!( "Invalid HuggingFace URL: {}, only 'hf://' URLs are supported", @@ -104,10 +95,6 @@ impl ParsedHFUrl { } let mut parsed_url = Self::default(); - if let Some(endpoint) = hf_options.endpoint { - parsed_url.endpoint = Some(endpoint); - } - let mut last_delim = 5; // parse repository type. @@ -176,32 +163,29 @@ impl ParsedHFUrl { Ok(parsed_url) } - pub fn file_url(&self) -> Result { - let mut url = self.endpoint.clone().unwrap(); - url.push_str("/"); - url.push_str(self.repo_type.as_deref().unwrap()); - url.push_str("/"); + pub fn file_path(&self) -> String { + let mut url = self.repo_type.clone().unwrap(); + url.push('/'); url.push_str(self.repository.as_deref().unwrap()); url.push_str("/resolve/"); url.push_str(self.revision.as_deref().unwrap()); - url.push_str("/"); + url.push('/'); url.push_str(self.path.as_deref().unwrap()); - Ok(url) + url } - pub fn tree_url(&self) -> Result { - let mut url = self.endpoint.clone().unwrap(); - url.push_str("/api/"); + pub fn tree_path(&self) -> String { + let mut url = "api/".to_string(); url.push_str(self.repo_type.as_deref().unwrap()); - url.push_str("/"); + url.push('/'); url.push_str(self.repository.as_deref().unwrap()); url.push_str("/tree/"); url.push_str(self.revision.as_deref().unwrap()); - url.push_str("/"); + url.push('/'); url.push_str(self.path.as_deref().unwrap()); - Ok(url) + url } } @@ -290,6 +274,7 @@ impl ExtensionOptions for HFOptions { pub struct HFStoreBuilder { endpoint: Option, user_access_token: Option, + parsed_url: Option, } impl HFStoreBuilder { @@ -297,13 +282,19 @@ impl HFStoreBuilder { Self::default() } - pub fn with_endpoint(mut self, endpoint: String) -> Self { - self.endpoint = Some(endpoint); + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = Some(endpoint.into()); + self } - pub fn with_user_access_token(mut self, user_access_token: String) -> Self { - self.user_access_token = Some(user_access_token); + pub fn with_user_access_token(mut self, user_access_token: impl Into) -> Self { + self.user_access_token = Some(user_access_token.into()); + self + } + + pub fn with_parsed_url(mut self, parsed_url: ParsedHFUrl) -> Self { + self.parsed_url = Some(parsed_url); self } @@ -320,15 +311,25 @@ impl HFStoreBuilder { builder } - pub fn build(&self) -> Result { - let mut inner_builder = HttpBuilder::new(); + pub fn build(&self) -> Result { + let mut builder = HttpBuilder::new(); + + if self.parsed_url.is_none() { + return config_err!("Parsed URL is required to build HFStore"); + } - if let Some(ep) = &self.endpoint { - inner_builder = inner_builder.with_url(ep); + let ep; + if let Some(endpoint) = &self.endpoint { + ep = endpoint.to_string(); } else { - inner_builder = inner_builder.with_url(DEFAULT_ENDPOINT); + ep = DEFAULT_ENDPOINT.to_string(); } + let url = format!("{}/{}", ep, self.parsed_url.as_ref().unwrap().file_path()); + println!("URL: {}", url); + + builder = builder.with_url(url); + if let Some(user_access_token) = &self.user_access_token { if let Ok(token) = format!("Bearer {}", user_access_token).parse() { let mut header_map = HeaderMap::new(); @@ -338,113 +339,47 @@ impl HFStoreBuilder { ); let options = ClientOptions::new().with_default_headers(header_map); - inner_builder = inner_builder.with_client_options(options); + builder = builder.with_client_options(options); } } - let inner_store = inner_builder.build()?; - - return Ok(HFStore { - inner: Arc::new(inner_store), - }); - } -} -#[derive(Debug, Clone)] -pub struct HFStore { - inner: Arc, -} - -impl Display for HFStore { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "HFStore") + builder.build().map_err(|e| DataFusionError::Execution(format!("Unable to build HFStore: {}", e))) } } -#[async_trait] -impl ObjectStore for HFStore { - async fn put_opts( - &self, - _location: &Path, - _bytes: Bytes, - _opts: PutOptions, - ) -> object_store::Result { - Err(ObjectStoreError::NotSupported {source: "HFStore::put_opts".to_string().into()}) - } +pub fn get_hf_object_store_builder( + url: &Url, + options: &HFOptions, +) -> Result + { + let parsed_url = ParsedHFUrl::parse(url.to_string())?; + let mut builder = HFStoreBuilder::from_env(); + builder = builder.with_parsed_url(parsed_url); - async fn put_multipart( - &self, - _location: &Path, - ) -> object_store::Result<(MultipartId, Box)> { - Err(ObjectStoreError::NotSupported {source: "HFStore::put_multipart".to_string().into()}) + if let Some(endpoint) = &options.endpoint { + builder = builder.with_endpoint(endpoint); } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> object_store::Result<()> { - Err(ObjectStoreError::NotSupported {source: "HFStore::abort_multipart".to_string().into()}) + if let Some(user_access_token) = &options.user_access_token { + builder = builder.with_user_access_token(user_access_token); } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> object_store::Result { - println!("HFStore::get_opts: {:?}", location); - - self.inner.get_opts(location, options).await - } - - async fn delete(&self, _location: &Path) -> object_store::Result<()> { - Err(ObjectStoreError::NotSupported {source: "HFStore::delete".to_string().into()}) - } - - fn list( - &self, - _prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { - Box::pin(futures::stream::empty()) - } - - async fn list_with_delimiter( - &self, - _prefix: Option<&Path>, - ) -> object_store::Result { - Err(ObjectStoreError::NotSupported {source: "HFStore::list_with_delimiter".to_string().into()}) - } - - async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { - Err(ObjectStoreError::NotSupported {source: "HFStore::copy".to_string().into()}) - } - - async fn copy_if_not_exists( - &self, - _from: &Path, - _to: &Path, - ) -> object_store::Result<()> { - Err(ObjectStoreError::NotSupported {source: "HFStore::copy_if_not_exists".to_string().into()}) - } + Ok(builder) } #[cfg(test)] mod tests { use datafusion::error::DataFusionError; - use crate::hf_store::{HFOptions, ParsedHFUrl}; + use crate::hf_store::ParsedHFUrl; #[test] fn test_parse_hf_url() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); + let parsed_url = ParsedHFUrl::parse(url).unwrap(); - assert_eq!( - parsed_url.endpoint, - Some("https://huggingface.co".to_string()) - ); assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); assert_eq!( parsed_url.repository, @@ -458,14 +393,9 @@ mod tests { fn test_parse_hf_url_with_revision() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); - let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); + let parsed_url = ParsedHFUrl::parse(url).unwrap(); - assert_eq!( - parsed_url.endpoint, - Some("https://huggingface.co".to_string()) - ); assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); assert_eq!( parsed_url.repository, @@ -504,41 +434,41 @@ mod tests { } #[test] - fn test_file_url() { + fn test_file_path() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); + let parsed_url = ParsedHFUrl::parse(url); + + assert!(parsed_url.is_ok()); - let file_url = parsed_url.file_url().unwrap(); + let file_path = parsed_url.unwrap().file_path(); assert_eq!( - file_url, - "https://huggingface.co/datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" + file_path, + "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" ); } #[test] - fn test_tree_url() { + fn test_tree_path() { let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let options = HFOptions::default(); - let parsed_url = ParsedHFUrl::parse(url, options).unwrap(); + let parsed_url = ParsedHFUrl::parse(url); - let tree_url = parsed_url.tree_url().unwrap(); + assert!(parsed_url.is_ok()); + + let tree_path = parsed_url.unwrap().tree_path(); assert_eq!( - tree_url, - "https://huggingface.co/api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" + tree_path, + "api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" ); } fn test_error(url: &str, expected: &str) { - let options = HFOptions::default(); - - let parsed_url_result = ParsedHFUrl::parse(url.to_string(), options); + let parsed_url_result = ParsedHFUrl::parse(url.to_string()); match parsed_url_result { Ok(_) => panic!("Expected error, but got success"), diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs index 85e0009bd267..376413498afa 100644 --- a/datafusion-cli/src/object_storage.rs +++ b/datafusion-cli/src/object_storage.rs @@ -35,6 +35,8 @@ use object_store::http::HttpBuilder; use object_store::{CredentialProvider, ObjectStore}; use url::Url; +use crate::hf_store::{get_hf_object_store_builder, HFOptions}; + pub async fn get_s3_object_store_builder( url: &Url, aws_options: &AwsOptions, @@ -429,6 +431,10 @@ pub(crate) fn register_options(ctx: &SessionContext, scheme: &str) { // Register GCP specific table options in the session context: ctx.register_table_options_extension(GcpOptions::default()) } + "hf" => { + // Register HF specific table options in the session context: + ctx.register_table_options_extension(HFOptions::default()) + } // For unsupported schemes, do nothing: _ => {} } @@ -477,6 +483,16 @@ pub(crate) async fn get_object_store( let builder = get_gcs_object_store_builder(url, options)?; Arc::new(builder.build()?) } + "hf" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 'hf' scheme" + ); + }; + + let builder = get_hf_object_store_builder(url, options)?; + Arc::new(builder.build()?) + } "http" | "https" => Arc::new( HttpBuilder::new() .with_url(url.origin().ascii_serialization()) From d324c682301f98a4b930a44c2008620c17ca74c3 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 6 Jun 2024 22:37:22 +0800 Subject: [PATCH 04/15] Implement get eventually --- datafusion-cli/src/hf_store.rs | 268 ++++++++++++++++++++++++++------- datafusion-cli/src/lib.rs | 2 +- 2 files changed, 211 insertions(+), 59 deletions(-) diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 3a89d7cf18a4..f0987370cff1 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -15,20 +15,32 @@ // specific language governing permissions and limitations // under the License. +use async_trait::async_trait; +use bytes::Bytes; use datafusion::common::{config_err, Result}; use datafusion::config::{ ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, Visit, }; use datafusion::error::DataFusionError; +use futures::stream::BoxStream; +use futures::StreamExt; use http::{header, HeaderMap}; use object_store::http::{HttpBuilder, HttpStore}; -use object_store::ClientOptions; -use url::Url; +use object_store::path::Path; +use object_store::{ + ClientOptions, Error as ObjectStoreError, GetOptions, GetResult, ListResult, + MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result as ObjectStoreResult, +}; use std::any::Any; use std::env; use std::fmt::Display; use std::str::FromStr; +use std::sync::Arc; +use tokio::io::AsyncWrite; +use url::Url; +pub const STORE: &str = "hf"; pub const DEFAULT_ENDPOINT: &str = "https://huggingface.co"; pub enum HFConfigKey { @@ -87,15 +99,8 @@ impl ParsedHFUrl { /// /// url: The HuggingFace URL to parse. pub fn parse(url: String) -> Result { - if !url.starts_with(Self::SCHEMA) { - return config_err!( - "Invalid HuggingFace URL: {}, only 'hf://' URLs are supported", - url - ); - } - let mut parsed_url = Self::default(); - let mut last_delim = 5; + let mut last_delim = 0; // parse repository type. if let Some(curr_delim) = url[last_delim..].find('/') { @@ -197,7 +202,7 @@ pub struct HFOptions { } impl ConfigExtension for HFOptions { - const PREFIX: &'static str = "hf"; + const PREFIX: &'static str = STORE; } impl ExtensionOptions for HFOptions { @@ -256,11 +261,8 @@ impl ExtensionOptions for HFOptions { } let mut v = Visitor(vec![]); - self.endpoint.visit( - &mut v, - "endpoint", - "The HuggingFace API endpoint", - ); + self.endpoint + .visit(&mut v, "endpoint", "The HuggingFace API endpoint"); self.user_access_token.visit( &mut v, "user_access_token", @@ -273,8 +275,8 @@ impl ExtensionOptions for HFOptions { #[derive(Debug, Clone, Default)] pub struct HFStoreBuilder { endpoint: Option, + repo_type: Option, user_access_token: Option, - parsed_url: Option, } impl HFStoreBuilder { @@ -282,22 +284,25 @@ impl HFStoreBuilder { Self::default() } + pub fn with_repo_type(mut self, repo_type: impl Into) -> Self { + self.repo_type = Some(repo_type.into()); + self + } + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { self.endpoint = Some(endpoint.into()); self } - pub fn with_user_access_token(mut self, user_access_token: impl Into) -> Self { + pub fn with_user_access_token( + mut self, + user_access_token: impl Into, + ) -> Self { self.user_access_token = Some(user_access_token.into()); self } - pub fn with_parsed_url(mut self, parsed_url: ParsedHFUrl) -> Self { - self.parsed_url = Some(parsed_url); - self - } - pub fn from_env() -> Self { let mut builder = Self::new(); if let Ok(endpoint) = env::var("HF_ENDPOINT") { @@ -311,12 +316,10 @@ impl HFStoreBuilder { builder } - pub fn build(&self) -> Result { - let mut builder = HttpBuilder::new(); + pub fn build(&self) -> Result { + let mut inner_builder = HttpBuilder::new(); - if self.parsed_url.is_none() { - return config_err!("Parsed URL is required to build HFStore"); - } + let repo_type = self.repo_type.clone().unwrap_or("datasets".to_string()); let ep; if let Some(endpoint) = &self.endpoint { @@ -325,36 +328,48 @@ impl HFStoreBuilder { ep = DEFAULT_ENDPOINT.to_string(); } - let url = format!("{}/{}", ep, self.parsed_url.as_ref().unwrap().file_path()); - println!("URL: {}", url); - - builder = builder.with_url(url); + inner_builder = inner_builder.with_url(ep.clone()); if let Some(user_access_token) = &self.user_access_token { if let Ok(token) = format!("Bearer {}", user_access_token).parse() { let mut header_map = HeaderMap::new(); - header_map.insert( - header::AUTHORIZATION, - token, - ); + header_map.insert(header::AUTHORIZATION, token); let options = ClientOptions::new().with_default_headers(header_map); - builder = builder.with_client_options(options); + inner_builder = inner_builder.with_client_options(options); } } - builder.build().map_err(|e| DataFusionError::Execution(format!("Unable to build HFStore: {}", e))) + let builder = inner_builder.build().map_err(|e| { + DataFusionError::Execution(format!("Unable to build HFStore: {}", e)) + })?; + + Ok(HFStore::new(ep, repo_type, Arc::new(builder))) } } pub fn get_hf_object_store_builder( url: &Url, options: &HFOptions, -) -> Result - { - let parsed_url = ParsedHFUrl::parse(url.to_string())?; +) -> Result { let mut builder = HFStoreBuilder::from_env(); - builder = builder.with_parsed_url(parsed_url); + + // The repo type is the first part of the path, which are treated as the origin in the process. + let Some(repo_type) = url.domain() else { + return config_err!( + "Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", + url + ); + }; + + if repo_type != "datasets" && repo_type != "spaces" { + return config_err!( + "Invalid HuggingFace URL: {}, currently only 'datasets' or 'spaces' are supported", + url + ); + } + + builder = builder.with_repo_type(repo_type); if let Some(endpoint) = &options.endpoint { builder = builder.with_endpoint(endpoint); @@ -367,6 +382,151 @@ pub fn get_hf_object_store_builder( Ok(builder) } +#[derive(Debug)] +pub struct HFStore { + endpoint: String, + repo_type: String, + store: Arc, +} + +impl HFStore { + pub fn new(endpoint: String, repo_type: String, store: Arc) -> Self { + Self { + endpoint, + repo_type, + store, + } + } +} + +impl Display for HFStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "HFStore({})", self.endpoint) + } +} + +#[async_trait] +impl ObjectStore for HFStore { + async fn put_opts( + &self, + _location: &Path, + _bytes: Bytes, + _opts: PutOptions, + ) -> ObjectStoreResult { + Err(ObjectStoreError::NotImplemented) + } + + async fn put_multipart( + &self, + _location: &Path, + ) -> ObjectStoreResult<(MultipartId, Box)> { + Err(ObjectStoreError::NotImplemented) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> ObjectStoreResult<()> { + Err(ObjectStoreError::NotImplemented) + } + + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> ObjectStoreResult { + println!("GETTING: {}", location); + + let formatted_location = format!("{}/{}", self.repo_type, location); + + let Ok(parsed_url) = ParsedHFUrl::parse(formatted_location) else { + return Err(ObjectStoreError::Generic { + store: STORE, + source: format!("Unable to parse url {location}").into(), + }); + }; + + let file_path = parsed_url.file_path(); + println!("FILE_PATH: {:?}", file_path); + + let Ok(file_path) = Path::parse(file_path.clone()) else { + return Err(ObjectStoreError::Generic { + store: STORE, + source: format!("Invalid file path {}", file_path).into(), + }); + }; + + let mut res = self.store.get_opts(&file_path, options).await?; + + res.meta.location = location.clone(); + Ok(res) + } + + async fn delete(&self, _location: &Path) -> ObjectStoreResult<()> { + Err(ObjectStoreError::NotImplemented) + } + + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> ObjectStoreResult { + println!("LISTING_WITH_DELIMITER: {:?}", prefix); + + Err(ObjectStoreError::NotImplemented) + } + + fn list( + &self, + prefix: Option<&Path>, + ) -> BoxStream<'_, ObjectStoreResult> { + let Some(prefix) = prefix else { + return futures::stream::once(async { + Err(ObjectStoreError::Generic { + store: STORE, + source: "Prefix is required".into(), + }) + }) + .boxed(); + }; + + let formatted_prefix = format!("{}/{}", self.repo_type, prefix); + let Ok(parsed_url) = ParsedHFUrl::parse(formatted_prefix.clone()) else { + return futures::stream::once(async move { + Err(ObjectStoreError::Generic { + store: STORE, + source: format!("Unable to parse url {}", formatted_prefix.clone()).into(), + }) + }) + .boxed(); + }; + + let tree_path = Path::from(parsed_url.tree_path()); + println!("LISTING: {:?}", tree_path); + + futures::stream::once(async move { + let result = self.store.get(&tree_path).await; + + println!("RESULT: {:?}", result); + + Err(ObjectStoreError::NotImplemented) + }) + .boxed() + } + + async fn copy(&self, _from: &Path, _to: &Path) -> ObjectStoreResult<()> { + Err(ObjectStoreError::NotImplemented) + } + + async fn copy_if_not_exists( + &self, + _from: &Path, + _to: &Path, + ) -> ObjectStoreResult<()> { + Err(ObjectStoreError::NotImplemented) + } +} + #[cfg(test)] mod tests { use datafusion::error::DataFusionError; @@ -375,8 +535,7 @@ mod tests { #[test] fn test_parse_hf_url() { - let url = - "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); let parsed_url = ParsedHFUrl::parse(url).unwrap(); @@ -392,7 +551,7 @@ mod tests { #[test] fn test_parse_hf_url_with_revision() { let url = - "hf://datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); + "datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); let parsed_url = ParsedHFUrl::parse(url).unwrap(); @@ -408,35 +567,29 @@ mod tests { #[test] fn test_parse_hf_url_errors() { test_error( - "hg://datasets/datasets-examples/doc-formats-csv-1/data.csv", - "Invalid HuggingFace URL: hg://datasets/datasets-examples/doc-formats-csv-1/data.csv, only 'hf://' URLs are supported", - ); - - test_error( - "hf://datasets/datasets-examples/doc-formats-csv-1", + "datasets/datasets-examples/doc-formats-csv-1", "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1, please format as 'hf:///[@revision]/'", ); test_error( - "hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv", + "datadicts/datasets-examples/doc-formats-csv-1/data.csv", "Invalid HuggingFace URL: hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv, currently only 'datasets' or 'spaces' are supported", ); test_error( - "hf://datasets/datasets-examples/doc-formats-csv-1@~csv", + "datasets/datasets-examples/doc-formats-csv-1@~csv", "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1@~csv, please format as 'hf:///[@revision]/'", ); test_error( - "hf://datasets/datasets-examples/doc-formats-csv-1@~csv/", + "datasets/datasets-examples/doc-formats-csv-1@~csv/", "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1@~csv/, please specify a path", ); } #[test] fn test_file_path() { - let url = - "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); let parsed_url = ParsedHFUrl::parse(url); @@ -452,8 +605,7 @@ mod tests { #[test] fn test_tree_path() { - let url = - "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); let parsed_url = ParsedHFUrl::parse(url); diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index 1a877eb8bb92..3a2994084657 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -23,8 +23,8 @@ pub mod command; pub mod exec; pub mod functions; pub mod helper; +pub mod hf_store; pub mod highlighter; pub mod object_storage; pub mod print_format; pub mod print_options; -pub mod hf_store; From e97a5a0accb955f1836d9ef39b4a178f08f37d96 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Fri, 7 Jun 2024 12:07:22 +0800 Subject: [PATCH 05/15] implement for list table api --- datafusion-cli/Cargo.lock | 2 + datafusion-cli/Cargo.toml | 2 + datafusion-cli/src/hf_store.rs | 212 +++++++++++++++++++++++++++++---- 3 files changed, 191 insertions(+), 25 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 7611083deb1d..b8b2600fe5c0 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1194,6 +1194,8 @@ dependencies = [ "regex", "rstest", "rustyline", + "serde", + "serde_json", "tokio", "url", ] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 721c555b6bb6..d6af643b88c2 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -56,6 +56,8 @@ parking_lot = { version = "0.12" } parquet = { version = "51.0.0", default-features = false } regex = "1.8" rustyline = "11.0" +serde = "1.0.117" +serde_json = "1.0.117" tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } url = "2.2" diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index f0987370cff1..84c14fee2493 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::ToByteSlice; use async_trait::async_trait; use bytes::Bytes; use datafusion::common::{config_err, Result}; @@ -22,8 +23,9 @@ use datafusion::config::{ ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, Visit, }; use datafusion::error::DataFusionError; +use futures::future::join_all; use futures::stream::BoxStream; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use http::{header, HeaderMap}; use object_store::http::{HttpBuilder, HttpStore}; use object_store::path::Path; @@ -32,6 +34,8 @@ use object_store::{ MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, Result as ObjectStoreResult, }; +use serde::Deserialize; +use serde_json; use std::any::Any; use std::env; use std::fmt::Display; @@ -98,7 +102,7 @@ impl ParsedHFUrl { /// If the endpoint is not provided, it defaults to `https://huggingface.co`. /// /// url: The HuggingFace URL to parse. - pub fn parse(url: String) -> Result { + pub fn parse_hf_style(url: String) -> Result { let mut parsed_url = Self::default(); let mut last_delim = 0; @@ -168,14 +172,101 @@ impl ParsedHFUrl { Ok(parsed_url) } + /// Parse a http style HuggingFace URL into a ParsedHFUrl struct. + /// The URL should be in the format `https://huggingface.co///resolve//` + /// where `repo_type` is either `datasets` or `spaces`. + /// + /// url: The HuggingFace URL to parse. + fn parse_http_style(url: String) -> Result { + let mut parsed_url = Self::default(); + let mut last_delim = 0; + + // parse repository type. + if let Some(curr_delim) = url[last_delim..].find('/') { + let repo_type = &url[last_delim..last_delim + curr_delim]; + if (repo_type != "datasets") && (repo_type != "spaces") { + return config_err!( + "Invalid HuggingFace URL: {}, currently only 'datasets' or 'spaces' are supported", + url + ); + } + + parsed_url.repo_type = Some(repo_type.to_string()); + last_delim += curr_delim + 1; + } else { + return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + } + + let start_delim = last_delim; + // parse repository and revision. + if let Some(curr_delim) = url[last_delim..].find('/') { + last_delim += curr_delim + 1; + } else { + return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + } + + let next_slash = url[last_delim..].find('/'); + + // next slash is not found + if next_slash.is_none() { + return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + } + + parsed_url.repository = Some(url[start_delim..last_delim + next_slash.unwrap()].to_string()); + last_delim += next_slash.unwrap(); + + let next_resolve = url[last_delim..].find("resolve"); + if next_resolve.is_none() { + return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + } + + last_delim += next_resolve.unwrap() + "resolve".len(); + + let next_slash = url[last_delim + 1..].find('/'); + if next_slash.is_none() { + return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + } + + parsed_url.revision = Some(url[last_delim + 1..last_delim + 1 + next_slash.unwrap()].to_string()); + last_delim += 1 + next_slash.unwrap(); + + // parse path. + let path = &url[last_delim + 1..]; + parsed_url.path = Some(path.to_string()); + + Ok(parsed_url) + } + + pub fn hf_path(&self) -> String { + let mut url = self.repository.as_deref().unwrap().to_string(); + + if let Some(revision) = &self.revision { + if revision != "main" { + url.push('@'); + url.push_str(revision); + } + } + + url.push('/'); + url.push_str(self.path.as_deref().unwrap()); + + url + } + pub fn file_path(&self) -> String { + let mut url = self.file_path_prefix(); + url.push('/'); + url.push_str(self.path.as_deref().unwrap()); + + url + } + + pub fn file_path_prefix(&self) -> String { let mut url = self.repo_type.clone().unwrap(); url.push('/'); url.push_str(self.repository.as_deref().unwrap()); url.push_str("/resolve/"); url.push_str(self.revision.as_deref().unwrap()); - url.push('/'); - url.push_str(self.path.as_deref().unwrap()); url } @@ -386,7 +477,20 @@ pub fn get_hf_object_store_builder( pub struct HFStore { endpoint: String, repo_type: String, - store: Arc, + store: Arc, +} + +#[derive(Debug, Deserialize)] +pub struct HFTreeEntry { + pub r#type: String, + pub path: String, + pub oid: String, +} + +impl HFTreeEntry { + pub fn is_file(&self) -> bool { + self.r#type == "file" + } } impl HFStore { @@ -436,11 +540,9 @@ impl ObjectStore for HFStore { location: &Path, options: GetOptions, ) -> ObjectStoreResult { - println!("GETTING: {}", location); - let formatted_location = format!("{}/{}", self.repo_type, location); - let Ok(parsed_url) = ParsedHFUrl::parse(formatted_location) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style(formatted_location) else { return Err(ObjectStoreError::Generic { store: STORE, source: format!("Unable to parse url {location}").into(), @@ -448,7 +550,6 @@ impl ObjectStore for HFStore { }; let file_path = parsed_url.file_path(); - println!("FILE_PATH: {:?}", file_path); let Ok(file_path) = Path::parse(file_path.clone()) else { return Err(ObjectStoreError::Generic { @@ -458,7 +559,7 @@ impl ObjectStore for HFStore { }; let mut res = self.store.get_opts(&file_path, options).await?; - + res.meta.location = location.clone(); Ok(res) } @@ -469,9 +570,8 @@ impl ObjectStore for HFStore { async fn list_with_delimiter( &self, - prefix: Option<&Path>, + _prefix: Option<&Path>, ) -> ObjectStoreResult { - println!("LISTING_WITH_DELIMITER: {:?}", prefix); Err(ObjectStoreError::NotImplemented) } @@ -480,6 +580,7 @@ impl ObjectStore for HFStore { &self, prefix: Option<&Path>, ) -> BoxStream<'_, ObjectStoreResult> { + let Some(prefix) = prefix else { return futures::stream::once(async { Err(ObjectStoreError::Generic { @@ -491,26 +592,71 @@ impl ObjectStore for HFStore { }; let formatted_prefix = format!("{}/{}", self.repo_type, prefix); - let Ok(parsed_url) = ParsedHFUrl::parse(formatted_prefix.clone()) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style(formatted_prefix.clone()) else { return futures::stream::once(async move { Err(ObjectStoreError::Generic { store: STORE, - source: format!("Unable to parse url {}", formatted_prefix.clone()).into(), + source: format!("Unable to parse url {}", formatted_prefix.clone()) + .into(), }) }) .boxed(); }; - let tree_path = Path::from(parsed_url.tree_path()); - println!("LISTING: {:?}", tree_path); + let tree_path = parsed_url.tree_path(); + let file_path_prefix = parsed_url.file_path_prefix(); futures::stream::once(async move { - let result = self.store.get(&tree_path).await; - - println!("RESULT: {:?}", result); + let result = self.store.get(&Path::from(tree_path)).await?; + let Ok(bytes) = result.bytes().await else { + return Err(ObjectStoreError::Generic { + store: STORE, + source: "Unable to get list body".into(), + }); + }; + - Err(ObjectStoreError::NotImplemented) + let Ok(tree_result) = + serde_json::from_slice::>(bytes.to_byte_slice()) + else { + return Err(ObjectStoreError::Generic { + store: STORE, + source: "Unable to parse list body".into(), + }); + }; + + let iter = join_all( + tree_result + .into_iter() + .filter(|entry| entry.is_file()) + .map(|entry| format!("{}/{}", file_path_prefix, entry.path.clone())) + .map(|meta_location| async { + self.store.head(&Path::from(meta_location)).await + }), + ) + .await + .into_iter() + .map(|result| { + result.and_then(|mut meta| { + let Ok(location) = ParsedHFUrl::parse_http_style(meta.location.to_string()) else { + return Err(ObjectStoreError::Generic { + store: STORE, + source: format!("Unable to parse location {}", meta.location) + .into(), + }); + }; + meta.location = Path::from(location.hf_path()); + if let Some(e_tag) = meta.e_tag.as_deref() { + meta.e_tag = Some(e_tag.replace("\"", "")); + } + + Ok(meta) + }) + }); + + Ok::<_, ObjectStoreError>(futures::stream::iter(iter)) }) + .try_flatten() .boxed() } @@ -537,7 +683,7 @@ mod tests { fn test_parse_hf_url() { let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let parsed_url = ParsedHFUrl::parse(url).unwrap(); + let parsed_url = ParsedHFUrl::parse_hf_style(url).unwrap(); assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); assert_eq!( @@ -553,7 +699,7 @@ mod tests { let url = "datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); - let parsed_url = ParsedHFUrl::parse(url).unwrap(); + let parsed_url = ParsedHFUrl::parse_hf_style(url).unwrap(); assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); assert_eq!( @@ -587,11 +733,27 @@ mod tests { ); } + #[test] + fn test_parse_http_url() { + let url = "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv".to_string(); + + let parsed_url = ParsedHFUrl::parse_http_style(url).unwrap(); + + assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); + assert_eq!( + parsed_url.repository, + Some("datasets-examples/doc-formats-csv-1".to_string()) + ); + assert_eq!(parsed_url.revision, Some("main".to_string())); + assert_eq!(parsed_url.path, Some("data.csv".to_string())); + } + + #[test] fn test_file_path() { let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let parsed_url = ParsedHFUrl::parse(url); + let parsed_url = ParsedHFUrl::parse_hf_style(url); assert!(parsed_url.is_ok()); @@ -607,7 +769,7 @@ mod tests { fn test_tree_path() { let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let parsed_url = ParsedHFUrl::parse(url); + let parsed_url = ParsedHFUrl::parse_hf_style(url); assert!(parsed_url.is_ok()); @@ -620,7 +782,7 @@ mod tests { } fn test_error(url: &str, expected: &str) { - let parsed_url_result = ParsedHFUrl::parse(url.to_string()); + let parsed_url_result = ParsedHFUrl::parse_hf_style(url.to_string()); match parsed_url_result { Ok(_) => panic!("Expected error, but got success"), From faac3bc3835e17653a4a602016bbecb5fa1b6472 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Fri, 7 Jun 2024 12:15:09 +0800 Subject: [PATCH 06/15] Fix CI --- datafusion-cli/src/hf_store.rs | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 84c14fee2493..745902cb88e7 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -175,8 +175,8 @@ impl ParsedHFUrl { /// Parse a http style HuggingFace URL into a ParsedHFUrl struct. /// The URL should be in the format `https://huggingface.co///resolve//` /// where `repo_type` is either `datasets` or `spaces`. - /// - /// url: The HuggingFace URL to parse. + /// + /// url: The HuggingFace URL to parse. fn parse_http_style(url: String) -> Result { let mut parsed_url = Self::default(); let mut last_delim = 0; @@ -212,14 +212,15 @@ impl ParsedHFUrl { return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); } - parsed_url.repository = Some(url[start_delim..last_delim + next_slash.unwrap()].to_string()); + parsed_url.repository = + Some(url[start_delim..last_delim + next_slash.unwrap()].to_string()); last_delim += next_slash.unwrap(); - + let next_resolve = url[last_delim..].find("resolve"); if next_resolve.is_none() { return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); } - + last_delim += next_resolve.unwrap() + "resolve".len(); let next_slash = url[last_delim + 1..].find('/'); @@ -227,7 +228,8 @@ impl ParsedHFUrl { return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); } - parsed_url.revision = Some(url[last_delim + 1..last_delim + 1 + next_slash.unwrap()].to_string()); + parsed_url.revision = + Some(url[last_delim + 1..last_delim + 1 + next_slash.unwrap()].to_string()); last_delim += 1 + next_slash.unwrap(); // parse path. @@ -572,7 +574,6 @@ impl ObjectStore for HFStore { &self, _prefix: Option<&Path>, ) -> ObjectStoreResult { - Err(ObjectStoreError::NotImplemented) } @@ -580,7 +581,6 @@ impl ObjectStore for HFStore { &self, prefix: Option<&Path>, ) -> BoxStream<'_, ObjectStoreResult> { - let Some(prefix) = prefix else { return futures::stream::once(async { Err(ObjectStoreError::Generic { @@ -615,7 +615,6 @@ impl ObjectStore for HFStore { }); }; - let Ok(tree_result) = serde_json::from_slice::>(bytes.to_byte_slice()) else { @@ -638,7 +637,9 @@ impl ObjectStore for HFStore { .into_iter() .map(|result| { result.and_then(|mut meta| { - let Ok(location) = ParsedHFUrl::parse_http_style(meta.location.to_string()) else { + let Ok(location) = + ParsedHFUrl::parse_http_style(meta.location.to_string()) + else { return Err(ObjectStoreError::Generic { store: STORE, source: format!("Unable to parse location {}", meta.location) @@ -714,28 +715,29 @@ mod tests { fn test_parse_hf_url_errors() { test_error( "datasets/datasets-examples/doc-formats-csv-1", - "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1, please format as 'hf:///[@revision]/'", + "Invalid HuggingFace URL: datasets/datasets-examples/doc-formats-csv-1, please format as 'hf:///[@revision]/'", ); test_error( "datadicts/datasets-examples/doc-formats-csv-1/data.csv", - "Invalid HuggingFace URL: hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv, currently only 'datasets' or 'spaces' are supported", + "Invalid HuggingFace URL: datadicts/datasets-examples/doc-formats-csv-1/data.csv, currently only 'datasets' or 'spaces' are supported", ); test_error( "datasets/datasets-examples/doc-formats-csv-1@~csv", - "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1@~csv, please format as 'hf:///[@revision]/'", + "Invalid HuggingFace URL: datasets/datasets-examples/doc-formats-csv-1@~csv, please format as 'hf:///[@revision]/'", ); test_error( "datasets/datasets-examples/doc-formats-csv-1@~csv/", - "Invalid HuggingFace URL: hf://datasets/datasets-examples/doc-formats-csv-1@~csv/, please specify a path", + "Invalid HuggingFace URL: datasets/datasets-examples/doc-formats-csv-1@~csv/, please specify a path", ); } #[test] fn test_parse_http_url() { - let url = "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv".to_string(); + let url = "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" + .to_string(); let parsed_url = ParsedHFUrl::parse_http_style(url).unwrap(); @@ -748,7 +750,6 @@ mod tests { assert_eq!(parsed_url.path, Some("data.csv".to_string())); } - #[test] fn test_file_path() { let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); From 45917348fc90c64b26e191af5a08fcb74b4e6ffd Mon Sep 17 00:00:00 2001 From: Xin Li Date: Fri, 7 Jun 2024 19:01:09 +0800 Subject: [PATCH 07/15] Add e2e ci test --- datafusion-cli/Cargo.toml | 2 +- datafusion-cli/src/hf_store.rs | 9 ++--- datafusion-cli/tests/cli_integration.rs | 33 +++++++++++++++++++ .../tests/data/hf_store_expected.jsonl | 20 +++++++++++ datafusion-cli/tests/data/hf_store_sql.txt | 9 +++++ 5 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 datafusion-cli/tests/data/hf_store_expected.jsonl create mode 100644 datafusion-cli/tests/data/hf_store_sql.txt diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index d6af643b88c2..38c9bf4cb66c 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -65,4 +65,4 @@ url = "2.2" assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" -rstest = "0.17" +rstest = "0.17" \ No newline at end of file diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 745902cb88e7..f7f93e90687d 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -607,7 +607,7 @@ impl ObjectStore for HFStore { let file_path_prefix = parsed_url.file_path_prefix(); futures::stream::once(async move { - let result = self.store.get(&Path::from(tree_path)).await?; + let result = self.store.get(&Path::parse(tree_path)?).await?; let Ok(bytes) = result.bytes().await else { return Err(ObjectStoreError::Generic { store: STORE, @@ -630,7 +630,7 @@ impl ObjectStore for HFStore { .filter(|entry| entry.is_file()) .map(|entry| format!("{}/{}", file_path_prefix, entry.path.clone())) .map(|meta_location| async { - self.store.head(&Path::from(meta_location)).await + self.store.head(&Path::parse(meta_location)?).await }), ) .await @@ -646,9 +646,10 @@ impl ObjectStore for HFStore { .into(), }); }; - meta.location = Path::from(location.hf_path()); + + meta.location = Path::from_url_path(location.hf_path())?; if let Some(e_tag) = meta.e_tag.as_deref() { - meta.e_tag = Some(e_tag.replace("\"", "")); + meta.e_tag = Some(e_tag.replace('"', "")); } Ok(meta) diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs index 119a0aa39d3c..72a89c511d10 100644 --- a/datafusion-cli/tests/cli_integration.rs +++ b/datafusion-cli/tests/cli_integration.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::fs; use std::process::Command; use assert_cmd::prelude::{CommandCargoExt, OutputAssertExt}; @@ -54,3 +55,35 @@ fn cli_quick_test<'a>( cmd.args(args); cmd.assert().stdout(predicate::eq(expected)); } + +#[rstest] +#[case::exec_hf_store_test( + ["--file", "tests/data/hf_store_sql.txt", "--format", "json", "-q"], + "tests/data/hf_store_expected.jsonl", +)] +#[test] +fn cli_hf_store_test<'a>( + #[case] args: impl IntoIterator, + #[case] expected_file: &str, +) { + let mut cmd = Command::cargo_bin("datafusion-cli").unwrap(); + cmd.args(args); + + let actual: Vec = serde_json::Deserializer::from_str( + String::from_utf8(cmd.assert().get_output().stdout.to_vec()) + .unwrap() + .as_str(), + ) + .into_iter::() + .collect::, _>>() + .unwrap(); + + let expected: Vec = serde_json::Deserializer::from_str( + fs::read_to_string(expected_file).unwrap().as_str(), + ) + .into_iter::() + .collect::, _>>() + .unwrap(); + + assert_eq!(actual, expected); +} diff --git a/datafusion-cli/tests/data/hf_store_expected.jsonl b/datafusion-cli/tests/data/hf_store_expected.jsonl new file mode 100644 index 000000000000..f27309db036c --- /dev/null +++ b/datafusion-cli/tests/data/hf_store_expected.jsonl @@ -0,0 +1,20 @@ +[ + { + "COUNT(*)": 5 + } +] +[ + { + "COUNT(*)": 152 + } +] +[ + { + "COUNT(*)": 173 + } +] +[ + { + "COUNT(*)": 152 + } +] diff --git a/datafusion-cli/tests/data/hf_store_sql.txt b/datafusion-cli/tests/data/hf_store_sql.txt new file mode 100644 index 000000000000..c56941d3a78a --- /dev/null +++ b/datafusion-cli/tests/data/hf_store_sql.txt @@ -0,0 +1,9 @@ +select count(*) from "hf://datasets/cais/mmlu/astronomy/dev-00000-of-00001.parquet"; + +select count(*) from "hf://datasets/cais/mmlu@~parquet/astronomy/test/0000.parquet"; + +create external table test stored as parquet location "hf://datasets/cais/mmlu/astronomy/"; +SELECT count(*) FROM test; + +create external table test_revision stored as parquet location "hf://datasets/cais/mmlu@~parquet/astronomy/test/"; +SELECT count(*) FROM test_revision; \ No newline at end of file From f59889dbb445455037f5fc7f027b452555f6f79e Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sun, 9 Jun 2024 22:44:51 +0800 Subject: [PATCH 08/15] stage path change --- datafusion-cli/Cargo.lock | 1 + datafusion-cli/Cargo.toml | 1 + datafusion-cli/src/hf_store.rs | 315 +++++++++++++-------------------- 3 files changed, 122 insertions(+), 195 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index b8b2600fe5c0..7f8d70b6be1a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1196,6 +1196,7 @@ dependencies = [ "rustyline", "serde", "serde_json", + "snafu", "tokio", "url", ] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 38c9bf4cb66c..7747654b9d63 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -58,6 +58,7 @@ regex = "1.8" rustyline = "11.0" serde = "1.0.117" serde_json = "1.0.117" +snafu = "0.7" tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } url = "2.2" diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index f7f93e90687d..8e72bc6029f8 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -36,6 +36,7 @@ use object_store::{ }; use serde::Deserialize; use serde_json; +use snafu::{OptionExt, ResultExt, Snafu}; use std::any::Any; use std::env; use std::fmt::Display; @@ -47,6 +48,35 @@ use url::Url; pub const STORE: &str = "hf"; pub const DEFAULT_ENDPOINT: &str = "https://huggingface.co"; +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { url: String, source: url::ParseError }, + + #[snafu(display("Unsupported schema {} in url {}, only 'hf' is supported", schema, url))] + UnsupportedUrlScheme { schema: String, url: String }, + + #[snafu(display("Invalid huggingface url: {}, please format as 'hf:///[@revision]/'", url))] + InvalidHfUrl { url: String }, + + #[snafu(display("Unsupported repository type: {}, currently only 'datasets' or 'spaces' are supported", repo_type))] + UnsupportedRepoType { repo_type: String }, + + #[snafu(display("Unable to parse location {} into ParsedHFUrl, please format as '//resolve//'", location))] + InvalidLocation { location: String }, +} + +impl From for ObjectStoreError { + fn from(source: Error) -> Self { + match source { + _ => ObjectStoreError::Generic { + store: STORE, + source: Box::new(source) + }, + } + } +} + pub enum HFConfigKey { Endpoint, UserAccessToken, @@ -93,150 +123,80 @@ impl Default for ParsedHFUrl { } impl ParsedHFUrl { - pub const SCHEMA: &'static str = "hf://"; - /// Parse a HuggingFace URL into a ParsedHFUrl struct. /// The URL should be in the format `hf:///[@revision]/` /// where `repo_type` is either `datasets` or `spaces`. /// If the revision is not provided, it defaults to `main`. - /// If the endpoint is not provided, it defaults to `https://huggingface.co`. /// /// url: The HuggingFace URL to parse. - pub fn parse_hf_style(url: String) -> Result { - let mut parsed_url = Self::default(); - let mut last_delim = 0; - - // parse repository type. - if let Some(curr_delim) = url[last_delim..].find('/') { - let repo_type = &url[last_delim..last_delim + curr_delim]; - if (repo_type != "datasets") && (repo_type != "spaces") { - return config_err!( - "Invalid HuggingFace URL: {}, currently only 'datasets' or 'spaces' are supported", - url - ); - } + pub fn parse_hf_style_url(url: String) -> ObjectStoreResult { + let url = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; - parsed_url.repo_type = Some(repo_type.to_string()); - last_delim += curr_delim + 1; - } else { - return config_err!("Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", url); - } - - let start_delim = last_delim; - // parse repository and revision. - if let Some(curr_delim) = url[last_delim..].find('/') { - last_delim += curr_delim + 1; - } else { - return config_err!("Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", url); - } - - let next_slash = url[last_delim..].find('/'); - - // next slash is not found - if next_slash.is_none() { - return config_err!("Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", url); - } - - let next_at = url[last_delim..].find('@'); - // @ is found before the next slash. - if let Some(at) = next_at { - if let Some(slash) = next_slash { - if at < slash { - let repo = &url[start_delim..last_delim + at]; - let revision = &url[last_delim + at + 1..last_delim + slash]; - parsed_url.repository = Some(repo.to_string()); - parsed_url.revision = Some(revision.to_string()); - last_delim += slash; - } + if url.scheme() != "hf" { + return Err(UnsupportedUrlSchemeSnafu { + schema: url.scheme().to_string(), + url: url.to_string(), } + .build() + .into()); } - // @ is not found before the next slash. - if parsed_url.repository.is_none() { - last_delim += next_slash.unwrap(); - let repo = &url[start_delim..last_delim]; - parsed_url.repository = Some(repo.to_string()); - } - - if (last_delim + 1) >= url.len() { - return config_err!( - "Invalid HuggingFace URL: {}, please specify a path", - url - ); - } - - // parse path. - let path = &url[last_delim + 1..]; - parsed_url.path = Some(path.to_string()); - - Ok(parsed_url) + // domain is the first part of the path, which are treated as the origin in the url. + let repo_type = url.domain().context(InvalidHfUrlSnafu { url: url.clone() })?; + + Ok(Self::parse_hf_style_path(repo_type, url.path())?) } - /// Parse a http style HuggingFace URL into a ParsedHFUrl struct. - /// The URL should be in the format `https://huggingface.co///resolve//` + /// Parse a HuggingFace path into a ParsedHFUrl struct. + /// The path should be in the format `/resolve//` with given `repo_type`. /// where `repo_type` is either `datasets` or `spaces`. - /// - /// url: The HuggingFace URL to parse. - fn parse_http_style(url: String) -> Result { + /// + /// repo_type: The repository type, either `datasets` or `spaces`. + /// path: The HuggingFace path to parse. + fn parse_hf_style_path(repo_type: &str, mut path: &str) -> ObjectStoreResult { let mut parsed_url = Self::default(); - let mut last_delim = 0; - - // parse repository type. - if let Some(curr_delim) = url[last_delim..].find('/') { - let repo_type = &url[last_delim..last_delim + curr_delim]; - if (repo_type != "datasets") && (repo_type != "spaces") { - return config_err!( - "Invalid HuggingFace URL: {}, currently only 'datasets' or 'spaces' are supported", - url - ); - } - parsed_url.repo_type = Some(repo_type.to_string()); - last_delim += curr_delim + 1; - } else { - return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + if (repo_type != "datasets") && (repo_type != "spaces") { + return Err(UnsupportedRepoTypeSnafu { repo_type }.build().into()); } - let start_delim = last_delim; - // parse repository and revision. - if let Some(curr_delim) = url[last_delim..].find('/') { - last_delim += curr_delim + 1; - } else { - return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); - } + parsed_url.repo_type = Some(repo_type.to_string()); - let next_slash = url[last_delim..].find('/'); + // remove leading slash which is not needed. + path = path.trim_start_matches('/'); - // next slash is not found - if next_slash.is_none() { - return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + // parse the repository and revision. + // - case 1: // where / is the repository and defaults to main. + // - case 2: /@/ where / is the repository and is the revision. + let pathes = path.splitn(3, '/').collect::>(); + if pathes.len() != 3 { + return Err(InvalidHfUrlSnafu { url: path }.build().into()); } - parsed_url.repository = - Some(url[start_delim..last_delim + next_slash.unwrap()].to_string()); - last_delim += next_slash.unwrap(); - - let next_resolve = url[last_delim..].find("resolve"); - if next_resolve.is_none() { - return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); + let revision_parts = pathes[1].splitn(2, '@').collect::>(); + if revision_parts.len() == 2 { + parsed_url.repository = Some(format!("{}/{}", pathes[0], revision_parts[0])); + parsed_url.revision = Some(revision_parts[1].to_string()); + } else { + parsed_url.repository = Some(format!("{}/{}", pathes[0], pathes[1])); } - last_delim += next_resolve.unwrap() + "resolve".len(); + parsed_url.path = Some(pathes[2].to_string()); - let next_slash = url[last_delim + 1..].find('/'); - if next_slash.is_none() { - return config_err!("Invalid HuggingFace URL: {}, please format as 'https://huggingface.co///resolve//'", url); - } + Ok(parsed_url) + } - parsed_url.revision = - Some(url[last_delim + 1..last_delim + 1 + next_slash.unwrap()].to_string()); - last_delim += 1 + next_slash.unwrap(); + /// Parse a http style HuggingFace path into a ParsedHFUrl struct. + /// The path should be in the format `//resolve//` + /// where `repo_type` is either `datasets` or `spaces`. + /// + /// path: The HuggingFace path to parse. + fn parse_http_style_path(path: &Path) -> ObjectStoreResult { + let mut parsed_url = Self::default(); - // parse path. - let path = &url[last_delim + 1..]; - parsed_url.path = Some(path.to_string()); + let parts = path.parts(); - Ok(parsed_url) + parsed_url.repo_type = parts.next().map(|p| p.raw.to_string()); } pub fn hf_path(&self) -> String { @@ -544,7 +504,7 @@ impl ObjectStore for HFStore { ) -> ObjectStoreResult { let formatted_location = format!("{}/{}", self.repo_type, location); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style(formatted_location) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_location) else { return Err(ObjectStoreError::Generic { store: STORE, source: format!("Unable to parse url {location}").into(), @@ -592,7 +552,7 @@ impl ObjectStore for HFStore { }; let formatted_prefix = format!("{}/{}", self.repo_type, prefix); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style(formatted_prefix.clone()) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_prefix.clone()) else { return futures::stream::once(async move { Err(ObjectStoreError::Generic { store: STORE, @@ -638,7 +598,7 @@ impl ObjectStore for HFStore { .map(|result| { result.and_then(|mut meta| { let Ok(location) = - ParsedHFUrl::parse_http_style(meta.location.to_string()) + ParsedHFUrl::parse_http_style_path(meta.location) else { return Err(ObjectStoreError::Generic { store: STORE, @@ -677,85 +637,67 @@ impl ObjectStore for HFStore { #[cfg(test)] mod tests { - use datafusion::error::DataFusionError; - use crate::hf_store::ParsedHFUrl; #[test] fn test_parse_hf_url() { - let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - - let parsed_url = ParsedHFUrl::parse_hf_style(url).unwrap(); - - assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); - assert_eq!( - parsed_url.repository, - Some("datasets-examples/doc-formats-csv-1".to_string()) - ); - assert_eq!(parsed_url.revision, Some("main".to_string())); - assert_eq!(parsed_url.path, Some("data.csv".to_string())); - } + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "main"; + let path = "data.csv"; + + let url = format!("hf://{}/{}/{}", repo_type, repository, path); - #[test] - fn test_parse_hf_url_with_revision() { - let url = - "datasets/datasets-examples/doc-formats-csv-1@~csv/data.csv".to_string(); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url).unwrap(); + + assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); + assert_eq!(parsed_url.repository, Some(repository.to_string())); + assert_eq!(parsed_url.revision, Some(revision.to_string())); + assert_eq!(parsed_url.path, Some(path.to_string())); - let parsed_url = ParsedHFUrl::parse_hf_style(url).unwrap(); + let hf_path = format!("{}/{}", repository, path); + let parsed_path_url = ParsedHFUrl::parse_hf_style_path(repo_type, &hf_path).unwrap(); - assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); - assert_eq!( - parsed_url.repository, - Some("datasets-examples/doc-formats-csv-1".to_string()) - ); - assert_eq!(parsed_url.revision, Some("~csv".to_string())); - assert_eq!(parsed_url.path, Some("data.csv".to_string())); + assert_eq!(parsed_path_url.repo_type, parsed_url.repo_type); + assert_eq!(parsed_path_url.repository, parsed_url.repository); + assert_eq!(parsed_path_url.revision, parsed_url.revision); + assert_eq!(parsed_path_url.path, parsed_url.path); } #[test] - fn test_parse_hf_url_errors() { - test_error( - "datasets/datasets-examples/doc-formats-csv-1", - "Invalid HuggingFace URL: datasets/datasets-examples/doc-formats-csv-1, please format as 'hf:///[@revision]/'", - ); + fn test_parse_hf_url_with_revision() { + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "~parquet"; + let path = "data.csv"; - test_error( - "datadicts/datasets-examples/doc-formats-csv-1/data.csv", - "Invalid HuggingFace URL: datadicts/datasets-examples/doc-formats-csv-1/data.csv, currently only 'datasets' or 'spaces' are supported", - ); + let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url).unwrap(); + + assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); + assert_eq!(parsed_url.repository, Some(repository.to_string())); + assert_eq!(parsed_url.revision, Some(revision.to_string())); + assert_eq!(parsed_url.path, Some(path.to_string())); - test_error( - "datasets/datasets-examples/doc-formats-csv-1@~csv", - "Invalid HuggingFace URL: datasets/datasets-examples/doc-formats-csv-1@~csv, please format as 'hf:///[@revision]/'", - ); + let hf_path = format!("{}@{}/{}", repository, revision, path); + let parsed_path_url = ParsedHFUrl::parse_hf_style_path(repo_type, &hf_path).unwrap(); - test_error( - "datasets/datasets-examples/doc-formats-csv-1@~csv/", - "Invalid HuggingFace URL: datasets/datasets-examples/doc-formats-csv-1@~csv/, please specify a path", - ); + assert_eq!(parsed_path_url.repo_type, parsed_url.repo_type); + assert_eq!(parsed_path_url.repository, parsed_url.repository); + assert_eq!(parsed_path_url.revision, parsed_url.revision); + assert_eq!(parsed_path_url.path, parsed_url.path); } #[test] fn test_parse_http_url() { - let url = "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" - .to_string(); - - let parsed_url = ParsedHFUrl::parse_http_style(url).unwrap(); - - assert_eq!(parsed_url.repo_type, Some("datasets".to_string())); - assert_eq!( - parsed_url.repository, - Some("datasets-examples/doc-formats-csv-1".to_string()) - ); - assert_eq!(parsed_url.revision, Some("main".to_string())); - assert_eq!(parsed_url.path, Some("data.csv".to_string())); + let path = "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv"; } #[test] fn test_file_path() { - let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let parsed_url = ParsedHFUrl::parse_hf_style(url); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url); assert!(parsed_url.is_ok()); @@ -771,7 +713,7 @@ mod tests { fn test_tree_path() { let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); - let parsed_url = ParsedHFUrl::parse_hf_style(url); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url); assert!(parsed_url.is_ok()); @@ -782,21 +724,4 @@ mod tests { "api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" ); } - - fn test_error(url: &str, expected: &str) { - let parsed_url_result = ParsedHFUrl::parse_hf_style(url.to_string()); - - match parsed_url_result { - Ok(_) => panic!("Expected error, but got success"), - Err(err) => match err { - DataFusionError::Configuration(_) => { - assert_eq!( - err.to_string(), - format!("Invalid or Unsupported Configuration: {}", expected) - ) - } - _ => panic!("Expected Configuration error, but got {:?}", err), - }, - } - } } From ce2bb2489ba28e1aee051c609a9e32f7e001626f Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 10 Jun 2024 12:46:08 +0800 Subject: [PATCH 09/15] complete path refactoring --- datafusion-cli/src/hf_store.rs | 237 ++++++++++++++++++++++++++------- 1 file changed, 189 insertions(+), 48 deletions(-) diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 8e72bc6029f8..72d4c34deb96 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -62,8 +62,8 @@ pub enum Error { #[snafu(display("Unsupported repository type: {}, currently only 'datasets' or 'spaces' are supported", repo_type))] UnsupportedRepoType { repo_type: String }, - #[snafu(display("Unable to parse location {} into ParsedHFUrl, please format as '//resolve//'", location))] - InvalidLocation { location: String }, + #[snafu(display("Unable to parse location {} into ParsedHFUrl, please format as '//resolve//'", url))] + InvalidLocation { url: String }, } impl From for ObjectStoreError { @@ -124,12 +124,12 @@ impl Default for ParsedHFUrl { impl ParsedHFUrl { /// Parse a HuggingFace URL into a ParsedHFUrl struct. - /// The URL should be in the format `hf:///[@revision]/` + /// The URL should be in the format `hf:////[@revision]/` /// where `repo_type` is either `datasets` or `spaces`. /// If the revision is not provided, it defaults to `main`. /// /// url: The HuggingFace URL to parse. - pub fn parse_hf_style_url(url: String) -> ObjectStoreResult { + pub fn parse_hf_style_url(url: &str) -> ObjectStoreResult { let url = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; if url.scheme() != "hf" { @@ -148,12 +148,14 @@ impl ParsedHFUrl { } /// Parse a HuggingFace path into a ParsedHFUrl struct. - /// The path should be in the format `/resolve//` with given `repo_type`. + /// The path should be in the format `/[@revision]/` with given `repo_type`. /// where `repo_type` is either `datasets` or `spaces`. /// /// repo_type: The repository type, either `datasets` or `spaces`. /// path: The HuggingFace path to parse. fn parse_hf_style_path(repo_type: &str, mut path: &str) -> ObjectStoreResult { + static EXPECTED_PARTS: usize = 3; + let mut parsed_url = Self::default(); if (repo_type != "datasets") && (repo_type != "spaces") { @@ -168,38 +170,48 @@ impl ParsedHFUrl { // parse the repository and revision. // - case 1: // where / is the repository and defaults to main. // - case 2: /@/ where / is the repository and is the revision. - let pathes = path.splitn(3, '/').collect::>(); - if pathes.len() != 3 { - return Err(InvalidHfUrlSnafu { url: path }.build().into()); + let path_parts = path.splitn(EXPECTED_PARTS, '/').collect::>(); + if path_parts.len() != EXPECTED_PARTS { + return Err(InvalidHfUrlSnafu { url: format!("hf://{}/{}", repo_type, path) }.build().into()); } - let revision_parts = pathes[1].splitn(2, '@').collect::>(); + let revision_parts = path_parts[1].splitn(2, '@').collect::>(); if revision_parts.len() == 2 { - parsed_url.repository = Some(format!("{}/{}", pathes[0], revision_parts[0])); + parsed_url.repository = Some(format!("{}/{}", path_parts[0], revision_parts[0])); parsed_url.revision = Some(revision_parts[1].to_string()); } else { - parsed_url.repository = Some(format!("{}/{}", pathes[0], pathes[1])); + parsed_url.repository = Some(format!("{}/{}", path_parts[0], path_parts[1])); } - parsed_url.path = Some(pathes[2].to_string()); + parsed_url.path = Some(path_parts[2].to_string()); Ok(parsed_url) } /// Parse a http style HuggingFace path into a ParsedHFUrl struct. - /// The path should be in the format `//resolve//` + /// The path should be in the format `///resolve//` /// where `repo_type` is either `datasets` or `spaces`. /// /// path: The HuggingFace path to parse. - fn parse_http_style_path(path: &Path) -> ObjectStoreResult { + fn parse_http_style_path(path: &str) -> ObjectStoreResult { + static EXPECTED_PARTS: usize = 6; + let mut parsed_url = Self::default(); - let parts = path.parts(); + let path_parts = path.splitn(EXPECTED_PARTS, '/').collect::>(); + if path_parts.len() != EXPECTED_PARTS || path_parts[3] != "resolve" { + return Err(InvalidLocationSnafu { url: path.to_string() }.build().into()); + } + + parsed_url.repo_type = Some(path_parts[0].to_string()); + parsed_url.repository = Some(format!("{}/{}", path_parts[1], path_parts[2])); + parsed_url.revision = Some(path_parts[4].to_string()); + parsed_url.path = Some(path_parts[5].to_string()); - parsed_url.repo_type = parts.next().map(|p| p.raw.to_string()); + Ok(parsed_url) } - pub fn hf_path(&self) -> String { + fn as_hf_path(&self) -> String { let mut url = self.repository.as_deref().unwrap().to_string(); if let Some(revision) = &self.revision { @@ -215,15 +227,15 @@ impl ParsedHFUrl { url } - pub fn file_path(&self) -> String { - let mut url = self.file_path_prefix(); + fn as_location(&self) -> String { + let mut url = self.as_location_dir(); url.push('/'); url.push_str(self.path.as_deref().unwrap()); url } - pub fn file_path_prefix(&self) -> String { + pub fn as_location_dir(&self) -> String { let mut url = self.repo_type.clone().unwrap(); url.push('/'); url.push_str(self.repository.as_deref().unwrap()); @@ -233,7 +245,7 @@ impl ParsedHFUrl { url } - pub fn tree_path(&self) -> String { + pub fn as_tree_location(&self) -> String { let mut url = "api/".to_string(); url.push_str(self.repo_type.as_deref().unwrap()); url.push('/'); @@ -504,14 +516,14 @@ impl ObjectStore for HFStore { ) -> ObjectStoreResult { let formatted_location = format!("{}/{}", self.repo_type, location); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_location) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_location.as_str()) else { return Err(ObjectStoreError::Generic { store: STORE, source: format!("Unable to parse url {location}").into(), }); }; - let file_path = parsed_url.file_path(); + let file_path = parsed_url.as_location(); let Ok(file_path) = Path::parse(file_path.clone()) else { return Err(ObjectStoreError::Generic { @@ -552,7 +564,7 @@ impl ObjectStore for HFStore { }; let formatted_prefix = format!("{}/{}", self.repo_type, prefix); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_prefix.clone()) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_prefix.as_str()) else { return futures::stream::once(async move { Err(ObjectStoreError::Generic { store: STORE, @@ -563,8 +575,8 @@ impl ObjectStore for HFStore { .boxed(); }; - let tree_path = parsed_url.tree_path(); - let file_path_prefix = parsed_url.file_path_prefix(); + let tree_path = parsed_url.as_tree_location(); + let file_path_prefix = parsed_url.as_location_dir(); futures::stream::once(async move { let result = self.store.get(&Path::parse(tree_path)?).await?; @@ -598,7 +610,7 @@ impl ObjectStore for HFStore { .map(|result| { result.and_then(|mut meta| { let Ok(location) = - ParsedHFUrl::parse_http_style_path(meta.location) + ParsedHFUrl::parse_http_style_path(meta.location.to_string().as_str()) else { return Err(ObjectStoreError::Generic { store: STORE, @@ -607,7 +619,7 @@ impl ObjectStore for HFStore { }); }; - meta.location = Path::from_url_path(location.hf_path())?; + meta.location = Path::from_url_path(location.as_hf_path())?; if let Some(e_tag) = meta.e_tag.as_deref() { meta.e_tag = Some(e_tag.replace('"', "")); } @@ -648,7 +660,7 @@ mod tests { let url = format!("hf://{}/{}/{}", repo_type, repository, path); - let parsed_url = ParsedHFUrl::parse_hf_style_url(url).unwrap(); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); assert_eq!(parsed_url.repository, Some(repository.to_string())); @@ -672,7 +684,7 @@ mod tests { let path = "data.csv"; let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); - let parsed_url = ParsedHFUrl::parse_hf_style_url(url).unwrap(); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); assert_eq!(parsed_url.repository, Some(repository.to_string())); @@ -688,40 +700,169 @@ mod tests { assert_eq!(parsed_path_url.path, parsed_url.path); } + #[test] + fn test_parse_hf_url_error() { + test_parse_hf_url_error_matches( + "abc", + "Generic hf error: Unable parse source url. Url: abc, Error: relative URL without a base" + ); + + test_parse_hf_url_error_matches( + "hf://", + "Generic hf error: Invalid huggingface url: hf://, please format as 'hf:///[@revision]/'" + ); + + test_parse_hf_url_error_matches( + "df://datasets/datasets-examples/doc-formats-csv-1", + "Generic hf error: Unsupported schema df in url df://datasets/datasets-examples/doc-formats-csv-1, only 'hf' is supported" + ); + + test_parse_hf_url_error_matches( + "hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv", + "Generic hf error: Unsupported repository type: datadicts, currently only 'datasets' or 'spaces' are supported" + ); + + test_parse_hf_url_error_matches( + "hf://datasets/datasets-examples/doc-formats-csv-1", + "Generic hf error: Invalid huggingface url: hf://datasets/datasets-examples/doc-formats-csv-1, please format as 'hf:///[@revision]/'" + ); + } + + fn test_parse_hf_url_error_matches(url: &str, expected_error: &str) { + let parsed_url_result = ParsedHFUrl::parse_hf_style_url(url); + + assert!(parsed_url_result.is_err()); + assert_eq!(parsed_url_result.unwrap_err().to_string(), expected_error); + } + #[test] fn test_parse_http_url() { - let path = "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv"; + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "main"; + let path = "data.csv"; + + let url = format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_http_style_path(&url).unwrap(); + + assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); + assert_eq!(parsed_url.repository, Some(repository.to_string())); + assert_eq!(parsed_url.revision, Some(revision.to_string())); + assert_eq!(parsed_url.path, Some(path.to_string())); } #[test] - fn test_file_path() { - let url = "hf://datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + fn test_parse_http_url_with_revision() { + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "~parquet"; + let path = "data.csv"; - let parsed_url = ParsedHFUrl::parse_hf_style_url(url); + let url = format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_http_style_path(&url).unwrap(); - assert!(parsed_url.is_ok()); + assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); + assert_eq!(parsed_url.repository, Some(repository.to_string())); + assert_eq!(parsed_url.revision, Some(revision.to_string())); + assert_eq!(parsed_url.path, Some(path.to_string())); + } - let file_path = parsed_url.unwrap().file_path(); + #[test] + fn test_parse_http_url_error() { + test_parse_http_url_error_matches( + "datasets/datasets-examples/doc-formats-csv-1", + "Generic hf error: Unable to parse location datasets/datasets-examples/doc-formats-csv-1 into ParsedHFUrl, please format as '//resolve//'" + ); - assert_eq!( - file_path, - "datasets/datasets-examples/doc-formats-csv-1/resolve/main/data.csv" + test_parse_http_url_error_matches( + "datasets/datasets-examples/doc-formats-csv-1/data.csv", + "Generic hf error: Unable to parse location datasets/datasets-examples/doc-formats-csv-1/data.csv into ParsedHFUrl, please format as '//resolve//'" ); + + test_parse_http_url_error_matches( + "datasets/datasets-examples/doc-formats-csv-1/resolve/main", + "Generic hf error: Unable to parse location datasets/datasets-examples/doc-formats-csv-1/resolve/main into ParsedHFUrl, please format as '//resolve//'" + ); + } + + fn test_parse_http_url_error_matches(url: &str, expected_error: &str) { + let parsed_url_result = ParsedHFUrl::parse_http_style_path(url); + assert!(parsed_url_result.is_err()); + assert_eq!(parsed_url_result.unwrap_err().to_string(), expected_error); + } + + #[test] + fn test_as_hf_path() { + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let path = "data.csv"; + + let url = format!("hf://{}/{}/{}", repo_type, repository, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_hf_path(), format!("{}/{}", repository, path)); + + let revision = "~parquet"; + let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_hf_path(), format!("{}@{}/{}", repository, revision, path)); } #[test] - fn test_tree_path() { - let url = "datasets/datasets-examples/doc-formats-csv-1/data.csv".to_string(); + fn test_as_location() { + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "main"; + let path = "data.csv"; - let parsed_url = ParsedHFUrl::parse_hf_style_url(url); + let url = format!("hf://{}/{}/{}", repo_type, repository, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_location(), format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path)); - assert!(parsed_url.is_ok()); + let revision = "~parquet"; + let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - let tree_path = parsed_url.unwrap().tree_path(); + assert_eq!(parsed_url.as_location(), format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path)); + } - assert_eq!( - tree_path, - "api/datasets/datasets-examples/doc-formats-csv-1/tree/main/data.csv" - ); + #[test] + fn test_as_location_dir() { + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "main"; + let path = "data.csv"; + + let url = format!("hf://{}/{}/{}", repo_type, repository, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_location_dir(), format!("{}/{}/resolve/{}", repo_type, repository, revision)); + + let revision = "~parquet"; + let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_location_dir(), format!("{}/{}/resolve/{}", repo_type, repository, revision)); + } + + #[test] + fn test_as_tree_location() { + let repo_type = "datasets"; + let repository = "datasets-examples/doc-formats-csv-1"; + let revision = "main"; + let path = "data.csv"; + + let url = format!("hf://{}/{}/{}", repo_type, repository, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_tree_location(), format!("api/{}/{}/tree/{}/{}", repo_type, repository, revision, path)); + + let revision = "~parquet"; + let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); + let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); + + assert_eq!(parsed_url.as_tree_location(), format!("api/{}/{}/tree/{}/{}", repo_type, repository, revision, path)); } } From 645fe80af40fa97f033295210fd5f725ae036fb7 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 10 Jun 2024 13:26:56 +0800 Subject: [PATCH 10/15] refining builder code --- datafusion-cli/src/hf_store.rs | 322 ++++++++++++++++++++++++++------- 1 file changed, 255 insertions(+), 67 deletions(-) diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 72d4c34deb96..79ade4897bb1 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -51,11 +51,18 @@ pub const DEFAULT_ENDPOINT: &str = "https://huggingface.co"; #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { url: String, source: url::ParseError }, - - #[snafu(display("Unsupported schema {} in url {}, only 'hf' is supported", schema, url))] + UnableToParseUrl { + url: String, + source: url::ParseError, + }, + + #[snafu(display( + "Unsupported schema {} in url {}, only 'hf' is supported", + schema, + url + ))] UnsupportedUrlScheme { schema: String, url: String }, - + #[snafu(display("Invalid huggingface url: {}, please format as 'hf:///[@revision]/'", url))] InvalidHfUrl { url: String }, @@ -64,42 +71,30 @@ pub enum Error { #[snafu(display("Unable to parse location {} into ParsedHFUrl, please format as '//resolve//'", url))] InvalidLocation { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for ObjectStoreError { fn from(source: Error) -> Self { match source { + Error::UnknownConfigurationKey { key } => { + ObjectStoreError::UnknownConfigurationKey { store: STORE, key } + } _ => ObjectStoreError::Generic { store: STORE, - source: Box::new(source) + source: Box::new(source), }, } } -} - -pub enum HFConfigKey { - Endpoint, - UserAccessToken, } -impl AsRef for HFConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::Endpoint => "endpoint", - Self::UserAccessToken => "user_access_token", - } - } -} - -impl FromStr for HFConfigKey { - type Err = DataFusionError; - - fn from_str(s: &str) -> Result { - match s { - "endpoint" => Ok(Self::Endpoint), - "user_access_token" => Ok(Self::UserAccessToken), - _ => config_err!("Invalid HuggingFace configuration key: {}", s), - } +impl From for DataFusionError { + fn from(source: Error) -> Self { + // Only datafusion configuration errors are exposed in this mod. + // Other errors are aligned with generic object store errors. + DataFusionError::Configuration(source.to_string()) } } @@ -142,15 +137,17 @@ impl ParsedHFUrl { } // domain is the first part of the path, which are treated as the origin in the url. - let repo_type = url.domain().context(InvalidHfUrlSnafu { url: url.clone() })?; - + let repo_type = url + .domain() + .context(InvalidHfUrlSnafu { url: url.clone() })?; + Ok(Self::parse_hf_style_path(repo_type, url.path())?) } /// Parse a HuggingFace path into a ParsedHFUrl struct. /// The path should be in the format `/[@revision]/` with given `repo_type`. /// where `repo_type` is either `datasets` or `spaces`. - /// + /// /// repo_type: The repository type, either `datasets` or `spaces`. /// path: The HuggingFace path to parse. fn parse_hf_style_path(repo_type: &str, mut path: &str) -> ObjectStoreResult { @@ -172,12 +169,17 @@ impl ParsedHFUrl { // - case 2: /@/ where / is the repository and is the revision. let path_parts = path.splitn(EXPECTED_PARTS, '/').collect::>(); if path_parts.len() != EXPECTED_PARTS { - return Err(InvalidHfUrlSnafu { url: format!("hf://{}/{}", repo_type, path) }.build().into()); + return Err(InvalidHfUrlSnafu { + url: format!("hf://{}/{}", repo_type, path), + } + .build() + .into()); } let revision_parts = path_parts[1].splitn(2, '@').collect::>(); if revision_parts.len() == 2 { - parsed_url.repository = Some(format!("{}/{}", path_parts[0], revision_parts[0])); + parsed_url.repository = + Some(format!("{}/{}", path_parts[0], revision_parts[0])); parsed_url.revision = Some(revision_parts[1].to_string()); } else { parsed_url.repository = Some(format!("{}/{}", path_parts[0], path_parts[1])); @@ -200,9 +202,13 @@ impl ParsedHFUrl { let path_parts = path.splitn(EXPECTED_PARTS, '/').collect::>(); if path_parts.len() != EXPECTED_PARTS || path_parts[3] != "resolve" { - return Err(InvalidLocationSnafu { url: path.to_string() }.build().into()); + return Err(InvalidLocationSnafu { + url: path.to_string(), + } + .build() + .into()); } - + parsed_url.repo_type = Some(path_parts[0].to_string()); parsed_url.repository = Some(format!("{}/{}", path_parts[1], path_parts[2])); parsed_url.revision = Some(path_parts[4].to_string()); @@ -337,6 +343,34 @@ impl ExtensionOptions for HFOptions { } } +pub enum HFConfigKey { + Endpoint, + UserAccessToken, +} + +impl AsRef for HFConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::Endpoint => "endpoint", + Self::UserAccessToken => "user_access_token", + } + } +} + +impl FromStr for HFConfigKey { + type Err = ObjectStoreError; + + fn from_str(s: &str) -> ObjectStoreResult { + match s { + "endpoint" => Ok(Self::Endpoint), + "user_access_token" => Ok(Self::UserAccessToken), + _ => Err(UnknownConfigurationKeySnafu { key: s.to_string() } + .build() + .into()), + } + } +} + #[derive(Debug, Clone, Default)] pub struct HFStoreBuilder { endpoint: Option, @@ -368,6 +402,22 @@ impl HFStoreBuilder { self } + pub fn with_config_key(mut self, key: HFConfigKey, value: impl Into) -> Self { + match key { + HFConfigKey::Endpoint => self.endpoint = Some(value.into()), + HFConfigKey::UserAccessToken => self.user_access_token = Some(value.into()), + } + + self + } + + pub fn get_config_key(&self, key: HFConfigKey) -> Option { + match key { + HFConfigKey::Endpoint => self.endpoint.clone(), + HFConfigKey::UserAccessToken => self.user_access_token.clone(), + } + } + pub fn from_env() -> Self { let mut builder = Self::new(); if let Ok(endpoint) = env::var("HF_ENDPOINT") { @@ -381,7 +431,7 @@ impl HFStoreBuilder { builder } - pub fn build(&self) -> Result { + pub fn build(&self) -> ObjectStoreResult { let mut inner_builder = HttpBuilder::new(); let repo_type = self.repo_type.clone().unwrap_or("datasets".to_string()); @@ -405,9 +455,7 @@ impl HFStoreBuilder { } } - let builder = inner_builder.build().map_err(|e| { - DataFusionError::Execution(format!("Unable to build HFStore: {}", e)) - })?; + let builder = inner_builder.build()?; Ok(HFStore::new(ep, repo_type, Arc::new(builder))) } @@ -421,17 +469,15 @@ pub fn get_hf_object_store_builder( // The repo type is the first part of the path, which are treated as the origin in the process. let Some(repo_type) = url.domain() else { - return config_err!( - "Invalid HuggingFace URL: {}, please format as 'hf:///[@revision]/'", - url - ); + return Err(InvalidHfUrlSnafu { + url: url.to_string(), + } + .build() + .into()); }; if repo_type != "datasets" && repo_type != "spaces" { - return config_err!( - "Invalid HuggingFace URL: {}, currently only 'datasets' or 'spaces' are supported", - url - ); + return Err(UnsupportedRepoTypeSnafu { repo_type }.build().into()); } builder = builder.with_repo_type(repo_type); @@ -516,7 +562,8 @@ impl ObjectStore for HFStore { ) -> ObjectStoreResult { let formatted_location = format!("{}/{}", self.repo_type, location); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_location.as_str()) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_location.as_str()) + else { return Err(ObjectStoreError::Generic { store: STORE, source: format!("Unable to parse url {location}").into(), @@ -564,7 +611,8 @@ impl ObjectStore for HFStore { }; let formatted_prefix = format!("{}/{}", self.repo_type, prefix); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_prefix.as_str()) else { + let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_prefix.as_str()) + else { return futures::stream::once(async move { Err(ObjectStoreError::Generic { store: STORE, @@ -609,9 +657,9 @@ impl ObjectStore for HFStore { .into_iter() .map(|result| { result.and_then(|mut meta| { - let Ok(location) = - ParsedHFUrl::parse_http_style_path(meta.location.to_string().as_str()) - else { + let Ok(location) = ParsedHFUrl::parse_http_style_path( + meta.location.to_string().as_str(), + ) else { return Err(ObjectStoreError::Generic { store: STORE, source: format!("Unable to parse location {}", meta.location) @@ -649,7 +697,7 @@ impl ObjectStore for HFStore { #[cfg(test)] mod tests { - use crate::hf_store::ParsedHFUrl; + use crate::hf_store::{HFConfigKey, HFOptions, HFStoreBuilder, ParsedHFUrl}; #[test] fn test_parse_hf_url() { @@ -657,18 +705,19 @@ mod tests { let repository = "datasets-examples/doc-formats-csv-1"; let revision = "main"; let path = "data.csv"; - + let url = format!("hf://{}/{}/{}", repo_type, repository, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - + assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); assert_eq!(parsed_url.repository, Some(repository.to_string())); assert_eq!(parsed_url.revision, Some(revision.to_string())); assert_eq!(parsed_url.path, Some(path.to_string())); let hf_path = format!("{}/{}", repository, path); - let parsed_path_url = ParsedHFUrl::parse_hf_style_path(repo_type, &hf_path).unwrap(); + let parsed_path_url = + ParsedHFUrl::parse_hf_style_path(repo_type, &hf_path).unwrap(); assert_eq!(parsed_path_url.repo_type, parsed_url.repo_type); assert_eq!(parsed_path_url.repository, parsed_url.repository); @@ -685,17 +734,18 @@ mod tests { let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - + assert_eq!(parsed_url.repo_type, Some(repo_type.to_string())); assert_eq!(parsed_url.repository, Some(repository.to_string())); assert_eq!(parsed_url.revision, Some(revision.to_string())); assert_eq!(parsed_url.path, Some(path.to_string())); let hf_path = format!("{}@{}/{}", repository, revision, path); - let parsed_path_url = ParsedHFUrl::parse_hf_style_path(repo_type, &hf_path).unwrap(); + let parsed_path_url = + ParsedHFUrl::parse_hf_style_path(repo_type, &hf_path).unwrap(); assert_eq!(parsed_path_url.repo_type, parsed_url.repo_type); - assert_eq!(parsed_path_url.repository, parsed_url.repository); + assert_eq!(parsed_path_url.repository, parsed_url.repository); assert_eq!(parsed_path_url.revision, parsed_url.revision); assert_eq!(parsed_path_url.path, parsed_url.path); } @@ -786,7 +836,7 @@ mod tests { } fn test_parse_http_url_error_matches(url: &str, expected_error: &str) { - let parsed_url_result = ParsedHFUrl::parse_http_style_path(url); + let parsed_url_result = ParsedHFUrl::parse_http_style_path(url); assert!(parsed_url_result.is_err()); assert_eq!(parsed_url_result.unwrap_err().to_string(), expected_error); } @@ -806,7 +856,10 @@ mod tests { let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_hf_path(), format!("{}@{}/{}", repository, revision, path)); + assert_eq!( + parsed_url.as_hf_path(), + format!("{}@{}/{}", repository, revision, path) + ); } #[test] @@ -819,13 +872,19 @@ mod tests { let url = format!("hf://{}/{}/{}", repo_type, repository, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_location(), format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path)); + assert_eq!( + parsed_url.as_location(), + format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path) + ); let revision = "~parquet"; let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_location(), format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path)); + assert_eq!( + parsed_url.as_location(), + format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path) + ); } #[test] @@ -838,13 +897,19 @@ mod tests { let url = format!("hf://{}/{}/{}", repo_type, repository, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_location_dir(), format!("{}/{}/resolve/{}", repo_type, repository, revision)); + assert_eq!( + parsed_url.as_location_dir(), + format!("{}/{}/resolve/{}", repo_type, repository, revision) + ); let revision = "~parquet"; let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_location_dir(), format!("{}/{}/resolve/{}", repo_type, repository, revision)); + assert_eq!( + parsed_url.as_location_dir(), + format!("{}/{}/resolve/{}", repo_type, repository, revision) + ); } #[test] @@ -857,12 +922,135 @@ mod tests { let url = format!("hf://{}/{}/{}", repo_type, repository, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_tree_location(), format!("api/{}/{}/tree/{}/{}", repo_type, repository, revision, path)); + assert_eq!( + parsed_url.as_tree_location(), + format!( + "api/{}/{}/tree/{}/{}", + repo_type, repository, revision, path + ) + ); let revision = "~parquet"; let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_tree_location(), format!("api/{}/{}/tree/{}/{}", repo_type, repository, revision, path)); + assert_eq!( + parsed_url.as_tree_location(), + format!( + "api/{}/{}/tree/{}/{}", + repo_type, repository, revision, path + ) + ); + } + + #[test] + fn test_hf_store_builder() { + let endpoint = "https://huggingface.co"; + let user_access_token = "abc"; + + let builder = HFStoreBuilder::new() + .with_endpoint(endpoint) + .with_user_access_token(user_access_token); + + assert_eq!( + builder.endpoint, + builder.get_config_key(HFConfigKey::Endpoint) + ); + assert_eq!( + builder.user_access_token, + builder.get_config_key(HFConfigKey::UserAccessToken) + ); + } + + #[test] + fn test_hf_store_builder_default() { + let builder = HFStoreBuilder::new(); + + assert_eq!(builder.endpoint, None); + assert_eq!(builder.user_access_token, None); + } + + #[test] + fn test_fn_store_from_config_key() { + let endpoint = "https://huggingface.co"; + let user_access_token = "abc"; + + let builder = HFStoreBuilder::new() + .with_config_key(HFConfigKey::Endpoint, endpoint) + .with_config_key(HFConfigKey::UserAccessToken, user_access_token); + + assert_eq!( + builder.endpoint, + builder.get_config_key(HFConfigKey::Endpoint) + ); + assert_eq!( + builder.user_access_token, + builder.get_config_key(HFConfigKey::UserAccessToken) + ); + } + + #[test] + fn test_hf_store_builder_from_env() { + let endpoint = "https://huggingface.co"; + let user_access_token = "abc"; + + let _ = std::env::set_var("HF_ENDPOINT", endpoint); + let _ = std::env::set_var("HF_USER_ACCESS_TOKEN", user_access_token); + + let builder = HFStoreBuilder::from_env(); + + assert_eq!( + builder.endpoint, + builder.get_config_key(HFConfigKey::Endpoint) + ); + assert_eq!( + builder.user_access_token, + builder.get_config_key(HFConfigKey::UserAccessToken) + ); + } + + #[test] + fn test_get_hf_object_store_builder() { + let endpoint = "https://huggingface.co"; + let user_access_token = "abc"; + + let url = + url::Url::parse("hf://datasets/datasets-examples/doc-formats-csv-1/data.csv") + .unwrap(); + let options = HFOptions { + endpoint: Some(endpoint.to_string()), + user_access_token: Some(user_access_token.to_string()), + }; + + let builder = super::get_hf_object_store_builder(&url, &options).unwrap(); + + assert_eq!(builder.endpoint, Some(endpoint.to_string())); + assert_eq!( + builder.user_access_token, + Some(user_access_token.to_string()) + ); + } + + #[test] + fn test_get_hf_object_store_builder_error() { + let endpoint = "https://huggingface.co"; + let user_access_token = "abc"; + + let url = + url::Url::parse("hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv") + .unwrap(); + let options = HFOptions { + endpoint: Some(endpoint.to_string()), + user_access_token: Some(user_access_token.to_string()), + }; + + let expected_error = super::get_hf_object_store_builder(&url, &options); + assert!(expected_error.is_err()); + + let expected_error = expected_error.unwrap_err(); + assert_eq!( + expected_error.to_string(), + "Invalid or Unsupported Configuration: Unsupported repository type: datadicts, currently only 'datasets' or 'spaces' are supported" + ); } } From b8476e812ef9dba285ab894b6c9ba290689a8fc2 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 10 Jun 2024 14:10:35 +0800 Subject: [PATCH 11/15] Refining list code --- datafusion-cli/src/hf_store.rs | 146 +++++++++------------ datafusion-cli/tests/data/hf_store_sql.txt | 2 +- 2 files changed, 63 insertions(+), 85 deletions(-) diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 79ade4897bb1..2d1694b2123f 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -74,6 +74,12 @@ pub enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("Unable to parse tree result body, this is likely a change in the API side or a network issue, Error: {}", inner))] + UnableToParseTreeResult { inner: serde_json::Error }, + + #[snafu(display("Prefix is required for HuggingFace store"))] + PrefixRequired, } impl From for ObjectStoreError { @@ -125,7 +131,7 @@ impl ParsedHFUrl { /// /// url: The HuggingFace URL to parse. pub fn parse_hf_style_url(url: &str) -> ObjectStoreResult { - let url = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; + let url = Url::parse(url).context(UnableToParseUrlSnafu { url })?; if url.scheme() != "hf" { return Err(UnsupportedUrlSchemeSnafu { @@ -141,7 +147,7 @@ impl ParsedHFUrl { .domain() .context(InvalidHfUrlSnafu { url: url.clone() })?; - Ok(Self::parse_hf_style_path(repo_type, url.path())?) + Self::parse_hf_style_path(repo_type, url.path()) } /// Parse a HuggingFace path into a ParsedHFUrl struct. @@ -217,7 +223,7 @@ impl ParsedHFUrl { Ok(parsed_url) } - fn as_hf_path(&self) -> String { + fn to_hf_path(&self) -> String { let mut url = self.repository.as_deref().unwrap().to_string(); if let Some(revision) = &self.revision { @@ -233,15 +239,15 @@ impl ParsedHFUrl { url } - fn as_location(&self) -> String { - let mut url = self.as_location_dir(); + fn to_location(&self) -> String { + let mut url = self.to_location_dir(); url.push('/'); url.push_str(self.path.as_deref().unwrap()); url } - pub fn as_location_dir(&self) -> String { + pub fn to_location_dir(&self) -> String { let mut url = self.repo_type.clone().unwrap(); url.push('/'); url.push_str(self.repository.as_deref().unwrap()); @@ -251,7 +257,7 @@ impl ParsedHFUrl { url } - pub fn as_tree_location(&self) -> String { + pub fn to_tree_location(&self) -> String { let mut url = "api/".to_string(); url.push_str(self.repo_type.as_deref().unwrap()); url.push('/'); @@ -560,25 +566,12 @@ impl ObjectStore for HFStore { location: &Path, options: GetOptions, ) -> ObjectStoreResult { - let formatted_location = format!("{}/{}", self.repo_type, location); - - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_location.as_str()) - else { - return Err(ObjectStoreError::Generic { - store: STORE, - source: format!("Unable to parse url {location}").into(), - }); - }; - - let file_path = parsed_url.as_location(); - - let Ok(file_path) = Path::parse(file_path.clone()) else { - return Err(ObjectStoreError::Generic { - store: STORE, - source: format!("Invalid file path {}", file_path).into(), - }); - }; + let parsed_url = ParsedHFUrl::parse_hf_style_path( + &self.repo_type, + location.to_string().as_str(), + )?; + let file_path = Path::parse(parsed_url.to_location())?; let mut res = self.store.get_opts(&file_path, options).await?; res.meta.location = location.clone(); @@ -602,53 +595,37 @@ impl ObjectStore for HFStore { ) -> BoxStream<'_, ObjectStoreResult> { let Some(prefix) = prefix else { return futures::stream::once(async { - Err(ObjectStoreError::Generic { - store: STORE, - source: "Prefix is required".into(), - }) + Err(PrefixRequiredSnafu {}.build().into()) }) .boxed(); }; - let formatted_prefix = format!("{}/{}", self.repo_type, prefix); - let Ok(parsed_url) = ParsedHFUrl::parse_hf_style_url(formatted_prefix.as_str()) - else { - return futures::stream::once(async move { - Err(ObjectStoreError::Generic { - store: STORE, - source: format!("Unable to parse url {}", formatted_prefix.clone()) - .into(), - }) - }) - .boxed(); - }; + let parsed_url_result = ParsedHFUrl::parse_hf_style_path( + &self.repo_type, + prefix.to_string().as_str(), + ); + + if let Err(err) = parsed_url_result { + return futures::stream::once(async { Err(err) }).boxed(); + } - let tree_path = parsed_url.as_tree_location(); - let file_path_prefix = parsed_url.as_location_dir(); + let parsed_url = parsed_url_result.unwrap(); + let tree_location = parsed_url.to_tree_location(); + let file_location_dir = parsed_url.to_location_dir(); futures::stream::once(async move { - let result = self.store.get(&Path::parse(tree_path)?).await?; - let Ok(bytes) = result.bytes().await else { - return Err(ObjectStoreError::Generic { - store: STORE, - source: "Unable to get list body".into(), - }); - }; - - let Ok(tree_result) = + let result = self.store.get(&Path::parse(tree_location)?).await?; + let bytes = result.bytes().await?; + + let tree_result = serde_json::from_slice::>(bytes.to_byte_slice()) - else { - return Err(ObjectStoreError::Generic { - store: STORE, - source: "Unable to parse list body".into(), - }); - }; + .map_err(|err| UnableToParseTreeResultSnafu { inner: err }.build())?; let iter = join_all( tree_result .into_iter() .filter(|entry| entry.is_file()) - .map(|entry| format!("{}/{}", file_path_prefix, entry.path.clone())) + .map(|entry| format!("{}/{}", file_location_dir, entry.path.clone())) .map(|meta_location| async { self.store.head(&Path::parse(meta_location)?).await }), @@ -657,17 +634,17 @@ impl ObjectStore for HFStore { .into_iter() .map(|result| { result.and_then(|mut meta| { - let Ok(location) = ParsedHFUrl::parse_http_style_path( + match ParsedHFUrl::parse_http_style_path( meta.location.to_string().as_str(), - ) else { - return Err(ObjectStoreError::Generic { - store: STORE, - source: format!("Unable to parse location {}", meta.location) - .into(), - }); - }; - - meta.location = Path::from_url_path(location.as_hf_path())?; + ) { + Ok(parsed_url) => { + meta.location = Path::from_url_path(parsed_url.to_hf_path())?; + } + Err(err) => { + return Err(err); + } + } + if let Some(e_tag) = meta.e_tag.as_deref() { meta.e_tag = Some(e_tag.replace('"', "")); } @@ -842,7 +819,7 @@ mod tests { } #[test] - fn test_as_hf_path() { + fn test_to_hf_path() { let repo_type = "datasets"; let repository = "datasets-examples/doc-formats-csv-1"; let path = "data.csv"; @@ -850,20 +827,20 @@ mod tests { let url = format!("hf://{}/{}/{}", repo_type, repository, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); - assert_eq!(parsed_url.as_hf_path(), format!("{}/{}", repository, path)); + assert_eq!(parsed_url.to_hf_path(), format!("{}/{}", repository, path)); let revision = "~parquet"; let url = format!("hf://{}/{}@{}/{}", repo_type, repository, revision, path); let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_hf_path(), + parsed_url.to_hf_path(), format!("{}@{}/{}", repository, revision, path) ); } #[test] - fn test_as_location() { + fn test_to_location() { let repo_type = "datasets"; let repository = "datasets-examples/doc-formats-csv-1"; let revision = "main"; @@ -873,7 +850,7 @@ mod tests { let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_location(), + parsed_url.to_location(), format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path) ); @@ -882,13 +859,13 @@ mod tests { let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_location(), + parsed_url.to_location(), format!("{}/{}/resolve/{}/{}", repo_type, repository, revision, path) ); } #[test] - fn test_as_location_dir() { + fn test_to_location_dir() { let repo_type = "datasets"; let repository = "datasets-examples/doc-formats-csv-1"; let revision = "main"; @@ -898,7 +875,7 @@ mod tests { let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_location_dir(), + parsed_url.to_location_dir(), format!("{}/{}/resolve/{}", repo_type, repository, revision) ); @@ -907,13 +884,13 @@ mod tests { let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_location_dir(), + parsed_url.to_location_dir(), format!("{}/{}/resolve/{}", repo_type, repository, revision) ); } #[test] - fn test_as_tree_location() { + fn test_to_tree_location() { let repo_type = "datasets"; let repository = "datasets-examples/doc-formats-csv-1"; let revision = "main"; @@ -923,7 +900,7 @@ mod tests { let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_tree_location(), + parsed_url.to_tree_location(), format!( "api/{}/{}/tree/{}/{}", repo_type, repository, revision, path @@ -935,7 +912,7 @@ mod tests { let parsed_url = ParsedHFUrl::parse_hf_style_url(url.as_str()).unwrap(); assert_eq!( - parsed_url.as_tree_location(), + parsed_url.to_tree_location(), format!( "api/{}/{}/tree/{}/{}", repo_type, repository, revision, path @@ -1036,9 +1013,10 @@ mod tests { let endpoint = "https://huggingface.co"; let user_access_token = "abc"; - let url = - url::Url::parse("hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv") - .unwrap(); + let url = url::Url::parse( + "hf://datadicts/datasets-examples/doc-formats-csv-1/data.csv", + ) + .unwrap(); let options = HFOptions { endpoint: Some(endpoint.to_string()), user_access_token: Some(user_access_token.to_string()), diff --git a/datafusion-cli/tests/data/hf_store_sql.txt b/datafusion-cli/tests/data/hf_store_sql.txt index c56941d3a78a..26f962019e93 100644 --- a/datafusion-cli/tests/data/hf_store_sql.txt +++ b/datafusion-cli/tests/data/hf_store_sql.txt @@ -6,4 +6,4 @@ create external table test stored as parquet location "hf://datasets/cais/mmlu/a SELECT count(*) FROM test; create external table test_revision stored as parquet location "hf://datasets/cais/mmlu@~parquet/astronomy/test/"; -SELECT count(*) FROM test_revision; \ No newline at end of file +SELECT count(*) FROM test_revision; From 31835ba1ae39f505e0d535e484a88c8e394fada9 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 10 Jun 2024 16:59:22 +0800 Subject: [PATCH 12/15] Add document --- datafusion-cli/Cargo.lock | 23 ++++---- datafusion-cli/Cargo.toml | 3 +- datafusion-cli/src/hf_store.rs | 69 ++++++++++++----------- docs/source/user-guide/cli/datasources.md | 57 +++++++++++++++++++ 4 files changed, 105 insertions(+), 47 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 51940d131df6..d30a4736239c 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1184,14 +1184,13 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", - "bytes", "clap", "ctor", "datafusion", "dirs", "env_logger", "futures", - "http", + "http 1.1.0", "mimalloc", "object_store", "parking_lot", @@ -2885,9 +2884,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.4" +version = "1.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" dependencies = [ "aho-corasick", "memchr", @@ -2897,9 +2896,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", @@ -2908,15 +2907,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" @@ -3851,9 +3850,9 @@ checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" [[package]] name = "utf8parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 75d8a947e30b..e3765c481d86 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -34,7 +34,6 @@ arrow = { version = "52.0.0" } async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" -bytes = "1.0" clap = { version = "3", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "39.0.0", features = [ "avro", @@ -49,7 +48,7 @@ datafusion = { path = "../datafusion/core", version = "39.0.0", features = [ dirs = "4.0.0" env_logger = "0.9" futures = "0.3" -http= "0.2.12" +http= "1.1.0" mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.10.1", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 2d1694b2123f..4ace592b0059 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -17,8 +17,7 @@ use arrow::datatypes::ToByteSlice; use async_trait::async_trait; -use bytes::Bytes; -use datafusion::common::{config_err, Result}; +use datafusion::common::Result; use datafusion::config::{ ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, Visit, }; @@ -26,13 +25,13 @@ use datafusion::error::DataFusionError; use futures::future::join_all; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use http::{header, HeaderMap}; +use http::{header, HeaderMap, HeaderValue}; use object_store::http::{HttpBuilder, HttpStore}; use object_store::path::Path; use object_store::{ ClientOptions, Error as ObjectStoreError, GetOptions, GetResult, ListResult, - MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, - Result as ObjectStoreResult, + MultipartUpload, ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, + PutResult, Result as ObjectStoreResult, }; use serde::Deserialize; use serde_json; @@ -42,7 +41,6 @@ use std::env; use std::fmt::Display; use std::str::FromStr; use std::sync::Arc; -use tokio::io::AsyncWrite; use url::Url; pub const STORE: &str = "hf"; @@ -295,7 +293,7 @@ impl ExtensionOptions for HFOptions { Box::new(self.clone()) } - fn set(&mut self, key: &str, value: &str) -> datafusion::common::Result<()> { + fn set(&mut self, key: &str, value: &str) -> Result<()> { let (_key, rem) = key.split_once('.').unwrap_or((key, "")); match rem { "endpoint" => { @@ -305,7 +303,7 @@ impl ExtensionOptions for HFOptions { self.user_access_token.set(rem, value)?; } _ => { - return config_err!("Config value \"{}\" not found on HFOptions", rem); + return Err(UnknownConfigurationKeySnafu { key }.build().into()); } } Ok(()) @@ -452,7 +450,11 @@ impl HFStoreBuilder { inner_builder = inner_builder.with_url(ep.clone()); if let Some(user_access_token) = &self.user_access_token { - if let Ok(token) = format!("Bearer {}", user_access_token).parse() { + if let Ok(mut token) = + HeaderValue::from_str(format!("Bearer {user_access_token}").as_str()) + { + token.set_sensitive(true); + let mut header_map = HeaderMap::new(); header_map.insert(header::AUTHORIZATION, token); let options = ClientOptions::new().with_default_headers(header_map); @@ -540,24 +542,17 @@ impl ObjectStore for HFStore { async fn put_opts( &self, _location: &Path, - _bytes: Bytes, + _payload: PutPayload, _opts: PutOptions, ) -> ObjectStoreResult { Err(ObjectStoreError::NotImplemented) } - async fn put_multipart( - &self, - _location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { - Err(ObjectStoreError::NotImplemented) - } - - async fn abort_multipart( + async fn put_multipart_opts( &self, _location: &Path, - _multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { + _opts: PutMultipartOpts, + ) -> ObjectStoreResult> { Err(ObjectStoreError::NotImplemented) } @@ -572,6 +567,7 @@ impl ObjectStore for HFStore { )?; let file_path = Path::parse(parsed_url.to_location())?; + let mut res = self.store.get_opts(&file_path, options).await?; res.meta.location = location.clone(); @@ -929,13 +925,10 @@ mod tests { .with_endpoint(endpoint) .with_user_access_token(user_access_token); - assert_eq!( - builder.endpoint, - builder.get_config_key(HFConfigKey::Endpoint) - ); + assert_eq!(builder.endpoint, Some(endpoint.to_string())); assert_eq!( builder.user_access_token, - builder.get_config_key(HFConfigKey::UserAccessToken) + Some(user_access_token.to_string()) ); } @@ -956,13 +949,10 @@ mod tests { .with_config_key(HFConfigKey::Endpoint, endpoint) .with_config_key(HFConfigKey::UserAccessToken, user_access_token); - assert_eq!( - builder.endpoint, - builder.get_config_key(HFConfigKey::Endpoint) - ); + assert_eq!(builder.endpoint, Some(endpoint.to_string())); assert_eq!( builder.user_access_token, - builder.get_config_key(HFConfigKey::UserAccessToken) + Some(user_access_token.to_string()) ); } @@ -976,13 +966,26 @@ mod tests { let builder = HFStoreBuilder::from_env(); + assert_eq!(builder.endpoint, Some(endpoint.to_string())); assert_eq!( - builder.endpoint, - builder.get_config_key(HFConfigKey::Endpoint) + builder.user_access_token, + Some(user_access_token.to_string()) ); + } + + #[test] + fn test_hf_store_builder_preserve_case() { + let endpoint = "https://huggingface.co"; + let user_access_token = "AbcD231_!@#"; + + let builder = HFStoreBuilder::new() + .with_config_key(HFConfigKey::Endpoint, endpoint) + .with_config_key(HFConfigKey::UserAccessToken, user_access_token); + + assert_eq!(builder.endpoint, Some(endpoint.to_string())); assert_eq!( builder.user_access_token, - builder.get_config_key(HFConfigKey::UserAccessToken) + Some(user_access_token.to_string()) ); } diff --git a/docs/source/user-guide/cli/datasources.md b/docs/source/user-guide/cli/datasources.md index 2b11645c471a..c17727e9679b 100644 --- a/docs/source/user-guide/cli/datasources.md +++ b/docs/source/user-guide/cli/datasources.md @@ -347,3 +347,60 @@ Supported configuration options are: | `GOOGLE_APPLICATION_CREDENTIALS` | `gcp.application_credentials_path` | location of application credentials file | | `GOOGLE_BUCKET` | | bucket name | | `GOOGLE_BUCKET_NAME` | | (alias) bucket name | + +## Hugging Face + +The `datafusion-cli` supports querying datasets from the [Hugging Face Hub](https://huggingface.co/datasets) for both public and private datasets. + +For example, to query directly a public dataset from the Hugging Face Hub: + +```sql +SELECT question, answer +FROM "hf://datasets/cais/mmlu/astronomy/dev-00000-of-00001.parquet"; +``` + +It is also possible to query a list of files from a dataset: + +```sql +CREATE EXTERNAL TABLE astronomy +STORED AS parquet +LOCATION "hf://datasets/cais/mmlu/astronomy/"; +``` + +and then + +```sql +SELECT question, answer +FROM astronomy; +``` + +To query a private dataset, you need to set the either the hf.user_access_token or the HF_USER_ACCESS_TOKEN environment variable: + +```sql +CREATE EXTERNAL TABLE astronomy +OPTIONS ( + 'hf.user_access_token' '******' +) +STORED AS parquet +LOCATION "hf://datasets/cais/mmlu/astronomy/"; +``` + +or + +```bash +$ export HF_USER_ACCESS_TOKEN=****** + +$ datafusion-cli +DataFusion CLI v38.0.0 + +> CREATE EXTERNAL TABLE astronomy +STORED AS parquet +LOCATION "hf://datasets/cais/mmlu/astronomy/"; +``` + +Supported configuration options are: + +| Environment Variable | Configuration Option | Description | +| -------------------- | -------------------- | ----------- | +| `HF_ENDPOINT` | `hf.endpoint` | Hugging Face endpoint | +| `HF_USER_ACCESS_TOKEN` | `hf.user_access_token` | Hugging Face user access token | From 7f21b3c44139c639f0bca468a9531ae36c5169fe Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 11 Jun 2024 10:44:54 +0800 Subject: [PATCH 13/15] Fix ci --- datafusion-cli/Cargo.toml | 4 ++-- datafusion-cli/src/hf_store.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index e3765c481d86..14d0282343a5 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -48,7 +48,7 @@ datafusion = { path = "../datafusion/core", version = "39.0.0", features = [ dirs = "4.0.0" env_logger = "0.9" futures = "0.3" -http= "1.1.0" +http = "1.1.0" mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.10.1", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } @@ -65,4 +65,4 @@ url = "2.2" assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" -rstest = "0.17" \ No newline at end of file +rstest = "0.17" diff --git a/datafusion-cli/src/hf_store.rs b/datafusion-cli/src/hf_store.rs index 4ace592b0059..32ae45aedf28 100644 --- a/datafusion-cli/src/hf_store.rs +++ b/datafusion-cli/src/hf_store.rs @@ -961,8 +961,8 @@ mod tests { let endpoint = "https://huggingface.co"; let user_access_token = "abc"; - let _ = std::env::set_var("HF_ENDPOINT", endpoint); - let _ = std::env::set_var("HF_USER_ACCESS_TOKEN", user_access_token); + std::env::set_var("HF_ENDPOINT", endpoint); + std::env::set_var("HF_USER_ACCESS_TOKEN", user_access_token); let builder = HFStoreBuilder::from_env(); From bc7345b4a78565c2bc5d877a0b4f72b452b85918 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 11 Jun 2024 11:14:21 +0800 Subject: [PATCH 14/15] Fix doc --- docs/source/user-guide/cli/datasources.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/cli/datasources.md b/docs/source/user-guide/cli/datasources.md index c17727e9679b..e9fe19f76f53 100644 --- a/docs/source/user-guide/cli/datasources.md +++ b/docs/source/user-guide/cli/datasources.md @@ -385,7 +385,7 @@ STORED AS parquet LOCATION "hf://datasets/cais/mmlu/astronomy/"; ``` -or +or ```bash $ export HF_USER_ACCESS_TOKEN=****** @@ -400,7 +400,7 @@ LOCATION "hf://datasets/cais/mmlu/astronomy/"; Supported configuration options are: -| Environment Variable | Configuration Option | Description | -| -------------------- | -------------------- | ----------- | -| `HF_ENDPOINT` | `hf.endpoint` | Hugging Face endpoint | +| Environment Variable | Configuration Option | Description | +| ---------------------- | ---------------------- | ------------------------------ | +| `HF_ENDPOINT` | `hf.endpoint` | Hugging Face endpoint | | `HF_USER_ACCESS_TOKEN` | `hf.user_access_token` | Hugging Face user access token | From 1f659b3da13bb44e7c24b6f75e2e8bf78521ee6c Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 13 Jun 2024 16:52:43 +0800 Subject: [PATCH 15/15] disable win-x64 test --- datafusion-cli/tests/cli_integration.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs index 915c593b810f..3d5691d01048 100644 --- a/datafusion-cli/tests/cli_integration.rs +++ b/datafusion-cli/tests/cli_integration.rs @@ -58,6 +58,12 @@ fn cli_quick_test<'a>( cmd.assert().stdout(predicate::eq(expected)); } +// Disabled due to https://github.com/apache/datafusion/issues/10793 +// $ ./target/debug/datafusion-cli.exe --file tests/data/hf_store_sql.txt --format json +// DataFusion CLI v39.0.0 +// +// thread 'main' has overflowed its stack +#[cfg(not(target_family = "windows"))] #[rstest] #[case::exec_hf_store_test( ["--file", "tests/data/hf_store_sql.txt", "--format", "json", "-q"],