Skip to content

Commit 649cf35

Browse files
lestevePGijsbers
andauthored
ENH Use OpenML metadata for download url (scikit-learn#30708)
Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
1 parent ada9947 commit 649cf35

File tree

4 files changed

+44
-27
lines changed

4 files changed

+44
-27
lines changed

sklearn/datasets/_openml.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from tempfile import TemporaryDirectory
1414
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1515
from urllib.error import HTTPError, URLError
16+
from urllib.parse import urlparse
1617
from urllib.request import Request, urlopen
1718
from warnings import warn
1819

@@ -32,12 +33,10 @@
3233

3334
__all__ = ["fetch_openml"]
3435

35-
_OPENML_PREFIX = "https://api.openml.org/"
36-
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
37-
_DATA_INFO = "api/v1/json/data/{}"
38-
_DATA_FEATURES = "api/v1/json/data/features/{}"
39-
_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
40-
_DATA_FILE = "data/v1/download/{}"
36+
_SEARCH_NAME = "https://api.openml.org/api/v1/json/data/list/data_name/{}/limit/2"
37+
_DATA_INFO = "https://api.openml.org/api/v1/json/data/{}"
38+
_DATA_FEATURES = "https://api.openml.org/api/v1/json/data/features/{}"
39+
_DATA_QUALITIES = "https://api.openml.org/api/v1/json/data/qualities/{}"
4140

4241
OpenmlQualitiesType = List[Dict[str, str]]
4342
OpenmlFeaturesType = List[Dict[str, str]]
@@ -119,16 +118,17 @@ def wrapper(*args, **kwargs):
119118

120119

121120
def _open_openml_url(
122-
openml_path: str, data_home: Optional[str], n_retries: int = 3, delay: float = 1.0
121+
url: str, data_home: Optional[str], n_retries: int = 3, delay: float = 1.0
123122
):
124123
"""
125124
Returns a resource from OpenML.org. Caches it to data_home if required.
126125
127126
Parameters
128127
----------
129-
openml_path : str
130-
OpenML URL that will be accessed. This will be prefixes with
131-
_OPENML_PREFIX.
128+
url : str
129+
OpenML URL that will be downloaded and cached locally. The path component
130+
of the URL is used to replicate the tree structure as sub-folders of the local
131+
cache folder.
132132
133133
data_home : str
134134
Directory to which the files will be cached. If None, no caching will
@@ -150,7 +150,7 @@ def _open_openml_url(
150150
def is_gzip_encoded(_fsrc):
151151
return _fsrc.info().get("Content-Encoding", "") == "gzip"
152152

153-
req = Request(_OPENML_PREFIX + openml_path)
153+
req = Request(url)
154154
req.add_header("Accept-encoding", "gzip")
155155

156156
if data_home is None:
@@ -159,6 +159,7 @@ def is_gzip_encoded(_fsrc):
159159
return gzip.GzipFile(fileobj=fsrc, mode="rb")
160160
return fsrc
161161

162+
openml_path = urlparse(url).path.lstrip("/")
162163
local_path = _get_local_path(openml_path, data_home)
163164
dir_name, file_name = os.path.split(local_path)
164165
if not os.path.exists(local_path):
@@ -1126,7 +1127,7 @@ def fetch_openml(
11261127
shape = None
11271128

11281129
# obtain the data
1129-
url = _DATA_FILE.format(data_description["file_id"])
1130+
url = data_description["url"]
11301131
bunch = _download_data_to_bunch(
11311132
url,
11321133
return_sparse,
Binary file not shown.
Binary file not shown.

sklearn/datasets/tests/test_openml.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from sklearn import config_context
1818
from sklearn.datasets import fetch_openml as fetch_openml_orig
1919
from sklearn.datasets._openml import (
20-
_OPENML_PREFIX,
2120
_get_local_path,
2221
_open_openml_url,
2322
_retry_with_clean_cache,
@@ -33,6 +32,7 @@
3332
OPENML_TEST_DATA_MODULE = "sklearn.datasets.tests.data.openml"
3433
# if True, urlopen will be monkey patched to only use local files
3534
test_offline = True
35+
_MONKEY_PATCH_LOCAL_OPENML_PATH = "data/v1/download/{}"
3636

3737

3838
class _MockHTTPResponse:
@@ -74,7 +74,7 @@ def _monkey_patch_webbased_functions(context, data_id, gzip_response):
7474
# stored as cache should not be mixed up with real openml datasets
7575
url_prefix_data_description = "https://api.openml.org/api/v1/json/data/"
7676
url_prefix_data_features = "https://api.openml.org/api/v1/json/data/features/"
77-
url_prefix_download_data = "https://api.openml.org/data/v1/"
77+
url_prefix_download_data = "https://www.openml.org/data/v1/download"
7878
url_prefix_data_list = "https://api.openml.org/api/v1/json/data/list/"
7979

8080
path_suffix = ".gz"
@@ -105,7 +105,9 @@ def _file_name(url, suffix):
105105
)
106106

107107
def _mock_urlopen_shared(url, has_gzip_header, expected_prefix, suffix):
108-
assert url.startswith(expected_prefix)
108+
assert url.startswith(
109+
expected_prefix
110+
), f"{expected_prefix!r} does not match {url!r}"
109111

110112
data_file_name = _file_name(url, suffix)
111113
data_file_path = resources.files(data_module) / data_file_name
@@ -136,15 +138,27 @@ def _mock_urlopen_data_features(url, has_gzip_header):
136138
)
137139

138140
def _mock_urlopen_download_data(url, has_gzip_header):
141+
# For simplicity the mock filenames don't contain the filename, i.e.
142+
# the last part of the data description url after the last /.
143+
# For example for id_1, data description download url is:
144+
# gunzip -c sklearn/datasets/tests/data/openml/id_1/api-v1-jd-1.json.gz | grep '"url" # noqa: E501
145+
# "https:\/\/www.openml.org\/data\/v1\/download\/1\/anneal.arff"
146+
# but the mock filename does not contain anneal.arff and is:
147+
# sklearn/datasets/tests/data/openml/id_1/data-v1-dl-1.arff.gz.
148+
# We only keep the part of the url before the last /
149+
url_without_filename = url.rsplit("/", 1)[0]
150+
139151
return _mock_urlopen_shared(
140-
url=url,
152+
url=url_without_filename,
141153
has_gzip_header=has_gzip_header,
142154
expected_prefix=url_prefix_download_data,
143155
suffix=".arff",
144156
)
145157

146158
def _mock_urlopen_data_list(url, has_gzip_header):
147-
assert url.startswith(url_prefix_data_list)
159+
assert url.startswith(
160+
url_prefix_data_list
161+
), f"{url_prefix_data_list!r} does not match {url!r}"
148162

149163
data_file_name = _file_name(url, ".json")
150164
data_file_path = resources.files(data_module) / data_file_name
@@ -1343,22 +1357,24 @@ def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
13431357
data_id = 61
13441358

13451359
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
1346-
openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
1360+
openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH.format(data_id) + "/filename.arff"
1361+
url = f"https://www.openml.org/{openml_path}"
13471362
cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
13481363
# first fill the cache
1349-
response1 = _open_openml_url(openml_path, cache_directory)
1364+
response1 = _open_openml_url(url, cache_directory)
13501365
# assert file exists
13511366
location = _get_local_path(openml_path, cache_directory)
13521367
assert os.path.isfile(location)
13531368
# redownload, to utilize cache
1354-
response2 = _open_openml_url(openml_path, cache_directory)
1369+
response2 = _open_openml_url(url, cache_directory)
13551370
assert response1.read() == response2.read()
13561371

13571372

13581373
@pytest.mark.parametrize("write_to_disk", [True, False])
13591374
def test_open_openml_url_unlinks_local_path(monkeypatch, tmpdir, write_to_disk):
13601375
data_id = 61
1361-
openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
1376+
openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH.format(data_id) + "/filename.arff"
1377+
url = f"https://www.openml.org/{openml_path}"
13621378
cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
13631379
location = _get_local_path(openml_path, cache_directory)
13641380

@@ -1371,14 +1387,14 @@ def _mock_urlopen(request, *args, **kwargs):
13711387
monkeypatch.setattr(sklearn.datasets._openml, "urlopen", _mock_urlopen)
13721388

13731389
with pytest.raises(ValueError, match="Invalid request"):
1374-
_open_openml_url(openml_path, cache_directory)
1390+
_open_openml_url(url, cache_directory)
13751391

13761392
assert not os.path.exists(location)
13771393

13781394

13791395
def test_retry_with_clean_cache(tmpdir):
13801396
data_id = 61
1381-
openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
1397+
openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH.format(data_id)
13821398
cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
13831399
location = _get_local_path(openml_path, cache_directory)
13841400
os.makedirs(os.path.dirname(location))
@@ -1401,7 +1417,7 @@ def _load_data():
14011417

14021418
def test_retry_with_clean_cache_http_error(tmpdir):
14031419
data_id = 61
1404-
openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
1420+
openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH.format(data_id)
14051421
cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
14061422

14071423
@_retry_with_clean_cache(openml_path, cache_directory)
@@ -1487,7 +1503,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir, pars
14871503

14881504
def swap_file_mock(request, *args, **kwargs):
14891505
url = request.get_full_url()
1490-
if url.endswith("data/v1/download/1666876"):
1506+
if url.endswith("data/v1/download/1666876/anneal.arff"):
14911507
with open(corrupt_copy_path, "rb") as f:
14921508
corrupted_data = f.read()
14931509
return _MockHTTPResponse(BytesIO(corrupted_data), is_gzip=True)
@@ -1515,13 +1531,13 @@ def _mock_urlopen_network_error(request, *args, **kwargs):
15151531
sklearn.datasets._openml, "urlopen", _mock_urlopen_network_error
15161532
)
15171533

1518-
invalid_openml_url = "invalid-url"
1534+
invalid_openml_url = "https://api.openml.org/invalid-url"
15191535

15201536
with pytest.warns(
15211537
UserWarning,
15221538
match=re.escape(
15231539
"A network error occurred while downloading"
1524-
f" {_OPENML_PREFIX + invalid_openml_url}. Retrying..."
1540+
f" {invalid_openml_url}. Retrying..."
15251541
),
15261542
) as record:
15271543
with pytest.raises(HTTPError, match="Simulated network error"):

0 commit comments

Comments
 (0)