17
17
from sklearn import config_context
18
18
from sklearn .datasets import fetch_openml as fetch_openml_orig
19
19
from sklearn .datasets ._openml import (
20
- _OPENML_PREFIX ,
21
20
_get_local_path ,
22
21
_open_openml_url ,
23
22
_retry_with_clean_cache ,
33
32
OPENML_TEST_DATA_MODULE = "sklearn.datasets.tests.data.openml"
34
33
# if True, urlopen will be monkey patched to only use local files
35
34
test_offline = True
35
+ _MONKEY_PATCH_LOCAL_OPENML_PATH = "data/v1/download/{}"
36
36
37
37
38
38
class _MockHTTPResponse :
@@ -74,7 +74,7 @@ def _monkey_patch_webbased_functions(context, data_id, gzip_response):
74
74
# stored as cache should not be mixed up with real openml datasets
75
75
url_prefix_data_description = "https://api.openml.org/api/v1/json/data/"
76
76
url_prefix_data_features = "https://api.openml.org/api/v1/json/data/features/"
77
- url_prefix_download_data = "https://api .openml.org/data/v1/"
77
+ url_prefix_download_data = "https://www .openml.org/data/v1/download "
78
78
url_prefix_data_list = "https://api.openml.org/api/v1/json/data/list/"
79
79
80
80
path_suffix = ".gz"
@@ -105,7 +105,9 @@ def _file_name(url, suffix):
105
105
)
106
106
107
107
def _mock_urlopen_shared (url , has_gzip_header , expected_prefix , suffix ):
108
- assert url .startswith (expected_prefix )
108
+ assert url .startswith (
109
+ expected_prefix
110
+ ), f"{ expected_prefix !r} does not match { url !r} "
109
111
110
112
data_file_name = _file_name (url , suffix )
111
113
data_file_path = resources .files (data_module ) / data_file_name
@@ -136,15 +138,27 @@ def _mock_urlopen_data_features(url, has_gzip_header):
136
138
)
137
139
138
140
def _mock_urlopen_download_data (url , has_gzip_header ):
141
+ # For simplicity the mock filenames don't contain the filename, i.e.
142
+ # the last part of the data description url after the last /.
143
+ # For example for id_1, data description download url is:
144
+ # gunzip -c sklearn/datasets/tests/data/openml/id_1/api-v1-jd-1.json.gz | grep '"url" # noqa: E501
145
+ # "https:\/\/www.openml.org\/data\/v1\/download\/1\/anneal.arff"
146
+ # but the mock filename does not contain anneal.arff and is:
147
+ # sklearn/datasets/tests/data/openml/id_1/data-v1-dl-1.arff.gz.
148
+ # We only keep the part of the url before the last /
149
+ url_without_filename = url .rsplit ("/" , 1 )[0 ]
150
+
139
151
return _mock_urlopen_shared (
140
- url = url ,
152
+ url = url_without_filename ,
141
153
has_gzip_header = has_gzip_header ,
142
154
expected_prefix = url_prefix_download_data ,
143
155
suffix = ".arff" ,
144
156
)
145
157
146
158
def _mock_urlopen_data_list (url , has_gzip_header ):
147
- assert url .startswith (url_prefix_data_list )
159
+ assert url .startswith (
160
+ url_prefix_data_list
161
+ ), f"{ url_prefix_data_list !r} does not match { url !r} "
148
162
149
163
data_file_name = _file_name (url , ".json" )
150
164
data_file_path = resources .files (data_module ) / data_file_name
@@ -1343,22 +1357,24 @@ def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
1343
1357
data_id = 61
1344
1358
1345
1359
_monkey_patch_webbased_functions (monkeypatch , data_id , gzip_response )
1346
- openml_path = sklearn .datasets ._openml ._DATA_FILE .format (data_id )
1360
+ openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH .format (data_id ) + "/filename.arff"
1361
+ url = f"https://www.openml.org/{ openml_path } "
1347
1362
cache_directory = str (tmpdir .mkdir ("scikit_learn_data" ))
1348
1363
# first fill the cache
1349
- response1 = _open_openml_url (openml_path , cache_directory )
1364
+ response1 = _open_openml_url (url , cache_directory )
1350
1365
# assert file exists
1351
1366
location = _get_local_path (openml_path , cache_directory )
1352
1367
assert os .path .isfile (location )
1353
1368
# redownload, to utilize cache
1354
- response2 = _open_openml_url (openml_path , cache_directory )
1369
+ response2 = _open_openml_url (url , cache_directory )
1355
1370
assert response1 .read () == response2 .read ()
1356
1371
1357
1372
1358
1373
@pytest .mark .parametrize ("write_to_disk" , [True , False ])
1359
1374
def test_open_openml_url_unlinks_local_path (monkeypatch , tmpdir , write_to_disk ):
1360
1375
data_id = 61
1361
- openml_path = sklearn .datasets ._openml ._DATA_FILE .format (data_id )
1376
+ openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH .format (data_id ) + "/filename.arff"
1377
+ url = f"https://www.openml.org/{ openml_path } "
1362
1378
cache_directory = str (tmpdir .mkdir ("scikit_learn_data" ))
1363
1379
location = _get_local_path (openml_path , cache_directory )
1364
1380
@@ -1371,14 +1387,14 @@ def _mock_urlopen(request, *args, **kwargs):
1371
1387
monkeypatch .setattr (sklearn .datasets ._openml , "urlopen" , _mock_urlopen )
1372
1388
1373
1389
with pytest .raises (ValueError , match = "Invalid request" ):
1374
- _open_openml_url (openml_path , cache_directory )
1390
+ _open_openml_url (url , cache_directory )
1375
1391
1376
1392
assert not os .path .exists (location )
1377
1393
1378
1394
1379
1395
def test_retry_with_clean_cache (tmpdir ):
1380
1396
data_id = 61
1381
- openml_path = sklearn . datasets . _openml . _DATA_FILE .format (data_id )
1397
+ openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH .format (data_id )
1382
1398
cache_directory = str (tmpdir .mkdir ("scikit_learn_data" ))
1383
1399
location = _get_local_path (openml_path , cache_directory )
1384
1400
os .makedirs (os .path .dirname (location ))
@@ -1401,7 +1417,7 @@ def _load_data():
1401
1417
1402
1418
def test_retry_with_clean_cache_http_error (tmpdir ):
1403
1419
data_id = 61
1404
- openml_path = sklearn . datasets . _openml . _DATA_FILE .format (data_id )
1420
+ openml_path = _MONKEY_PATCH_LOCAL_OPENML_PATH .format (data_id )
1405
1421
cache_directory = str (tmpdir .mkdir ("scikit_learn_data" ))
1406
1422
1407
1423
@_retry_with_clean_cache (openml_path , cache_directory )
@@ -1487,7 +1503,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir, pars
1487
1503
1488
1504
def swap_file_mock (request , * args , ** kwargs ):
1489
1505
url = request .get_full_url ()
1490
- if url .endswith ("data/v1/download/1666876" ):
1506
+ if url .endswith ("data/v1/download/1666876/anneal.arff " ):
1491
1507
with open (corrupt_copy_path , "rb" ) as f :
1492
1508
corrupted_data = f .read ()
1493
1509
return _MockHTTPResponse (BytesIO (corrupted_data ), is_gzip = True )
@@ -1515,13 +1531,13 @@ def _mock_urlopen_network_error(request, *args, **kwargs):
1515
1531
sklearn .datasets ._openml , "urlopen" , _mock_urlopen_network_error
1516
1532
)
1517
1533
1518
- invalid_openml_url = "invalid-url"
1534
+ invalid_openml_url = "https://api.openml.org/ invalid-url"
1519
1535
1520
1536
with pytest .warns (
1521
1537
UserWarning ,
1522
1538
match = re .escape (
1523
1539
"A network error occurred while downloading"
1524
- f" { _OPENML_PREFIX + invalid_openml_url } . Retrying..."
1540
+ f" { invalid_openml_url } . Retrying..."
1525
1541
),
1526
1542
) as record :
1527
1543
with pytest .raises (HTTPError , match = "Simulated network error" ):
0 commit comments