|
7 | 7 | import re
|
8 | 8 |
|
9 | 9 | import requests
|
10 |
| -import sys |
11 | 10 | import json
|
12 | 11 | import shutil
|
13 | 12 | import argparse
|
@@ -69,8 +68,7 @@ class TOKENIZER_TYPE(IntEnum):
|
69 | 68 | hf_token = args.hf_token if args.hf_token is not None else hf_token
|
70 | 69 |
|
71 | 70 | if hf_token is None:
|
72 |
| - logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token") |
73 |
| - sys.exit(1) |
| 71 | + logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token") |
74 | 72 |
|
75 | 73 | # TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
76 | 74 | # will be updated with time - contributions welcome
|
@@ -151,7 +149,7 @@ class TOKENIZER_TYPE(IntEnum):
|
151 | 149 |
|
152 | 150 |
|
153 | 151 | def download_file_with_auth(url, token, save_path):
|
154 |
| - headers = {"Authorization": f"Bearer {token}"} |
| 152 | + headers = {"Authorization": f"Bearer {token}"} if token else None |
155 | 153 | response = sess.get(url, headers=headers)
|
156 | 154 | response.raise_for_status()
|
157 | 155 | os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
@@ -250,20 +248,18 @@ def get_existing_models(convert_py):
|
250 | 248 | else:
|
251 | 249 | # otherwise, compute the hash of the tokenizer
|
252 | 250 |
|
253 |
| - # Skip if the tokenizer folder does not exist or there are other download issues previously |
254 |
| - if not os.path.exists(f"models/tokenizers/{name}"): |
255 |
| - logger.warning(f"Directory for tokenizer {name} not found. Skipping...") |
256 |
| - continue |
| 251 | + # Fail if the tokenizer folder with config does not exist or there are other download issues previously |
| 252 | + if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"): |
| 253 | + raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.") |
257 | 254 |
|
258 | 255 | try:
|
259 | 256 | logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
|
260 | 257 | if name == "t5":
|
261 | 258 | tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
262 | 259 | else:
|
263 | 260 | tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
264 |
| - except OSError as e: |
265 |
| - logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") |
266 |
| - continue # Skip to the next model if the tokenizer can't be loaded |
| 261 | + except Exception as e: |
| 262 | + raise OSError(f"Error loading tokenizer for model {name}.") from e |
267 | 263 |
|
268 | 264 | chktok = tokenizer.encode(CHK_TXT)
|
269 | 265 | chkhsh = sha256(str(chktok).encode()).hexdigest()
|
|
0 commit comments