Skip to content

Commit 9ac5c53

Browse files
committed
Merge branch 'r/1.0.3'
1 parent e5129b6 commit 9ac5c53

File tree

3 files changed

+35
-14
lines changed

3 files changed

+35
-14
lines changed

pywaybackup/SnapshotCollection.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,12 @@ def url_split(cls, url, index=False):
117117
url = "http://" + url
118118
parsed_url = urlparse(url)
119119
domain = parsed_url.netloc.split("@")[-1].split(":")[0] # split mailto: and port
120-
filename = parsed_url.path.split("/")[-1]
121-
if index is True and filename == "":
122-
filename = "index.html"
123-
subdir = parsed_url.path.strip("/").replace(filename, "").strip("/")
120+
path_parts = parsed_url.path.split("/")
121+
if not url.endswith("/") or "." in path_parts[-1]:
122+
filename = path_parts[-1]
123+
subdir = "/".join(path_parts[:-1]).strip("/")
124+
else:
125+
filename = "index.html" if index else ""
126+
subdir = "/".join(path_parts).strip("/")
124127
filename = filename.replace("%20", " ") # replace url encoded spaces
125-
return domain, subdir, filename
128+
return domain, subdir, filename

pywaybackup/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.2"
1+
__version__ = "1.0.3"

pywaybackup/archive.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515

1616

17+
1718
# GET: store page to wayback machine and response with redirect to snapshot
1819
# POST: store page to wayback machine and response with wayback machine status-page
1920
# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
@@ -69,7 +70,6 @@ def save_page(url: str):
6970

7071

7172

72-
7373
def print_list():
7474
v.write("")
7575
count = sc.count_list()
@@ -146,6 +146,10 @@ def download_list(output, retry, no_redirect, workers):
146146
for thread in threads:
147147
thread.join()
148148

149+
150+
151+
152+
149153
def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1, connection=None):
150154
"""
151155
Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
@@ -174,6 +178,10 @@ def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1,
174178
time.sleep(15)
175179
download_loop(failed_urls, output, worker, retry, no_redirect, attempt, connection)
176180

181+
182+
183+
184+
177185
def download(output, snapshot_entry, connection, status_message, no_redirect=False):
178186
"""
179187
Download a single URL and save it to the specified filepath.
@@ -202,20 +210,21 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
202210
response = connection.getresponse()
203211
response_data = response.read()
204212
response_status = response.status
213+
response_status_message = parse_response_code(response_status)
205214
location = response.getheader("Location")
206215
if location:
207-
status_message = f"{status_message}\n" + \
208-
f" -> URL: {location}"
209216
location = urljoin(download_url, location)
210217
download_url = location
218+
status_message = f"{status_message}\n" + \
219+
f" -> URL: {download_url}"
211220
sc.snapshot_entry_modify(snapshot_entry, "redirect_timestamp", sc.url_get_timestamp(location))
212221
sc.snapshot_entry_modify(snapshot_entry, "redirect_url", location)
213222
else:
214223
break
215224
if response_status == 200:
216-
sc.snapshot_entry_modify(snapshot_entry, "file", sc.snapshot_entry_create_output(snapshot_entry, output))
217-
download_file = snapshot_entry["file"]
218-
os.makedirs(os.path.dirname(download_file), exist_ok=True)
225+
download_file = sc.snapshot_entry_create_output(snapshot_entry, output)
226+
download_path = os.path.dirname(download_file)
227+
os.makedirs(download_path, exist_ok=True)
219228
with open(download_file, 'wb') as file:
220229
if response.getheader('Content-Encoding') == 'gzip':
221230
response_data = gzip.decompress(response_data)
@@ -227,12 +236,13 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
227236
f"SUCCESS -> HTTP: {response_status} - {response_status_message}\n" + \
228237
f" -> URL: {download_url}\n" + \
229238
f" -> FILE: {download_file}"
239+
sc.snapshot_entry_modify(snapshot_entry, "file", download_file)
230240
v.write(status_message)
231241
return True
232242
else:
233243
status_message = f"{status_message}\n" + \
234244
f"UNEXPECTED -> HTTP: {response_status} - {response_status_message}\n" + \
235-
f" -> URL: {download_url}\n"
245+
f" -> URL: {download_url}"
236246
v.write(status_message)
237247
return True
238248
# exception returns false and appends the url to the failed list
@@ -262,6 +272,10 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
262272
500: "Internal Server Error",
263273
503: "Service Unavailable"
264274
}
275+
276+
277+
278+
265279
def parse_response_code(response_code: int):
266280
"""
267281
Parse the response code of the Wayback Machine and return a human-readable message.
@@ -270,6 +284,10 @@ def parse_response_code(response_code: int):
270284
return RESPONSE_CODE_DICT[response_code]
271285
return "Unknown response code"
272286

287+
288+
289+
290+
273291
def save_csv(csv_path: str, url: str):
274292
"""
275293
Write a CSV file with the list of snapshots.
@@ -285,4 +303,4 @@ def save_csv(csv_path: str, url: str):
285303
row = csv.DictWriter(file, sc.SNAPSHOT_COLLECTION[0].keys())
286304
row.writeheader()
287305
for snapshot in sc.SNAPSHOT_COLLECTION:
288-
row.writerow(snapshot)
306+
row.writerow(snapshot)

0 commit comments

Comments
 (0)