Skip to content

Commit 0e9e70f

Browse files
tw4likreymer
andauthored
Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)
Adds `filename` to pages, pointed to the WACZ file those files come from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent migration to backfill this information for existing pages, and increases the backend container's startupProbe time to 24 hours to give it sufficient time to finish the migration. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1 parent 8cfa287 commit 0e9e70f

File tree

8 files changed

+131
-4
lines changed

8 files changed

+131
-4
lines changed

backend/btrixcloud/db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .migrations import BaseMigration
1818

1919

20-
CURR_DB_VERSION = "0041"
20+
CURR_DB_VERSION = "0042"
2121

2222

2323
# ============================================================================
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Migration 0042 - Add filename to pages
3+
"""
4+
5+
from btrixcloud.migrations import BaseMigration
6+
7+
8+
MIGRATION_VERSION = "0042"
9+
10+
11+
class Migration(BaseMigration):
12+
"""Migration class."""
13+
14+
# pylint: disable=unused-argument
15+
def __init__(self, mdb, **kwargs):
16+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
17+
18+
self.page_ops = kwargs.get("page_ops")
19+
20+
async def migrate_up(self):
21+
"""Perform migration up.
22+
23+
Add filename to all pages that don't currently have it stored,
24+
iterating through each archived item and its WACZ files as necessary
25+
"""
26+
pages_mdb = self.mdb["pages"]
27+
28+
if self.page_ops is None:
29+
print(
30+
"Unable to add filename and other fields to pages, missing page_ops",
31+
flush=True,
32+
)
33+
return
34+
35+
crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
36+
37+
crawl_count = len(crawl_ids_to_update)
38+
current_index = 1
39+
40+
for crawl_id in crawl_ids_to_update:
41+
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
42+
try:
43+
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
44+
# pylint: disable=broad-exception-caught
45+
except Exception as err:
46+
print(
47+
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
48+
flush=True,
49+
)
50+
current_index += 1

backend/btrixcloud/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
24932493
loadState: Optional[int] = None
24942494
status: Optional[int] = None
24952495
mime: Optional[str] = None
2496+
filename: Optional[str] = None
2497+
depth: Optional[int] = None
2498+
favIconUrl: Optional[AnyHttpUrl] = None
2499+
isSeed: Optional[bool] = False
24962500

24972501
# manual review
24982502
userid: Optional[UUID] = None

backend/btrixcloud/pages.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""crawl pages"""
22

33
import asyncio
4+
import os
45
import traceback
56
from datetime import datetime
67
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
@@ -83,6 +84,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
8384

8485
if len(pages_buffer) > batch_size:
8586
await self._add_pages_to_db(crawl_id, pages_buffer)
87+
pages_buffer = []
8688

8789
pages_buffer.append(
8890
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@@ -100,6 +102,53 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
100102
traceback.print_exc()
101103
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
102104

105+
async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
106+
"""Add WACZ filename and additional fields to existing pages in crawl if not already set"""
107+
try:
108+
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
109+
if not crawl.resources:
110+
return
111+
112+
for wacz_file in crawl.resources:
113+
# Strip oid directory from filename
114+
filename = os.path.basename(wacz_file.name)
115+
116+
stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
117+
for page_dict in stream:
118+
if not page_dict.get("url"):
119+
continue
120+
121+
page_id = page_dict.get("id")
122+
123+
if not page_id:
124+
continue
125+
126+
if page_id:
127+
try:
128+
page_id = UUID(page_id)
129+
# pylint: disable=broad-exception-caught
130+
except Exception:
131+
continue
132+
133+
await self.pages.find_one_and_update(
134+
{"_id": page_id},
135+
{
136+
"$set": {
137+
"filename": filename,
138+
"depth": page_dict.get("depth"),
139+
"isSeed": page_dict.get("seed", False),
140+
"favIconUrl": page_dict.get("favIconUrl"),
141+
}
142+
},
143+
)
144+
# pylint: disable=broad-exception-caught, raise-missing-from
145+
except Exception as err:
146+
traceback.print_exc()
147+
print(
148+
f"Error adding filename to pages from item {crawl_id} to db: {err}",
149+
flush=True,
150+
)
151+
103152
def _get_page_from_dict(
104153
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
105154
) -> Page:
@@ -127,6 +176,10 @@ def _get_page_from_dict(
127176
loadState=page_dict.get("loadState"),
128177
status=status,
129178
mime=page_dict.get("mime", "text/html"),
179+
filename=page_dict.get("filename"),
180+
depth=page_dict.get("depth"),
181+
isSeed=page_dict.get("seed", False),
182+
favIconUrl=page_dict.get("favIconUrl"),
130183
ts=(str_to_date(ts) if ts else dt_now()),
131184
)
132185
p.compute_page_type()

backend/btrixcloud/storages.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,9 @@ def stream_page_lines(
619619

620620
line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
621621
for line in line_iter:
622-
yield _parse_json(line.decode("utf-8", errors="ignore"))
622+
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
623+
page_json["filename"] = os.path.basename(wacz_filename)
624+
yield page_json
623625

624626
page_generators: List[Iterator[Dict[Any, Any]]] = []
625627

backend/test/test_run_crawl.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
673673
assert page["loadState"]
674674
assert page["status"]
675675
assert page["mime"]
676+
assert page["filename"]
677+
assert page["depth"] is not None
678+
assert page["favIconUrl"]
679+
assert page["isSeed"] in (True, False)
676680
assert page["isError"] in (True, False)
677681
assert page["isFile"] in (True, False)
678682

@@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
694698
assert page.get("title") or page.get("title") is None
695699
assert page["loadState"]
696700
assert page["mime"]
701+
assert page["filename"]
702+
assert page["depth"] is not None
703+
assert page["favIconUrl"]
704+
assert page["isSeed"] in (True, False)
697705
assert page["isError"] in (True, False)
698706
assert page["isFile"] in (True, False)
699707

@@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
794802
assert page.get("title") or page.get("title") is None
795803
assert page["loadState"]
796804
assert page["mime"]
805+
assert page["filename"]
806+
assert page["depth"] is not None
807+
assert page["favIconUrl"]
808+
assert page["isSeed"] in (True, False)
797809
assert page["isError"] in (True, False)
798810
assert page["isFile"] in (True, False)
799811

@@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
876888
assert page["loadState"]
877889
assert page["status"]
878890
assert page["mime"]
891+
assert page["filename"]
892+
assert page["depth"] is not None
893+
assert page["favIconUrl"]
894+
assert page["isSeed"] in (True, False)
879895
assert page["isError"] in (True, False)
880896
assert page["isFile"] in (True, False)
881897

backend/test/test_uploads.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
252252
assert page["crawl_id"] == upload_id
253253
assert page["url"]
254254
assert page["ts"]
255+
assert page["filename"]
255256
assert page.get("title") or page.get("title") is None
256257

257258
page_id = pages[0]["id"]
@@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
267268
assert page["crawl_id"]
268269
assert page["url"]
269270
assert page["ts"]
271+
assert page["filename"]
270272
assert page.get("title") or page.get("title") is None
271273

272274
assert page["notes"] == []

chart/templates/backend.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ spec:
123123
httpGet:
124124
path: /healthzStartup
125125
port: 8000
126-
periodSeconds: 5
127-
failureThreshold: 60
126+
periodSeconds: 10
127+
failureThreshold: 8640
128128
successThreshold: 1
129129

130130
readinessProbe:

0 commit comments

Comments
 (0)