From 9c7c9203eec45a25708e7b3e1e860f9ab9b87d5c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 26 Nov 2024 17:15:43 -0500 Subject: [PATCH 01/57] Add list endpoint to get sorted list of URLs in collection Response is sorted desc by page count match and includes an array containing page_id, ts, and status for each snapshot with that URL. --- backend/btrixcloud/colls.py | 109 +++++++++++++++++++++++++++++++++++ backend/btrixcloud/models.py | 27 ++++++++- 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 411e659ac9..f0035d803a 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -33,6 +33,9 @@ OrgPublicCollections, PublicOrgDetails, CollAccessType, + PageUrlCount, + PageIdTimestamp, + PaginatedPageUrlCountResponse, ) from .utils import dt_now @@ -60,6 +63,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): self.collections = mdb["collections"] self.crawls = mdb["crawls"] self.crawl_configs = mdb["crawl_configs"] + self.pages = mdb["pages"] self.crawl_ops = cast(CrawlOps, None) self.orgs = orgs @@ -312,6 +316,17 @@ async def get_collection_search_values(self, org: Organization): names = [name for name in names if name] return {"names": names} + async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: + """Return list of crawl ids in collection""" + crawl_ids = [] + async for crawl_raw in self.crawls.find( + {"collectionIds": coll_id}, projection=["_id"] + ): + crawl_id = crawl_raw.get("_id") + if crawl_id: + crawl_ids.append(crawl_id) + return crawl_ids + async def delete_collection(self, coll_id: UUID, org: Organization): """Delete collection and remove from associated crawls.""" await self.crawl_ops.remove_collection_from_all_crawls(coll_id) @@ -422,6 +437,78 @@ async def get_org_public_collections(self, org_slug: str): return OrgPublicCollections(org=public_org_details, collections=collections) + async def list_urls_in_collection( + self, + coll_id: UUID, + oid: UUID, + url_prefix: Optional[str] = None, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ) -> Tuple[List[PageUrlCount], int]: + """List all URLs in collection sorted desc by snapshot count""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + page = page - 1 + skip = page_size * page + + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}} + + if url_prefix: + regex_pattern = f"^{url_prefix}" + match_query["url"] = {"$regex": regex_pattern, "$options": "i"} + + aggregate = [{"$match": match_query}] + + aggregate.extend( + [ + { + "$group": { + "_id": "$url", + "pages": {"$push": "$$ROOT"}, + "count": {"$sum": 1}, + }, + }, + {"$sort": {"count": -1}}, + {"$set": {"url": "$_id"}}, + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + # Get total + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + return [ + PageUrlCount( + url=data.get("url", ""), + count=data.get("count", 0), + snapshots=[ + PageIdTimestamp( + pageId=p["_id"], ts=p.get("ts"), status=p.get("status", 200) + ) + for p in data.get("pages", []) + ], + ) + for data in items + ], total + # ============================================================================ # pylint: disable=too-many-locals @@ -617,4 +704,26 @@ async def download_collection( async def get_org_public_collections(org_slug: str): return await colls.get_org_public_collections(org_slug) + @app.get( + "/orgs/{oid}/collections/{coll_id}/urls", + tags=["collections"], + response_model=PaginatedPageUrlCountResponse, + ) + async def get_collection_url_list( + coll_id: UUID, + oid: UUID, + urlPrefix: Optional[str] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ): + """Retrieve paginated list of urls in collection sorted by snapshot count""" + pages, total = await ops.list_urls_in_collection( + coll_id=coll_id, + oid=oid, + url_prefix=urlPrefix, + page_size=pageSize, + page=page, + ) + return paginated_format(pages, total, page, pageSize) + return colls diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 93e708c4d0..126b67928f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2240,7 +2240,7 @@ class PageWithAllQA(Page): class PageOut(Page): """Model for pages output, no QA""" - status: Optional[int] = 200 + status: int = 200 # ============================================================================ @@ -2266,6 +2266,24 @@ class PageNoteUpdatedResponse(BaseModel): data: PageNote +# ============================================================================ +class PageIdTimestamp(BaseModel): + """Simplified model for page info to include in PageUrlCount""" + + pageId: UUID + ts: Optional[datetime] = None + status: int = 200 + + +# ============================================================================ +class PageUrlCount(BaseModel): + """Model for counting pages by URL""" + + url: AnyHttpUrl + count: int = 0 + snapshots: List[PageIdTimestamp] = [] + + # ============================================================================ ### GENERIC RESPONSE MODELS ### @@ -2512,3 +2530,10 @@ class PaginatedUserEmailsResponse(PaginatedResponse): """Response model for user emails with org info""" items: List[UserEmailWithOrgInfo] + + +# ============================================================================ +class PaginatedPageUrlCountResponse(PaginatedResponse): + """Response model for page count by url""" + + items: List[PageUrlCount] From 2041be5553f472eb6e732489619b7c8c9d7788bc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 27 Nov 2024 11:00:04 -0500 Subject: [PATCH 02/57] Add endpoint to set or update collection home url --- backend/btrixcloud/colls.py | 37 ++++++++++++++++++++++++++++++++++++ backend/btrixcloud/main.py | 2 ++ backend/btrixcloud/models.py | 11 +++++++++++ backend/btrixcloud/ops.py | 2 ++ 4 files changed, 52 insertions(+) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index f0035d803a..1d3fbac26b 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -36,6 +36,7 @@ PageUrlCount, PageIdTimestamp, PaginatedPageUrlCountResponse, + UpdateCollHomeUrl, ) from .utils import dt_now @@ -74,6 +75,10 @@ def set_crawl_ops(self, ops): """set crawl ops""" self.crawl_ops = ops + def set_page_ops(self, ops): + """set page ops""" + self.page_ops = ops + async def init_index(self): """init lookup index""" await self.collections.create_index( @@ -509,6 +514,26 @@ async def list_urls_in_collection( for data in items ], total + async def set_home_url( + self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization + ) -> Dict[str, bool]: + """Set home URL for collection and save thumbnail to database""" + page = await self.page_ops.get_page(update.pageId, org.id) + + update_query = { + "homeUrl": page.url, + "homeUrlTs": page.ts, + "homeUrlPageId": page.id, + } + + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": query}, + return_document=pymongo.ReturnDocument.AFTER, + ) + + return {"success": True} + # ============================================================================ # pylint: disable=too-many-locals @@ -726,4 +751,16 @@ async def get_collection_url_list( ) return paginated_format(pages, total, page, pageSize) + @app.post( + "/orgs/{oid}/collections/{coll_id}/home-url", + tags=["collections"], + response_model=SuccessResponse, + ) + async def set_collection_home_url( + update: UpdateCollHomeUrl, + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + return await colls.set_home_url(coll_id, update, org) + return colls diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index f6b678cb82..0abd384316 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -260,6 +260,8 @@ def main() -> None: crawl_config_ops.set_coll_ops(coll_ops) + coll_ops.set_page_ops(page_ops) + # run only in first worker if run_once_lock("btrix-init-db"): asyncio.create_task( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 126b67928f..51be8c43a1 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1082,6 +1082,10 @@ class Collection(BaseMongoModel): access: CollAccessType = CollAccessType.PRIVATE + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + homeUrlPageId: Optional[UUID] = None + # ============================================================================ class CollIn(BaseModel): @@ -1110,6 +1114,13 @@ class UpdateColl(BaseModel): access: Optional[CollAccessType] = None +# ============================================================================ +class UpdateCollHomeUrl(BaseModel): + """Update home url for collection""" + + pageId: UUID + + # ============================================================================ class AddRemoveCrawlList(BaseModel): """Collections to add or remove from collection""" diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 23629de2aa..32e5e5fee1 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -109,6 +109,8 @@ def init_ops() -> Tuple[ crawl_config_ops.set_coll_ops(coll_ops) + coll_ops.set_page_ops(page_ops) + return ( org_ops, crawl_config_ops, From 02a7d1e6a436a368f4e9594523e9e3978786bfab Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 27 Nov 2024 16:15:57 -0500 Subject: [PATCH 03/57] Fixups --- backend/btrixcloud/colls.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 1d3fbac26b..e65b3c85a0 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -4,7 +4,7 @@ from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple import asyncio import pymongo @@ -53,7 +53,7 @@ class CollectionOps: """ops for working with named collections of crawls""" - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods orgs: OrgOps storage_ops: StorageOps @@ -77,6 +77,7 @@ def set_crawl_ops(self, ops): def set_page_ops(self, ops): """set page ops""" + # pylint: disable=attribute-defined-outside-init self.page_ops = ops async def init_index(self): @@ -528,8 +529,7 @@ async def set_home_url( await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, - {"$set": query}, - return_document=pymongo.ReturnDocument.AFTER, + {"$set": update_query}, ) return {"success": True} @@ -742,7 +742,7 @@ async def get_collection_url_list( page: int = 1, ): """Retrieve paginated list of urls in collection sorted by snapshot count""" - pages, total = await ops.list_urls_in_collection( + pages, total = await colls.list_urls_in_collection( coll_id=coll_id, oid=oid, url_prefix=urlPrefix, From d75c4af6f4f89bb068add5993b804d9d6be4aebc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 27 Nov 2024 17:03:23 -0500 Subject: [PATCH 04/57] Use updated response for /home-urls endpoint --- backend/btrixcloud/colls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index e65b3c85a0..ff23f68f32 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -532,7 +532,7 @@ async def set_home_url( {"$set": update_query}, ) - return {"success": True} + return {"updated": True} # ============================================================================ @@ -754,7 +754,7 @@ async def get_collection_url_list( @app.post( "/orgs/{oid}/collections/{coll_id}/home-url", tags=["collections"], - response_model=SuccessResponse, + response_model=UpdatedResponse, ) async def set_collection_home_url( update: UpdateCollHomeUrl, From 867fc7aab79d8b47a6ad619d1b1aaf9848e164bf Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 27 Nov 2024 17:03:33 -0500 Subject: [PATCH 05/57] Add tests --- backend/test/test_collections.py | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 4faf42540d..f76d842bef 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -844,6 +844,73 @@ def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers assert data["collections"] == [] +def test_set_collection_home_url( + crawler_auth_headers, default_org_id, crawler_crawl_id +): + # Get a page id from crawler_crawl_id + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] >= 1 + + page = data["items"][0] + assert page + + page_id = page["id"] + assert page_id + + page_url = page["url"] + page_ts = page["ts"] + + # Set page as home url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url", + headers=crawler_auth_headers, + json={"pageId": page_id}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Check that fields were set in collection as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["homeUrl"] == page_url + assert data["homeUrlTs"] == page_ts + assert data["homeUrlPageId"] == page_id + + +def test_collection_url_list(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + urls = data["items"] + assert urls + + for url in urls: + assert url["url"] + assert url["count"] >= 1 + + snapshots = url["snapshots"] + assert snapshots + + for snapshot in snapshots: + assert snapshot["pageId"] + assert snapshot["ts"] + assert snapshot["status"] + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From 52404c6c3f88d0998aff024354f7256550ebc186 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 28 Nov 2024 15:09:49 -0500 Subject: [PATCH 06/57] WIP: Add collection thumbnail upload Also reorganizes some utility classes and collection get methods. Caveats: - Generates a new presigned url for the thumbnail image each time CollOut is created (may want to do something similar to crawl files and track expiration date) --- backend/btrixcloud/basecrawls.py | 19 +--- backend/btrixcloud/colls.py | 135 ++++++++++++++++++---- backend/btrixcloud/main.py | 4 +- backend/btrixcloud/models.py | 190 ++++++++++++++++++++++++++++++- backend/btrixcloud/uploads.py | 43 +------ 5 files changed, 313 insertions(+), 78 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index d913d3362b..5106487610 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -1,6 +1,5 @@ """ base crawl type """ -import os from datetime import timedelta from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple from uuid import UUID @@ -29,6 +28,7 @@ UpdatedResponse, DeletedResponseQuota, CrawlSearchValuesResponse, + PRESIGN_DURATION_SECONDS, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now, date_to_str @@ -47,11 +47,6 @@ CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object StorageOps = EventWebhookOps = BackgroundJobOps = object -# Presign duration must be less than 604800 seconds (one week), -# so set this one minute short of a week. -PRESIGN_MINUTES_MAX = 10079 -PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX - # ============================================================================ # pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines @@ -93,16 +88,8 @@ def __init__( self.background_job_ops = background_job_ops self.page_ops = cast(PageOps, None) - presign_duration_minutes = int( - os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT - ) - - self.presign_duration_seconds = ( - min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60 - ) - # renew when <25% of time remaining - self.expire_at_duration_seconds = int(self.presign_duration_seconds * 0.75) + self.expire_at_duration_seconds = int(PRESIGN_DURATION_SECONDS * 0.75) def set_page_ops(self, page_ops): """set page ops reference""" @@ -474,7 +461,7 @@ async def resolve_signed_urls( ): exp = now + delta presigned_url = await self.storage_ops.get_presigned_url( - org, file_, self.presign_duration_seconds + org, file_, PRESIGN_DURATION_SECONDS ) prefix = "files" diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index ff23f68f32..f9a64df2cd 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -4,12 +4,14 @@ from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any +import os import asyncio import pymongo from fastapi import Depends, HTTPException, Response from fastapi.responses import StreamingResponse +from starlette.requests import Request from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( @@ -29,6 +31,7 @@ EmptyResponse, UpdatedResponse, SuccessResponse, + AddedResponse, CollectionSearchValuesResponse, OrgPublicCollections, PublicOrgDetails, @@ -37,6 +40,10 @@ PageIdTimestamp, PaginatedPageUrlCountResponse, UpdateCollHomeUrl, + User, + ImageFile, + ImageFilePreparer, + MIN_UPLOAD_PART_SIZE, ) from .utils import dt_now @@ -170,7 +177,7 @@ async def add_crawls_to_collection( ) ) - return await self.get_collection(coll_id, org) + return await self.get_collection_out(coll_id, org) async def remove_crawls_from_collection( self, coll_id: UUID, crawl_ids: List[str], org: Organization @@ -194,12 +201,12 @@ async def remove_crawls_from_collection( ) ) - return await self.get_collection(coll_id, org) + return await self.get_collection_out(coll_id, org) - async def get_collection( - self, coll_id: UUID, org: Organization, resources=False, public_only=False - ) -> CollOut: - """Get collection by id""" + async def get_collection_raw( + self, coll_id: UUID, public_only: bool = False + ) -> Dict[str, Any]: + """Get collection by id as dict from database""" query: dict[str, object] = {"_id": coll_id} if public_only: query["access"] = {"$in": ["public", "unlisted"]} @@ -208,10 +215,31 @@ async def get_collection( if not result: raise HTTPException(status_code=404, detail="collection_not_found") + return result + + async def get_collection( + self, coll_id: UUID, public_only: bool = False + ) -> Collection: + """Get collection by id""" + result = await self.get_collection_raw(coll_id, public_only) + return Collection.from_dict(result) + + async def get_collection_out( + self, coll_id: UUID, org: Organization, resources=False, public_only=False + ) -> CollOut: + """Get CollOut by id""" + result = await self.get_collection_raw(coll_id, public_only) + if resources: - result["resources"] = await self.get_collection_crawl_resources( - coll_id, org + result["resources"] = await self.get_collection_crawl_resources(coll_id) + + thumbnail = result.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + result["thumbnail"] = await image_file.get_image_file_out( + org, self.storage_ops ) + return CollOut.from_dict(result) async def list_collections( @@ -283,11 +311,10 @@ async def list_collections( return collections, total - async def get_collection_crawl_resources(self, coll_id: UUID, org: Organization): + async def get_collection_crawl_resources(self, coll_id: UUID): """Return pre-signed resources for all collection crawl files.""" - coll = await self.get_collection(coll_id, org) - if not coll: - raise HTTPException(status_code=404, detail="collection_not_found") + # Ensure collection exists + _ = await self.get_collection_raw(coll_id) all_files = [] @@ -349,7 +376,7 @@ async def delete_collection(self, coll_id: UUID, org: Organization): async def download_collection(self, coll_id: UUID, org: Organization): """Download all WACZs in collection as streaming nested WACZ""" - coll = await self.get_collection(coll_id, org, resources=True) + coll = await self.get_collection_out(coll_id, org, resources=True) metadata = { "type": "collection", @@ -534,10 +561,64 @@ async def set_home_url( return {"updated": True} + async def upload_thumbnail_stream( + self, stream, filename: str, coll_id: UUID, org: Organization, user: User + ) -> Dict[str, bool]: + """Upload file as stream to use as collection thumbnail""" + coll = await self.get_collection(coll_id) + + _, extension = os.path.splitext(filename) + + image_filename = f"thumbnail-{str(coll_id)}{extension}" + + prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/" + + file_prep = ImageFilePreparer( + prefix, + image_filename, + original_filename=filename, + user=user, + created=dt_now(), + ) + + async def stream_iter(): + """iterate over each chunk and compute and digest + total size""" + async for chunk in stream: + file_prep.add_chunk(chunk) + yield chunk + + print("Collection Thumbnail Stream Upload Start", flush=True) + + if not await self.storage_ops.do_upload_multipart( + org, + file_prep.upload_name, + stream_iter(), + MIN_UPLOAD_PART_SIZE, + ): + print("Collection Thumbnail Stream Upload Failed", flush=True) + raise HTTPException(status_code=400, detail="upload_failed") + + print("Collection Thumbnail Stream Upload Complete", flush=True) + + thumbnail_file = file_prep.get_image_file(org.storage) + + if coll.thumbnail: + if not await self.storage_ops.delete_crawl_file_object(org, coll.thumbnail): + print( + f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}" + ) + + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": {"thumbnail": dict(thumbnail_file)}}, + ) + + return {"added": True} + # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops): +def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep): """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments @@ -595,7 +676,7 @@ async def get_collection_all(org: Organization = Depends(org_viewer_dep)): all_collections, _ = await colls.list_collections(org.id, page_size=10_000) for collection in all_collections: results[collection.name] = await colls.get_collection_crawl_resources( - collection.id, org + collection.id ) except Exception as exc: # pylint: disable=raise-missing-from @@ -623,7 +704,7 @@ async def get_collection_search_values( async def get_collection( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection(coll_id, org) + return await colls.get_collection_out(coll_id, org) @app.get( "/orgs/{oid}/collections/{coll_id}/replay.json", @@ -633,7 +714,7 @@ async def get_collection( async def get_collection_replay( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection(coll_id, org, resources=True) + return await colls.get_collection_out(coll_id, org, resources=True) @app.get( "/orgs/{oid}/collections/{coll_id}/public/replay.json", @@ -645,7 +726,7 @@ async def get_collection_public_replay( coll_id: UUID, org: Organization = Depends(org_public), ): - coll = await colls.get_collection( + coll = await colls.get_collection_out( coll_id, org, resources=True, public_only=True ) response.headers["Access-Control-Allow-Origin"] = "*" @@ -763,4 +844,20 @@ async def set_collection_home_url( ): return await colls.set_home_url(coll_id, update, org) + @app.put( + "/orgs/{oid}/collections/{coll_id}/upload/thumbnail", + tags=["collections"], + response_model=AddedResponse, + ) + async def upload_stream( + request: Request, + filename: str, + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + return await colls.upload_thumbnail_stream( + request.stream(), filename, coll_id, org, user + ) + return colls diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 0abd384316..7119214fc5 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -223,7 +223,9 @@ def main() -> None: profiles, ) - coll_ops = init_collections_api(app, mdb, org_ops, storage_ops, event_webhook_ops) + coll_ops = init_collections_api( + app, mdb, org_ops, storage_ops, event_webhook_ops, current_active_user + ) base_crawl_init = ( app, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 51be8c43a1..a7b1c13e8a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -5,6 +5,9 @@ from datetime import datetime from enum import Enum, IntEnum from uuid import UUID +import base64 +import hashlib +import mimetypes import os from typing import Optional, List, Dict, Union, Literal, Any, get_args @@ -21,6 +24,7 @@ BeforeValidator, TypeAdapter, ) +from pathvalidate import sanitize_filename # from fastapi_users import models as fastapi_users_models @@ -29,6 +33,20 @@ # crawl scale for constraint MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +# Presign duration must be less than 604800 seconds (one week), +# so set this one minute short of a week +PRESIGN_MINUTES_MAX = 10079 +PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX + +# Expire duration seconds for presigned urls +PRESIGN_DURATION_MINUTES = int( + os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT +) +PRESIGN_DURATION_SECONDS = min(PRESIGN_DURATION_MINUTES, PRESIGN_MINUTES_MAX) * 60 + +# Minimum part size for file uploads +MIN_UPLOAD_PART_SIZE = 10000000 + # annotated types # ============================================================================ @@ -1050,6 +1068,155 @@ class UpdateUpload(UpdateCrawl): """Update modal that also includes name""" +# ============================================================================ +class FilePreparer: + """wrapper to compute digest / name for streaming upload""" + + def __init__(self, prefix, filename): + self.upload_size = 0 + self.upload_hasher = hashlib.sha256() + self.upload_name = prefix + self.prepare_filename(filename) + + def add_chunk(self, chunk): + """add chunk for file""" + self.upload_size += len(chunk) + self.upload_hasher.update(chunk) + + def get_crawl_file(self, storage: StorageRef): + """get crawl file""" + return CrawlFile( + filename=self.upload_name, + hash=self.upload_hasher.hexdigest(), + size=self.upload_size, + storage=storage, + ) + + def prepare_filename(self, filename): + """prepare filename by sanitizing and adding extra string + to avoid duplicates""" + name = sanitize_filename(filename.rsplit("/", 1)[-1]) + parts = name.split(".") + randstr = base64.b32encode(os.urandom(5)).lower() + parts[0] += "-" + randstr.decode("utf-8") + return ".".join(parts) + + +# ============================================================================ + +### USER-UPLOADED IMAGES ### + + +# ============================================================================ +class ImageFileOut(BaseModel): + """output for user-upload imaged file (conformance to Data Resource Spec)""" + + name: str + path: str + hash: str + size: int + + originalFilename: str + mime: str + userid: UUID + userName: str + created: datetime + + +# ============================================================================ +# class PublicImageFileOut(BaseModel): +# """public output for user-upload imaged file (conformance to Data Resource Spec)""" + +# name: str +# path: str +# hash: str +# size: int + +# mime: str + + +# ============================================================================ +class ImageFile(BaseFile): + """User-uploaded image file""" + + originalFilename: str + mime: str + userid: UUID + userName: str + created: datetime + + async def get_image_file_out(self, org, storage_ops) -> ImageFileOut: + """Get ImageFileOut with new presigned url""" + presigned_url = await storage_ops.get_presigned_url( + org, self, PRESIGN_DURATION_SECONDS + ) + + return ImageFileOut( + name=self.filename, + path=presigned_url or "", + hash=self.hash, + size=self.size, + originalFilename=self.originalFilename, + mime=self.mime, + userid=self.userid, + userName=self.userName, + created=self.created, + ) + + # async def get_public_image_file_out(self, org, storage_ops) -> PublicImageFileOut: + # """Get PublicImageFileOut with new presigned url""" + # presigned_url = await storage_ops.get_presigned_url( + # org, self, PRESIGN_DURATION_SECONDS + # ) + + # return PublicImageFileOut( + # name=self.filename, + # path=presigned_url or "", + # hash=self.hash, + # size=self.size, + # mime=self.mime, + # ) + + +# ============================================================================ +class ImageFilePreparer(FilePreparer): + """Wrapper for user image streaming uploads""" + + # pylint: disable=too-many-arguments, too-many-function-args + + def __init__( + self, + prefix, + filename, + original_filename: str, + user: User, + created: datetime, + ): + super().__init__(prefix, filename) + + self.original_filename = original_filename + self.mime, _ = mimetypes.guess_type(original_filename) or ("image/jpeg", None) + self.userid = user.id + self.user_name = user.name + self.created = created + + def get_image_file( + self, + storage: StorageRef, + ) -> ImageFile: + """get user-uploaded image file""" + return ImageFile( + filename=self.upload_name, + hash=self.upload_hasher.hexdigest(), + size=self.upload_size, + storage=storage, + originalFilename=self.original_filename, + mime=self.mime, + userid=self.userid, + userName=self.user_name, + created=self.created, + ) + + # ============================================================================ ### COLLECTIONS ### @@ -1086,6 +1253,8 @@ class Collection(BaseMongoModel): homeUrlTs: Optional[datetime] = None homeUrlPageId: Optional[UUID] = None + thumbnail: Optional[ImageFile] = None + # ============================================================================ class CollIn(BaseModel): @@ -1099,10 +1268,29 @@ class CollIn(BaseModel): # ============================================================================ -class CollOut(Collection): +class CollOut(BaseMongoModel): """Collection output model with annotations.""" + name: str + oid: UUID + description: Optional[str] = None + modified: Optional[datetime] = None + + crawlCount: Optional[int] = 0 + pageCount: Optional[int] = 0 + totalSize: Optional[int] = 0 + + # Sorted by count, descending + tags: Optional[List[str]] = [] + + access: CollAccessType = CollAccessType.PRIVATE + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + homeUrlPageId: Optional[UUID] = None + resources: List[CrawlFileOut] = [] + thumbnail: Optional[ImageFileOut] = None # ============================================================================ diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index ded0630719..7257c875e6 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -1,9 +1,6 @@ """ handle user uploads into browsertrix """ import uuid -import hashlib -import os -import base64 from urllib.parse import unquote from uuid import UUID @@ -13,7 +10,6 @@ from fastapi import Depends, UploadFile, File from fastapi import HTTPException from starlette.requests import Request -from pathvalidate import sanitize_filename from .basecrawls import BaseCrawlOps from .storages import CHUNK_SIZE @@ -27,18 +23,16 @@ Organization, PaginatedCrawlOutResponse, User, - StorageRef, UpdatedResponse, DeletedResponseQuota, AddedResponseIdQuota, + FilePreparer, + MIN_UPLOAD_PART_SIZE, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now -MIN_UPLOAD_PART_SIZE = 10000000 - - # ============================================================================ class UploadOps(BaseCrawlOps): """upload ops""" @@ -224,39 +218,6 @@ async def delete_uploads( return {"deleted": True, "storageQuotaReached": quota_reached} -# ============================================================================ -class FilePreparer: - """wrapper to compute digest / name for streaming upload""" - - def __init__(self, prefix, filename): - self.upload_size = 0 - self.upload_hasher = hashlib.sha256() - self.upload_name = prefix + self.prepare_filename(filename) - - def add_chunk(self, chunk): - """add chunk for file""" - self.upload_size += len(chunk) - self.upload_hasher.update(chunk) - - def get_crawl_file(self, storage: StorageRef): - """get crawl file""" - return CrawlFile( - filename=self.upload_name, - hash=self.upload_hasher.hexdigest(), - size=self.upload_size, - storage=storage, - ) - - def prepare_filename(self, filename): - """prepare filename by sanitizing and adding extra string - to avoid duplicates""" - name = sanitize_filename(filename.rsplit("/", 1)[-1]) - parts = name.split(".") - randstr = base64.b32encode(os.urandom(5)).lower() - parts[0] += "-" + randstr.decode("utf-8") - return ".".join(parts) - - # ============================================================================ class UploadFileReader(BufferedReader): """Compute digest on file upload""" From 130ee51ed2bd206dcceafd47a9785c8cf6516f13 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 28 Nov 2024 15:55:23 -0500 Subject: [PATCH 07/57] Add collection thumbnail upload test --- backend/btrixcloud/colls.py | 8 +- backend/test/data/thumbnail.jpg | 738 +++++++++++++++++++++++++++++++ backend/test/test_collections.py | 29 ++ 3 files changed, 773 insertions(+), 2 deletions(-) create mode 100644 backend/test/data/thumbnail.jpg diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index f9a64df2cd..22ad1e8d3d 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -608,9 +608,13 @@ async def stream_iter(): f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}" ) + coll.thumbnail = thumbnail_file + + # Update entire document to avoid bson.errors.InvalidDocument error + # with thumbnail await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, - {"$set": {"thumbnail": dict(thumbnail_file)}}, + {"$set": coll.to_dict()}, ) return {"added": True} @@ -845,7 +849,7 @@ async def set_collection_home_url( return await colls.set_home_url(coll_id, update, org) @app.put( - "/orgs/{oid}/collections/{coll_id}/upload/thumbnail", + "/orgs/{oid}/collections/{coll_id}/stream/thumbnail", tags=["collections"], response_model=AddedResponse, ) diff --git a/backend/test/data/thumbnail.jpg b/backend/test/data/thumbnail.jpg new file mode 100644 index 0000000000..e746e341f2 --- /dev/null +++ b/backend/test/data/thumbnail.jpg @@ -0,0 +1,738 @@ + + + + + + + + + + + + + + + + + + + ReplayWeb.page Docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + + +
+ +
+ +
+ + + + +
+ +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index f76d842bef..f2a41c00e0 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -911,6 +911,35 @@ def test_collection_url_list(crawler_auth_headers, default_org_id): assert snapshot["status"] +def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id): + with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh: + r = requests.put( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/stream/thumbnail?filename=thumbnail.jpg", + headers=crawler_auth_headers, + data=read_in_chunks(fh), + ) + assert r.status_code == 200 + assert r.json()["added"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + thumbnail = r.json()["thumbnail"] + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] > 0 + + assert thumbnail["originalFilename"] == "thumbnail.jpg" + assert thumbnail["mime"] == "image/jpeg" + assert thumbnail["userid"] + assert thumbnail["userName"] + assert thumbnail["created"] + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From 3b26828396f0b6c687cf3fc0e5b232faeac1b0b8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 28 Nov 2024 16:50:53 -0500 Subject: [PATCH 08/57] WIP: Only share subset of coll and image data for public endpoint TODO: - Add data ranges to collections (maybe precompute on base model) - Add caption for public collections? --- backend/btrixcloud/colls.py | 66 +++++++++++++++++++++++++------ backend/btrixcloud/models.py | 58 +++++++++++++++++---------- backend/test/test_collections.py | 67 +++++++++++++++++++++++++++++--- 3 files changed, 153 insertions(+), 38 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 22ad1e8d3d..cdaece5ce3 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -4,7 +4,7 @@ from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union import os import asyncio @@ -44,6 +44,7 @@ ImageFile, ImageFilePreparer, MIN_UPLOAD_PART_SIZE, + PublicCollOut, ) from .utils import dt_now @@ -244,7 +245,8 @@ async def get_collection_out( async def list_collections( self, - oid: UUID, + org: Organization, + public_colls_out: bool = False, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, @@ -259,16 +261,17 @@ async def list_collections( page = page - 1 skip = page * page_size - match_query: dict[str, object] = {"oid": oid} + match_query: dict[str, object] = {"oid": org.id} if name: match_query["name"] = name - elif name_prefix: regex_pattern = f"^{name_prefix}" match_query["name"] = {"$regex": regex_pattern, "$options": "i"} - if access: + if public_colls_out: + match_query["access"] = CollAccessType.PUBLIC + elif access: match_query["access"] = access aggregate = [{"$match": match_query}] @@ -307,7 +310,22 @@ async def list_collections( except (IndexError, ValueError): total = 0 - collections = [CollOut.from_dict(res) for res in items] + collections: List[Union[CollOut, PublicCollOut]] = [] + + for res in items: + if public_colls_out: + res["resources"] = await self.get_collection_crawl_resources(res["_id"]) + + thumbnail = res.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + res["thumbnail"] = await image_file.get_public_image_file_out( + org, self.storage_ops + ) + + collections.append(PublicCollOut.from_dict(res)) + else: + collections.append(CollOut.from_dict(res)) return collections, total @@ -446,7 +464,14 @@ async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID): ) await self.update_crawl_collections(crawl_id) - async def get_org_public_collections(self, org_slug: str): + async def get_org_public_collections( + self, + org_slug: str, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: Optional[str] = None, + sort_direction: int = 1, + ): """List public collections for org""" try: org = await self.orgs.get_org_by_slug(org_slug) @@ -459,7 +484,12 @@ async def get_org_public_collections(self, org_slug: str): raise HTTPException(status_code=404, detail="public_profile_not_found") collections, _ = await self.list_collections( - org.id, access=CollAccessType.PUBLIC + org, + page_size=page_size, + page=page, + sort_by=sort_by, + sort_direction=sort_direction, + public_colls_out=True, ) public_org_details = PublicOrgDetails( @@ -658,7 +688,7 @@ async def list_collection_all( access: Optional[str] = None, ): collections, total = await colls.list_collections( - org.id, + org, page_size=pageSize, page=page, sort_by=sortBy, @@ -677,7 +707,7 @@ async def list_collection_all( async def get_collection_all(org: Organization = Depends(org_viewer_dep)): results = {} try: - all_collections, _ = await colls.list_collections(org.id, page_size=10_000) + all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: results[collection.name] = await colls.get_collection_crawl_resources( collection.id @@ -811,8 +841,20 @@ async def download_collection( tags=["collections"], response_model=OrgPublicCollections, ) - async def get_org_public_collections(org_slug: str): - return await colls.get_org_public_collections(org_slug) + async def get_org_public_collections( + org_slug: str, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: int = 1, + ): + return await colls.get_org_public_collections( + org_slug, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) @app.get( "/orgs/{oid}/collections/{coll_id}/urls", diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a7b1c13e8a..ec5fed6f8c 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1123,15 +1123,15 @@ class ImageFileOut(BaseModel): # ============================================================================ -# class PublicImageFileOut(BaseModel): -# """public output for user-upload imaged file (conformance to Data Resource Spec)""" +class PublicImageFileOut(BaseModel): + """public output for user-upload imaged file (conformance to Data Resource Spec)""" -# name: str -# path: str -# hash: str -# size: int + name: str + path: str + hash: str + size: int -# mime: str + mime: str # ============================================================================ @@ -1162,19 +1162,19 @@ async def get_image_file_out(self, org, storage_ops) -> ImageFileOut: created=self.created, ) - # async def get_public_image_file_out(self, org, storage_ops) -> PublicImageFileOut: - # """Get PublicImageFileOut with new presigned url""" - # presigned_url = await storage_ops.get_presigned_url( - # org, self, PRESIGN_DURATION_SECONDS - # ) + async def get_public_image_file_out(self, org, storage_ops) -> PublicImageFileOut: + """Get PublicImageFileOut with new presigned url""" + presigned_url = await storage_ops.get_presigned_url( + org, self, PRESIGN_DURATION_SECONDS + ) - # return PublicImageFileOut( - # name=self.filename, - # path=presigned_url or "", - # hash=self.hash, - # size=self.size, - # mime=self.mime, - # ) + return PublicImageFileOut( + name=self.filename, + path=presigned_url or "", + hash=self.hash, + size=self.size, + mime=self.mime, + ) # ============================================================================ @@ -1293,6 +1293,24 @@ class CollOut(BaseMongoModel): thumbnail: Optional[ImageFileOut] = None +# ============================================================================ +class PublicCollOut(BaseMongoModel): + """Collection output model with annotations.""" + + name: str + description: Optional[str] = None + # caption: Optional[str] = None + + # earliestDate: Optional[datetime] = None + # latestDate: Optional[datetime] = None + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + + resources: List[CrawlFileOut] = [] + thumbnail: Optional[PublicImageFileOut] = None + + # ============================================================================ class UpdateColl(BaseModel): """Update collection""" @@ -1366,7 +1384,7 @@ class OrgPublicCollections(BaseModel): org: PublicOrgDetails - collections: List[CollOut] = [] + collections: List[PublicCollOut] = [] # ============================================================================ diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index f2a41c00e0..5f3c5d2628 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -16,8 +16,10 @@ _coll_id = None _second_coll_id = None _public_coll_id = None +_second_public_coll_id = None upload_id = None modified = None +default_org_slug = None curr_dir = os.path.dirname(os.path.realpath(__file__)) @@ -742,11 +744,14 @@ def test_list_public_collections( json={ "crawlIds": [crawler_crawl_id], "name": "Second public collection", + "description": "Lorem ipsum", "access": "public", }, ) assert r.status_code == 200 - second_public_coll_id = r.json()["id"] + + global _second_public_coll_id + _second_public_coll_id = r.json()["id"] # Get default org slug r = requests.get( @@ -755,7 +760,10 @@ def test_list_public_collections( ) assert r.status_code == 200 data = r.json() - org_slug = data["slug"] + + global default_org_slug + default_org_slug = data["slug"] + org_name = data["name"] # Verify that public profile isn't enabled @@ -764,7 +772,7 @@ def test_list_public_collections( assert data["publicUrl"] == "" # Try listing public collections without org public profile enabled - r = requests.get(f"{API_PREFIX}/public-collections/{org_slug}") + r = requests.get(f"{API_PREFIX}/public-collections/{default_org_slug}") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -795,7 +803,7 @@ def test_list_public_collections( assert data["publicUrl"] == public_url # List public collections with no auth (no public profile) - r = requests.get(f"{API_PREFIX}/public-collections/{org_slug}") + r = requests.get(f"{API_PREFIX}/public-collections/{default_org_slug}") assert r.status_code == 200 data = r.json() @@ -807,8 +815,8 @@ def test_list_public_collections( collections = data["collections"] assert len(collections) == 2 for collection in collections: - assert collection["id"] in (_public_coll_id, second_public_coll_id) - assert collection["access"] == "public" + assert collection["id"] in (_public_coll_id, _second_public_coll_id) + assert collection["name"] # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug @@ -940,6 +948,53 @@ def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id): assert thumbnail["created"] +def test_list_public_colls_home_url_thumbnail(): + # Check we get expected data for each public collection + # and nothing we don't expect + non_public_fields = ( + "oid", + "modified", + "crawlCount", + "pageCount", + "totalSize", + "tags", + "access", + "homeUrlPageId", + ) + non_public_image_fields = ("originalFilename", "userid", "userName", "created") + + r = requests.get(f"{API_PREFIX}/public-collections/{default_org_slug}") + assert r.status_code == 200 + collections = r.json()["collections"] + assert len(collections) == 2 + + for coll in collections: + assert coll["id"] in (_public_coll_id, _second_public_coll_id) + assert coll["name"] + assert coll["resources"] + + for field in non_public_fields: + assert field not in coll + + if coll["id"] == _public_coll_id: + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + if coll["id"] == _second_public_coll_id: + assert coll["description"] + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in non_public_image_fields: + assert field not in thumbnail + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From ff64933751ce9b5d53633a6353598ce8b14ebf0a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 28 Nov 2024 18:15:05 -0500 Subject: [PATCH 09/57] Fix test that was checking wrong collection --- backend/test/test_collections.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 5f3c5d2628..716856f766 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -980,8 +980,6 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["homeUrl"] assert coll["homeUrlTs"] - if coll["id"] == _second_public_coll_id: - assert coll["description"] thumbnail = coll["thumbnail"] assert thumbnail @@ -994,6 +992,9 @@ def test_list_public_colls_home_url_thumbnail(): for field in non_public_image_fields: assert field not in thumbnail + if coll["id"] == _second_public_coll_id: + assert coll["description"] + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection From 0ee13d84162360daa096713cfc21cd25b27ffc86 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 10:43:25 -0500 Subject: [PATCH 10/57] Change path for thumbnail endpoint --- backend/btrixcloud/colls.py | 4 ++-- backend/test/test_collections.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index cdaece5ce3..8d8d27de0c 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -891,11 +891,11 @@ async def set_collection_home_url( return await colls.set_home_url(coll_id, update, org) @app.put( - "/orgs/{oid}/collections/{coll_id}/stream/thumbnail", + "/orgs/{oid}/collections/{coll_id}/thumbnail", tags=["collections"], response_model=AddedResponse, ) - async def upload_stream( + async def upload_thumbnail_stream( request: Request, filename: str, coll_id: UUID, diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 716856f766..ff38c46770 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -922,7 +922,7 @@ def test_collection_url_list(crawler_auth_headers, default_org_id): def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id): with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh: r = requests.put( - f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/stream/thumbnail?filename=thumbnail.jpg", + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg", headers=crawler_auth_headers, data=read_in_chunks(fh), ) From ce52e87c403bad0eb88a917d3f62991d76b92b52 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 10:53:02 -0500 Subject: [PATCH 11/57] Add endpoint to delete collection thumbnail Also renames StorageOps.delete_crawl_file_object to simply delete_file_object, since we use it across all types of files we store in s3. --- backend/btrixcloud/basecrawls.py | 2 +- backend/btrixcloud/colls.py | 33 +++++++++++++++++++++++++++++++- backend/btrixcloud/crawls.py | 2 +- backend/btrixcloud/profiles.py | 2 +- backend/btrixcloud/storages.py | 4 +--- backend/test/test_collections.py | 23 ++++++++++++++++++++++ 6 files changed, 59 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 5106487610..063ecb9dec 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -369,7 +369,7 @@ async def _delete_crawl_files( size = 0 for file_ in crawl.files: size += file_.size - if not await self.storage_ops.delete_crawl_file_object(org, file_): + if not await self.storage_ops.delete_file_object(org, file_): raise HTTPException(status_code=400, detail="file_deletion_error") # Not replicating QA run WACZs yet if not isinstance(crawl, QARun): diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 8d8d27de0c..6704081c55 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -32,6 +32,7 @@ UpdatedResponse, SuccessResponse, AddedResponse, + DeletedResponse, CollectionSearchValuesResponse, OrgPublicCollections, PublicOrgDetails, @@ -633,7 +634,7 @@ async def stream_iter(): thumbnail_file = file_prep.get_image_file(org.storage) if coll.thumbnail: - if not await self.storage_ops.delete_crawl_file_object(org, coll.thumbnail): + if not await self.storage_ops.delete_file_object(org, coll.thumbnail): print( f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}" ) @@ -649,6 +650,25 @@ async def stream_iter(): return {"added": True} + async def delete_thumbnail(self, coll_id: UUID, org: Organization): + """Delete collection thumbnail""" + coll = await self.get_collection(coll_id) + + if not coll.thumbnail: + raise HTTPException(status_code=404, detail="thumbnail_not_found") + + if not await self.storage_ops.delete_file_object(org, coll.thumbnail): + print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}") + raise HTTPException(status_code=400, detail="file_deletion_error") + + # Delete from database + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": {"thumbnail": None}}, + ) + + return {"deleted": True} + # ============================================================================ # pylint: disable=too-many-locals @@ -906,4 +926,15 @@ async def upload_thumbnail_stream( request.stream(), filename, coll_id, org, user ) + @app.delete( + "/orgs/{oid}/collections/{coll_id}/thumbnail", + tags=["collections"], + response_model=DeletedResponse, + ) + async def delete_thumbnail_stream( + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + return await colls.delete_thumbnail(coll_id, org) + return colls diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 5a0994fe70..539c408ee6 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -918,7 +918,7 @@ async def delete_crawl_qa_run_files( """delete crawl qa wacz files""" qa_run = await self.get_qa_run(crawl_id, qa_run_id, org) for file_ in qa_run.files: - if not await self.storage_ops.delete_crawl_file_object(org, file_): + if not await self.storage_ops.delete_file_object(org, file_): raise HTTPException(status_code=400, detail="file_deletion_error") # Not replicating QA run WACZs yet # await self.background_job_ops.create_delete_replica_jobs( diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index ab72422472..9b8ae8da8f 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -426,7 +426,7 @@ async def delete_profile( # Delete file from storage if profile.resource: - await self.storage_ops.delete_crawl_file_object(org, profile.resource) + await self.storage_ops.delete_file_object(org, profile.resource) await self.orgs.inc_org_bytes_stored( org.id, -profile.resource.size, "profile" ) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 50b9557a92..43b6fabcd5 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -476,9 +476,7 @@ async def get_presigned_url( return presigned_url - async def delete_crawl_file_object( - self, org: Organization, crawlfile: BaseFile - ) -> bool: + async def delete_file_object(self, org: Organization, crawlfile: BaseFile) -> bool: """delete crawl file from storage.""" return await self._delete_file(org, crawlfile.filename, crawlfile.storage) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index ff38c46770..6705c3c192 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -996,6 +996,29 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["description"] +def test_delete_thumbnail(crawler_auth_headers, default_org_id): + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json().get("thumbnail") is None + + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail", + headers=crawler_auth_headers, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "thumbnail_not_found" + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From 5e880d90ad508c1296a06b2e351eeb8b075ddcf5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 11:03:13 -0500 Subject: [PATCH 12/57] Add caption field to collections --- backend/btrixcloud/colls.py | 1 + backend/btrixcloud/models.py | 6 +++++- backend/test/test_collections.py | 30 +++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 6704081c55..b250e68669 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -110,6 +110,7 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): oid=oid, name=coll_in.name, description=coll_in.description, + caption=coll_in.caption, modified=modified, access=coll_in.access, ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index ec5fed6f8c..5e0bebe4f8 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1238,6 +1238,7 @@ class Collection(BaseMongoModel): name: str = Field(..., min_length=1) oid: UUID description: Optional[str] = None + caption: Optional[str] = None modified: Optional[datetime] = None crawlCount: Optional[int] = 0 @@ -1262,6 +1263,7 @@ class CollIn(BaseModel): name: str = Field(..., min_length=1) description: Optional[str] = None + caption: Optional[str] = None crawlIds: Optional[List[str]] = [] access: CollAccessType = CollAccessType.PRIVATE @@ -1274,6 +1276,7 @@ class CollOut(BaseMongoModel): name: str oid: UUID description: Optional[str] = None + caption: Optional[str] = None modified: Optional[datetime] = None crawlCount: Optional[int] = 0 @@ -1299,7 +1302,7 @@ class PublicCollOut(BaseMongoModel): name: str description: Optional[str] = None - # caption: Optional[str] = None + caption: Optional[str] = None # earliestDate: Optional[datetime] = None # latestDate: Optional[datetime] = None @@ -1317,6 +1320,7 @@ class UpdateColl(BaseModel): name: Optional[str] = None description: Optional[str] = None + caption: Optional[str] = None access: Optional[CollAccessType] = None diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 6705c3c192..a1202d72ba 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -12,6 +12,8 @@ UPDATED_NAME = "Updated tést cöllection" SECOND_COLLECTION_NAME = "second-collection" DESCRIPTION = "Test description" +CAPTION = "Short caption" +UPDATED_CAPTION = "Updated caption" _coll_id = None _second_coll_id = None @@ -33,6 +35,7 @@ def test_create_collection( json={ "crawlIds": [crawler_crawl_id], "name": COLLECTION_NAME, + "caption": CAPTION, }, ) assert r.status_code == 200 @@ -51,6 +54,23 @@ def test_create_collection( assert _coll_id in r.json()["collectionIds"] assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == _coll_id + assert data["name"] == COLLECTION_NAME + assert data["caption"] == CAPTION + assert data["crawlCount"] == 1 + assert data["pageCount"] > 0 + assert data["totalSize"] > 0 + modified = data["modified"] + assert modified + assert modified.endswith("Z") + def test_create_public_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id @@ -61,6 +81,7 @@ def test_create_public_collection( json={ "crawlIds": [crawler_crawl_id], "name": PUBLIC_COLLECTION_NAME, + "caption": CAPTION, "access": "public", }, ) @@ -117,6 +138,7 @@ def test_update_collection( headers=crawler_auth_headers, json={ "description": DESCRIPTION, + "caption": UPDATED_CAPTION, }, ) assert r.status_code == 200 @@ -132,6 +154,7 @@ def test_update_collection( assert data["id"] == _coll_id assert data["name"] == COLLECTION_NAME assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 1 assert data["pageCount"] > 0 assert data["totalSize"] > 0 @@ -276,6 +299,7 @@ def test_get_collection(crawler_auth_headers, default_org_id): assert data["name"] == UPDATED_NAME assert data["oid"] == default_org_id assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 assert data["totalSize"] > 0 @@ -294,6 +318,7 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["name"] == UPDATED_NAME assert data["oid"] == default_org_id assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 assert data["totalSize"] > 0 @@ -461,10 +486,11 @@ def test_list_collections( assert len(items) == 3 first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0] - assert first_coll["id"] + assert first_coll["id"] == _coll_id assert first_coll["name"] == UPDATED_NAME assert first_coll["oid"] == default_org_id assert first_coll["description"] == DESCRIPTION + assert first_coll["caption"] == UPDATED_CAPTION assert first_coll["crawlCount"] == 3 assert first_coll["pageCount"] > 0 assert first_coll["totalSize"] > 0 @@ -977,6 +1003,8 @@ def test_list_public_colls_home_url_thumbnail(): assert field not in coll if coll["id"] == _public_coll_id: + assert coll["caption"] == CAPTION + assert coll["homeUrl"] assert coll["homeUrlTs"] From 355c7f44953bc49c2f3c1a968a78d5aa01bb90e7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 11:35:43 -0500 Subject: [PATCH 13/57] Calculate and store earliest and latest dates in collection - Based on page timestamps - Not all archived items have pages written to the db yet, so this won't count pages in older crawls and in uploads yet - Might want to move to asyncio task or background job --- backend/btrixcloud/colls.py | 43 ++++++++++++++++++++++++++++++++ backend/btrixcloud/models.py | 10 ++++++-- backend/test/test_collections.py | 25 +++++++++++++++++++ 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index b250e68669..0485a1218c 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -120,6 +120,7 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): if crawl_ids: await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org) await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( crawl_ids, coll_id, org @@ -173,6 +174,7 @@ async def add_crawls_to_collection( raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( @@ -197,6 +199,7 @@ async def remove_crawls_from_collection( raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_removed_from_collection_notification( @@ -448,6 +451,46 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): }, ) + async def update_collection_dates(self, coll_id: UUID): + """Update collection earliest and latest dates from page timestamps""" + coll = await self.get_collection(coll_id) + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + earliest_ts = None + latest_ts = None + + match_query = { + "oid": coll.oid, + "crawl_id": {"$in": crawl_ids}, + "ts": {"$ne": None}, + } + + cursor = self.pages.find(match_query).sort("ts", 1).limit(1) + pages = await cursor.to_list(length=1) + try: + earliest_page = pages[0] + earliest_ts = earliest_page.get("ts") + except IndexError: + pass + + cursor = self.pages.find(match_query).sort("ts", -1).limit(1) + pages = await cursor.to_list(length=1) + try: + latest_page = pages[0] + latest_ts = latest_page.get("ts") + except IndexError: + pass + + await self.collections.find_one_and_update( + {"_id": coll_id}, + { + "$set": { + "dateEarliest": earliest_ts, + "dateLatest": latest_ts, + } + }, + ) + async def update_crawl_collections(self, crawl_id: str): """Update counts and tags for all collections in crawl""" crawl = await self.crawls.find_one({"_id": crawl_id}) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 5e0bebe4f8..307c40c803 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1245,6 +1245,9 @@ class Collection(BaseMongoModel): pageCount: Optional[int] = 0 totalSize: Optional[int] = 0 + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + # Sorted by count, descending tags: Optional[List[str]] = [] @@ -1283,6 +1286,9 @@ class CollOut(BaseMongoModel): pageCount: Optional[int] = 0 totalSize: Optional[int] = 0 + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + # Sorted by count, descending tags: Optional[List[str]] = [] @@ -1304,8 +1310,8 @@ class PublicCollOut(BaseMongoModel): description: Optional[str] = None caption: Optional[str] = None - # earliestDate: Optional[datetime] = None - # latestDate: Optional[datetime] = None + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None homeUrl: Optional[AnyHttpUrl] = None homeUrlTs: Optional[datetime] = None diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index a1202d72ba..388d4e39cd 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -71,6 +71,9 @@ def test_create_collection( assert modified assert modified.endswith("Z") + assert data["dateEarliest"] + assert data["dateLatest"] + def test_create_public_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id @@ -162,6 +165,8 @@ def test_update_collection( modified = data["modified"] assert modified assert modified.endswith("Z") + assert data["dateEarliest"] + assert data["dateLatest"] def test_rename_collection( @@ -236,6 +241,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] # Verify it was added r = requests.get( @@ -258,6 +265,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] == 0 assert data["modified"] >= modified assert data.get("tags", []) == [] + assert data["dateEarliest"] + assert data["dateLatest"] # Verify they were removed r = requests.get( @@ -286,6 +295,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] def test_get_collection(crawler_auth_headers, default_org_id): @@ -305,6 +316,8 @@ def test_get_collection(crawler_auth_headers, default_org_id): assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] def test_get_collection_replay(crawler_auth_headers, default_org_id): @@ -324,6 +337,8 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] resources = data["resources"] assert resources @@ -440,6 +455,8 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): assert data["totalSize"] > 0 assert data["modified"] assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] # Verify it was added r = requests.get( @@ -497,6 +514,8 @@ def test_list_collections( assert first_coll["modified"] assert first_coll["tags"] == ["wr-test-2", "wr-test-1"] assert first_coll["access"] == "private" + assert first_coll["dateEarliest"] + assert first_coll["dateLatest"] second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0] assert second_coll["id"] @@ -509,6 +528,8 @@ def test_list_collections( assert second_coll["modified"] assert second_coll["tags"] == ["wr-test-2"] assert second_coll["access"] == "private" + assert second_coll["dateEarliest"] + assert second_coll["dateLatest"] def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): @@ -843,6 +864,8 @@ def test_list_public_collections( for collection in collections: assert collection["id"] in (_public_coll_id, _second_public_coll_id) assert collection["name"] + assert collection["dateEarliest"] + assert collection["dateLatest"] # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug @@ -998,6 +1021,8 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["id"] in (_public_coll_id, _second_public_coll_id) assert coll["name"] assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] for field in non_public_fields: assert field not in coll From 571c3f75842ce2849e3088341bd107c4524deeb0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 13:19:38 -0500 Subject: [PATCH 14/57] Add comment to get CI to run --- backend/btrixcloud/colls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 0485a1218c..f5fc671c8f 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -465,6 +465,7 @@ async def update_collection_dates(self, coll_id: UUID): "ts": {"$ne": None}, } + # Note: Pages for uploads are not currently in the db cursor = self.pages.find(match_query).sort("ts", 1).limit(1) pages = await cursor.to_list(length=1) try: From d904dc06d8c4a10f73f3f0dbb409c146e134186e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 13:44:09 -0500 Subject: [PATCH 15/57] Fix test --- backend/test/test_collections.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 388d4e39cd..a5f0c6610a 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -265,8 +265,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] == 0 assert data["modified"] >= modified assert data.get("tags", []) == [] - assert data["dateEarliest"] - assert data["dateLatest"] + assert data.get("dateEarliest") is None + assert data.get("dateLatest") is None # Verify they were removed r = requests.get( From 99229e061af7021191e7dec332a50c8c49de2c13 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 11:55:10 -0500 Subject: [PATCH 16/57] Add pages to database for uploads Note: - Page endpoints have crawls in path, not uploads or all-crawls, but will now work with uploads - To add pages from older uploads, the pages endpoints to re-add pages for a particular crawl or all crawls will now work for uploads as well --- backend/btrixcloud/main.py | 5 ++-- backend/btrixcloud/main_bg.py | 2 +- backend/btrixcloud/main_op.py | 1 + backend/btrixcloud/models.py | 6 ++--- backend/btrixcloud/ops.py | 23 +++++++--------- backend/btrixcloud/pages.py | 11 +++----- backend/btrixcloud/uploads.py | 4 +++ backend/test/test_uploads.py | 51 +++++++++++++++++++++++++++++++++++ 8 files changed, 76 insertions(+), 27 deletions(-) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 7119214fc5..a9fe790730 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -245,14 +245,15 @@ def main() -> None: crawls = init_crawls_api(crawl_manager, *base_crawl_init) + upload_ops = init_uploads_api(*base_crawl_init) + page_ops = init_pages_api( app, mdb, crawls, org_ops, storage_ops, current_active_user ) base_crawl_ops.set_page_ops(page_ops) crawls.set_page_ops(page_ops) - - init_uploads_api(*base_crawl_init) + upload_ops.set_page_ops(page_ops) org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 2fba05e53f..6ce4264126 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -27,7 +27,7 @@ async def main(): ) return 1 - (org_ops, _, _, _, _, _, _, _, _, _, user_manager) = init_ops() + (org_ops, _, _, _, _, _, _, _, _, _, _, user_manager) = init_ops() if not oid: print("Org id missing, quitting") diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index a6f6654be3..af7a2d0956 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -31,6 +31,7 @@ def main(): crawl_config_ops, _, crawl_ops, + _, page_ops, coll_ops, _, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 307c40c803..660b107ee5 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -797,6 +797,9 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): reviewStatus: ReviewStatus = None + filePageCount: Optional[int] = 0 + errorPageCount: Optional[int] = 0 + # ============================================================================ class CollIdName(BaseModel): @@ -1013,9 +1016,6 @@ class Crawl(BaseCrawl, CrawlConfigCore): qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} - filePageCount: Optional[int] = 0 - errorPageCount: Optional[int] = 0 - # ============================================================================ class CrawlCompleteIn(BaseModel): diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 32e5e5fee1..2a282b8e09 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -16,6 +16,7 @@ from .pages import PageOps from .profiles import ProfileOps from .storages import StorageOps +from .uploads import UploadOps from .users import UserManager from .webhooks import EventWebhookOps @@ -26,6 +27,7 @@ def init_ops() -> Tuple[ CrawlConfigOps, BaseCrawlOps, CrawlOps, + UploadOps, PageOps, CollectionOps, ProfileOps, @@ -70,7 +72,7 @@ def init_ops() -> Tuple[ coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) - base_crawl_ops = BaseCrawlOps( + base_crawl_init = ( mdb, user_manager, org_ops, @@ -81,23 +83,17 @@ def init_ops() -> Tuple[ background_job_ops, ) - crawl_ops = CrawlOps( - crawl_manager, - mdb, - user_manager, - org_ops, - crawl_config_ops, - coll_ops, - storage_ops, - event_webhook_ops, - background_job_ops, - ) + base_crawl_ops = BaseCrawlOps(*base_crawl_init) + + crawl_ops = CrawlOps(crawl_manager, *base_crawl_init) + + upload_ops = UploadOps(*base_crawl_init) page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) base_crawl_ops.set_page_ops(page_ops) - crawl_ops.set_page_ops(page_ops) + upload_ops.set_page_ops(page_ops) background_job_ops.set_ops(crawl_ops, profile_ops) @@ -116,6 +112,7 @@ def init_ops() -> Tuple[ crawl_config_ops, base_crawl_ops, crawl_ops, + upload_ops, page_ops, coll_ops, profile_ops, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index a980567c49..6f83409c8f 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -199,10 +199,7 @@ async def update_crawl_file_and_error_counts( inc_query["errorPageCount"] = error_count await self.crawls.find_one_and_update( - { - "_id": crawl_id, - "type": "crawl", - }, + {"_id": crawl_id}, {"$inc": inc_query}, ) @@ -555,10 +552,8 @@ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): await self.add_crawl_pages_to_db_from_wacz(crawl_id) async def re_add_all_crawl_pages(self, oid: UUID): - """Re-add pages for all crawls in org""" - crawl_ids = await self.crawls.distinct( - "_id", {"type": "crawl", "finished": {"$ne": None}} - ) + """Re-add pages for all crawls and uploads in org""" + crawl_ids = await self.crawls.distinct("_id", {"finished": {"$ne": None}}) for crawl_id in crawl_ids: await self.re_add_crawl_pages(crawl_id, oid) diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 7257c875e6..0473e1cf58 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -189,6 +189,8 @@ async def _create_upload( self.event_webhook_ops.create_upload_finished_notification(crawl_id, org.id) ) + asyncio.create_task(self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)) + await self.orgs.inc_org_bytes_stored(org.id, file_size, "upload") quota_reached = self.orgs.storage_quota_reached(org) @@ -407,3 +409,5 @@ async def delete_uploads( org: Organization = Depends(org_crawl_dep), ): return await ops.delete_uploads(delete_list, org, user) + + return ops diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index fb7543d0a2..0049e53269 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -132,6 +132,57 @@ def test_get_stream_upload( assert r.status_code == 200 +def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): + time.sleep(10) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{upload_id}/pages", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] > 0 + + pages = data["items"] + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] == upload_id + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + assert page["status"] + assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) + + page_id = pages[0]["id"] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{upload_id}/pages/{page_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + page = r.json() + + assert page["id"] == page_id + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) + + assert page["notes"] == [] + assert page.get("userid") is None + assert page.get("modified") is None + assert page.get("approved") is None + + def test_list_uploads( admin_auth_headers, default_org_id, uploads_collection_id, upload_id_2 ): From af9265a9ab4ee5b394462570c82515493b8ec7ce Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 13:09:14 -0500 Subject: [PATCH 17/57] WIP: Add all-crawls/ and uploads/ versions of GET pages endpoints - QA-related endpoints are still restricted to crawls only for now - Do we want to enforce the collection types or are these multiple paths per endpoint method sufficient? tbd --- backend/btrixcloud/pages.py | 48 +++++++++++++++++++++++++++++++++--- backend/test/test_uploads.py | 4 +-- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 6f83409c8f..74383d9d71 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -638,7 +638,17 @@ def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): @app.post( "/orgs/{oid}/crawls/all/pages/reAdd", - tags=["pages"], + tags=["pages", "crawls"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/uploads/all/pages/reAdd", + tags=["pages", "uploads"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/all-crawls/all/pages/reAdd", + tags=["pages", "all-crawls"], response_model=StartedResponseBool, ) async def re_add_all_crawl_pages( @@ -653,7 +663,17 @@ async def re_add_all_crawl_pages( @app.post( "/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", - tags=["pages"], + tags=["pages", "crawls"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/uploads/{crawl_id}/pages/reAdd", + tags=["pages", "uploads"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/all-crawls/{crawl_id}/pages/reAdd", + tags=["pages", "all-crawls"], response_model=StartedResponseBool, ) async def re_add_crawl_pages( @@ -665,7 +685,17 @@ async def re_add_crawl_pages( @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", - tags=["pages"], + tags=["pages", "crawls"], + response_model=PageOut, + ) + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/pages/{page_id}", + tags=["pages", "uploads"], + response_model=PageOut, + ) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/pages/{page_id}", + tags=["pages", "all-crawls"], response_model=PageOut, ) async def get_page( @@ -753,7 +783,17 @@ async def delete_page_notes( @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages", - tags=["pages"], + tags=["pages", "crawls"], + response_model=PaginatedPageOutResponse, + ) + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/pages", + tags=["pages", "uploads"], + response_model=PaginatedPageOutResponse, + ) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/pages", + tags=["pages", "all-crawls"], response_model=PaginatedPageOutResponse, ) async def get_pages_list( diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 0049e53269..86d1abb854 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -136,7 +136,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): time.sleep(10) r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/crawls/{upload_id}/pages", + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages", headers=admin_auth_headers, ) assert r.status_code == 200 @@ -160,7 +160,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): page_id = pages[0]["id"] r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/crawls/{upload_id}/pages/{page_id}", + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}", headers=admin_auth_headers, ) assert r.status_code == 200 From 1ad1b271c98ef8d6d084e1e9a42fc4443e1f5055 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 16:56:20 -0500 Subject: [PATCH 18/57] Add fallbacks if pages have no id or a non-UUID id --- backend/btrixcloud/pages.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 74383d9d71..37982b8d7c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -94,9 +94,19 @@ def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID ) -> Page: """Return Page object from dict""" - page_id = page_dict.get("id") + page_id = page_dict.get("id", "") if not page_id: print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) + page_id = uuid4() + + try: + UUID(page_id) + except ValueError: + print( + f'Page {page_dict.get("url")} is not a valid UUID - assigning UUID', + flush=True, + ) + page_id = uuid4() status = page_dict.get("status") if not status and page_dict.get("loadState"): From 34ebaed6509e580963c0f4725683c2a944615596 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 17:09:37 -0500 Subject: [PATCH 19/57] Move pages test later to give it more time --- backend/test/test_uploads.py | 102 +++++++++++++++++------------------ 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 86d1abb854..73c57cf8c2 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -132,57 +132,6 @@ def test_get_stream_upload( assert r.status_code == 200 -def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): - time.sleep(10) - - r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages", - headers=admin_auth_headers, - ) - assert r.status_code == 200 - data = r.json() - - assert data["total"] > 0 - - pages = data["items"] - for page in pages: - assert page["id"] - assert page["oid"] - assert page["crawl_id"] == upload_id - assert page["url"] - assert page["ts"] - assert page.get("title") or page.get("title") is None - assert page["loadState"] - assert page["status"] - assert page["mime"] - assert page["isError"] in (True, False) - assert page["isFile"] in (True, False) - - page_id = pages[0]["id"] - r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}", - headers=admin_auth_headers, - ) - assert r.status_code == 200 - page = r.json() - - assert page["id"] == page_id - assert page["oid"] - assert page["crawl_id"] - assert page["url"] - assert page["ts"] - assert page.get("title") or page.get("title") is None - assert page["loadState"] - assert page["mime"] - assert page["isError"] in (True, False) - assert page["isFile"] in (True, False) - - assert page["notes"] == [] - assert page.get("userid") is None - assert page.get("modified") is None - assert page.get("approved") is None - - def test_list_uploads( admin_auth_headers, default_org_id, uploads_collection_id, upload_id_2 ): @@ -283,6 +232,57 @@ def test_get_upload_replay_json_admin( assert "files" not in data +def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): + time.sleep(10) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] > 0 + + pages = data["items"] + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] == upload_id + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + assert page["status"] + assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) + + page_id = pages[0]["id"] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + page = r.json() + + assert page["id"] == page_id + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page["loadState"] + assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) + + assert page["notes"] == [] + assert page.get("userid") is None + assert page.get("modified") is None + assert page.get("approved") is None + + def test_replace_upload( admin_auth_headers, default_org_id, uploads_collection_id, upload_id ): From e14c807c7fc44c93026bd08302593f5de6bf5d46 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 17:11:05 -0500 Subject: [PATCH 20/57] Delete upload pages when deleted or replaced --- backend/btrixcloud/basecrawls.py | 3 ++- backend/btrixcloud/uploads.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 063ecb9dec..01e700f8b5 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -323,8 +323,9 @@ async def delete_crawls( status_code=400, detail=f"Error Stopping Crawl: {exc}" ) + await self.page_ops.delete_crawl_pages(crawl_id, org.id) + if type_ == "crawl": - await self.page_ops.delete_crawl_pages(crawl_id, org.id) await self.delete_all_crawl_qa_files(crawl_id, org) crawl_size = await self._delete_crawl_files(crawl, org) diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 0473e1cf58..e95b30427f 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -99,9 +99,10 @@ async def stream_iter(): if prev_upload: try: await self._delete_crawl_files(prev_upload, org) + await self.page_ops.delete_crawl_pages(prev_upload.id, org.id) # pylint: disable=broad-exception-caught except Exception as exc: - print("replace file deletion failed", exc) + print(f"Error handling previous upload: {exc}", flush=True) return await self._create_upload( files, name, description, collections, tags, id_, org, user From 594deddbc515ea0add54be8d30d21a0c98b1bf63 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 17:26:32 -0500 Subject: [PATCH 21/57] Remove asserts for upload pages for optional fields --- backend/test/test_uploads.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 73c57cf8c2..3fb1c1c44b 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -233,6 +233,7 @@ def test_get_upload_replay_json_admin( def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): + # Give time for pages to finish being uploaded time.sleep(10) r = requests.get( @@ -252,11 +253,6 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["url"] assert page["ts"] assert page.get("title") or page.get("title") is None - assert page["loadState"] - assert page["status"] - assert page["mime"] - assert page["isError"] in (True, False) - assert page["isFile"] in (True, False) page_id = pages[0]["id"] r = requests.get( @@ -272,10 +268,6 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["url"] assert page["ts"] assert page.get("title") or page.get("title") is None - assert page["loadState"] - assert page["mime"] - assert page["isError"] in (True, False) - assert page["isFile"] in (True, False) assert page["notes"] == [] assert page.get("userid") is None From f00cc9fc99a744ec0b66a7fd3e5c413bf9c534ed Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 2 Dec 2024 17:27:29 -0500 Subject: [PATCH 22/57] Remove outdated comment --- backend/btrixcloud/colls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index f5fc671c8f..0485a1218c 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -465,7 +465,6 @@ async def update_collection_dates(self, coll_id: UUID): "ts": {"$ne": None}, } - # Note: Pages for uploads are not currently in the db cursor = self.pages.find(match_query).sort("ts", 1).limit(1) pages = await cursor.to_list(length=1) try: From 19cfca6ad13e673e82576d12e501b8509eb3aca0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 11:25:28 -0500 Subject: [PATCH 23/57] Filter (re-)add all pages endpoint by crawl type in path If a user POSTS to /crawls/all/ or /uploads/all/, filter the crawl objects in the database by that type before adding pages. This enables us to easily add pages to the database for all uploads in an organization, for example. GET endpoints and endpoints that specify just a single crawl have been kept more permissive as long as the crawl_id is found. --- backend/btrixcloud/pages.py | 38 +++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 37982b8d7c..567ff4f7e5 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 -from fastapi import Depends, HTTPException +from fastapi import Depends, HTTPException, Request import pymongo from .models import ( @@ -561,9 +561,15 @@ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): print(f"Deleted pages for crawl {crawl_id}", flush=True) await self.add_crawl_pages_to_db_from_wacz(crawl_id) - async def re_add_all_crawl_pages(self, oid: UUID): + async def re_add_all_crawl_pages( + self, oid: UUID, type_filter: Optional[str] = None + ): """Re-add pages for all crawls and uploads in org""" - crawl_ids = await self.crawls.distinct("_id", {"finished": {"$ne": None}}) + match_query: Dict[str, object] = {"finished": {"$ne": None}} + if type_filter: + match_query["type"] = type_filter + + crawl_ids = await self.crawls.distinct("_id", match_query) for crawl_id in crawl_ids: await self.re_add_crawl_pages(crawl_id, oid) @@ -662,13 +668,28 @@ def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): response_model=StartedResponseBool, ) async def re_add_all_crawl_pages( - org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep) + request: Request, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), ): """Re-add pages for all crawls in org (superuser only)""" if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - asyncio.create_task(ops.re_add_all_crawl_pages(org.id)) + type_filter = None + + try: + route_path = request.scope["route"].path + type_path = route_path.split("/")[4] + + if type_path == "uploads": + type_filter = "upload" + if type_path == "crawls": + type_filter = "crawl" + except (IndexError, AttributeError): + pass + + asyncio.create_task(ops.re_add_all_crawl_pages(org.id, type_filter=type_filter)) return {"started": True} @app.post( @@ -687,7 +708,8 @@ async def re_add_all_crawl_pages( response_model=StartedResponseBool, ) async def re_add_crawl_pages( - crawl_id: str, org: Organization = Depends(org_crawl_dep) + crawl_id: str, + org: Organization = Depends(org_crawl_dep), ): """Re-add pages for crawl""" asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) @@ -727,7 +749,7 @@ async def get_page_with_qa( page_id: UUID, org: Organization = Depends(org_crawl_dep), ): - """GET single page""" + """GET single page with QA details""" return await ops.get_page_out(page_id, org.id, crawl_id, qa_run_id=qa_run_id) @app.patch( @@ -788,7 +810,7 @@ async def delete_page_notes( delete: PageNoteDelete, org: Organization = Depends(org_crawl_dep), ): - """Edit page note""" + """Delete page note""" return await ops.delete_page_notes(page_id, org.id, delete, crawl_id) @app.get( From 4fd6e489d1e48e44fd7d298383eea72d809b5528 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 12:26:02 -0500 Subject: [PATCH 24/57] Move re-adding all pages in org to background job --- backend/btrixcloud/background_jobs.py | 68 ++++++++++++++++++++++--- backend/btrixcloud/crawlmanager.py | 38 +++++++++----- backend/btrixcloud/main.py | 2 +- backend/btrixcloud/main_bg.py | 12 ++++- backend/btrixcloud/models.py | 10 ++++ backend/btrixcloud/ops.py | 2 +- backend/btrixcloud/pages.py | 40 ++++++++++----- chart/app-templates/background_job.yaml | 5 +- 8 files changed, 139 insertions(+), 38 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b9667078e0..6ebb43ad01 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -1,7 +1,6 @@ """k8s background jobs""" import asyncio -import os from datetime import datetime from typing import Optional, Tuple, Union, List, Dict, TYPE_CHECKING, cast from uuid import UUID @@ -22,6 +21,7 @@ DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob, + ReAddOrgPagesJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -286,8 +286,6 @@ async def create_delete_org_job( try: job_id = await self.crawl_manager.run_delete_org_job( oid=str(org.id), - backend_image=os.environ.get("BACKEND_IMAGE", ""), - pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), existing_job_id=existing_job_id, ) if existing_job_id: @@ -331,8 +329,6 @@ async def create_recalculate_org_stats_job( try: job_id = await self.crawl_manager.run_recalculate_org_stats_job( oid=str(org.id), - backend_image=os.environ.get("BACKEND_IMAGE", ""), - pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), existing_job_id=existing_job_id, ) if existing_job_id: @@ -366,6 +362,52 @@ async def create_recalculate_org_stats_job( print(f"warning: recalculate org stats job could not be started: {exc}") return None + async def create_re_add_org_pages_job( + self, + oid: UUID, + type_filter: Optional[str] = None, + existing_job_id: Optional[str] = None, + ): + """Create job to (re)add all pages in an org, optionally filtered by crawl type""" + + try: + job_id = await self.crawl_manager.run_re_add_org_pages_job( + oid=str(oid), + type_filter=type_filter, + existing_job_id=existing_job_id, + ) + if existing_job_id: + readd_pages_job = await self.get_background_job(existing_job_id, oid) + previous_attempt = { + "started": readd_pages_job.started, + "finished": readd_pages_job.finished, + } + if readd_pages_job.previousAttempts: + readd_pages_job.previousAttempts.append(previous_attempt) + else: + readd_pages_job.previousAttempts = [previous_attempt] + readd_pages_job.started = dt_now() + readd_pages_job.finished = None + readd_pages_job.success = None + else: + readd_pages_job = ReAddOrgPagesJob( + id=job_id, + oid=oid, + type_filter=type_filter, + started=dt_now(), + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": readd_pages_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + # pylint: disable=raise-missing-from + print(f"warning: re-add org pages job could not be started: {exc}") + return None + async def job_finished( self, job_id: str, @@ -411,7 +453,11 @@ async def job_finished( async def get_background_job( self, job_id: str, oid: Optional[UUID] = None ) -> Union[ - CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob + CreateReplicaJob, + DeleteReplicaJob, + DeleteOrgJob, + RecalculateOrgStatsJob, + ReAddOrgPagesJob, ]: """Get background job""" query: dict[str, object] = {"_id": job_id} @@ -435,6 +481,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.RECALCULATE_ORG_STATS: return RecalculateOrgStatsJob.from_dict(data) + if data["type"] == BgJobType.READD_ORG_PAGES: + return ReAddOrgPagesJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( @@ -575,6 +624,13 @@ async def retry_background_job( existing_job_id=job_id, ) + if job.type == BgJobType.READD_ORG_PAGES: + await self.create_re_add_org_pages_job( + org.id, + job.type_filter, + existing_job_id=job_id, + ) + return {"success": True} async def retry_failed_background_jobs( diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 7921ca4856..55ed6c3072 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -115,8 +115,6 @@ async def run_replica_job( async def run_delete_org_job( self, oid: str, - backend_image: str, - pull_policy: str, existing_job_id: Optional[str] = None, ) -> str: """run job to delete org and all of its data""" @@ -127,14 +125,12 @@ async def run_delete_org_job( job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, backend_image, pull_policy, job_id, job_type=BgJobType.DELETE_ORG.value + oid, job_id, job_type=BgJobType.DELETE_ORG.value ) async def run_recalculate_org_stats_job( self, oid: str, - backend_image: str, - pull_policy: str, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -146,19 +142,32 @@ async def run_recalculate_org_stats_job( return await self._run_bg_job_with_ops_classes( oid, - backend_image, - pull_policy, job_id, job_type=BgJobType.RECALCULATE_ORG_STATS.value, ) - async def _run_bg_job_with_ops_classes( + async def run_re_add_org_pages_job( self, oid: str, - backend_image: str, - pull_policy: str, - job_id: str, - job_type: str, + type_filter: Optional[str] = None, + existing_job_id: Optional[str] = None, + ) -> str: + """run job to recalculate storage stats for the org""" + + if existing_job_id: + job_id = existing_job_id + else: + job_id = f"org-pages-{oid}-{secrets.token_hex(5)}" + + return await self._run_bg_job_with_ops_classes( + oid, + job_id, + job_type=BgJobType.READD_ORG_PAGES.value, + type_filter=type_filter, + ) + + async def _run_bg_job_with_ops_classes( + self, oid: str, job_id: str, job_type: str, **kwargs ) -> str: """run background job with access to ops classes""" @@ -166,8 +175,9 @@ async def _run_bg_job_with_ops_classes( "id": job_id, "oid": oid, "job_type": job_type, - "backend_image": backend_image, - "pull_policy": pull_policy, + "backend_image": os.environ.get("BACKEND_IMAGE", ""), + "pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), + **kwargs, } data = self.templates.env.get_template("background_job.yaml").render(params) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index a9fe790730..75c7a32cdd 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -248,7 +248,7 @@ def main() -> None: upload_ops = init_uploads_api(*base_crawl_init) page_ops = init_pages_api( - app, mdb, crawls, org_ops, storage_ops, current_active_user + app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user ) base_crawl_ops.set_page_ops(page_ops) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 6ce4264126..7c77dcba78 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -12,6 +12,7 @@ job_type = os.environ.get("BG_JOB_TYPE") oid = os.environ.get("OID") +type_filter = os.environ.get("TYPE_FILTER") # ============================================================================ @@ -27,7 +28,7 @@ async def main(): ) return 1 - (org_ops, _, _, _, _, _, _, _, _, _, _, user_manager) = init_ops() + (org_ops, _, _, _, _, page_ops, _, _, _, _, _, user_manager) = init_ops() if not oid: print("Org id missing, quitting") @@ -57,6 +58,15 @@ async def main(): traceback.print_exc() return 1 + if job_type == BgJobType.READD_ORG_PAGES: + try: + await page_ops.re_add_all_crawl_pages(org, type_filter=type_filter) + return 0 + # pylint: disable=broad-exception-caught + except Exception: + traceback.print_exc() + return 1 + print(f"Provided job type {job_type} not currently supported") return 1 diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 660b107ee5..8ed1b52d74 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2294,6 +2294,7 @@ class BgJobType(str, Enum): DELETE_REPLICA = "delete-replica" DELETE_ORG = "delete-org" RECALCULATE_ORG_STATS = "recalculate-org-stats" + READD_ORG_PAGES = "readd-org-pages" # ============================================================================ @@ -2346,6 +2347,14 @@ class RecalculateOrgStatsJob(BackgroundJob): type: Literal[BgJobType.RECALCULATE_ORG_STATS] = BgJobType.RECALCULATE_ORG_STATS +# ============================================================================ +class ReAddOrgPagesJob(BackgroundJob): + """Model for tracking jobs to readd an org's pages""" + + type: Literal[BgJobType.READD_ORG_PAGES] = BgJobType.READD_ORG_PAGES + type_filter: Optional[str] = None + + # ============================================================================ # Union of all job types, for response model @@ -2356,6 +2365,7 @@ class RecalculateOrgStatsJob(BackgroundJob): BackgroundJob, DeleteOrgJob, RecalculateOrgStatsJob, + ReAddOrgPagesJob, ] ] diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 2a282b8e09..bee24d00c5 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -89,7 +89,7 @@ def init_ops() -> Tuple[ upload_ops = UploadOps(*base_crawl_init) - page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) base_crawl_ops.set_page_ops(page_ops) crawl_ops.set_page_ops(page_ops) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 567ff4f7e5..afc16ab7d0 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -24,6 +24,7 @@ PageNoteEdit, PageNoteDelete, QARunBucketStats, + StartedResponse, StartedResponseBool, UpdatedResponse, DeletedResponse, @@ -34,11 +35,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now if TYPE_CHECKING: + from .background_jobs import BackgroundJobOps from .crawls import CrawlOps from .orgs import OrgOps from .storages import StorageOps else: - CrawlOps = StorageOps = OrgOps = object + CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object # ============================================================================ @@ -49,18 +51,24 @@ class PageOps: crawl_ops: CrawlOps org_ops: OrgOps storage_ops: StorageOps + background_job_ops: BackgroundJobOps - def __init__(self, mdb, crawl_ops, org_ops, storage_ops): + def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops): self.pages = mdb["pages"] self.crawls = mdb["crawls"] self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops + self.background_job_ops = background_job_ops async def init_index(self): """init index for pages db collection""" await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + async def set_ops(self, background_job_ops: BackgroundJobOps): + """Set ops classes as needed""" + self.background_job_ops = background_job_ops + async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): """Add pages to database from WACZ files""" pages_buffer: List[Page] = [] @@ -562,16 +570,16 @@ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): await self.add_crawl_pages_to_db_from_wacz(crawl_id) async def re_add_all_crawl_pages( - self, oid: UUID, type_filter: Optional[str] = None + self, org: Organization, type_filter: Optional[str] = None ): """Re-add pages for all crawls and uploads in org""" match_query: Dict[str, object] = {"finished": {"$ne": None}} - if type_filter: + if type_filter in ("crawl", "upload"): match_query["type"] = type_filter crawl_ids = await self.crawls.distinct("_id", match_query) for crawl_id in crawl_ids: - await self.re_add_crawl_pages(crawl_id, oid) + await self.re_add_crawl_pages(crawl_id, org.id) async def get_qa_run_aggregate_counts( self, @@ -644,35 +652,37 @@ async def get_qa_run_aggregate_counts( # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme -def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): +def init_pages_api( + app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep +): """init pages API""" # pylint: disable=invalid-name - ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) org_crawl_dep = org_ops.org_crawl_dep @app.post( "/orgs/{oid}/crawls/all/pages/reAdd", tags=["pages", "crawls"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) @app.post( "/orgs/{oid}/uploads/all/pages/reAdd", tags=["pages", "uploads"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) @app.post( "/orgs/{oid}/all-crawls/all/pages/reAdd", tags=["pages", "all-crawls"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) async def re_add_all_crawl_pages( request: Request, org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): - """Re-add pages for all crawls in org (superuser only)""" + """Re-add pages for all crawls in org (superuser only, may delete page QA data!)""" if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") @@ -689,8 +699,10 @@ async def re_add_all_crawl_pages( except (IndexError, AttributeError): pass - asyncio.create_task(ops.re_add_all_crawl_pages(org.id, type_filter=type_filter)) - return {"started": True} + job_id = await ops.background_job_ops.create_re_add_org_pages_job( + org.id, type_filter=type_filter + ) + return {"started": job_id or ""} @app.post( "/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", @@ -711,7 +723,7 @@ async def re_add_crawl_pages( crawl_id: str, org: Organization = Depends(org_crawl_dep), ): - """Re-add pages for crawl""" + """Re-add pages for crawl (may delete page QA data!)""" asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) return {"started": True} diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 132d3bf8fe..b6301457df 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 0 + ttlSecondsAfterFinished: 90 backoffLimit: 3 template: spec: @@ -38,6 +38,9 @@ spec: - name: OID value: {{ oid }} + - name: TYPE_FILTER + value: {{ type_filter }} + envFrom: - configMapRef: name: backend-env-config From 7271dbf798de0aa8b8b1d1c70496ea8775874efc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 12:40:34 -0500 Subject: [PATCH 25/57] Add migration to add upload files for all orgs in bg jobs --- backend/btrixcloud/db.py | 11 ++++-- backend/btrixcloud/main.py | 1 + .../migrations/migration_0037_upload_pages.py | 37 +++++++++++++++++++ backend/btrixcloud/pages.py | 2 +- 4 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 backend/btrixcloud/migrations/migration_0037_upload_pages.py diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index f453442191..460f1a1082 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from .migrations import BaseMigration -CURR_DB_VERSION = "0036" +CURR_DB_VERSION = "0037" # ============================================================================ @@ -82,6 +82,7 @@ async def update_and_prepare_db( invite_ops, storage_ops, page_ops, + background_job_ops, db_inited, ): """Prepare database for application. @@ -94,7 +95,7 @@ async def update_and_prepare_db( """ await ping_db(mdb) print("Database setup started", flush=True) - if await run_db_migrations(mdb, user_manager, page_ops): + if await run_db_migrations(mdb, user_manager, background_job_ops): await drop_indexes(mdb) await create_indexes( org_ops, @@ -113,7 +114,7 @@ async def update_and_prepare_db( # ============================================================================ -async def run_db_migrations(mdb, user_manager, page_ops): +async def run_db_migrations(mdb, user_manager, background_job_ops): """Run database migrations.""" # if first run, just set version and exit @@ -145,7 +146,9 @@ async def run_db_migrations(mdb, user_manager, page_ops): assert spec.loader migration_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(migration_module) - migration = migration_module.Migration(mdb, page_ops=page_ops) + migration = migration_module.Migration( + mdb, background_job_ops=background_job_ops + ) if await migration.run(): migrations_run = True except ImportError as err: diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 75c7a32cdd..927a03dcb8 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -278,6 +278,7 @@ def main() -> None: invites, storage_ops, page_ops, + background_job_ops, db_inited, ) ) diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py new file mode 100644 index 0000000000..e228782556 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -0,0 +1,37 @@ +""" +Migration 0037 -- upload pages +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0037" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.background_job_ops = kwargs.get("background_job_ops") + + async def migrate_up(self): + """Perform migration up. + + Start background jobs to parse uploads and add their pages to db + """ + mdb_orgs = self.mdb["organizations"] + async for org in mdb_orgs.find(): + oid = org["_id"] + try: + await self.background_job_ops.create_re_add_org_pages_job( + oid, type_filter="upload" + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error starting background job to add upload pges to org {oid}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index afc16ab7d0..1e62a9130f 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -111,7 +111,7 @@ def _get_page_from_dict( UUID(page_id) except ValueError: print( - f'Page {page_dict.get("url")} is not a valid UUID - assigning UUID', + f'Page {page_dict.get("url")} id "{page_id}" is not a valid UUID - assigning UUID', flush=True, ) page_id = uuid4() From 0ddbcd019d5778f7da866051cc14d97769edd260 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 14:29:47 -0500 Subject: [PATCH 26/57] Add API endpoint to GET single public collection --- backend/btrixcloud/colls.py | 58 ++++++++++++-- .../migrations/migration_0037_upload_pages.py | 6 ++ backend/test/test_collections.py | 75 ++++++++++++++++++- 3 files changed, 130 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 0485a1218c..a61857e416 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -2,6 +2,8 @@ Collections API """ +# pylint: disable=too-many-lines + from collections import Counter from uuid import UUID, uuid4 from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union @@ -210,11 +212,11 @@ async def remove_crawls_from_collection( return await self.get_collection_out(coll_id, org) async def get_collection_raw( - self, coll_id: UUID, public_only: bool = False + self, coll_id: UUID, public_or_unlisted_only: bool = False ) -> Dict[str, Any]: """Get collection by id as dict from database""" query: dict[str, object] = {"_id": coll_id} - if public_only: + if public_or_unlisted_only: query["access"] = {"$in": ["public", "unlisted"]} result = await self.collections.find_one(query) @@ -224,17 +226,21 @@ async def get_collection_raw( return result async def get_collection( - self, coll_id: UUID, public_only: bool = False + self, coll_id: UUID, public_or_unlisted_only: bool = False ) -> Collection: """Get collection by id""" - result = await self.get_collection_raw(coll_id, public_only) + result = await self.get_collection_raw(coll_id, public_or_unlisted_only) return Collection.from_dict(result) async def get_collection_out( - self, coll_id: UUID, org: Organization, resources=False, public_only=False + self, + coll_id: UUID, + org: Organization, + resources=False, + public_or_unlisted_only=False, ) -> CollOut: """Get CollOut by id""" - result = await self.get_collection_raw(coll_id, public_only) + result = await self.get_collection_raw(coll_id, public_or_unlisted_only) if resources: result["resources"] = await self.get_collection_crawl_resources(coll_id) @@ -248,6 +254,26 @@ async def get_collection_out( return CollOut.from_dict(result) + async def get_public_collection_out( + self, coll_id: UUID, org: Organization + ) -> PublicCollOut: + """Get PublicCollOut by id""" + result = await self.get_collection_raw(coll_id) + + if result.get("access") != "public": + raise HTTPException(status_code=404, detail="collection_not_found") + + result["resources"] = await self.get_collection_crawl_resources(coll_id) + + thumbnail = result.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + result["thumbnail"] = await image_file.get_public_image_file_out( + org, self.storage_ops + ) + + return PublicCollOut.from_dict(result) + async def list_collections( self, org: Organization, @@ -825,7 +851,7 @@ async def get_collection_public_replay( org: Organization = Depends(org_public), ): coll = await colls.get_collection_out( - coll_id, org, resources=True, public_only=True + coll_id, org, resources=True, public_or_unlisted_only=True ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" @@ -920,6 +946,24 @@ async def get_org_public_collections( sort_direction=sortDirection, ) + @app.get( + "/public-collections/{org_slug}/collections/{coll_id}", + tags=["collections"], + response_model=PublicCollOut, + ) + async def get_public_collection( + org_slug: str, + coll_id: UUID, + ): + try: + org = await colls.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="collection_not_found") + + return await colls.get_public_collection_out(coll_id, org) + @app.get( "/orgs/{oid}/collections/{coll_id}/urls", tags=["collections"], diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index e228782556..5ae79c51aa 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -22,6 +22,12 @@ async def migrate_up(self): Start background jobs to parse uploads and add their pages to db """ + if self.background_job_ops is None: + print( + "Unable to start background job, missing background_job_ops", flush=True + ) + return + mdb_orgs = self.mdb["organizations"] async for org in mdb_orgs.find(): oid = org["_id"] diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index a5f0c6610a..f48955642b 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1,5 +1,6 @@ import requests import os +from uuid import uuid4 from zipfile import ZipFile, ZIP_STORED from tempfile import TemporaryFile @@ -15,6 +16,19 @@ CAPTION = "Short caption" UPDATED_CAPTION = "Updated caption" +NON_PUBLIC_COLL_FIELDS = ( + "oid", + "modified", + "crawlCount", + "pageCount", + "totalSize", + "tags", + "access", + "homeUrlPageId", +) +NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created") + + _coll_id = None _second_coll_id = None _public_coll_id = None @@ -1024,7 +1038,7 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["dateEarliest"] assert coll["dateLatest"] - for field in non_public_fields: + for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll if coll["id"] == _public_coll_id: @@ -1042,13 +1056,70 @@ def test_list_public_colls_home_url_thumbnail(): assert thumbnail["size"] assert thumbnail["mime"] - for field in non_public_image_fields: + for field in NON_PUBLIC_IMAGE_FIELDS: assert field not in thumbnail if coll["id"] == _second_public_coll_id: assert coll["description"] +def test_get_public_collection(): + r = requests.get( + f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _public_coll_id + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + assert coll["caption"] == CAPTION + + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in NON_PUBLIC_IMAGE_FIELDS: + assert field not in thumbnail + + # Invalid org slug - don't reveal whether org exists or not, use + # same exception as if collection doesn't exist + r = requests.get( + f"{API_PREFIX}/public-collections/doesntexist/collections/{_public_coll_id}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + # Invalid collection id + random_uuid = uuid4() + r = requests.get( + f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{random_uuid}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + # Collection isn't public + r = requests.get( + f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{ _coll_id}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + def test_delete_thumbnail(crawler_auth_headers, default_org_id): r = requests.delete( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail", From 34ea14395e4b7d912e7f52f89e28b200418568da Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 14:45:21 -0500 Subject: [PATCH 27/57] Recalculate collection dates after adding upload pages --- backend/btrixcloud/colls.py | 9 +++++++++ backend/btrixcloud/main_bg.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index a61857e416..4199b0e1f2 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -477,6 +477,15 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): }, ) + async def recalculate_org_collection_dates(self, org: Organization): + """Recalculate earliest and latest dates for collections in org""" + collections, _ = await self.list_collections( + org, + page_size=100_000, + ) + for coll in collections: + await self.update_collection_dates(coll.id) + async def update_collection_dates(self, coll_id: UUID): """Update collection earliest and latest dates from page timestamps""" coll = await self.get_collection(coll_id) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 7c77dcba78..a742e4c408 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -28,7 +28,7 @@ async def main(): ) return 1 - (org_ops, _, _, _, _, page_ops, _, _, _, _, _, user_manager) = init_ops() + (org_ops, _, _, _, _, page_ops, coll_ops, _, _, _, _, user_manager) = init_ops() if not oid: print("Org id missing, quitting") @@ -61,6 +61,7 @@ async def main(): if job_type == BgJobType.READD_ORG_PAGES: try: await page_ops.re_add_all_crawl_pages(org, type_filter=type_filter) + await coll_ops.recalculate_org_collection_dates(org) return 0 # pylint: disable=broad-exception-caught except Exception: From 60522c15b1641724537d7423bb2a6cc7231ac7e4 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 14:56:21 -0500 Subject: [PATCH 28/57] Rename type_filter to crawl_type --- backend/btrixcloud/background_jobs.py | 8 ++-- backend/btrixcloud/crawlmanager.py | 4 +- backend/btrixcloud/main_bg.py | 4 +- .../migrations/migration_0037_upload_pages.py | 2 +- backend/btrixcloud/models.py | 2 +- backend/btrixcloud/pages.py | 39 +++++++++++-------- chart/app-templates/background_job.yaml | 4 +- 7 files changed, 34 insertions(+), 29 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 6ebb43ad01..35a2e75d59 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -365,7 +365,7 @@ async def create_recalculate_org_stats_job( async def create_re_add_org_pages_job( self, oid: UUID, - type_filter: Optional[str] = None, + crawl_type: Optional[str] = None, existing_job_id: Optional[str] = None, ): """Create job to (re)add all pages in an org, optionally filtered by crawl type""" @@ -373,7 +373,7 @@ async def create_re_add_org_pages_job( try: job_id = await self.crawl_manager.run_re_add_org_pages_job( oid=str(oid), - type_filter=type_filter, + crawl_type=crawl_type, existing_job_id=existing_job_id, ) if existing_job_id: @@ -393,7 +393,7 @@ async def create_re_add_org_pages_job( readd_pages_job = ReAddOrgPagesJob( id=job_id, oid=oid, - type_filter=type_filter, + crawl_type=crawl_type, started=dt_now(), ) @@ -627,7 +627,7 @@ async def retry_background_job( if job.type == BgJobType.READD_ORG_PAGES: await self.create_re_add_org_pages_job( org.id, - job.type_filter, + job.crawl_type, existing_job_id=job_id, ) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 55ed6c3072..6810929f51 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -149,7 +149,7 @@ async def run_recalculate_org_stats_job( async def run_re_add_org_pages_job( self, oid: str, - type_filter: Optional[str] = None, + crawl_type: Optional[str] = None, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -163,7 +163,7 @@ async def run_re_add_org_pages_job( oid, job_id, job_type=BgJobType.READD_ORG_PAGES.value, - type_filter=type_filter, + crawl_type=crawl_type, ) async def _run_bg_job_with_ops_classes( diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index a742e4c408..e798f78ec3 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -12,7 +12,7 @@ job_type = os.environ.get("BG_JOB_TYPE") oid = os.environ.get("OID") -type_filter = os.environ.get("TYPE_FILTER") +crawl_type = os.environ.get("CRAWL_TYPE") # ============================================================================ @@ -60,7 +60,7 @@ async def main(): if job_type == BgJobType.READD_ORG_PAGES: try: - await page_ops.re_add_all_crawl_pages(org, type_filter=type_filter) + await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) await coll_ops.recalculate_org_collection_dates(org) return 0 # pylint: disable=broad-exception-caught diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index 5ae79c51aa..9bb4408a0d 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -33,7 +33,7 @@ async def migrate_up(self): oid = org["_id"] try: await self.background_job_ops.create_re_add_org_pages_job( - oid, type_filter="upload" + oid, crawl_type="upload" ) # pylint: disable=broad-exception-caught except Exception as err: diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 8ed1b52d74..e799aaa72d 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2352,7 +2352,7 @@ class ReAddOrgPagesJob(BackgroundJob): """Model for tracking jobs to readd an org's pages""" type: Literal[BgJobType.READD_ORG_PAGES] = BgJobType.READD_ORG_PAGES - type_filter: Optional[str] = None + crawl_type: Optional[str] = None # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 1e62a9130f..d3f4c9dc05 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -570,12 +570,12 @@ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): await self.add_crawl_pages_to_db_from_wacz(crawl_id) async def re_add_all_crawl_pages( - self, org: Organization, type_filter: Optional[str] = None + self, org: Organization, crawl_type: Optional[str] = None ): """Re-add pages for all crawls and uploads in org""" match_query: Dict[str, object] = {"finished": {"$ne": None}} - if type_filter in ("crawl", "upload"): - match_query["type"] = type_filter + if crawl_type in ("crawl", "upload"): + match_query["type"] = crawl_type crawl_ids = await self.crawls.distinct("_id", match_query) for crawl_id in crawl_ids: @@ -649,6 +649,23 @@ async def get_qa_run_aggregate_counts( return sorted(return_data, key=lambda bucket: bucket.lowerBoundary) + def get_crawl_type_from_pages_route(self, request: Request): + """Get crawl type to filter on from request route""" + crawl_type = None + + try: + route_path = request.scope["route"].path + type_path = route_path.split("/")[4] + + if type_path == "uploads": + crawl_type = "upload" + if type_path == "crawls": + crawl_type = "crawl" + except (IndexError, AttributeError): + pass + + return crawl_type + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme @@ -686,21 +703,9 @@ async def re_add_all_crawl_pages( if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - type_filter = None - - try: - route_path = request.scope["route"].path - type_path = route_path.split("/")[4] - - if type_path == "uploads": - type_filter = "upload" - if type_path == "crawls": - type_filter = "crawl" - except (IndexError, AttributeError): - pass - + crawl_type = ops.get_crawl_type_from_pages_route(request) job_id = await ops.background_job_ops.create_re_add_org_pages_job( - org.id, type_filter=type_filter + org.id, crawl_type=crawl_type ) return {"started": job_id or ""} diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index b6301457df..f47dd2acfd 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -38,8 +38,8 @@ spec: - name: OID value: {{ oid }} - - name: TYPE_FILTER - value: {{ type_filter }} + - name: CRAWL_TYPE + value: {{ crawl_type }} envFrom: - configMapRef: From 67b52af8005679eb92671abf88f3b97cd1dc1560 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 15:00:39 -0500 Subject: [PATCH 29/57] Recalculate collection stats after adding upload pages --- backend/btrixcloud/colls.py | 9 +++++++++ backend/btrixcloud/main_bg.py | 1 + 2 files changed, 10 insertions(+) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 4199b0e1f2..787c4ac783 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -443,6 +443,15 @@ async def download_collection(self, coll_id: UUID, org: Organization): resp, headers=headers, media_type="application/wacz+zip" ) + async def recalculate_org_collection_counts_tags(self, org: Organization): + """Recalculate counts and tags for collections in org""" + collections, _ = await self.list_collections( + org, + page_size=100_000, + ) + for coll in collections: + await self.update_collection_counts_and_tags(coll.id) + async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" crawl_count = 0 diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index e798f78ec3..709139d8d2 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -62,6 +62,7 @@ async def main(): try: await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) await coll_ops.recalculate_org_collection_dates(org) + await coll_ops.recalculate_org_collection_counts_tags(org) return 0 # pylint: disable=broad-exception-caught except Exception: From 2b5ba8ecaefaee8ba95a13daa544b603caef8f59 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 15:20:50 -0500 Subject: [PATCH 30/57] Reduce per-page print logging --- backend/btrixcloud/pages.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index d3f4c9dc05..251d959be1 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -104,16 +104,11 @@ def _get_page_from_dict( """Return Page object from dict""" page_id = page_dict.get("id", "") if not page_id: - print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) page_id = uuid4() try: UUID(page_id) except ValueError: - print( - f'Page {page_dict.get("url")} id "{page_id}" is not a valid UUID - assigning UUID', - flush=True, - ) page_id = uuid4() status = page_dict.get("status") From 2e72abe58e5136e74aedd4cf15bf9b83d764faa3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 16:06:17 -0500 Subject: [PATCH 31/57] Include upload pages in collection pageCount --- backend/btrixcloud/colls.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 787c4ac783..c46a8a511e 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -467,8 +467,17 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): files = crawl.files or [] for file in files: total_size += file.size - if crawl.stats: - page_count += crawl.stats.done + + try: + org = await self.orgs.get_org_by_id(crawl.oid) + _, crawl_pages = await self.page_ops.list_pages( + crawl.id, org, page_size=1_000_000 + ) + page_count += crawl_pages + # pylint: disable=broad-exception-caught + except Exception: + pass + if crawl.tags: tags.extend(crawl.tags) From b96346b7501a0689217b05bd9775eed02f2ef24d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 16:12:42 -0500 Subject: [PATCH 32/57] Return 404 for GET public collection if org isn't public --- backend/btrixcloud/colls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index c46a8a511e..ffcf397767 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -989,6 +989,9 @@ async def get_public_collection( # pylint: disable=raise-missing-from raise HTTPException(status_code=404, detail="collection_not_found") + if not org.enablePublicProfile: + raise HTTPException(status_code=404, detail="collection_not_found") + return await colls.get_public_collection_out(coll_id, org) @app.get( From b260386e30c522852f25c9713cb4495438dfe829 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 16:59:26 -0500 Subject: [PATCH 33/57] Add oid to PublicCollOut model --- backend/btrixcloud/models.py | 1 + backend/test/test_collections.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e799aaa72d..36ae904877 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1307,6 +1307,7 @@ class PublicCollOut(BaseMongoModel): """Collection output model with annotations.""" name: str + oid: UUID description: Optional[str] = None caption: Optional[str] = None diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index f48955642b..2d1ea98614 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -17,7 +17,6 @@ UPDATED_CAPTION = "Updated caption" NON_PUBLIC_COLL_FIELDS = ( - "oid", "modified", "crawlCount", "pageCount", @@ -877,6 +876,7 @@ def test_list_public_collections( assert len(collections) == 2 for collection in collections: assert collection["id"] in (_public_coll_id, _second_public_coll_id) + assert collection["oid"] assert collection["name"] assert collection["dateEarliest"] assert collection["dateLatest"] @@ -1033,6 +1033,7 @@ def test_list_public_colls_home_url_thumbnail(): for coll in collections: assert coll["id"] in (_public_coll_id, _second_public_coll_id) + assert coll["oid"] assert coll["name"] assert coll["resources"] assert coll["dateEarliest"] @@ -1063,7 +1064,7 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["description"] -def test_get_public_collection(): +def test_get_public_collection(default_org_id): r = requests.get( f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{_public_coll_id}" ) @@ -1071,6 +1072,7 @@ def test_get_public_collection(): coll = r.json() assert coll["id"] == _public_coll_id + assert coll["oid"] == default_org_id assert coll["name"] assert coll["resources"] assert coll["dateEarliest"] From de9907106e99ca76fe89e10b891dd04159982d7c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 17:08:09 -0500 Subject: [PATCH 34/57] Modify public collections endpoint paths Use public namespace, with form: - /public/orgs/slug/collections - /public/orgs/slug/collections/collection_id --- backend/btrixcloud/colls.py | 8 ++++---- backend/test/test_collections.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index ffcf397767..a60f0f074b 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -954,8 +954,8 @@ async def download_collection( return await colls.download_collection(coll_id, org) @app.get( - "/public-collections/{org_slug}", - tags=["collections"], + "/public/orgs/{org_slug}/collections", + tags=["collections", "public"], response_model=OrgPublicCollections, ) async def get_org_public_collections( @@ -974,8 +974,8 @@ async def get_org_public_collections( ) @app.get( - "/public-collections/{org_slug}/collections/{coll_id}", - tags=["collections"], + "/public/orgs/{org_slug}/collections/{coll_id}", + tags=["collections", "public"], response_model=PublicCollOut, ) async def get_public_collection( diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 2d1ea98614..a150a40cf0 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -832,7 +832,7 @@ def test_list_public_collections( assert data["publicUrl"] == "" # Try listing public collections without org public profile enabled - r = requests.get(f"{API_PREFIX}/public-collections/{default_org_slug}") + r = requests.get(f"{API_PREFIX}/public/org/{default_org_slug}/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -863,7 +863,7 @@ def test_list_public_collections( assert data["publicUrl"] == public_url # List public collections with no auth (no public profile) - r = requests.get(f"{API_PREFIX}/public-collections/{default_org_slug}") + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") assert r.status_code == 200 data = r.json() @@ -883,7 +883,7 @@ def test_list_public_collections( # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug - r = requests.get(f"{API_PREFIX}/public-collections/nonexistentslug") + r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -891,7 +891,7 @@ def test_list_public_collections( def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers): # Test existing org that's not public - should return same 404 as # if org doesn't exist - r = requests.get(f"{API_PREFIX}/public-collections/{NON_DEFAULT_ORG_SLUG}") + r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -908,7 +908,7 @@ def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers # List public collections with no auth - should still get profile even # with no public collections - r = requests.get(f"{API_PREFIX}/public-collections/{NON_DEFAULT_ORG_SLUG}") + r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections") assert r.status_code == 200 data = r.json() assert data["org"]["name"] == NON_DEFAULT_ORG_NAME @@ -1026,7 +1026,7 @@ def test_list_public_colls_home_url_thumbnail(): ) non_public_image_fields = ("originalFilename", "userid", "userName", "created") - r = requests.get(f"{API_PREFIX}/public-collections/{default_org_slug}") + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") assert r.status_code == 200 collections = r.json()["collections"] assert len(collections) == 2 @@ -1066,7 +1066,7 @@ def test_list_public_colls_home_url_thumbnail(): def test_get_public_collection(default_org_id): r = requests.get( - f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{_public_coll_id}" + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}" ) assert r.status_code == 200 coll = r.json() @@ -1101,7 +1101,7 @@ def test_get_public_collection(default_org_id): # Invalid org slug - don't reveal whether org exists or not, use # same exception as if collection doesn't exist r = requests.get( - f"{API_PREFIX}/public-collections/doesntexist/collections/{_public_coll_id}" + f"{API_PREFIX}/public/orgs/doesntexist/collections/{_public_coll_id}" ) assert r.status_code == 404 assert r.json()["detail"] == "collection_not_found" @@ -1109,14 +1109,14 @@ def test_get_public_collection(default_org_id): # Invalid collection id random_uuid = uuid4() r = requests.get( - f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{random_uuid}" + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{random_uuid}" ) assert r.status_code == 404 assert r.json()["detail"] == "collection_not_found" # Collection isn't public r = requests.get( - f"{API_PREFIX}/public-collections/{default_org_slug}/collections/{ _coll_id}" + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{ _coll_id}" ) assert r.status_code == 404 assert r.json()["detail"] == "collection_not_found" From cabed6d5d8353b14a56e510e9a263bdc4ef09940 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 17:35:24 -0500 Subject: [PATCH 35/57] Fix endpoint path in test --- backend/test/test_collections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index a150a40cf0..7a0b723903 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -832,7 +832,7 @@ def test_list_public_collections( assert data["publicUrl"] == "" # Try listing public collections without org public profile enabled - r = requests.get(f"{API_PREFIX}/public/org/{default_org_slug}/collections") + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" From 5fbf3f37d7a309195876ecdc8063824ae605baee Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 4 Dec 2024 09:59:49 -0500 Subject: [PATCH 36/57] Only fetch org once, not per-crawl --- backend/btrixcloud/colls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index a60f0f074b..b9b533ecef 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -459,6 +459,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size = 0 tags = [] + coll = await self.get_collection(collection_id) + org = await self.orgs.get_org_by_id(coll.oid) + async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): crawl = BaseCrawl.from_dict(crawl_raw) if crawl.state not in SUCCESSFUL_STATES: @@ -469,7 +472,6 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size += file.size try: - org = await self.orgs.get_org_by_id(crawl.oid) _, crawl_pages = await self.page_ops.list_pages( crawl.id, org, page_size=1_000_000 ) From ea86db4e2c76414936f00f74bb2f775c3cc161b6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 4 Dec 2024 10:03:08 -0500 Subject: [PATCH 37/57] Add counts and size to PublicCollOut --- backend/btrixcloud/models.py | 4 ++++ backend/test/test_collections.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 36ae904877..430ca14a22 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1311,6 +1311,10 @@ class PublicCollOut(BaseMongoModel): description: Optional[str] = None caption: Optional[str] = None + crawlCount: Optional[int] = 0 + pageCount: Optional[int] = 0 + totalSize: Optional[int] = 0 + dateEarliest: Optional[datetime] = None dateLatest: Optional[datetime] = None diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 7a0b723903..854c2c430b 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -18,9 +18,6 @@ NON_PUBLIC_COLL_FIELDS = ( "modified", - "crawlCount", - "pageCount", - "totalSize", "tags", "access", "homeUrlPageId", @@ -880,6 +877,9 @@ def test_list_public_collections( assert collection["name"] assert collection["dateEarliest"] assert collection["dateLatest"] + assert collection["crawlCount"] > 0 + assert collection["pageCount"] > 0 + assert collection["totalSize"] > 0 # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug @@ -1038,6 +1038,9 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["resources"] assert coll["dateEarliest"] assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll @@ -1077,6 +1080,9 @@ def test_get_public_collection(default_org_id): assert coll["resources"] assert coll["dateEarliest"] assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll From 90052bc7f5ecbc00d1c2e82b13d80a0db827cb32 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 4 Dec 2024 11:22:50 -0500 Subject: [PATCH 38/57] Enforce max thumbnail file size of 2MB --- backend/btrixcloud/colls.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index b9b533ecef..7e60544309 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -60,6 +60,9 @@ OrgOps = StorageOps = EventWebhookOps = CrawlOps = object +THUMBNAIL_MAX_SIZE = 2_000_000 + + # ============================================================================ class CollectionOps: """ops for working with named collections of crawls""" @@ -717,7 +720,7 @@ async def stream_iter(): file_prep.add_chunk(chunk) yield chunk - print("Collection Thumbnail Stream Upload Start", flush=True) + print("Collection thumbnail stream upload starting", flush=True) if not await self.storage_ops.do_upload_multipart( org, @@ -725,13 +728,21 @@ async def stream_iter(): stream_iter(), MIN_UPLOAD_PART_SIZE, ): - print("Collection Thumbnail Stream Upload Failed", flush=True) + print("Collection thumbnail stream upload failed", flush=True) raise HTTPException(status_code=400, detail="upload_failed") - print("Collection Thumbnail Stream Upload Complete", flush=True) + print("Collection thumbnail stream upload complete", flush=True) thumbnail_file = file_prep.get_image_file(org.storage) + if thumbnail_file.size > THUMBNAIL_MAX_SIZE: + print( + "Collection thumbnail stream upload failed: max size (2 MB) exceeded", + flush=True, + ) + await self.storage_ops.delete_file_object(org, thumbnail_file) + raise HTTPException(status_code=400, detail="upload_failed") + if coll.thumbnail: if not await self.storage_ops.delete_file_object(org, coll.thumbnail): print( @@ -740,8 +751,7 @@ async def stream_iter(): coll.thumbnail = thumbnail_file - # Update entire document to avoid bson.errors.InvalidDocument error - # with thumbnail + # Update entire document to avoid bson.errors.InvalidDocument exception await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, {"$set": coll.to_dict()}, From 921c6bb5ffb6900c906df82ea1f69f3f0f56ed8b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 4 Dec 2024 11:38:09 -0500 Subject: [PATCH 39/57] Use proper thumbnail as test data --- backend/test/data/thumbnail.jpg | Bin 27462 -> 27636 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/backend/test/data/thumbnail.jpg b/backend/test/data/thumbnail.jpg index e746e341f2c778c4ac69de34063000d624169e89..133cfae2b048546ea31f8eb79ff6dd4517cbee50 100644 GIT binary patch literal 27636 zcmb5Vby%BC(>NN6gyNLoL5h1R8XBY&r^Vf&KyeLP+}%QfLLoSTP@uRL*HYZAxE6P5 z;qW}~`+etJzw7*Q&i--V$?WdT?99r@-Vd`63jiqq76v93CI%K3CKeEgg^fdmgY)z$ z4v2sdkBA&ZK|u~8Bcq~WrlX=}1e1}`bI~)hu(EToQ_^wsaSG-*@gp}+o}gi% zJ;A`l{FgDbC&cK_o@0=_lvHCl=5q<^zyz69jxi?XuAaPqm{|s zKnQU;1)fO^SV{MG_V%_@^acXZO-Y-qmoRwj>QnB|6Bs>7M{GHVbZYsA-amQ!wT$^_ z>&J(`NbhTQ`>vr6n-zji>LwgnhFl-LfJ+5*HEFa5oUa;x`J*|UZ^k3~Bp2J?C5xV@ zK<`CNE)&w=NTu1`ubM0r0S?ds-wVR)f`F*gQv-Do*v4lw&xIG+mmgG`N=@Iw>V(eq zzby5B7@}#NrTQJZF2cAJEG{b-;Jl}4!qXcnr65CX%i|7}1W4#KcjTH>>MuA1ZRx$c zNN$eLa}`edWejk`MP^p$8nh87+0=JA6xb7O_A!qnEi}o|I(nw-4RuhUTLN4=Q)#M% zcAD1XTVxQV!Xm@`V}_32(5{`bSix2x$GV)%IEUPsRt=#Eej`e9yQfW$UFxwnP)@jbE%0`6@ISCpp{w~%{ z0m|H@cc!YX5QMzSzEE4_?^)KWa723bfgb?h2|6r%puR>IYWw>%Mi&|jkKhNuWbu!I zOOL-uKTRK~sL_RrV})1W@Ez<;6*T<1c`vKWdvop)n6*ylCOUP$n*WGZd9{`27Fn<2 z$_2YQ?{quWxl4U)ncqNho%K)4wc!oN{4Et!+hVM*av#c*o37;uh4>}cNahlPu$1+c z*SSl&%({>wI`NzmYw}4-KxBT}&Sv&$bc$&$SrqPB*1S9w>#q?x3iM5MrQS3qdIV{S zbv^Q};)vMLMZ@}&GE)VaH@pE)$6KTf?&0fT<{jaqMOAsmnTN#2+g!zqV9{-*C1`l` z4;x;uZZ+{YoJc+Z&|EIE^7PmpwLp9_#+Jc(ik@V(1up#Q+k+1Pt#>r}v&c_9VE2Qh zOqMr0?e=CpIgmC8eHvP&M0D%$+rzc};~bin89KRn%TKQ=;muHUfMCLNjGNN#RrFQV3q z9ssY7a{r{f6FBlRXJ+nm77f6-tP)%F2jM6C8-W^}TVhY=MDWwZY`*8y2&-dhOsSx- z{1v9;Wh7snK3zkibL37QepZf*glB7s@#dltOTIo;)hGh_PD)$KdbDp8suLO;fuzA_ zot+s;L$HFvPaV&%B71e$x0a76IJq)~{o=J)&2}3H^jg&`t!@7n37=FqX9$1Z`~d4* zjW?zb#umpknY|bmqK4_;Nrkq~=#JgJO7~jdO~neY`E#sNMCLuhbu=+#Mcr8EA3>58 z|0h*Y%g&|Sd5E4NXm<+4X~ebDEu(r(+j4z=5LL z{Dd`{jhl_}FWg%ld$;dW+@DdH`G%eliH!pJM+2DiCUv45eugjEo7*~(q7%zqiduLk z8XI23F1<}nt(Cd;LZw?(@~U_jQ~PCjxcjsaZ28vD za_SrR(WH`#9SM?Nu*Rbb4lO(bA2*WT)8o_5VX()q8{Y>&m5R8n0w5?s6!UAVk%418 zBrjp!Zttl*X#Gn<)UoxRrg8aDedKC1zlMnj1CFz&XSx{mnx>uuK_OhRX--C_*Jv|2 z$7$9oL~gCt)5%bKxmfpWXVBTy^iK|+6zGTuwfVkqN3LTK{XuNj$yy0Dt?4r@FUk|3 z`A@Q0zLVJ=Q?EaKmP&z;rH} zAfur;okD-mwuW*tnqn-vbJGf19ncP{Doy+mG@v3K_u8L*4Vf2@LcMJDIDZH0e1ylv zi!kIWYoNj~NJhCDBucFEdL;-3SqS&^cIx|1dmXs@RE6SEMP%bjFPx{-AeLuJ^%>Xs# zd%z$;LB|>$Pv_xzUzh60hE7YURlX!`o0eu)>B8TlCTxc(*8wF_xAk!O<884&=n(Nk z<#o|^$VSuLd10{sU2FAdTdT8gdZf=VIv`v0nHZ@%)%!lx86i6v+>m(rL)q<^n480j z@LI!79qXFO<0hD0nY)A^l$zex|BQa#o7X5)U>_*#hKnUWb#Pi&GP=NNdFj23d(D)P zOwF0;&mC#cBg*Pac3s}7#BZK*=$0ByE;4}=L4vDnfsS_Hu$Mw*+@rdvVwIEV0$d#x z*a%)DoH~2}tckO)x*LXB*-r1-nydfz(uV*V~H5dNAqy%*gZ{@OT}U`{{H z<_lw%hy3DwZmO-TOqs%= zt)D^g8LW`TRR}K242s)Rv2T3*Eh7@Gr|e~e3)VVh;WI16cA|&g#Js~qTf8lm4NXLB z6TQm!6TgpzVyI;$C<`*TC|X;fvckfYEF?D2Uuq}1SwT-l7i?2-M78+jH3Zl-=}z;i z%&~A%VBW8$yfoSdQ)&?Cfwl^;NKR=&r{S%%D`w~HeqWbC+3mPynv^@sfzS17x{r~O z1tr_KLNvU2$mmP*jXuzW$)2Fc&1;#C8Cg0ZWc#gyyOp?$(em%-mBBB^+Yywsa+VFw z1Q^7L9^%#_uax}Wmw=7#=Ey;@d}J;HWDsH3#r2q|!p7}Hk^So2DUYZP**3MOD%}TKer2AZK$n=F=YjyYa;t@)j_!7$~xh!ijPz zABf+T5XdMxBgJK(!P>`3b^U96MzD)M!mHOD!I&bJ#MW9(meDB zKOOq*`PZxFx@4@vCj;xK3U+8%--J&Sn8UD15egotq)oa{O_P z1U_EeWfVENp*?b7URhKz3^CFPY3oR)rX~ey%da3vbbchJn8K5?4U)vH11)~qN zgH3}8GrH>M_fv0K#>dAyXZ5>i!Q*RN2-9RZEGYAuYfnQ;aeV$mABh4+IDaS@08{}y z`G?jT>8k|oMaMKbwO=MA&r5x|!vgUDYgkw!XmLL}QLVpGw%)86+v>Bkd~3|HNg*NM zmL2o+c7NXqe|7I|GVsr}C(``6+WL&4*;jX5}esZ7j?^P%eR(-_lF;Se$_{_Tac%3SSb6LbY6xWp0zXj1wp+? zf8WxsYIu$dy_&y)&Xeqb{;^_Dnom$_36VF9Oa~t&Dy(aDU z>h=NPtSUOC{s1sxt+>K4Az|NYQxGTeN6Gg4M{1%E=EZU(LpXNcBn^>Fp|g+VWtwNu z`pZXZtRXwzpsA`B_ujk~x0Hlz4k<`5mN0kkdctYP?7co36N{^Ryodx73g;9zzF&89E@zT3BWi2v1Lyb8!mup9Ih z&O%4j_k@3Y$ZU|{Yrd%F#pPZ8t3C}v=d+!yOsh#A5lhc4FIe3@4W}~|FBYJ^zgP4L zGI!ZYa76FootFFB0<0)w#;w;tMS>j|0~T`KA$Vm@u~RPcvuJJbMJHUIa-dh#wN}{I z#|RA^)AkG31};|-Jhjx;#}+X20Px~jr0b2-u4wW98XljK@xr>i_>(7S<&T9cYe%C_ z%~Ilm?q1W4Sl(}Sn3}|0+yg+yu1u)0xODCNG7~}LB&720u4do4OZSz-H`XC{4JA@2 z4c93jTl>g{!Kp+8tSHLA!~wtpG(DTF%)VsC!X za_+QNN#k7F3~j6y=uuW_GvO3legMR^wfNp}mz6JLHj(pXii{+lC@{vGFt!*5tYB7% zT=%V?Jpk~X0`TkvOH!{C#NHK^Zwj4FN@U-b(NMj@tV4hAeOR)}xFph!vwi40ebQ+z zNXF5@2obqZ$8rg(3dDA#PI$Rijwi4k0d>`%Fc6}F%{Uz#LDycd#;NG-6^ga{sag*147+# zE(c!qwAmHk5&IEt1*7sMFan|*sGbA@;U4|JK~$)OO7gY*dDP3}zH^C;#?F_mphQt& zjXIO$ulko63Dx9lhzYezhFek?dY-Ad+db$Mm}M{Cjivqs!Rox(KNos6tF#eLXg7ivC<+&@!`0Fmc_bLBaj_cQ+z z*wL8h5brjLZlQWvM}h{maE~z_6ZfY$JJUE2r28PzccvBiG7yR0@lTEa;|Rt!h9@mUiIJunHn|_mYSfTrcACH zB#!3>6369S`f&rGE;*N0%3%>+()_IR<`ru&LOz2iGTK^_ajQh#7ep&he^|1;gA)D5 zo3js#Em2c|Y+q=KXlvDbaZRdw?)kl`gH=LfQA?EzCz@xp#j8qhga&==Wn07=JD~^F;>bhgG=T4UL1)CTwjC zlF$#woqYp=Cx(0{nKKJnC!gHY7wrk-wX1d$Y;*eNAJ;T@e*Fj++LY~K+q#=*t9WKB z{de)~>v_voAtjP@Myc7=U~CMcZ`78xRjOwS)G;D1&yK~=`VE9mmv=yLSKeP>w-LeJ z!FZF;rw#zihSkl%G0o)&U#q*xqq{y!$W8X4MyYyXI=u@Xkq6~4C_HdQVI$V2_fI3K z0Tq=nI_+~MK{l*wamrr4a$#x$J;)CvH5{h|@myd$2a-9eGN8F=<*4@~PzV6qEA90q zQIf7PhgS@}M(@xBYEGE;4=ZbiqG@6+IL1j0<|Pse?4gtdkl3hqEMd_Zk?bK#(kD_q z{r$&_w-%)_9S?waJ}R}BE|SjXez!Q68`nazsoC9v<{mK%G|FH01LUUXSYj^zu4Rzd z^pCN6^a?x$OD=d4(4`;i|E}4;+AWO7{+T=owg)d>VI`DkP|rQuN{Bg%)x@ntNm}U_ z=@Rnay_HIQ2*)oxt_0i4_YAm_Lf~B_D%9dC^nT=pzXbhCD~Es7?0d==W)yx(($E-} zcD^aV!@NT%VI2DQ@oAtgKPJuDIA-Ma*fgsf@Oaw3G6*vkLm>1ZCOj(Gt2C}2#%gaD z!sh4PYURq$#f-D-jA$uU^sUxchs9Y^6NnMg_{R;)xWyp-;2^qXRY%CLY zQUXzK8ikE#X4^kJj&EbAbGBpcTd41i4~6{HWF*ULls+JG$~_p0Q`HT7x53h6fh&Cc zAxT@`e^dUMb`%oi8I@HH+os9=tz`DTRVd``j$+I{7S@lw-eLXrHD$ecT66RYu^HwH% z_|Re-BPGnL^lBL`>gy%-(_UyZUT^owJ}j=j^XK^AN&`ou&xV4vYjKzJ9QtV!LRbU8 zTGb(5QXc40i-wJOB$=%G9tmoUw53Y0DKemJ3vZSR=u>ywk2bvfttoz97z z!^p*{%JzU%&kWfhKP|~M-q1g~?OfYam_wWuFXCN%aJz1rcR(jC`Eg?0@HcZ(9wN8) zFz~g{W!x;4*VVGCC}`Jzc*INec0V)t3_4@mR^nrxH%1N>hN&2J>|aUP%xxZsN>s1j znO3izsm&AGwvGAun2mekUea3Km2KWZKsa;(*H zP@4E#U?>ndp>@Ua+048|3HN%VIO(U|HSWv9z^8*%-mc2oq5Bt`kbcf?HywwIdhyYf zas6sV71J`5DDTOSZ3v{MF?R?i+l`<3>f9j9bG}v!X(*NNmYM~B02G@LrOHwT;ki8d z#q~^Xu0w<#FYbq1GgJYaChuK&x&w$jGu0YDpP9n+E;Esm3GC~F9$6&u~E zi4ArMnerp}nCbheVJFQ&D1Wj}N$}mk1?F`85-0*s@LBi?qJ=U2hU0uIx5ieVTzi&B zUh}~`n-U{)siswId;axSJ5ft4Z<0ES`O^vk(d+BV9drf#1~26i-R;4Q%om6Rr`zb% z1B(na3hmC!f#CBE!sv5+*nAdgL^YL>omC;ZJ_oX!A$D*PPAe}0 zfza0kzZ_Cd@sZ0*s#}~~^}5Kf_u+m3)Zh}NPn`wqL~2W(>>&rvgpJc-e~8~O4yy4 z5b4oCVHx#!L`jYeTY0G2n-V$Aar)vN6bC$mxK&s)k-tqwM@BJ2C_#%FJzh~UAF@#u z2~B*e38mxO)byN%X+i_GD%F(N;e~xb){NxX(^Jg`7}V4s7i4=uI#y>OXz7;j5>@ar z|Ed5gnB$ck7MAbeRK44zX!B@{rz2VatdEeiA;jN40|?5Ta&&!9{q?5)GQrIvHz=!&!V`H$3d z^sUd2@p)RDqI&ynblOrXM-ECJ40kOs!nfsMp&eMuqQ;ikLk~43Yt^MLS33 zc6zK^|Mi^XDbB}V_sVzMr`wg54tW6J+}xe(j*agPemRlOfOKySrwrFMyk}Se zhDgVYSJ2}AE@8A!YJ>|`m zeC~0=$0yq_hdcvEo>D?B2uaTkr{~#!=ayx}XDIf|G8FqNxy91ryHJ8hUXUhn*{2Hs z6Czl|HJrf<7R46r{2nK+-l$=XZ>QiGelmZUQ$d&>j-%gjjTkUGjI9qbZ{;5x!DT&9 zsX$g=uIKWQR>0WVR>N%&(`jUtwFFEnjvlx>(SGDb?0esfvX$r7i-D2yKiE2e-40dr zlLo~o4g)ncIVEl>U32>?=^0UhGixR*Agje3a$vcDrLo!rsZr#%6~=GGiLa6M3!9Dt zb45S^2=G;aw4?c1$vs8pAp@J^#$>0-y%GTC}KIN_BZ}Gx8Q>+;j1KBg5F5(4BT0=IbE-aT-<|@>s?|M+d-;W;v+Itxu{dzm{ zb0%b^KDH|;5GWXh5Z|K=3?rnngVH$UR@DaeCBcHP%e(+gcKdlRTSbL}A}Y6|Tfoc3$}zA)pX`fsm&RCP~AM3USNrA(H=5ztexfL$PGC75S=P4Qho;6t~9X5jJ{Hz z@27TX(9|HJhUyQP5N`oI_(ZiocpbS2H{2YZKjs1oehs%BwcSNdoM%4(3R@rl=so6h zYW?U8>~qR9*zK%sY%N2#gokjAx16$~FyeYW)QoKEbta|6+*{#A*2?B^Yih>Nuxz(P z`DMS2X|r;rE0t&AV7gV45NPxq)g zbsz;P3)WRj<;i+X>f#OS-WSi`FQ&yi6mGjHF3VTHA94A5jc_-fDrNYt&P)7tJoo&h znSYXGduQ38Hs2;6iCXwrJa$36@OgV_L2)k^I_SJSN8qq}^>op)C2bO>?=Z2*Se_9K zSKK+KRK2F!rHt2gp`;l2J>`w{b$?rL!XwLN`jyb;2b8-nez+pE+Yx4NjSuFgr>Pp1 zefjfDTTi|3#hl%_)^;t`+k=@JJwfHKD~H7k+~_;`HXQ)L0@Fm?Z)ntTXT@82{Im$w zOQTYEvc_+F+&%{HkfkRuIpWDRK}zzVG!!))(vdoKn#*$2V)%?D*|?IEdBn zI}3EQ{DY3ZsWz1Tj26pt$e&a*bxL&Z`!tuf?7!}YlrRL0I~E}nh|y664~R3wmDh7& zvtpV|@j&ACxjtKYWh)hOx=$o*9ZP6{adA3nbR{G{-9%wx7hgoln6qS`>~kr4L+E~f ztSB9L-F&Mqlw*EydC@W` zSq04fOnyz6$Ab^H_npLDMo?6s=A=Uj%M_ zvwp+A6P1fyNb-hQktLKNlfha79?Q|h*4ae#c}@`|&4#1t#S+Sf7?Fl*FLNF!2X7J~0%sI_wGT zaX)_m^x-td#+{60UvY#uFZ2JTRis#?Y43f7>T9mUHc@UNreJay>4S0vURAb|(6%w_ zI?gga0GPi}g~*v4Y1LR|i`%|l^do&u;H6uh1hCWua-FFI6UMX}Z`kJI{YaHg)v>&@ zQAzb$RDqk{wCFX0^Izj=hIR^=fkHbuKw)CU7%$lVJ_r68w0Z#i%6b33E@8Wva?cnl3axd0I`}hd&U%1i!?T@|se-V4qi?>! zX=o;)EsfS5X^^lmevw+1Kkk9@Mw$D`NMBokPBJO6(V^$)i~+oj)$=mFGh+O;vgyTj zqmxklV#2l63yPcdYQigS`45*nMuM1lxz*0G$E!omtTVtZGCJFg){zpo12I&G zNP~`+=F6dY$WrRM=&4BNI&Ep1u(;P|_TG$@|HWn<;L z)dm7Q$GHc9#aH8*3->Q-HYi>D3!V~RSe=c0RJ$xhzAK6X$O?D8&HQ%7+*G9$0ONTp zbk1$^m9nTz8+Bs?-TBR8;Y71d=!WYR)993dpAsilQS44oo;qS1bfzIkQXp73OF$nK zxahi^F2Vf6WAXLqd;k;LGJpC5pptWPfAZ6$8|Y+gq6@<^PV^dwOsWvS*y2cI@7vYDlaVrK5fW-V`WTYiZusM4}McBb(iug za3PUt2_b+EG)KJAh?%rZ^>zo6AEPcjXqcImS|By@w}~@QT4M<B5$5Sz1DS~ z{r9h$8nDaVoG0J)Rs+!%nrVcVpNjAg;+-`5mk6(G2do^}SNrSKU=_7aY3LQ(x=mRZ zxG9-6iaW9-QdjpgLT@j$rtJ1^2RGg8-|8|oQL&N@@oy2=)32-#(qAltQ+^Fy#6_16 zSk&|xj=_k8=&WEy2$p)kllb(l#xgmj{IIoE6Wgh(IY(coqdD|UeG6cKr};<@#GoZz znah##=xobhoOS%Nni6$B^wPOuv1~Zw#Lm4uw~7q=U8e7Aw4*0L=6XX`^R(u|vv)&+ ztHbGI%I49zLmep%7~6e7`|=YXFTSHbzCX=op0UL;Y)v)6#ESHxotX^S@7Se@vOsy( zyr18(OH!ylMwmhw*d?(GWr_aT>p?OQNo?fQn>R=4sGX>0LF1P=udNzq>S$7_%z9AQP!^_Xjnb>a1X5F9D)#T$o4T_2A~ zZtzws{~4K5b$(UuNo>Co3*gONR$DJIN#|zoZ*-J3rGa)D>;K^)6mLXU(Vpz?L^q02n$l_|}Qw z(>F(9D4mQhj@0Yz>l9rYPin4u26WE_YC2Q}ViRk6KfYdcsQn2@V&&8>as;!;1R zK0$8(?xkSppUxo}M77UY;WOC*b2r!83iELAzrKX7o3@<;Uavl4w|Ik)8`)7zBAupH_{nt<*jM^9b0-lA&FNt zh(|2Pgya*81+-iz&umqE>U?K$9iC5{8l91xFlUn!61%xHX_hHBg6{r~>=wcRXJcny zi#LgjjVdF&063lL$!D1rm5Ygeh&L1P$iVtRAQcAXBOJ5u$1ptv<^6-JW=7JyR$~e$ z!^Z)n1=vvRrveiup{a>ySC6o#`e_$LF;I@zIBD-JhuFBp*~z{umgpi zJ}=-HO|y{fleCN^F>ek7Lr&!9*V57$(yyg&@&Y4+RHbhpUm;^?BzYXv3=Dy}ieYM) zZ;DZ>a$1;*!I2@Sn#n0@8zpSf${;!!)**FgSPEO=Lh|F3^ycUnw^a9krq~mi6BOSv zY3c%ZCJa=U!~qi;8(!j)2_PZMNqZ$s*_difnS?#qv)LqLu9ccrtkmXXCP<0|THCSB z{^YQaFG6q2jz^|zUpw9%t!YU%$8t2d0ueXr)3;`Rz<8;o-wBwi=y#)-^juByn5wqs zma5cTH2~DPx(Gckw-ine5j&b}Vju~sX1s&Xcx~N>>M>eDMd|b)UovQjjxZS zZISW2Z=X6sB%aD=~jpy^JT-B!QN_~b-eLan_a`-VVZD4Lasw)5&R$>x&V}Ap6$B5zaFQ# zy{{Y-(Npg$59))F&52j2JEuapkdg?^3V~f9lSpB?vom86mFG$OUP<@^z&^t(FTN~X z!Ig@Ck}Q@E{Ro3<;DXyiT`;=@IYMk+^|v7qi0*Jnzdh5TBa~kfbr`wuw$RipIb4n` zUg$)C(32?>;!3>^`LV0T9rqQsI9*%rJs3&B1L?hI6X~zTH%IN7`4wqLZQ!r3L+liO zrzZHM_i;CtCvO!Jf!JlxEvebxaWGn_M`A|?RWUGNN9ILFqOc>Q85o{0(1&hCMzRnS z$%06kWr>Nri#Zg7gW)BNJ(`-dlA7vTtRj#lZC}2UzTn7$Sa;2p^>i)MXC=Lgk#%w_ z-eC$GDvycL%Guj6h0V4X%rRta^dt@EKhVC>(W8bRO|qPgxcj$RaqQG!Hyu;j-DMGu zc&@{8x&G1~L3nBJ-J{c+QH+O4&)#|I&?_Sfp@wOF{Hqy=LbG%Wk9E}|e4;^%a?V#n zlixsslc|e~Kt-V9yiZSuK+@`gVIcZX?-)3rv`)yFyzthd&O9A{V!-$%^{B4zFDoIh zyf%u<R2$vS`yQCka-<8WNKcJvQQhloj@{5C@2@8btQ3- zQC=#{P65|-7zfPsJhygnwoYHc%S^=bwm^^9(;O%|Q2%kmRKx?zpnx$}2L1w@i=*QT zx0T#wUk)kXmmUTz2nuSnn01-v2BY9!6ngs`jrmL`(9oJ^DuJw9urW&HK`h*x5-OJj4LPqqc%NgU|M-)?nNQu~nNIMazGsXm zN%MNic51SC;yJz zeJDbCMjP29(;E4sSloDc;zp9X>sdw5+MW-+??ZCGORBKs0gnaU$$4$}^+{-~RYPwp zngZ;n#Qv)M%Y$u&S7(Cq0%NKkCHnl$+t|>@+;~9DXXv45BYT>P0W-l7drZuiPD2Y9 z&%VCQ?QF7}t*X0!QA|DxZ}(AnTF{>8Qs$=8=F4c)!^7veVr#skD=R|c+;j<1+W8Z8 z{fjg-(DVp$z?|V&Bai|}F(`>%MCRo8uln=bP>3*tV}395(6h({AKr?MoOL&FbNwRAFl*z6o@g+OJI6`kszN z1P`YuEFS$Vjom2eQ)?>8CKTxOw(-T|2t3gUMe|sodKvbdSZ1^U14mN>8+0xMQYR*s zR`y|kts?z{{`sRpBTaRiDWf6mwbCJK7Mqv}ko8+U-_Ax&%Got_QBx~9hjaHnsty1o zTZlVLO0L??9NkK7crsCm0n@$jQn=C&Sgk)bNZ%dg5vz^QYpBYUQ!yUB|08e0I2h%K zWiZ%H)96@?m@{ThI#1)-7dJkRfX2Mv20I}#(ZO}%oKE`!+3q3N4u3xR;Il5-JOIL% z2cXE>y9Moe+@zwYs>KkbeYuN|Q75i^uMjN~nCiV>Qk2tJRW_2kN*q?^9F@-+n)sTD z*0*oW;cs~I;kN+%Hh6+r!QEH7E~kyh2*yXTieK}d^QFuMDEr!=v!9H1F#d1TwcSaX z?ldlYxplv5oO}BZA5UW!_~T{Yy4hvq=ylo1w{gGe>}#bz$+I@%zk$bJZaA03vVYB6 zsTzBs8@MNkLU^G5m$M7zU9nkLyG<1r4qZ%5>)nQ9J!y%!;of-mVq^>d>A7&j`VTI` zMagDR7FWQBcR@L+k#mQKpeyUP(ju0@QAN2FuVV7>L@sdlzgi`Xfe zi@6kmxr+y^Zo&NL(acUg;!#h!5|5G<`yUB?e-zAz)Cuqj$`BcD{yaqVV4^+g|Y(6auLN~NjekI=osKWanXQwJ~|-Hjdi zUyfJJxSFF=!)&Yo3BQ;!u^ywEWi#z52JGeErp7;fQzE3`L-6vxI*9Ao0LuIq3(J47 zg#07swj;UP$37mR21hEU&JA+l&~?u=vlczEmkqlUJvx}Ebpk0KUBW&}zQ(=}fi6ii zw7O>l1|$EY=luT_Hf-+iL4=Hg9%!5@`i#lZE)Wn~1Lu{Mlgvh?uOHf=Ox@q5Je%B* z?IV)!@0|rV!K?mv)->Z0OCh#cMnG7HEqJ~6DuyH^+uQwtePxQe5_Xo{CED}pgOaBkQM0t;g{eSuW&o5f}NHj@} z{yc29Nbj8BEHxtaFhvG70h_$|I_2m ze};ww%EMMg=Fx}Y3%WejI7pszm9^ z>NCLf?B_^uRAoVV^p^is)uRx2|B)1~RQ?gEumih-d$SBaQ?=k}W0}8x|8-WUg2Yas z%8*^NR)$xNmQ}S9lZwxJ?B`z;YvZe_5wv>L-luyl3v$}ia|nzxT$-G^{~O!FXcXVh0(ffZa!bcq>5JDjigx{7*na?PY3H* z)53M!=YBOHr-?M>Xav6LsJ)E)>c~&c?BUwm{G}DVP0>N)N*-csgsbCIQu4K~3zO^d zQzS(NOUxDv$1MnckQnx#aN+lY8naM?{JO=rM$qn0kKZp{e@PsF&1Ao|y`AYr+LwUX zbWeMXAs^0ft_&t{k1U^N-1&Woj2Yjk7ai-MgxgS31BD?gm`co=q@VT?sn%OzH>*B0 ziO$Z>es508mx@bsK*CUs`93Y#J0mrpZ(}6iFPodNyt{#vmDH!c{!do+p z*HC~%mI(9bL#$WwC!4|KA6o(YBf3j9#>m32HOlu}HhMHlFPBn?Pt3!*`e2VYwXbe> zws`a_R41qTyt%gEhy~mjI=3>W?rrp722=Whkb1J$BRtI6Ukl)3!Ul`|?|p;5T?MXu z6FV?-7Q-gs44ej$`^+Md)srJu`E@!%S&8F#inIB1-4=nk5m>@=5zvh=?&M9E-c5JP zi~xPyDVUA@ZTYg6@t@DToZJ;i5&-`p1PbYWOvU@AIheOl-B=Zr7P*rSe?}MN&jdr1 zy^Q+^qauJnhYpMs=lY6T7*>jB@Ot+Tq`t zw!M1p5ndp@yRl~P(mfG;)!0$y_|YP%O$Je;|0}p|y~vKCh730ufJlp7El&v* zDPzrHGT#b`A8(pp{TN>-r6Wn>M+`%8pYb=x1VV!KgkW)%-K&*bfaea}RKxXJ+MP?R zjs$gb^rW6t;|A0&CvqgRr=qTF*K!#r4)|uxwq;VT%}e{FAdjMn0a3RJ3n80D59`ft zKJZ&*AiWOi(!r~OoFcc$;6q(Z4DIv$R9SI(pQOXIV*ro@*6-6`z5Lu`YcSlh{Ds8G z@B4rN_Hb|-&BpnLw4qklWn+1I&iOjWdbG$PDW9i6rTo4?GbN9Qa61i<@3pb_+zSVC z&O=mqUr9~@R!!=JrpC9v&f}+eFa;4H2KxG5ZCgR6Cl0yJOa$m;3oiSm?`Ae6T z`P!?()unH`U!10E(yokmc4d6>iVkh*f``cxjj6lQzYMZ6QtmZ3$mOa|o0RP8>8a^S zy2W+HcBv4$cTcqKXLm~_1d9H7vsc2wP+An7m$iEQWAqZ@j$z;8zgtv(K5a)+_4onA zAa&s<@sD*UB42%eW@_$JlLx*Aics`#X$jl?9Zyiu(&IYd)6tsz!QECvJivudv(dsx z-%P+9=iCdc=RS!yxG8U3U3b~vX;GeAh;_cB5u5MP=8rTHcvd?M>w;8KxKy5r7&n@> zsF@sqt~d?9A8@U6Gm#}qcGv|i5=mlsg(1m0h)FU2F=U7&HHJX+Y+7uR*kO-Rc-ulu zVg(uXFf|qFUeGEI_Ey>36G9npy)SDwOqAC7Zk=(=Aa?4UFt-2G8WE%_pSV3!nKqce zKUC1ibGmzBLtt|o*lL?vRW#&`N>#AThjXO)$dF`WL6fqMZGJN3DJCPp5P+9UCB>aV*0=A$Jz@?@so{2IP#e3Auy_Pn zX1HjA7hy){gueb1(mY^q7R-`-JFw~Pn=m5UaD9!3*%X?(&p&CN8BOcz8Je}i%J zymDZE^{%IurtA;gKPgAiSQE|h9L0nBZzDx3kH9P7Od{8o%aW9;+PTBrSQm@!$m2{? z!k6)=lUo(N0Uzg+t2CJB-o1a@>i$~icKk`tR!S^rO8w0)l@WDi_e9*xFTc27rL}|~ zv~ja0)3k@s>=PKVl*vVc9ssB>A_9gd{GtWkYc(VCi3q@z(g{3Yy<(i)S~ zVPokG)pTXhSF|$gc7Q3MNZ0ViyH@kTPg4`Ln{uR`!Flik61@GF%@Oa>_C&F_L}wyr ze&15_d~MJ$>Q2%6COOR#R9)X-scWb2;dloLHBr6WdM8g7Fu zMmmA=D6YLe5A}lAj`=6tw=}PCXf)Kg+_mjUigymiJsq71K7G$Qc5VQGf9dtkv7%Dn zidOSAEcK=R#w~ob4ZOahQoZo_es0&t7dDtbZAUAdb!rYfPqW}xT`H?wWVfYZv|jyd z^cH>dnK-syY-2Z=Tr?am+YT7(Pwq50XU0=`Ci{bT8$W}=43Nz*{?%a56xBGYp}%!$ zP(bub*VZOPU-~P-pgf~j)kIH%r}M8;ems|ZvZ{>{i8Vih7<83I{P`Zo1~Tj=EfsK2 z;D$hBPSSm&Ely@4>e((hokS;%ZiosnBSV zCXF273o-}`doVaR zhv;59ws>PC;D%CHI=H%&OinlpI`A3|~hX zghRJi*=3wI?&C#e+hXIbS>M#^GBSQ) z5-q^uXVxmD)5Et)J1TO&3B27<@Q$`Rt-WDt%2bu$I|*5Z zjI#={$L4Z-M`!_9DMRoE|Oy*!yr6u4JHksZ#%qh7+@tf*3{ou$GdQm|V zOmf7){+!r7i2-R1M)0hQrnJG}>+MxJI=v)KPoL$_1#%biszbLWF3MhPE;S|8^g{R0 z5tGJK*{H~@2D<(5FzOAV^^p3(*#>au@5xww3&Rb)=7I!L6)*?f1XV>db}|%`+p8(P zxA*kdtNOa5G~r(L_cdD$&l|^~0&s^yqz;RWeScn2-aB<`NVpRXp{21YS=XdXbF~G? zLp;~fr{*t!a3?Ey+733$Wm*9w5zUqgVRn@RB3wu2otw7s9(Gw_jo&5`th zDVy#T>@b4*SlW6$2@h`lj`95%7no}6N53Pwi&0|=eWNn9W5)bZ%efogEzNlae3Am& zay*|$-u`vii%*!a zoTY0UI~G+<7bR(%-IL}1=n-8A8TNT=QkB?Q&}O(|{qQ$X6LD<_ZXr6} zYL*-~MwaUD!20%DdRe57=oH(jJFerR_5vh+#j9OGP7`z8ZqW66LlmN82M87dgwu($ z#Ut%kT_er6xt6Cys#JqxF{S>TG2GnkP1PRJE~7r?zI=OQwM^m(y9b32S5o4Jvr>Sa ztE`_)vPAY)s|6tH6la2dx_i-;V+a6G!E^vqiv-Yxuz>T7PaBd`KUZi~W0F7MZ=Eqe zYw4=muB6$;$WA!PEN~-(xy&r3oAjMW*US?uIORoW-FmqqyfP=n9vXPyg=q*G3ffGH zAG(b+vMG4a!Um&OCbbiN{h}2pG-pC*4@6Illo6boX+}>lP_p4m8+3FgOm=LoOCwc* zUEeS6mv>~}qEe!w5-0VaOd8+eDRJ*Ft*^1wmn>{bnqxmAAdC6rP~ps=m{-U(mh8hP zP@Szeu156kHPdj5Z<~vE>_h^xaN1@u>;sd(n>Ms+-rr2MZp73#Z~V3H1r_QMszx6r zy61U&=yrPxh}+q@mhV;gzZD-mQ9(zYqG|UJS&>7RH%>$RvOtnO?@s?B^{K&Ok=9X+ zqGj*v@XQ=K$8Z8ooriaBJ#KTPR{Nh45dHC@9aGF%Nn_m{Pj)+W!=7$v+He_A<@xleTl2Y_w^v%DL62_Y#C1t-?X264;`kVS5{=I7pBxbt zqJthlFJ&QNaq}T%l(ffYYK^SHuRK=2DM&|GVzSF6?vKT#4@vEwn3-y>MtP!a=Q~$3 z-DPoRQzqB8;Q(h9#v!?ogey@h0V98}wZ~bEXR&|OF?$x32V1Yix`s_*j(3avqWR;& zt1WQ3soxW}R+8V;w{f4NJFy#mN%juNw^#aTFL~1yGS}PS*15RAwFVd4j@>!0@@Dh* zR8O@3z+8Ny)+_v+u4U662LuriQF{_`T$bWEo;=)Bt|mu*8M`;_kf=M{J!QgOHJ;2W z`>-6MBuucQI!6;Sl#sy6?`!KPdE~q~ko@Y=&!9cr0itA5v20kwz$v8vYqlTjq3}KM zY)q8}KGk2rKwnfmIUqdq4#upfNY2~dkj(IOc@8|(wi<c2! zMYkuoYF|xcGh!VW|KokhkWiCU6|*SItfnqgZ=k}J5-yYX+X<)0TKfICSY_?akkgyz zziOYe(X}H7zxB-cAkCj#eF#hY^XXmAjOP9&(}A_WC*2beP&qJT6?o3>dSpJk1^gMb$Up))`%zlv7gj)kd^F!NdgU zOoIVFi}|#&(SI1Pbh*)l*U1%g!3u68Gs<*2RQ0mCa>A9ldKa}9`eURlKJsbhf?e*3yNvO7gCMG1BNm7NU zAO#fkU=|-&Lr+8zoRarTx$J404|ZVh+yV3P!F{{xD;lemIs*u6Hq^tf=y&LdI494Y5MnNh=J zXc?ZzGicx78_cOOw~C>|3>oo5t znFvV;negVU6{#Sw>pr4JzwTMxfc22gh&!Hr0=i*`+3lk5<)R`ZQ}xX-y&n*D592Vxzo{%O zeGt~ArWPVU)6}`TswMNCD6zG#zbjHWxtS_GCU4p%f12>!MGn}sNZbr`sdr**@_tsK z^+c!!hw>2*=!RgIogG`Gz#wKvKQW7UTU!N7BOZ47sjIJg!3s8W-Al~r&*=wY147e9 zB6>bbU`SC$xbL&K(gOJ(qf&l_Uhgis)(_3_1^kf7$IUbBc+Tb)au`U;lAxUL?JYr^ z)x$j*s9M}Ot$c0_uzeG%MedUqo9)RAbSsPL$xXym55LC0!1rDZs~we0xt}>y?8$yX zO8W4^A}v|;bh~JTyx2pjm!n(;uYb8;?jOkP@##`3`xTbPt(jzyFzI7nvcG>~z;Xjt zR{U84On;^`F|N=HtK)kALuFtNtSp`GMf+oK z+*kOLR?)1K31l$fmf?&ouO5tad?ZP0M7Fln)zno$*m0z*|heJDy{d%ags{?iF?&Qn#lyQ`qLTeDGREVT%BS7ySM#%~pAgx1Xq zo99=bp*gvUv@d=Iv%Hzu|4sXiN`>?zrkM{Zn}u|_*1vS``y>_SeYm7CG?Twyef9*E zaZLL;`z9+#IvD94wYtVIiRK;b(?g4UHr*(y8}U~b`c^*E(s6oxX|4RKKMAe?s zI!6UJzhL0{8G`zr>b)$Ucw`{;vaCU8tFQyZEn2DfLV;`UV=$G~^F2WGV^_v^bpslH zHDs`qW6q$Aj2H+m#+Q+m$j!npse5|Q%_q_~df`M-%Wyg`9ut@J^foh#v~95EaxF7@ zlXi?|>HZP!-9WQSQ1f7KzJmV3zR_JWU$D#EUO%u@ek48&-Jqv#q$9`o5CgVlSwiZM zwT)~bY{MdA*jF*PmVC{!iYtH47HJRbYnv2s+PiV+`v%d(_5&?zrTru%I!P6S-%ik< z_fBCKI0`8!ts+)~1r8pphEXZc0guJaq|_gz?ELc2b;G1`ZnWBMUOfKx7r>`uS7Ds* z`?AEHz9cAZ*aW)JCI71Pe*K8%Z6eADLUPw^r_-4%4U+?1{0p8Ba=_Fh`zSS5?vy?1 z%y2CIG}!eo;2dZ1$NpzRrk!fWg!mnuM$ZQ(rtM4T&K>Mmp2`~rFRFy9#(w%ls9rqI zXCymuD;4xyrj;%9`%f1NG_3aU#p@jEmprGh12Qyeh#t}9=~9>P$Ufbw&=*zil%Wsr zR%BF6($vlGWlvP-%}C;(_1Y-cRK>)x6V5>lpB0fXL3miUZLJ0?}leJ%Z(fsssM<&46W7z8Z2+$B$yYh84o zMkb_c7ny!*S4}Z&Ap0IJ@6d4@!FnwC@YCGg$k-Tb;D_bdEEyG<@ zpTBu?WQ#n2V{0d6hH|h`9ei@F0@8;*^wkQ(4aaLDn1IUdmzM%GV7+dktAZilXJKJ9 zYto^&wtXW`{s1!6Ksd)tl6v=l*QXg}L#q{mbQhA5HDu=2NHwE*@)?!Zi!aU6 zO$-80Ao@lRjDOZ-wI1PAKA=xebw5uZnToDfe(@ifwEc?0tyAcuJl%$J3!bVlw~;n3 ziBAz1CYN8oDi(-8$Tx2vD8;GCnWMvB6&>-JXgHxzXkIUF{?JEcQ*(X-gaH3r^}E#; zKF<|XBeD@QrE(_KuYRcGZ-*j5DRaJ&K-zJ0I+QPdr;o|Kmi#(;K{7F{JKpoUQz8x> zLD)`Mo(dq9CPqRT)LhVZQ4DJ{%BWrSgyS@Se zHNl;J^==zDB3;W!SW(PjR^aGIet7ifWJy=zj>I$P1kCMa`{*m0U&Xk`B4RS)s#e+Y z=sr3Tkz*AbH|I(~<0|(dxnT1Yd@C=zC{|rVio!)GCL-H1q*8$u!UmCaon_S46X+}x zoiiKI&~~%F3?Kk?CvDN43g(7_p$i5;vTO!}7Q#;3H zagy4m%$302_nn%yFO2HvHawhrl}9N-DkH-H3y<>g(I0IEiY%8aGRqrw&oE5{b_2u+ zdyJ5?w1Nqygz0<2ZQVfET0-C^Wuwal|C(H>e+h#bvom}Hng6T-5xpj+6tyJNh!l4A zjyf@FkKy?92SY(+nm2TXMm;*|^nG@9cWxE* z#s+9&mRhh?aLGf)r52T|cQ2CeE6?k2F4}lrveY;{cJL1z z{co3_Dj$Db4f*dM{$E$O9*2hq{m)bB|3ef=wDF5B^?gyTPMP~^odti&eE9BEcGM`# zs<0%%xou3$kha|YCHE@oQ!#4tk=5}e*O$D(Ve&W!1Gyb6u?ck6U4VVHN6&Py_XGV4 zUb$&8>{Y>?s3mNm0j%Iw4BEylGVWz+Vfo`MEC(0f_Bd6TCpy7D^W(^`|Np=A-}uLW zf)zB>)c@=G`C;i%KKx11IyaW5`t(68@e-St1%lnP*e-Z^LwM)q;FO^lUq3?%Hzuxh zpOG)~t|!masJOL8|23S;tn=Nkz#5(M=Rp-f$v?bhyMF=C^@^*jC$5NJOS$s)sYu7H z6v8VHS2s##J}sp+(nS@@eP8v9S;LaXJLI&y#}@78lOF2C>dmUP9N<%aBToNUL-)djLA$PAejszpje;t}^q z2~z1rMFl4y;ewR3E?L;g)B8UOq41L2Tobb$!M*Y>V3gCY zSKz7@_ovt{J$K5cj3nx2@os$V!oJ%zIFPbS8uLlVYRt``FKr_-T(t|HmU!H_8nW+7 z*a{^@1U2dl5PJLxH_gnfn1A^Bp!V9V!b*ck57BmPZS6l=wn;NE*hSrTa1AQgo!*x$ zf$}?6XdCEv$Z+xSUqJTJRQ;TUu`ZH4`3~dIXf65t%p!j_Z<^dfbaek0ftV3^jXCRGw!D}gFVS>GbLN6=^Z9220&r20N%4g=PD@`lnj6gEinP4r zy{FCkvz2g)!}Dk}r(~Reo|&W2A+GqhRrcMGeoI$pPlEOiGiKkH9Q=5I@qe!> zDgQO+_=$h{k1+M`nLP_0wl4Z-1o%ivOC5!^r_6{SyB6|ccrpO{FFEi|O{`dNm!PY1OneH)R7IeLg(1r^DkCPo zh|QG$U4wLB*JlG`zkB+bhaSp`CeHPRgISRSKD8qgN{iOm3$xc|?Q?ah;t~(oQvn1k zL$4g`c`=hv=t^w!@M|w-7sO5Fhu;`5?)|yPp>h1gLSg#nHw;lGQqW?YK^MGwEApyg zbo-0xjNLx zwog0dZhrxDA;iY9IdA$g4CW=@kdpT3cBhxI=CjfG_(DEMf8CH3NVSaVbhIg2fA*?$ z;Q<5T!gX21eDl*PYpLX9(JgVlvtCJb_}-Ya=35)i=;%4Qp?mee9f%xanVybvyEi?W zf9Lt84z&Z%9y_Cdts}R)GV(J@!%tXr{nD92>MtPk_JhG+CpuJlI7vKoat>zGFd*|A zEiYM)=oq1}21i{76Z*w_GV^6Qj;ozqIaqo5xs{Lo&}r4>1qEf-J#T2w^b6eOj1H&V z@ZdyRx!>oVxxzLSxeo*unr($LhwxG#>)%#9Nr$CpTVE-g z=}i@TfO(g{dLc87BTTzge&3P@FcYL&Z_1L7;dNo*@fiVng_^DGoPxaN1D~@85pyK7CX&0eYhw4eIAm3`t$l+cWNumOq-|6 zW1~7_5binn!g)}?d3t7mYoMYA+2IkNgVMKP)|6@%#|IEavp#vQE_8*Vp{%G+L#H2L zC=G~zb}h6UTd0uuvL-hpBU4hygQnXRBOPUF<8$j4$Ld*4W9En_6EE*4hmqz?>}{;P z9fy5LTnAE|+EfBTr6NUSx=vC%s&_Q};E67WTCbcLg|HeUC_`XnbxA5PK&$%~FUhwN zA_b+q%C|b9Z1(S1n#Cj*{bSv_>yL^>T0=vv@&(MsUHjdJ&U4?|5(Y6h|B~`aO7m`> zu@6l#7u3}y$<^FpgMKc*Kpr}3u9NiKuZg^Ymf*%`AMd;AF)(QzMcU=Uvp?Ou3yv*r zE#P+@GmxWPnY-MQ6Mf~N{tN)@>-7I2xlS^DpB}U4)TDiP1j6NvH zaBr&41aB`c2H5a(ZUP9}74KB006hFI(kATbGw);k9cDn5ixcwS`q{%^Gi6QsCwFL zY0FK>^4fi?o@z++6F%0Qi<`^;rrY=N9J?U7-K(I zogS-$raG&$k5rZ>F0C;tf|+CQn{uNQDf8j8GS*L$q99&@%Z{c5Yg%EEos!M{6Dr%h zrtK`!qED=>1npg7nN(P_WC2lMw`lB0imN4gvXQEo?(en{|02c28Cx8f_g72fSy3At-hM6R@6EopaBSlq8EDrX| zq55f4$1&PA6vM75?nE=&>eG9Rqgqc2T9AX*m}OQEXYq?QQyl!DX6esD{EF@%uG(!d zA5PwGO0LcX=`vXV)fLVM*E8YNZZ_oUKT=Rh2w4I{Fr=nl*6f3d0euN<8gzTn9Th6) z1nHF>--g1D43@F>Mcf~&khDSGLBe-WuGqU30jc{>Pu>KSV?PwJe&T}Y{)P-2N_b7P z@No((!@bh@=CsC2Wk>ejV<+5=BeQAbRrc@bW3pZTc%MCVs_sAOf!j2PWK?@H)TALv z6C~=cAlDNcT~4``vpieSaGJI`R&T`4>V~F4cdzFl^p_6=Y=B5^#+a9k$_5x!gd*pKCZ@=d?uK0jzU;PLY!r;z)}1cw2(2 zcUAVwXYWzTXF)Dn1E!>XYei4%xZipncG7p&xl24hs@U2FVsb{3=+WrK{Nhoa0X>bj zQDyVN<6HgvTYmuqJy8eac7;+RK?dp%g6hf#N-dqSQ9!-b#+wBQ>%kD;sC2n?KgCTTi?%zM`-#d-N((UAg=(Kz`p6^DEichWk?OKEBj_%cNtw;`BO6_N!~! zE)X$_o7wSDUI?vEI1HM+ahVIgEXmNlf6as_8{PrQy_pG{@I_=$2JYtH5*s)`kW%cs zw#g$sHspFyIxr78^W#f4-OdU%kBipw?u{YPr>6-4TYV%qPnQ}vu*OLuBgU3>t$rov z_1viN4(h-I4`UTOD)7ow@~!u5=mqS{Bo8jNkcTi(zv#P<`N`(bUqgRtl#ib!=HC7Z z!#riKGySpBMx}LT%WlixI<;=RYof4Y_S^Es+*lFNHwmnm|vB2>$YPb36ZUe&gb z`(!2+=cdbOd+@!lniurJ4j8GElsN1g&&cx3?;ejxr?#^%PsOgL#RP3*uZ8Q(c^fp{ zN2fu-x^f!3(e@!5Jdl{TdtuyxL6XEjcl4(729v2esudkkH3-xEF9p0X>^C#rh`SV3 zXFvg8V=vE#<&}hlE(QpbEO+8U^Z7Fp6mq`B?YDlRpKGr%BOj}fDOd{!7{UlOY0%mo z?e|HEe>?ZgpXCy0!>mGaqha&k^MaK0DKN+EB)bwIQt~P9O!pV2vNgN=i$(4l=yh)a zOi&OylXMs(c9E+IWPMa;57wOBToQ3e8E7G_}2`Q%4N>t(8I)rj97( zr~GY3BseP!%}61sii8hpfBnc*d4`ARPKU>HkY(gNO8UuM)PR- zWGxxs0U#Ka4jTIl=y{69+%fL!*Ub59A2b80?VN@@`M|K3x}wmY;m9=U3Ic=1hQ%ZX z9_HCrsJdZGiz7>xUD7Hww|H(a58GNgLn|YurW^3$${J2YfvT{_%jtXv_aOiJd6S+z zthAso_sdx^twN^`J7>_LJOUKXOjBb}%yE|#)8&tvvHSAXol_f^R#hUKSsrD0&_r702&g*8BWvNqG+>xS=) zMbwN-!9Uf4d>g*w$WL-0AI!PDymb8+Kz2YynmJObxe!(L5KVGPXcdw;6gNq3kHpvb z4>8xpw~@~Y_ur$ESc>scnBw4|U`}GF?o)n3-64&PEi031Bt}9VH=QE%UwN!3r9|K>*%wZ_{OHFyC>4$t6lE6cF=?aJ{=pG-loucT z{sM}$G+zklnHsl=4YKFg_C?x#w|@5VbKRFtXKxAwWQdt~f`iI?apXD7le8j52Fvkd zmvHXz&kr_PLFLkb4IG^U@xaf$CyIqdFXx?+83XqH7#?Wueom;q<7D|f*`Ko!j=O?? z0r6iHJapve&A1EQT?I>Us>YLg=-lYEP9?EU2?()y)47#xBg~{}ry{g?+#vF<6?VD6 z4nmmnV^g$I>iL}AP%_uH-4|9P_-?S~%#PccM)RVMhMDc>3T4yY(9b!5)Eb>AiM;SN z!1L!i3NQ4r4{58T5z+#_<}ZE*$KD-cQ*@`cTgGTteWA7C*`X!PK>RON$w=m3+8BLswCh)T4M;5cc+KCK~%Sj^dTD%JT9@#S?)Eu|KThj^{bdS01cF)(h7dh89v+{fLD%)`W{#V&3i{&o zRUwGag#K4nG@xu|-Gx(d2g1${}Z*?y!DN5OTMfu$bxoprE8>=ns{4MP>#j>(SLYvUQ!=|j z=EMD6g6>t)E*mcx?+LBB{U~YjVaoCT!uI#(vSiFNDztf8TNmPj^hd z*eQ%mbNQk{M9*RtT`N;>%|%(B{Hzrif3qW%`o64_|M~^hyhkqw zNVevZlTLTE)d`a^5=7wu4}`R^ZqGW}e&7mw^tHZgA=z*iKt0L8voS>K183LjUC+ zUNd!ntv0=4AD7VPr*t>afQC-{zcW|=T+!jPuXwpS{GR~qzjJ5*QcGLvRQY&$X6dDJ Z;PY)4VWAiDY&4|lp(I<8ZQuS*{|5?yN#y_l literal 27462 zcmeI5`*RyPlIMT#zoL#8*K0?crFvJfNAjJ=H?i&wdnRIf)*H7Q8;TNH5;-oiG-OGB zbj<(0pD$1kkfLn4$8+u87@I7j@JIlOL?V$$q`o=*AB$$*zQ3-#WxKAQesiiHuP!%N zXOn6(@#b~8+nr4|&GgT^$k#^PA)zd##*LC^+->M7$y1c5q z{|n616Px6A(FNi3YuCQ7tKG7yKxWswu47}$?qprAHhvJLag@w1{WUN1ogGDI_=a($ zX(QrBFdwVnnjvc2$`7*gG7A5Iazuz`Z`_pu~UQElyx)DJ( zo2e*eI^8sF_5IDZe)La^vMs+~tr@)&=;yn4>-snSBrtQ}$tlqu(HkbvvTd)wKRJ2( z_O1Um_M7e1NfZQu3Qjyb2_|6%s;kx2l6j%<<*J5)=QrCeQ{{P68!EKZ@nYp&tftr1 zX1=PY$~W;exqjQcJDUVv;Ds5V$x}LXUA9YaaW;7sMt+edewbt>B$|?m|5HB?axZup z2E>GZK8w9D@gw5MQbXcNE<)ccV@nnFG@iyE)=Km;2q1hpGvy;7#eM{&*^Agu$I3!a z_1^mF@^6i$eRbxiQ@$V8)7($8!ZXijL(f7F3{d^+FkVjg>f?>i>oB?}KD;MBu<^-k z=JPe^3#WdV11*T7TEQ9LV(c*34MRW7Kvm>9hzLACOZn94G;(r#%Bmu@zzAOPVRG{H zR5bGRpC13_I60OVyLMa7+pmNqp~Y%imD_c>eKT#VckKhIb7m+ec#)Km7j?8r=r)YT z2oj=Y;l59!*NG_jS?GnfkOo~Ooj}r_U^+=%9Uor7qao4sb{f^O2Yp0kUp}1j|HG?< z46|`)fISVD`Cfzvo#f>7G&u>Ea}qijutuRg^q{5LTPZ<|`or zHDx+qZRgO!!<&K#DP0B07H5jgQ|W}s3er=}!@IRphz3IN1K$UsLn$A?#ixNApq zhVtMK0|nCb=mEpsuCBk5(XQShBO?E3iTtcVXOSv~K{vxb3_94zyg-;@GR%ThUl3zf zteHQXrL`ZMIUHzKri@|QaSc{6Kq;AVN?{HA#UZT(T4HRyY#0Fn9!EfxJY)oRH}iS5 z+kGXYFke>lH(wh6Yj8Nrc?5Ck=P93V5JcC^yXYm$Zk~IZml7G`B4Htpqr16TjhQen zGr)N)iU;tNm@_*97&4<9O%tB`HTnUZ*$>hOOpU+XR69}QS28kvbH4Q8ykes6mcm5hhfa31@nWED(u4_DJ1@u5Q_k)5Kp8L z0n3<4ew;HR5&U$PNluJ&xLcOP2#Xm|qQDNX08%LQXGLB!$7n#v{Gw>Nmn5UT zVR$D+ZM~)C@ODY2Vc|!4EEWyT*zr$_%+J)DBvV791QtAl0BtaWKIjY0NJCLelxI`e zZW8jOwOyF_44GGPsaQS%=$d>5l5rI0`dTsP!nVN63T+|pr|-z`A*W@&vRJNjFrOAvm5|6 zu#^B8GDt-sA)R%d6&r3s22f7Um&I6_S&HWf4I)o}#`Y-5BHu1B4DHbsMuf%q!}&ru z9!t^w3KO!zuq!!OVMH+F3d7rjR+!6jRo`sCmfhyNDT<=rDPiEWLMZT|+=BMIpU>$xl3{yf%m zar3n-E41W+0}V=85{h)#P>atptjv6NZi!ZkIE|i>tw@<4dYUE1i@j_a2IZXuhAxRC zvyX@cfOi1&ETV_5O&~kIXvRvFW&)pU`sr!GLn?xu&rGaU9ZCGi<4|IdDQnuPIk~5@ zqhYt<3#jduMJrFArC#4;t&!nzMViw-ZKe^Z*lWUO_3f9`R8;vQj9<@|1o{$Ps^W8C zKiezih18DX%+1YKJh*ks!?MT&0Jx!8J4aMva#3@Q(p8ERD{{=}hzaU|QkWV0DU<>%#Q;g7vyg)2nF$9o6wM5Ad4(nP`MXUniF`feL)J7HJhv=^}|T} zFLXmxQ34yXw$Gjk24tc(P5AC%Q~XSOkt8jivB?M{NLo+Z*%@zi^))ukZx%-X&Cn9xj}~_h{FT|DgUhGi@NYPRG@)zxZakFd=V^~X*<4}didyV~-4 zwG+G(PE-H)Ww~C}@6Z0&Tr_R-eH;W&_^zHL1o*C=KfZi4Ihj1Eo4HXEKYn5| zEX&>UtU7!8AHMHb|LJe5#m&5W^ypFbiPujDwcEJ40kAL_*=j#|IJUaI&@k3j!AGO4^OPM5mH}NOm zRRFbxI*0(-gIVsw{Hfi(|Ecw{A~4X(187f7ogB)cnOR-Fe{}x%@n7lqeEFz){L^1G z5Ee@#{WBcnOCLHt>Fq3t=_8MaPA{6pdvCGYo=xf&O9h=v>*#u}BV)`cmX(Yz-c}cy zDcePLC(HGgQ|#?~tR*&!)m3u{p+wIuTa7MN6K8DD>1uO*)AnW})Y>*zOpy_WeLtHl zw&hz@En5>#wWY4UXx>e{6^GO3=Y48<(>Ba3E_K^1f>N)O)L` z%ZsXZdehuiTP7uk@eSx5+R3tRa#|*Gx>(&hSYKZ^8-uXjo7MF&iw(+7PssZ5Lgi-D z+-&C6y#+_yTf+GUn;lGYs;$%PE`s5(q3LwDEEmmN&xxs5muO405cs$J^x!>lS0AQPxa> z4qi8wWjB(CL}StC_Q0AU9ng@0(#<9D_=g#b?Aduli5Df4r=yjJ*~%8T(F=B9ftN1B zyWwF$?s{=Z$eqQPGZJRpK0+u{o1w?EnGScMRCT?&-mOOaBl@g~b2n$(T(9N>{cF8C z-CdU(C+87Z%65b-IwTt)CxX(q5}@N`G|3-p%(3a+O|75ZTBm8MfA7_KVa2$=*-jsv zU{~7w9DC6B&!pLoagN9v8u(@C)|g~GL#&sM&B)pa(%Ro~6ofFIX&*2{e&_55nO?iB z2$f8Pwi8W7KC9WnD?v(*DgSTTPZ*^PyM-thT+Vy-kQ5&Ztf22#hrv=;{n!N4EGAO) zoa0UY>q*>?oH!*G3a3AO*f4bOSo)ZRbbp$O+MJ2T>pr#h3)2xE)+cQnF;8@#Fd@sD z5695vJU>5NwEDqwnsBuGj;&8TTUB4o4D~6@YwgUyzAi;iJZs;%SAZ^B(ZDYF5?Ea> z-o?Y89nw!J0B|&sC zu2A9au3;fY2sxV|aPJ&}#aRb!Bd1-nw$)#5R>-OSB)Bt&hCZ<&(Ym$A^=+cV)&4Pq zK63^+Eb_}Rok-_Vtd&JAT_s;wNMiQ>(M!}g9BN0Xx-du3<0S`+CKHD%VK^S2bi?S( zEwh;Hi%mUcG}kgr*c~PDo~+z%b~2EiLfeWx$I&^oM{k{!^_+bw;)u?$x)ijV2t9En9FssVyHIR#F0o2lhek|Ony$rdjNpb$7EqN z6z>@7jPvQlqOC{h=?R0;zl^z#G2rv_-Obe%<|9X?osJ#!S=a;sV-N5tN{8**M%}|= zo12ym)4&Jr$;BqIsq44f@*1&u7_VS#md$Xvde_%~&yq5QYHsR}_NU7+0{S>8_R84% z4;>KUc-{}zXn*yAU9V^HG+-KVE~1qEv~f3C9bRoa&d=+W@Nd^3v^zWYQbuuroO{x6 zML_`cZx5>TG6cK8@ZQa=4!{BCd!c>T+-#lQ7uEDzm|hihwQ83)7e)c(Y^qO&?SnDs zA9g$c+<0xnWrXWyw?aw5@-_;*U0uh7OQ#tJ4Yet()$XjfSTQ|&=+ynWR_C_+$MwWC zVBCvv{`DN5X^+9W0Qz8=-m+~$zbgSOhVLf1)Q1?6$)Gbu^=mY~=!PlE3#wm?zp!vJ zzp!YT>;$qx;#X(*CL$x7H1)C$iWG@jSnm{*H3^YKgaDVmSm(cjiOE4mQUE8BMVmR* zjbh6990y?fqj)Su3!TklqmyCijzo!0S-?oZ%9Db!k(b4`jtXTIicypaV9SOqYcit9 zMl?lR9nogBLdWnlBF+SW?wG@9E1zPkM4GV}RZ9dh)NwOMkqEP%dD06>vXrO8n3gDk z3SvPs$W_OMWc^~bL?ZuXriFSv(3v&&0Z_|xIBG0!kfcq14*4`drc3l1TN<6dV0xpz zNF~n+5UP7jzZgS?!nQ(4F*w9VR{`n;41J)hG6KQQD#p~4qzFR>J*_sK2I+tRvm0~D zT1?GGtamOv4d|^}$RZ^C12}@c|P)ShoLU zVvY#xCacra*nJ%@nssG$k-|~0-Z-gOYF3{>(Ec0RWn#w11DZ*eo~zo1R_lvuv0(8y`DBy( z#`eFz>x=dcblR?;;eS2wFg_4$4HSIT<-tg@fk ziPMuCvmy>l-5~0yQ&x#wZ7o%qo*mF6Ye>r_v3;HJpgtQ@59qQ(Qir$U=TsNY90Jwr zD6%*kE0fT}1e5!+m{Jc_jAQ1Zwt)(Jg4FNOG8=gSc?X6sN~5^!g41cI{XW{xF4LH1 zC2^3vy0S8f=QiwPbs}1hGTq%mcaYFk-S5edm7upG7aj-9N}CV%pZpBn8{T`hue&k$ zz0;qbq1Dx`E>H%BwcOe20{Z;C(+eI`YmU?V!Fpc<@>iqu-Ct$Q^-YGt*##OxJRm>Q zMMts|y$qQ8hj?i&HU2}q{D*k?s}?T@fmQrqPtG&b-|lXEE_<+zH%wz##CUT*fZG%Q z47i6%e_h>2guhRmtr(e5oUK2F=p~eU1jJ8t!6fhJ8Tu=B*d+~tqwhz6d?MdHnCn(W z7FmR7?slX3;YhS!YY$-#jRGBq$KD3N_N-e#yU=~-(8?W69NA&0%Bf>7cx)vxb}M$B zKR>@Ks|lU7^dW-wKpo!Dd0X1FfyRA;NJbg_7^PTH1HCwq`=EXu6zKK1$YHuXoW^%( zv?c`R<=mW!?CZ?>;S^N+@31$f`}_Kok>2}4`gPX#G1_VPF!ychkaAZ(8%4@a&g9+ZWR;&fHRWNB4pFa7VpI#}N#DE)(oQ z4cI$Crd~gm)}@nW_=ud`owM!WM>ft?zo|UIWM(%-@!qmI-RZS+T&}KeWFh1||HB`Dq|iuipJ1;R7k5wqi2h%{CA9B@azn2sSHB~+6?Fy2A40>Y zD={WonBDsT(FqJm_mufr>88*=z@u=+eD{p@FUjuXUxeM?qm&p(_h<5ZM_+M_b}gCw zoftld92tK6-;v=Do%&)7KXzD@2Ir7LJ&@t=4Z9s=4ga2Dg8~1UVs8&>nAJ97?{)8` zV@!QH{gUo}XxHa)d}pTAO?gKZIm{RbleWG?Tplp{mll}!470@5ZME=zVAFyU=|9-r=mK!i|@v^fCmXYwU3@6B8nAaX^tbaC{{H_N#5(;91 zoTUA_NCi97FNc&LEwB+KzrwkL1rjo1HHE)?8R_v47N7?+{<5U;*@B}juqR=0z^6zv@WPz3Yxbbq%Bks}jV~g%eZGB^oSNC^P@Za4! z4yt^oPp&rqPvu2jdCg_-WzkyPg2C?d|6#zK;c|j3LHh?i(pkXajt)tmw!c#aM7iZU z4;>uMD7vi9k&%vRWGr=9*x=xF89q%oC7*h~Z(0odZ#D}K%0ezB^#g2wkzZ|9`%vzX zs#ENn{tnD7hfLhZ)Fl-z&)lr(rS6~nuI4Vrrv2^v9~O`L`!K$4!uY!y515a_$B%zw zi+%g8{q<#9`R%ulx;JC8$mZWk^ts@)ZFm2MRmvkY1|0KplzKjS{1^elxoV<59W=db z_^I&|*>`g24Hlo=O>}P+*a(AElZywRgE6(4rp70z6k&TKaerQC@SJnjG8;@IxC@}4(qTnOyEXDt_XICR1)51Zn^`vHR7Zp_hM zS$mJ3Klgt7|CI39R@K`gsONjgG|2|V`xL%w<{0emq0Au7eRN?rYEatsheD2_k-w1@ zpd|oQL>KsKeS^7_4(m1zm#)lVZf68|r`R}YY|CVJ;3zQu?zTg9bAhSt-W>*6rQoX) z434HlxPa??fnkATe1Ub2!eqTo;)gZ1H@I4uMY{Eg`87(ZxXL zqB7w1wD2)~Nd;d(xde|%k*=NMn<>CFrpUP-jXxU9g>Y^Z5(A!@npt2d#a(`)Fnr|EWs9+s5*&3Ym17kg7QMol$xCl-X?$ssKfxHO z`xy!v3o1DnFg_3QT_axs`|!CMjX$IwJ{hPX4L_{qsLKnu2ut&_6o)&Z{O#p<(Tnki zN1w2#qGhIsrUHw3uD%Kd*aOKe11`l9Io*+YQ;?Aj6MaH>-jVT@VvWD4VK{Ny`NF{Y zu%IGiMeT}w%MPql5PNTAPI8i5D*uu)MtGX$y|(!3PPNKd{|>?g|!=It9fMTX1`V- z=MX0C!gs+e6g3)Rw63V=Q6_sUON&_{LeX>Vt;o@idKfY6ni$rOjD^lj3}X;xW`^bm z7FnTuYvCD~QIb&{eZVr#41*yE23YTmF1~K&7@A?UCPzytMs>Q7q26h5ZSO6DEPnb{ zfgW{V&a(=9DKZNTk!v>1umLvMHbib{1hNb9su<#FET=J%@Xk$aT?#i~x>*2rWCKTI z!?|2@5$L=&6$6U!X;H-K-JzukL9RK+5KvhD#R>}Zi{nSWun8d%6J_uNN8p_c{r1_l zcEHhdZ21hC7VsB2exFEG}-={sD7m zck!@A;b>{s?CKX;8o1`$PV4Hj{gm6-gN0&r5R2X6G+H_yV~N0p?C3ETKX6_I`$}*j zrFEqnQ(Q1HY2%lSI&($r#c^3>T!%oCv-mZ07#hZz%iu!9Ouwb$DV7L%K7jMfanPPi z3)s?UmfwN)!&uOt^Ij@b8+&29P0W6VR>to^iJT|1M-X0)VkcY z%|agFKPhp*S9G@kCWId-nHS5wL;}0cW!;G5R~q|fCC(8N+>VzXk8jCbu! zm*M0&D8h9SOeVC|$Yl-H;GYDyrr0P)h(rOtaAi0g;9d;&j|)ia>jfu8)Jb3Pbq06U z9k|@|QXb1volsTBvXgCl??% z@r{Z(14a$_g=Nez1*dM64#77MW}p@+`Fu=rEvX?6Xju7;KaxV;L}DY{A%ajnZUlKM z%t~e5G~#0%F-K!BZ|JPN2{(o0eGoAVCzr1gG-sGDQx-QPAV-$hV)04&J_VOtV= zsGc#8BaITv7$}Z$N<)nCeWk&Iut6MM$M2;;LD|Y>^3w-`q6I;UF)NQ)qM>mJhU%~& zhn?ZMEW`ID$i(zrA=5m15(V+T(e{^F=Hx8@J@;j9E-G`|;xl;Cz>p4L{NgOB>(2u? z5$JH*mNrMgn8*z^Sft^>$vGVdH`V?)jWeFe)+L7kB1dZQql{A+v3^nL31AeYm zzwxqd0BNejHUyCn6prL2pFBVcp|;2fPx2b1K8cO!Y3IGuc&0*#e@AvoK*;0~D3O~* zZ7q;Ifh1-ZBhYJ_rN-koJV^4dI6`Xu@~b7xpui?mc_{z2)HjC2o$wjUbQ6ycbps>h zj)%53YHW62D=C(QGujZUG+vMi6?A=OoQ~rrRUYf9O{)IAtH8PY&q zoemJF4TFII?)FK9VfeAg#`sS+zF1Sa*)L{=VotIl#uR3xCv`$;tooA!Q#4H5PWjv&m*Vm{7 zjV9{k&pTw8r$0?DN-ipY@5!5vH@QTWg?HE;5&tpa+^Mrj$1%;^UNR+*6RqnXDs~Ws z_HZ?S^Zp=ZwLz(fV)h_Ee`w+jCte59<}xYm*w5T43yJL1v#{`m`1rM{tdHbRSD<_I ztHsLH)^0JYZx{r=kJe*)4*8B}zC_U6y!ZE4k=yOX!g_D!|jopc4Wd6lPG zJ~w}yhQDG&dAt{GcC|sS@b}YXj&BO%JWmZ=VFL)3pPP_w?+vLcJG|Jw*o^P*Wa}!o zml}QC^Ho0HRRlNm^yZ;)Y?rH@CvPKyd#pga-bbJ~n~QLlV< z5!p3qY|UZjyRB)Qh8*TT0@}H(uh#zwf);*^qPn{s+L6(IrI?pdhDCUAQfVyHwf0@S@N!(M4y!Y*?(EA4|(U>6V)Lt6BdA@0nm` From a4ca9893ee422acc8b25442d71066b2df3251a86 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 9 Dec 2024 16:09:09 -0500 Subject: [PATCH 40/57] Allow unlisted collections in single public collection GET --- backend/btrixcloud/colls.py | 10 +++++-- backend/test/test_collections.py | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 7e60544309..40e4180658 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -258,12 +258,16 @@ async def get_collection_out( return CollOut.from_dict(result) async def get_public_collection_out( - self, coll_id: UUID, org: Organization + self, coll_id: UUID, org: Organization, allow_unlisted: bool = False ) -> PublicCollOut: """Get PublicCollOut by id""" result = await self.get_collection_raw(coll_id) - if result.get("access") != "public": + allowed_access = [CollAccessType.PUBLIC] + if allow_unlisted: + allowed_access.append(CollAccessType.UNLISTED) + + if result.get("access") not in allowed_access: raise HTTPException(status_code=404, detail="collection_not_found") result["resources"] = await self.get_collection_crawl_resources(coll_id) @@ -1004,7 +1008,7 @@ async def get_public_collection( if not org.enablePublicProfile: raise HTTPException(status_code=404, detail="collection_not_found") - return await colls.get_public_collection_out(coll_id, org) + return await colls.get_public_collection_out(coll_id, org, allow_unlisted=True) @app.get( "/orgs/{oid}/collections/{coll_id}/urls", diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 854c2c430b..2ca5c9159a 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1128,6 +1128,56 @@ def test_get_public_collection(default_org_id): assert r.json()["detail"] == "collection_not_found" +def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): + # Make second public coll unlisted + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + json={ + "access": "unlisted", + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify single public collection GET endpoint works for unlisted collection + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _public_coll_id + assert coll["oid"] == default_org_id + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + assert coll["caption"] == CAPTION + + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in NON_PUBLIC_IMAGE_FIELDS: + assert field not in thumbnail + + def test_delete_thumbnail(crawler_auth_headers, default_org_id): r = requests.delete( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail", From af0d798693bf19fe4b8f86ec41f6b5b5fa1118dc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 9 Dec 2024 16:13:20 -0500 Subject: [PATCH 41/57] Add access field to PublicCollOut --- backend/btrixcloud/models.py | 4 +++- backend/test/test_collections.py | 12 +++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 430ca14a22..118900b969 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1292,7 +1292,7 @@ class CollOut(BaseMongoModel): # Sorted by count, descending tags: Optional[List[str]] = [] - access: CollAccessType = CollAccessType.PRIVATE + access: CollAccessType homeUrl: Optional[AnyHttpUrl] = None homeUrlTs: Optional[datetime] = None @@ -1318,6 +1318,8 @@ class PublicCollOut(BaseMongoModel): dateEarliest: Optional[datetime] = None dateLatest: Optional[datetime] = None + access: CollAccessType + homeUrl: Optional[AnyHttpUrl] = None homeUrlTs: Optional[datetime] = None diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 2ca5c9159a..9350d4c6b4 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -19,7 +19,6 @@ NON_PUBLIC_COLL_FIELDS = ( "modified", "tags", - "access", "homeUrlPageId", ) NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created") @@ -874,6 +873,7 @@ def test_list_public_collections( for collection in collections: assert collection["id"] in (_public_coll_id, _second_public_coll_id) assert collection["oid"] + assert collection["access"] == "public" assert collection["name"] assert collection["dateEarliest"] assert collection["dateLatest"] @@ -1034,6 +1034,7 @@ def test_list_public_colls_home_url_thumbnail(): for coll in collections: assert coll["id"] in (_public_coll_id, _second_public_coll_id) assert coll["oid"] + assert coll["access"] == "public" assert coll["name"] assert coll["resources"] assert coll["dateEarliest"] @@ -1076,6 +1077,7 @@ def test_get_public_collection(default_org_id): assert coll["id"] == _public_coll_id assert coll["oid"] == default_org_id + assert coll["access"] == "public" assert coll["name"] assert coll["resources"] assert coll["dateEarliest"] @@ -1147,8 +1149,9 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): assert r.status_code == 200 coll = r.json() - assert coll["id"] == _public_coll_id + assert coll["id"] == _second_public_coll_id assert coll["oid"] == default_org_id + assert coll["access"] == "unlisted" assert coll["name"] assert coll["resources"] assert coll["dateEarliest"] @@ -1160,11 +1163,6 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll - assert coll["caption"] == CAPTION - - assert coll["homeUrl"] - assert coll["homeUrlTs"] - thumbnail = coll["thumbnail"] assert thumbnail From 72afd60a8f837fd62eced201479d0b2a21a52fd6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 9 Dec 2024 17:14:30 -0500 Subject: [PATCH 42/57] Don't expect thumbnail for unlisted collection --- backend/test/test_collections.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 9350d4c6b4..13ddb62dac 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1163,18 +1163,6 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll - thumbnail = coll["thumbnail"] - assert thumbnail - - assert thumbnail["name"] - assert thumbnail["path"] - assert thumbnail["hash"] - assert thumbnail["size"] - assert thumbnail["mime"] - - for field in NON_PUBLIC_IMAGE_FIELDS: - assert field not in thumbnail - def test_delete_thumbnail(crawler_auth_headers, default_org_id): r = requests.delete( From 902e49e0ab4da0bd7a4d34de291b7543cc79190c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 10 Dec 2024 10:28:16 -0500 Subject: [PATCH 43/57] Fix thumbnail-related 500 error with private collections list --- backend/btrixcloud/colls.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 40e4180658..629d94f14e 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -294,7 +294,7 @@ async def list_collections( access: Optional[str] = None, ): """List all collections for org""" - # pylint: disable=too-many-locals, duplicate-code + # pylint: disable=too-many-locals, duplicate-code, too-many-branches # Zero-index page for query page = page - 1 skip = page * page_size @@ -351,16 +351,22 @@ async def list_collections( collections: List[Union[CollOut, PublicCollOut]] = [] for res in items: - if public_colls_out: - res["resources"] = await self.get_collection_crawl_resources(res["_id"]) + res["resources"] = await self.get_collection_crawl_resources(res["_id"]) + + thumbnail = res.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) - thumbnail = res.get("thumbnail") - if thumbnail: - image_file = ImageFile(**thumbnail) + if public_colls_out: res["thumbnail"] = await image_file.get_public_image_file_out( org, self.storage_ops ) + else: + res["thumbnail"] = await image_file.get_image_file_out( + org, self.storage_ops + ) + if public_colls_out: collections.append(PublicCollOut.from_dict(res)) else: collections.append(CollOut.from_dict(res)) From 57825a2de39dc8d98b37d6d058fa64671fe893e9 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 10 Dec 2024 16:21:26 -0500 Subject: [PATCH 44/57] Add defaultThumbnailName field to collections This allows the frontend to specify which default thumbnail to use without needing to re-upload the same file to the backend and stream if from s3 each time it's used. The field can be set on collection creation and is is updatable via the collection PATCH endpoint. --- backend/btrixcloud/colls.py | 1 + backend/btrixcloud/models.py | 6 ++++++ backend/test/test_collections.py | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 629d94f14e..c41d090b6f 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -118,6 +118,7 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): caption=coll_in.caption, modified=modified, access=coll_in.access, + defaultThumbnailName=coll_in.defaultThumbnailName, ) try: await self.collections.insert_one(coll.to_dict()) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 118900b969..97fb4816f8 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1258,6 +1258,7 @@ class Collection(BaseMongoModel): homeUrlPageId: Optional[UUID] = None thumbnail: Optional[ImageFile] = None + defaultThumbnailName: Optional[str] = None # ============================================================================ @@ -1271,6 +1272,8 @@ class CollIn(BaseModel): access: CollAccessType = CollAccessType.PRIVATE + defaultThumbnailName: Optional[str] = None + # ============================================================================ class CollOut(BaseMongoModel): @@ -1300,6 +1303,7 @@ class CollOut(BaseMongoModel): resources: List[CrawlFileOut] = [] thumbnail: Optional[ImageFileOut] = None + defaultThumbnailName: Optional[str] = None # ============================================================================ @@ -1325,6 +1329,7 @@ class PublicCollOut(BaseMongoModel): resources: List[CrawlFileOut] = [] thumbnail: Optional[PublicImageFileOut] = None + defaultThumbnailName: Optional[str] = None # ============================================================================ @@ -1335,6 +1340,7 @@ class UpdateColl(BaseModel): description: Optional[str] = None caption: Optional[str] = None access: Optional[CollAccessType] = None + defaultThumbnailName: Optional[str] = None # ============================================================================ diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 13ddb62dac..630b95690e 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -38,6 +38,8 @@ def test_create_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id ): + default_thumbnail_name = "default-thumbnail.jpg" + r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers, @@ -45,6 +47,7 @@ def test_create_collection( "crawlIds": [crawler_crawl_id], "name": COLLECTION_NAME, "caption": CAPTION, + "defaultThumbnailName": default_thumbnail_name, }, ) assert r.status_code == 200 @@ -83,6 +86,8 @@ def test_create_collection( assert data["dateEarliest"] assert data["dateLatest"] + assert data["defaultThumbnailName"] == default_thumbnail_name + def test_create_public_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id @@ -176,6 +181,7 @@ def test_update_collection( assert modified.endswith("Z") assert data["dateEarliest"] assert data["dateLatest"] + assert data["defaultThumbnailName"] def test_rename_collection( @@ -327,6 +333,7 @@ def test_get_collection(crawler_auth_headers, default_org_id): assert data["tags"] == ["wr-test-2", "wr-test-1"] assert data["dateEarliest"] assert data["dateLatest"] + assert data["defaultThumbnailName"] def test_get_collection_replay(crawler_auth_headers, default_org_id): @@ -348,6 +355,7 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["tags"] == ["wr-test-2", "wr-test-1"] assert data["dateEarliest"] assert data["dateLatest"] + assert data["defaultThumbnailName"] resources = data["resources"] assert resources @@ -466,6 +474,7 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): assert data["tags"] == ["wr-test-2", "wr-test-1"] assert data["dateEarliest"] assert data["dateLatest"] + assert data["defaultThumbnailName"] # Verify it was added r = requests.get( @@ -525,6 +534,7 @@ def test_list_collections( assert first_coll["access"] == "private" assert first_coll["dateEarliest"] assert first_coll["dateLatest"] + assert first_coll["defaultThumbnailName"] second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0] assert second_coll["id"] @@ -1011,6 +1021,28 @@ def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id): assert thumbnail["created"] +def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id): + default_thumbnail_name = "orange-default.avif" + + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + json={"defaultThumbnailName": default_thumbnail_name}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == _second_public_coll_id + assert data["defaultThumbnailName"] == default_thumbnail_name + + def test_list_public_colls_home_url_thumbnail(): # Check we get expected data for each public collection # and nothing we don't expect @@ -1066,6 +1098,7 @@ def test_list_public_colls_home_url_thumbnail(): if coll["id"] == _second_public_coll_id: assert coll["description"] + assert coll["defaultThumbnailName"] == "orange-default.avif" def test_get_public_collection(default_org_id): @@ -1159,6 +1192,7 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): assert coll["crawlCount"] > 0 assert coll["pageCount"] > 0 assert coll["totalSize"] > 0 + assert coll["defaultThumbnailName"] == "orange-default.avif" for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll From 35474a47bece5bdac7631d65365af06c32c453b7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 11 Dec 2024 10:43:00 -0500 Subject: [PATCH 45/57] Add fallback values for access in coll out models --- backend/btrixcloud/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 97fb4816f8..d189ee1d71 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1295,7 +1295,7 @@ class CollOut(BaseMongoModel): # Sorted by count, descending tags: Optional[List[str]] = [] - access: CollAccessType + access: CollAccessType = CollAccessType.PRIVATE homeUrl: Optional[AnyHttpUrl] = None homeUrlTs: Optional[datetime] = None @@ -1322,7 +1322,7 @@ class PublicCollOut(BaseMongoModel): dateEarliest: Optional[datetime] = None dateLatest: Optional[datetime] = None - access: CollAccessType + access: CollAccessType = CollAccessType.PUBLIC homeUrl: Optional[AnyHttpUrl] = None homeUrlTs: Optional[datetime] = None From 7e8583aa2043a1c6563519cf2e43c957f24c68c2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 11 Dec 2024 14:36:15 -0500 Subject: [PATCH 46/57] Add ability to unset home url via /home-url endpoint --- backend/btrixcloud/colls.py | 20 +++++++++++++------- backend/btrixcloud/models.py | 2 +- backend/test/test_collections.py | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index c41d090b6f..0e1cedd990 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -690,13 +690,19 @@ async def set_home_url( self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization ) -> Dict[str, bool]: """Set home URL for collection and save thumbnail to database""" - page = await self.page_ops.get_page(update.pageId, org.id) - - update_query = { - "homeUrl": page.url, - "homeUrlTs": page.ts, - "homeUrlPageId": page.id, - } + if update.pageId: + page = await self.page_ops.get_page(update.pageId, org.id) + update_query = { + "homeUrl": page.url, + "homeUrlTs": page.ts, + "homeUrlPageId": page.id, + } + else: + update_query = { + "homeUrl": None, + "homeUrlTs": None, + "homeUrlPageId": None, + } await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index d189ee1d71..1e37592366 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1347,7 +1347,7 @@ class UpdateColl(BaseModel): class UpdateCollHomeUrl(BaseModel): """Update home url for collection""" - pageId: UUID + pageId: Optional[UUID] = None # ============================================================================ diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 630b95690e..6d503c0f9d 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1221,6 +1221,30 @@ def test_delete_thumbnail(crawler_auth_headers, default_org_id): assert r.json()["detail"] == "thumbnail_not_found" +def test_unset_collection_home_url( + crawler_auth_headers, default_org_id, crawler_crawl_id +): + # Unset home url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url", + headers=crawler_auth_headers, + json={"pageId": None}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Check that fields were set in collection as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data.get("homeUrl") is None + assert data.get("homeUrlTs") is None + assert data.get("homeUrlPageId") is None + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From 4b9a60df08af4d61099f9782332d7109f761809c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 11 Dec 2024 17:04:21 -0500 Subject: [PATCH 47/57] Add public collection download endpoint --- backend/btrixcloud/colls.py | 24 ++++++++++++++++++++++++ backend/test/test_collections.py | 20 ++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 0e1cedd990..580b37098a 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -1023,6 +1023,30 @@ async def get_public_collection( return await colls.get_public_collection_out(coll_id, org, allow_unlisted=True) + @app.get( + "/public/orgs/{org_slug}/collections/{coll_id}/download", + tags=["collections", "public"], + response_model=bytes, + ) + async def download_public_collection( + org_slug: str, + coll_id: UUID, + ): + try: + org = await colls.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="collection_not_found") + + if not org.enablePublicProfile: + raise HTTPException(status_code=404, detail="collection_not_found") + + # Make sure collection exists and is public/unlisted + coll = await colls.get_collection(coll_id, public_or_unlisted_only=True) + + return await colls.download_collection(coll.id, org) + @app.get( "/orgs/{oid}/collections/{coll_id}/urls", tags=["collections"], diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 6d503c0f9d..2e44580093 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1245,6 +1245,26 @@ def test_unset_collection_home_url( assert data.get("homeUrlPageId") is None +def test_download_streaming_public_collection(): + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From 7cf0b6238a33d6c1767034da9e9749529b0d8cfe Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 11 Dec 2024 17:14:06 -0500 Subject: [PATCH 48/57] Implement and enforce Collection.allowPublicDownload --- backend/btrixcloud/colls.py | 4 ++++ backend/btrixcloud/models.py | 8 ++++++++ backend/test/test_collections.py | 28 +++++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 580b37098a..87c7893d45 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -119,6 +119,7 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): modified=modified, access=coll_in.access, defaultThumbnailName=coll_in.defaultThumbnailName, + allowPublicDownload=coll_in.allowPublicDownload, ) try: await self.collections.insert_one(coll.to_dict()) @@ -1045,6 +1046,9 @@ async def download_public_collection( # Make sure collection exists and is public/unlisted coll = await colls.get_collection(coll_id, public_or_unlisted_only=True) + if coll.allowPublicDownload is False: + raise HTTPException(status_code=403, detail="not_allowed") + return await colls.download_collection(coll.id, org) @app.get( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 1e37592366..315013a0fa 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1260,6 +1260,8 @@ class Collection(BaseMongoModel): thumbnail: Optional[ImageFile] = None defaultThumbnailName: Optional[str] = None + allowPublicDownload: Optional[bool] = True + # ============================================================================ class CollIn(BaseModel): @@ -1273,6 +1275,7 @@ class CollIn(BaseModel): access: CollAccessType = CollAccessType.PRIVATE defaultThumbnailName: Optional[str] = None + allowPublicDownload: bool = True # ============================================================================ @@ -1305,6 +1308,8 @@ class CollOut(BaseMongoModel): thumbnail: Optional[ImageFileOut] = None defaultThumbnailName: Optional[str] = None + allowPublicDownload: Optional[bool] = True + # ============================================================================ class PublicCollOut(BaseMongoModel): @@ -1331,6 +1336,8 @@ class PublicCollOut(BaseMongoModel): thumbnail: Optional[PublicImageFileOut] = None defaultThumbnailName: Optional[str] = None + allowPublicDownload: Optional[bool] = True + # ============================================================================ class UpdateColl(BaseModel): @@ -1341,6 +1348,7 @@ class UpdateColl(BaseModel): caption: Optional[str] = None access: Optional[CollAccessType] = None defaultThumbnailName: Optional[str] = None + allowPublicDownload: Optional[bool] = None # ============================================================================ diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 2e44580093..5ec7b5d326 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -87,6 +87,7 @@ def test_create_collection( assert data["dateLatest"] assert data["defaultThumbnailName"] == default_thumbnail_name + assert data["allowPublicDownload"] def test_create_public_collection( @@ -100,6 +101,7 @@ def test_create_public_collection( "name": PUBLIC_COLLECTION_NAME, "caption": CAPTION, "access": "public", + "allowPublicDownload": False, }, ) assert r.status_code == 200 @@ -1079,6 +1081,8 @@ def test_list_public_colls_home_url_thumbnail(): assert field not in coll if coll["id"] == _public_coll_id: + assert coll["allowPublicDownload"] is False + assert coll["caption"] == CAPTION assert coll["homeUrl"] @@ -1099,6 +1103,7 @@ def test_list_public_colls_home_url_thumbnail(): if coll["id"] == _second_public_coll_id: assert coll["description"] assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] def test_get_public_collection(default_org_id): @@ -1127,6 +1132,8 @@ def test_get_public_collection(default_org_id): assert coll["homeUrl"] assert coll["homeUrlTs"] + assert coll["allowPublicDownload"] is False + thumbnail = coll["thumbnail"] assert thumbnail @@ -1193,6 +1200,7 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): assert coll["pageCount"] > 0 assert coll["totalSize"] > 0 assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] for field in NON_PUBLIC_COLL_FIELDS: assert field not in coll @@ -1245,7 +1253,25 @@ def test_unset_collection_home_url( assert data.get("homeUrlPageId") is None -def test_download_streaming_public_collection(): +def test_download_streaming_public_collection(crawler_auth_headers, default_org_id): + # Check that download is blocked if allowPublicDownload is False + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 403 + + # Set allowPublicDownload to True and then check downloading works + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + json={ + "allowPublicDownload": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + with TemporaryFile() as fh: with requests.get( f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", From 7ac420d992cb1c8b238e4d9774cdd156bb9538ef Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 11 Dec 2024 17:21:42 -0500 Subject: [PATCH 49/57] Fix linting --- backend/btrixcloud/colls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 87c7893d45..53f7185f16 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -1049,7 +1049,7 @@ async def download_public_collection( if coll.allowPublicDownload is False: raise HTTPException(status_code=403, detail="not_allowed") - return await colls.download_collection(coll.id, org) + return await colls.download_collection(coll_id, org) @app.get( "/orgs/{oid}/collections/{coll_id}/urls", From d00cc39384d8f310c1d56df4e14ec5cda0ddbbe1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 11 Dec 2024 17:24:47 -0500 Subject: [PATCH 50/57] Make sure coll out models return allowPublicDownload as bool --- backend/btrixcloud/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 315013a0fa..5517067948 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1308,7 +1308,7 @@ class CollOut(BaseMongoModel): thumbnail: Optional[ImageFileOut] = None defaultThumbnailName: Optional[str] = None - allowPublicDownload: Optional[bool] = True + allowPublicDownload: bool = True # ============================================================================ @@ -1336,7 +1336,7 @@ class PublicCollOut(BaseMongoModel): thumbnail: Optional[PublicImageFileOut] = None defaultThumbnailName: Optional[str] = None - allowPublicDownload: Optional[bool] = True + allowPublicDownload: bool = True # ============================================================================ From 4482b381fa05faa9ff126ac0475b26e68dc5be37 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 17 Dec 2024 15:35:22 -0500 Subject: [PATCH 51/57] Make migration idempotent - don't readd existing upload pages --- backend/btrixcloud/db.py | 6 ++--- .../migrations/migration_0037_upload_pages.py | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 460f1a1082..0723258e40 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -95,7 +95,7 @@ async def update_and_prepare_db( """ await ping_db(mdb) print("Database setup started", flush=True) - if await run_db_migrations(mdb, user_manager, background_job_ops): + if await run_db_migrations(mdb, user_manager, background_job_ops, page_ops): await drop_indexes(mdb) await create_indexes( org_ops, @@ -114,7 +114,7 @@ async def update_and_prepare_db( # ============================================================================ -async def run_db_migrations(mdb, user_manager, background_job_ops): +async def run_db_migrations(mdb, user_manager, background_job_ops, page_ops): """Run database migrations.""" # if first run, just set version and exit @@ -147,7 +147,7 @@ async def run_db_migrations(mdb, user_manager, background_job_ops): migration_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(migration_module) migration = migration_module.Migration( - mdb, background_job_ops=background_job_ops + mdb, background_job_ops=background_job_ops, page_ops=page_ops ) if await migration.run(): migrations_run = True diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index 9bb4408a0d..174e0e31d2 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -2,6 +2,8 @@ Migration 0037 -- upload pages """ +from uuid import UUID + from btrixcloud.migrations import BaseMigration @@ -16,6 +18,17 @@ def __init__(self, mdb, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) self.background_job_ops = kwargs.get("background_job_ops") + self.page_ops = kwargs.get("page_ops") + + async def org_upload_pages_already_added(self, oid: UUID) -> bool: + """Check if upload pages have already been added for this org""" + mdb_crawls = self.mdb["crawls"] + async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}): + upload_id = upload["_id"] + _, total = await self.page_ops.list_pages(upload_id) + if total > 0: + return True + return False async def migrate_up(self): """Perform migration up. @@ -28,9 +41,22 @@ async def migrate_up(self): ) return + if self.page_ops is None: + print( + "Unable to start background job, missing page_ops", flush=True + ) + return + mdb_orgs = self.mdb["organizations"] async for org in mdb_orgs.find(): oid = org["_id"] + + pages_already_added = await self.org_upload_pages_already_added(oid) + + if pages_already_added: + print(f"Skipping org {oid}, upload pages already added to db", flush=True) + continue + try: await self.background_job_ops.create_re_add_org_pages_job( oid, crawl_type="upload" From 73813ddab0731b73c2a569c8b4122b22f05cf536 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 17 Dec 2024 15:58:43 -0500 Subject: [PATCH 52/57] Reformat migration --- .../btrixcloud/migrations/migration_0037_upload_pages.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index 174e0e31d2..bab57b97ea 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -42,9 +42,7 @@ async def migrate_up(self): return if self.page_ops is None: - print( - "Unable to start background job, missing page_ops", flush=True - ) + print("Unable to start background job, missing page_ops", flush=True) return mdb_orgs = self.mdb["organizations"] @@ -54,7 +52,9 @@ async def migrate_up(self): pages_already_added = await self.org_upload_pages_already_added(oid) if pages_already_added: - print(f"Skipping org {oid}, upload pages already added to db", flush=True) + print( + f"Skipping org {oid}, upload pages already added to db", flush=True + ) continue try: From 2dce950508e7963eae2e8ead6ea5510950e62a07 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 17 Dec 2024 16:04:37 -0500 Subject: [PATCH 53/57] Fix typing --- .../migrations/migration_0037_upload_pages.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index bab57b97ea..62bfe98237 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -22,6 +22,13 @@ def __init__(self, mdb, **kwargs): async def org_upload_pages_already_added(self, oid: UUID) -> bool: """Check if upload pages have already been added for this org""" + if self.page_ops is None: + print( + f"page_ops missing, assuming pages need to be added for org {oid}", + flush=True, + ) + return False + mdb_crawls = self.mdb["crawls"] async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}): upload_id = upload["_id"] @@ -41,10 +48,6 @@ async def migrate_up(self): ) return - if self.page_ops is None: - print("Unable to start background job, missing page_ops", flush=True) - return - mdb_orgs = self.mdb["organizations"] async for org in mdb_orgs.find(): oid = org["_id"] From 979884eed0d8a67292e6339f586ee463ff0102e5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 17 Dec 2024 23:13:45 -0500 Subject: [PATCH 54/57] Allow getting and downloading public collections if org profile disabled --- backend/btrixcloud/colls.py | 6 --- backend/test/test_collections.py | 82 ++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 53f7185f16..b0c65671fb 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -1019,9 +1019,6 @@ async def get_public_collection( # pylint: disable=raise-missing-from raise HTTPException(status_code=404, detail="collection_not_found") - if not org.enablePublicProfile: - raise HTTPException(status_code=404, detail="collection_not_found") - return await colls.get_public_collection_out(coll_id, org, allow_unlisted=True) @app.get( @@ -1040,9 +1037,6 @@ async def download_public_collection( # pylint: disable=raise-missing-from raise HTTPException(status_code=404, detail="collection_not_found") - if not org.enablePublicProfile: - raise HTTPException(status_code=404, detail="collection_not_found") - # Make sure collection exists and is public/unlisted coll = await colls.get_collection(coll_id, public_or_unlisted_only=True) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 5ec7b5d326..c2999bb5ad 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1206,6 +1206,55 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): assert field not in coll +def test_get_public_collection_unlisted_org_profile_disabled( + admin_auth_headers, default_org_id +): + # Disable org profile + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": False, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify we can still get public details for unlisted collection + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _second_public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "unlisted" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + # Re-enable org profile + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + def test_delete_thumbnail(crawler_auth_headers, default_org_id): r = requests.delete( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail", @@ -1291,6 +1340,39 @@ def test_download_streaming_public_collection(crawler_auth_headers, default_org_ assert zip_file.getinfo(filename).compress_type == ZIP_STORED +def test_download_streaming_public_collection_profile_disabled( + admin_auth_headers, default_org_id +): + # Disable org public profile and ensure download still works for public collection + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": False, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( From 320b218d64a26ec43aa1af76a5bf267f635187eb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 17 Dec 2024 23:53:34 -0500 Subject: [PATCH 55/57] URL decode urlPrefix in /urls endpoint --- backend/btrixcloud/colls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index b0c65671fb..d934db0a7a 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -8,6 +8,7 @@ from uuid import UUID, uuid4 from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union import os +import urllib.parse import asyncio import pymongo @@ -1058,10 +1059,11 @@ async def get_collection_url_list( page: int = 1, ): """Retrieve paginated list of urls in collection sorted by snapshot count""" + url_prefix = urllib.parse.unquote(urlPrefix) if urlPrefix else None pages, total = await colls.list_urls_in_collection( coll_id=coll_id, oid=oid, - url_prefix=urlPrefix, + url_prefix=url_prefix, page_size=pageSize, page=page, ) From fb0ad7bbc468ef50a2a4a67ec59750101b871f78 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 18 Dec 2024 00:45:22 -0500 Subject: [PATCH 56/57] Do URL decoding inside list urls method --- backend/btrixcloud/colls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index d934db0a7a..be21c25a5c 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -635,6 +635,7 @@ async def list_urls_in_collection( match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}} if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) regex_pattern = f"^{url_prefix}" match_query["url"] = {"$regex": regex_pattern, "$options": "i"} @@ -1059,11 +1060,10 @@ async def get_collection_url_list( page: int = 1, ): """Retrieve paginated list of urls in collection sorted by snapshot count""" - url_prefix = urllib.parse.unquote(urlPrefix) if urlPrefix else None pages, total = await colls.list_urls_in_collection( coll_id=coll_id, oid=oid, - url_prefix=url_prefix, + url_prefix=urlPrefix, page_size=pageSize, page=page, ) From f8824092b0d2a2c0eab5a477f47b5bfe0b665049 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 18 Dec 2024 00:47:54 -0500 Subject: [PATCH 57/57] Escape special characters in url_prefix regex --- backend/btrixcloud/colls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index be21c25a5c..d9ab766aa9 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -8,6 +8,7 @@ from uuid import UUID, uuid4 from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union import os +import re import urllib.parse import asyncio @@ -636,7 +637,7 @@ async def list_urls_in_collection( if url_prefix: url_prefix = urllib.parse.unquote(url_prefix) - regex_pattern = f"^{url_prefix}" + regex_pattern = f"^{re.escape(url_prefix)}" match_query["url"] = {"$regex": regex_pattern, "$options": "i"} aggregate = [{"$match": match_query}]