diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b9667078e0..35a2e75d59 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -1,7 +1,6 @@ """k8s background jobs""" import asyncio -import os from datetime import datetime from typing import Optional, Tuple, Union, List, Dict, TYPE_CHECKING, cast from uuid import UUID @@ -22,6 +21,7 @@ DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob, + ReAddOrgPagesJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -286,8 +286,6 @@ async def create_delete_org_job( try: job_id = await self.crawl_manager.run_delete_org_job( oid=str(org.id), - backend_image=os.environ.get("BACKEND_IMAGE", ""), - pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), existing_job_id=existing_job_id, ) if existing_job_id: @@ -331,8 +329,6 @@ async def create_recalculate_org_stats_job( try: job_id = await self.crawl_manager.run_recalculate_org_stats_job( oid=str(org.id), - backend_image=os.environ.get("BACKEND_IMAGE", ""), - pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), existing_job_id=existing_job_id, ) if existing_job_id: @@ -366,6 +362,52 @@ async def create_recalculate_org_stats_job( print(f"warning: recalculate org stats job could not be started: {exc}") return None + async def create_re_add_org_pages_job( + self, + oid: UUID, + crawl_type: Optional[str] = None, + existing_job_id: Optional[str] = None, + ): + """Create job to (re)add all pages in an org, optionally filtered by crawl type""" + + try: + job_id = await self.crawl_manager.run_re_add_org_pages_job( + oid=str(oid), + crawl_type=crawl_type, + existing_job_id=existing_job_id, + ) + if existing_job_id: + readd_pages_job = await self.get_background_job(existing_job_id, oid) + previous_attempt = { + "started": readd_pages_job.started, + "finished": readd_pages_job.finished, + } + if readd_pages_job.previousAttempts: + readd_pages_job.previousAttempts.append(previous_attempt) + else: + readd_pages_job.previousAttempts = [previous_attempt] + readd_pages_job.started = dt_now() + readd_pages_job.finished = None + readd_pages_job.success = None + else: + readd_pages_job = ReAddOrgPagesJob( + id=job_id, + oid=oid, + crawl_type=crawl_type, + started=dt_now(), + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": readd_pages_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + # pylint: disable=raise-missing-from + print(f"warning: re-add org pages job could not be started: {exc}") + return None + async def job_finished( self, job_id: str, @@ -411,7 +453,11 @@ async def job_finished( async def get_background_job( self, job_id: str, oid: Optional[UUID] = None ) -> Union[ - CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob + CreateReplicaJob, + DeleteReplicaJob, + DeleteOrgJob, + RecalculateOrgStatsJob, + ReAddOrgPagesJob, ]: """Get background job""" query: dict[str, object] = {"_id": job_id} @@ -435,6 +481,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.RECALCULATE_ORG_STATS: return RecalculateOrgStatsJob.from_dict(data) + if data["type"] == BgJobType.READD_ORG_PAGES: + return ReAddOrgPagesJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( @@ -575,6 +624,13 @@ async def retry_background_job( existing_job_id=job_id, ) + if job.type == BgJobType.READD_ORG_PAGES: + await self.create_re_add_org_pages_job( + org.id, + job.crawl_type, + existing_job_id=job_id, + ) + return {"success": True} async def retry_failed_background_jobs( diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index d913d3362b..01e700f8b5 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -1,6 +1,5 @@ """ base crawl type """ -import os from datetime import timedelta from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple from uuid import UUID @@ -29,6 +28,7 @@ UpdatedResponse, DeletedResponseQuota, CrawlSearchValuesResponse, + PRESIGN_DURATION_SECONDS, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now, date_to_str @@ -47,11 +47,6 @@ CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object StorageOps = EventWebhookOps = BackgroundJobOps = object -# Presign duration must be less than 604800 seconds (one week), -# so set this one minute short of a week. -PRESIGN_MINUTES_MAX = 10079 -PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX - # ============================================================================ # pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines @@ -93,16 +88,8 @@ def __init__( self.background_job_ops = background_job_ops self.page_ops = cast(PageOps, None) - presign_duration_minutes = int( - os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT - ) - - self.presign_duration_seconds = ( - min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60 - ) - # renew when <25% of time remaining - self.expire_at_duration_seconds = int(self.presign_duration_seconds * 0.75) + self.expire_at_duration_seconds = int(PRESIGN_DURATION_SECONDS * 0.75) def set_page_ops(self, page_ops): """set page ops reference""" @@ -336,8 +323,9 @@ async def delete_crawls( status_code=400, detail=f"Error Stopping Crawl: {exc}" ) + await self.page_ops.delete_crawl_pages(crawl_id, org.id) + if type_ == "crawl": - await self.page_ops.delete_crawl_pages(crawl_id, org.id) await self.delete_all_crawl_qa_files(crawl_id, org) crawl_size = await self._delete_crawl_files(crawl, org) @@ -382,7 +370,7 @@ async def _delete_crawl_files( size = 0 for file_ in crawl.files: size += file_.size - if not await self.storage_ops.delete_crawl_file_object(org, file_): + if not await self.storage_ops.delete_file_object(org, file_): raise HTTPException(status_code=400, detail="file_deletion_error") # Not replicating QA run WACZs yet if not isinstance(crawl, QARun): @@ -474,7 +462,7 @@ async def resolve_signed_urls( ): exp = now + delta presigned_url = await self.storage_ops.get_presigned_url( - org, file_, self.presign_duration_seconds + org, file_, PRESIGN_DURATION_SECONDS ) prefix = "files" diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 411e659ac9..d9ab766aa9 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -2,14 +2,20 @@ Collections API """ +# pylint: disable=too-many-lines + from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union +import os +import re +import urllib.parse import asyncio import pymongo from fastapi import Depends, HTTPException, Response from fastapi.responses import StreamingResponse +from starlette.requests import Request from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( @@ -29,10 +35,21 @@ EmptyResponse, UpdatedResponse, SuccessResponse, + AddedResponse, + DeletedResponse, CollectionSearchValuesResponse, OrgPublicCollections, PublicOrgDetails, CollAccessType, + PageUrlCount, + PageIdTimestamp, + PaginatedPageUrlCountResponse, + UpdateCollHomeUrl, + User, + ImageFile, + ImageFilePreparer, + MIN_UPLOAD_PART_SIZE, + PublicCollOut, ) from .utils import dt_now @@ -45,11 +62,14 @@ OrgOps = StorageOps = EventWebhookOps = CrawlOps = object +THUMBNAIL_MAX_SIZE = 2_000_000 + + # ============================================================================ class CollectionOps: """ops for working with named collections of crawls""" - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods orgs: OrgOps storage_ops: StorageOps @@ -60,6 +80,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): self.collections = mdb["collections"] self.crawls = mdb["crawls"] self.crawl_configs = mdb["crawl_configs"] + self.pages = mdb["pages"] self.crawl_ops = cast(CrawlOps, None) self.orgs = orgs @@ -70,6 +91,11 @@ def set_crawl_ops(self, ops): """set crawl ops""" self.crawl_ops = ops + def set_page_ops(self, ops): + """set page ops""" + # pylint: disable=attribute-defined-outside-init + self.page_ops = ops + async def init_index(self): """init lookup index""" await self.collections.create_index( @@ -91,8 +117,11 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): oid=oid, name=coll_in.name, description=coll_in.description, + caption=coll_in.caption, modified=modified, access=coll_in.access, + defaultThumbnailName=coll_in.defaultThumbnailName, + allowPublicDownload=coll_in.allowPublicDownload, ) try: await self.collections.insert_one(coll.to_dict()) @@ -100,6 +129,7 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): if crawl_ids: await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org) await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( crawl_ids, coll_id, org @@ -153,6 +183,7 @@ async def add_crawls_to_collection( raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( @@ -160,7 +191,7 @@ async def add_crawls_to_collection( ) ) - return await self.get_collection(coll_id, org) + return await self.get_collection_out(coll_id, org) async def remove_crawls_from_collection( self, coll_id: UUID, crawl_ids: List[str], org: Organization @@ -177,6 +208,7 @@ async def remove_crawls_from_collection( raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_removed_from_collection_notification( @@ -184,29 +216,79 @@ async def remove_crawls_from_collection( ) ) - return await self.get_collection(coll_id, org) + return await self.get_collection_out(coll_id, org) - async def get_collection( - self, coll_id: UUID, org: Organization, resources=False, public_only=False - ) -> CollOut: - """Get collection by id""" + async def get_collection_raw( + self, coll_id: UUID, public_or_unlisted_only: bool = False + ) -> Dict[str, Any]: + """Get collection by id as dict from database""" query: dict[str, object] = {"_id": coll_id} - if public_only: + if public_or_unlisted_only: query["access"] = {"$in": ["public", "unlisted"]} result = await self.collections.find_one(query) if not result: raise HTTPException(status_code=404, detail="collection_not_found") + return result + + async def get_collection( + self, coll_id: UUID, public_or_unlisted_only: bool = False + ) -> Collection: + """Get collection by id""" + result = await self.get_collection_raw(coll_id, public_or_unlisted_only) + return Collection.from_dict(result) + + async def get_collection_out( + self, + coll_id: UUID, + org: Organization, + resources=False, + public_or_unlisted_only=False, + ) -> CollOut: + """Get CollOut by id""" + result = await self.get_collection_raw(coll_id, public_or_unlisted_only) + if resources: - result["resources"] = await self.get_collection_crawl_resources( - coll_id, org + result["resources"] = await self.get_collection_crawl_resources(coll_id) + + thumbnail = result.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + result["thumbnail"] = await image_file.get_image_file_out( + org, self.storage_ops ) + return CollOut.from_dict(result) + async def get_public_collection_out( + self, coll_id: UUID, org: Organization, allow_unlisted: bool = False + ) -> PublicCollOut: + """Get PublicCollOut by id""" + result = await self.get_collection_raw(coll_id) + + allowed_access = [CollAccessType.PUBLIC] + if allow_unlisted: + allowed_access.append(CollAccessType.UNLISTED) + + if result.get("access") not in allowed_access: + raise HTTPException(status_code=404, detail="collection_not_found") + + result["resources"] = await self.get_collection_crawl_resources(coll_id) + + thumbnail = result.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + result["thumbnail"] = await image_file.get_public_image_file_out( + org, self.storage_ops + ) + + return PublicCollOut.from_dict(result) + async def list_collections( self, - oid: UUID, + org: Organization, + public_colls_out: bool = False, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, @@ -216,21 +298,22 @@ async def list_collections( access: Optional[str] = None, ): """List all collections for org""" - # pylint: disable=too-many-locals, duplicate-code + # pylint: disable=too-many-locals, duplicate-code, too-many-branches # Zero-index page for query page = page - 1 skip = page * page_size - match_query: dict[str, object] = {"oid": oid} + match_query: dict[str, object] = {"oid": org.id} if name: match_query["name"] = name - elif name_prefix: regex_pattern = f"^{name_prefix}" match_query["name"] = {"$regex": regex_pattern, "$options": "i"} - if access: + if public_colls_out: + match_query["access"] = CollAccessType.PUBLIC + elif access: match_query["access"] = access aggregate = [{"$match": match_query}] @@ -269,15 +352,35 @@ async def list_collections( except (IndexError, ValueError): total = 0 - collections = [CollOut.from_dict(res) for res in items] + collections: List[Union[CollOut, PublicCollOut]] = [] + + for res in items: + res["resources"] = await self.get_collection_crawl_resources(res["_id"]) + + thumbnail = res.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + + if public_colls_out: + res["thumbnail"] = await image_file.get_public_image_file_out( + org, self.storage_ops + ) + else: + res["thumbnail"] = await image_file.get_image_file_out( + org, self.storage_ops + ) + + if public_colls_out: + collections.append(PublicCollOut.from_dict(res)) + else: + collections.append(CollOut.from_dict(res)) return collections, total - async def get_collection_crawl_resources(self, coll_id: UUID, org: Organization): + async def get_collection_crawl_resources(self, coll_id: UUID): """Return pre-signed resources for all collection crawl files.""" - coll = await self.get_collection(coll_id, org) - if not coll: - raise HTTPException(status_code=404, detail="collection_not_found") + # Ensure collection exists + _ = await self.get_collection_raw(coll_id) all_files = [] @@ -312,6 +415,17 @@ async def get_collection_search_values(self, org: Organization): names = [name for name in names if name] return {"names": names} + async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: + """Return list of crawl ids in collection""" + crawl_ids = [] + async for crawl_raw in self.crawls.find( + {"collectionIds": coll_id}, projection=["_id"] + ): + crawl_id = crawl_raw.get("_id") + if crawl_id: + crawl_ids.append(crawl_id) + return crawl_ids + async def delete_collection(self, coll_id: UUID, org: Organization): """Delete collection and remove from associated crawls.""" await self.crawl_ops.remove_collection_from_all_crawls(coll_id) @@ -328,7 +442,7 @@ async def delete_collection(self, coll_id: UUID, org: Organization): async def download_collection(self, coll_id: UUID, org: Organization): """Download all WACZs in collection as streaming nested WACZ""" - coll = await self.get_collection(coll_id, org, resources=True) + coll = await self.get_collection_out(coll_id, org, resources=True) metadata = { "type": "collection", @@ -346,6 +460,15 @@ async def download_collection(self, coll_id: UUID, org: Organization): resp, headers=headers, media_type="application/wacz+zip" ) + async def recalculate_org_collection_counts_tags(self, org: Organization): + """Recalculate counts and tags for collections in org""" + collections, _ = await self.list_collections( + org, + page_size=100_000, + ) + for coll in collections: + await self.update_collection_counts_and_tags(coll.id) + async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" crawl_count = 0 @@ -353,6 +476,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size = 0 tags = [] + coll = await self.get_collection(collection_id) + org = await self.orgs.get_org_by_id(coll.oid) + async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): crawl = BaseCrawl.from_dict(crawl_raw) if crawl.state not in SUCCESSFUL_STATES: @@ -361,8 +487,16 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): files = crawl.files or [] for file in files: total_size += file.size - if crawl.stats: - page_count += crawl.stats.done + + try: + _, crawl_pages = await self.page_ops.list_pages( + crawl.id, org, page_size=1_000_000 + ) + page_count += crawl_pages + # pylint: disable=broad-exception-caught + except Exception: + pass + if crawl.tags: tags.extend(crawl.tags) @@ -380,6 +514,55 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): }, ) + async def recalculate_org_collection_dates(self, org: Organization): + """Recalculate earliest and latest dates for collections in org""" + collections, _ = await self.list_collections( + org, + page_size=100_000, + ) + for coll in collections: + await self.update_collection_dates(coll.id) + + async def update_collection_dates(self, coll_id: UUID): + """Update collection earliest and latest dates from page timestamps""" + coll = await self.get_collection(coll_id) + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + earliest_ts = None + latest_ts = None + + match_query = { + "oid": coll.oid, + "crawl_id": {"$in": crawl_ids}, + "ts": {"$ne": None}, + } + + cursor = self.pages.find(match_query).sort("ts", 1).limit(1) + pages = await cursor.to_list(length=1) + try: + earliest_page = pages[0] + earliest_ts = earliest_page.get("ts") + except IndexError: + pass + + cursor = self.pages.find(match_query).sort("ts", -1).limit(1) + pages = await cursor.to_list(length=1) + try: + latest_page = pages[0] + latest_ts = latest_page.get("ts") + except IndexError: + pass + + await self.collections.find_one_and_update( + {"_id": coll_id}, + { + "$set": { + "dateEarliest": earliest_ts, + "dateLatest": latest_ts, + } + }, + ) + async def update_crawl_collections(self, crawl_id: str): """Update counts and tags for all collections in crawl""" crawl = await self.crawls.find_one({"_id": crawl_id}) @@ -398,7 +581,14 @@ async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID): ) await self.update_crawl_collections(crawl_id) - async def get_org_public_collections(self, org_slug: str): + async def get_org_public_collections( + self, + org_slug: str, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: Optional[str] = None, + sort_direction: int = 1, + ): """List public collections for org""" try: org = await self.orgs.get_org_by_slug(org_slug) @@ -411,7 +601,12 @@ async def get_org_public_collections(self, org_slug: str): raise HTTPException(status_code=404, detail="public_profile_not_found") collections, _ = await self.list_collections( - org.id, access=CollAccessType.PUBLIC + org, + page_size=page_size, + page=page, + sort_by=sort_by, + sort_direction=sort_direction, + public_colls_out=True, ) public_org_details = PublicOrgDetails( @@ -422,10 +617,192 @@ async def get_org_public_collections(self, org_slug: str): return OrgPublicCollections(org=public_org_details, collections=collections) + async def list_urls_in_collection( + self, + coll_id: UUID, + oid: UUID, + url_prefix: Optional[str] = None, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ) -> Tuple[List[PageUrlCount], int]: + """List all URLs in collection sorted desc by snapshot count""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + page = page - 1 + skip = page_size * page + + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}} + + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + match_query["url"] = {"$regex": regex_pattern, "$options": "i"} + + aggregate = [{"$match": match_query}] + + aggregate.extend( + [ + { + "$group": { + "_id": "$url", + "pages": {"$push": "$$ROOT"}, + "count": {"$sum": 1}, + }, + }, + {"$sort": {"count": -1}}, + {"$set": {"url": "$_id"}}, + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + # Get total + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + return [ + PageUrlCount( + url=data.get("url", ""), + count=data.get("count", 0), + snapshots=[ + PageIdTimestamp( + pageId=p["_id"], ts=p.get("ts"), status=p.get("status", 200) + ) + for p in data.get("pages", []) + ], + ) + for data in items + ], total + + async def set_home_url( + self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization + ) -> Dict[str, bool]: + """Set home URL for collection and save thumbnail to database""" + if update.pageId: + page = await self.page_ops.get_page(update.pageId, org.id) + update_query = { + "homeUrl": page.url, + "homeUrlTs": page.ts, + "homeUrlPageId": page.id, + } + else: + update_query = { + "homeUrl": None, + "homeUrlTs": None, + "homeUrlPageId": None, + } + + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": update_query}, + ) + + return {"updated": True} + + async def upload_thumbnail_stream( + self, stream, filename: str, coll_id: UUID, org: Organization, user: User + ) -> Dict[str, bool]: + """Upload file as stream to use as collection thumbnail""" + coll = await self.get_collection(coll_id) + + _, extension = os.path.splitext(filename) + + image_filename = f"thumbnail-{str(coll_id)}{extension}" + + prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/" + + file_prep = ImageFilePreparer( + prefix, + image_filename, + original_filename=filename, + user=user, + created=dt_now(), + ) + + async def stream_iter(): + """iterate over each chunk and compute and digest + total size""" + async for chunk in stream: + file_prep.add_chunk(chunk) + yield chunk + + print("Collection thumbnail stream upload starting", flush=True) + + if not await self.storage_ops.do_upload_multipart( + org, + file_prep.upload_name, + stream_iter(), + MIN_UPLOAD_PART_SIZE, + ): + print("Collection thumbnail stream upload failed", flush=True) + raise HTTPException(status_code=400, detail="upload_failed") + + print("Collection thumbnail stream upload complete", flush=True) + + thumbnail_file = file_prep.get_image_file(org.storage) + + if thumbnail_file.size > THUMBNAIL_MAX_SIZE: + print( + "Collection thumbnail stream upload failed: max size (2 MB) exceeded", + flush=True, + ) + await self.storage_ops.delete_file_object(org, thumbnail_file) + raise HTTPException(status_code=400, detail="upload_failed") + + if coll.thumbnail: + if not await self.storage_ops.delete_file_object(org, coll.thumbnail): + print( + f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}" + ) + + coll.thumbnail = thumbnail_file + + # Update entire document to avoid bson.errors.InvalidDocument exception + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": coll.to_dict()}, + ) + + return {"added": True} + + async def delete_thumbnail(self, coll_id: UUID, org: Organization): + """Delete collection thumbnail""" + coll = await self.get_collection(coll_id) + + if not coll.thumbnail: + raise HTTPException(status_code=404, detail="thumbnail_not_found") + + if not await self.storage_ops.delete_file_object(org, coll.thumbnail): + print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}") + raise HTTPException(status_code=400, detail="file_deletion_error") + + # Delete from database + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": {"thumbnail": None}}, + ) + + return {"deleted": True} + # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops): +def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep): """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments @@ -461,7 +838,7 @@ async def list_collection_all( access: Optional[str] = None, ): collections, total = await colls.list_collections( - org.id, + org, page_size=pageSize, page=page, sort_by=sortBy, @@ -480,10 +857,10 @@ async def list_collection_all( async def get_collection_all(org: Organization = Depends(org_viewer_dep)): results = {} try: - all_collections, _ = await colls.list_collections(org.id, page_size=10_000) + all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: results[collection.name] = await colls.get_collection_crawl_resources( - collection.id, org + collection.id ) except Exception as exc: # pylint: disable=raise-missing-from @@ -511,7 +888,7 @@ async def get_collection_search_values( async def get_collection( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection(coll_id, org) + return await colls.get_collection_out(coll_id, org) @app.get( "/orgs/{oid}/collections/{coll_id}/replay.json", @@ -521,7 +898,7 @@ async def get_collection( async def get_collection_replay( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection(coll_id, org, resources=True) + return await colls.get_collection_out(coll_id, org, resources=True) @app.get( "/orgs/{oid}/collections/{coll_id}/public/replay.json", @@ -533,8 +910,8 @@ async def get_collection_public_replay( coll_id: UUID, org: Organization = Depends(org_public), ): - coll = await colls.get_collection( - coll_id, org, resources=True, public_only=True + coll = await colls.get_collection_out( + coll_id, org, resources=True, public_or_unlisted_only=True ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" @@ -610,11 +987,126 @@ async def download_collection( return await colls.download_collection(coll_id, org) @app.get( - "/public-collections/{org_slug}", - tags=["collections"], + "/public/orgs/{org_slug}/collections", + tags=["collections", "public"], response_model=OrgPublicCollections, ) - async def get_org_public_collections(org_slug: str): - return await colls.get_org_public_collections(org_slug) + async def get_org_public_collections( + org_slug: str, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: int = 1, + ): + return await colls.get_org_public_collections( + org_slug, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + + @app.get( + "/public/orgs/{org_slug}/collections/{coll_id}", + tags=["collections", "public"], + response_model=PublicCollOut, + ) + async def get_public_collection( + org_slug: str, + coll_id: UUID, + ): + try: + org = await colls.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="collection_not_found") + + return await colls.get_public_collection_out(coll_id, org, allow_unlisted=True) + + @app.get( + "/public/orgs/{org_slug}/collections/{coll_id}/download", + tags=["collections", "public"], + response_model=bytes, + ) + async def download_public_collection( + org_slug: str, + coll_id: UUID, + ): + try: + org = await colls.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="collection_not_found") + + # Make sure collection exists and is public/unlisted + coll = await colls.get_collection(coll_id, public_or_unlisted_only=True) + + if coll.allowPublicDownload is False: + raise HTTPException(status_code=403, detail="not_allowed") + + return await colls.download_collection(coll_id, org) + + @app.get( + "/orgs/{oid}/collections/{coll_id}/urls", + tags=["collections"], + response_model=PaginatedPageUrlCountResponse, + ) + async def get_collection_url_list( + coll_id: UUID, + oid: UUID, + urlPrefix: Optional[str] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ): + """Retrieve paginated list of urls in collection sorted by snapshot count""" + pages, total = await colls.list_urls_in_collection( + coll_id=coll_id, + oid=oid, + url_prefix=urlPrefix, + page_size=pageSize, + page=page, + ) + return paginated_format(pages, total, page, pageSize) + + @app.post( + "/orgs/{oid}/collections/{coll_id}/home-url", + tags=["collections"], + response_model=UpdatedResponse, + ) + async def set_collection_home_url( + update: UpdateCollHomeUrl, + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + return await colls.set_home_url(coll_id, update, org) + + @app.put( + "/orgs/{oid}/collections/{coll_id}/thumbnail", + tags=["collections"], + response_model=AddedResponse, + ) + async def upload_thumbnail_stream( + request: Request, + filename: str, + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + return await colls.upload_thumbnail_stream( + request.stream(), filename, coll_id, org, user + ) + + @app.delete( + "/orgs/{oid}/collections/{coll_id}/thumbnail", + tags=["collections"], + response_model=DeletedResponse, + ) + async def delete_thumbnail_stream( + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + return await colls.delete_thumbnail(coll_id, org) return colls diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 7921ca4856..6810929f51 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -115,8 +115,6 @@ async def run_replica_job( async def run_delete_org_job( self, oid: str, - backend_image: str, - pull_policy: str, existing_job_id: Optional[str] = None, ) -> str: """run job to delete org and all of its data""" @@ -127,14 +125,12 @@ async def run_delete_org_job( job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, backend_image, pull_policy, job_id, job_type=BgJobType.DELETE_ORG.value + oid, job_id, job_type=BgJobType.DELETE_ORG.value ) async def run_recalculate_org_stats_job( self, oid: str, - backend_image: str, - pull_policy: str, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -146,19 +142,32 @@ async def run_recalculate_org_stats_job( return await self._run_bg_job_with_ops_classes( oid, - backend_image, - pull_policy, job_id, job_type=BgJobType.RECALCULATE_ORG_STATS.value, ) - async def _run_bg_job_with_ops_classes( + async def run_re_add_org_pages_job( self, oid: str, - backend_image: str, - pull_policy: str, - job_id: str, - job_type: str, + crawl_type: Optional[str] = None, + existing_job_id: Optional[str] = None, + ) -> str: + """run job to recalculate storage stats for the org""" + + if existing_job_id: + job_id = existing_job_id + else: + job_id = f"org-pages-{oid}-{secrets.token_hex(5)}" + + return await self._run_bg_job_with_ops_classes( + oid, + job_id, + job_type=BgJobType.READD_ORG_PAGES.value, + crawl_type=crawl_type, + ) + + async def _run_bg_job_with_ops_classes( + self, oid: str, job_id: str, job_type: str, **kwargs ) -> str: """run background job with access to ops classes""" @@ -166,8 +175,9 @@ async def _run_bg_job_with_ops_classes( "id": job_id, "oid": oid, "job_type": job_type, - "backend_image": backend_image, - "pull_policy": pull_policy, + "backend_image": os.environ.get("BACKEND_IMAGE", ""), + "pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), + **kwargs, } data = self.templates.env.get_template("background_job.yaml").render(params) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 5a0994fe70..539c408ee6 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -918,7 +918,7 @@ async def delete_crawl_qa_run_files( """delete crawl qa wacz files""" qa_run = await self.get_qa_run(crawl_id, qa_run_id, org) for file_ in qa_run.files: - if not await self.storage_ops.delete_crawl_file_object(org, file_): + if not await self.storage_ops.delete_file_object(org, file_): raise HTTPException(status_code=400, detail="file_deletion_error") # Not replicating QA run WACZs yet # await self.background_job_ops.create_delete_replica_jobs( diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index f453442191..0723258e40 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from .migrations import BaseMigration -CURR_DB_VERSION = "0036" +CURR_DB_VERSION = "0037" # ============================================================================ @@ -82,6 +82,7 @@ async def update_and_prepare_db( invite_ops, storage_ops, page_ops, + background_job_ops, db_inited, ): """Prepare database for application. @@ -94,7 +95,7 @@ async def update_and_prepare_db( """ await ping_db(mdb) print("Database setup started", flush=True) - if await run_db_migrations(mdb, user_manager, page_ops): + if await run_db_migrations(mdb, user_manager, background_job_ops, page_ops): await drop_indexes(mdb) await create_indexes( org_ops, @@ -113,7 +114,7 @@ async def update_and_prepare_db( # ============================================================================ -async def run_db_migrations(mdb, user_manager, page_ops): +async def run_db_migrations(mdb, user_manager, background_job_ops, page_ops): """Run database migrations.""" # if first run, just set version and exit @@ -145,7 +146,9 @@ async def run_db_migrations(mdb, user_manager, page_ops): assert spec.loader migration_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(migration_module) - migration = migration_module.Migration(mdb, page_ops=page_ops) + migration = migration_module.Migration( + mdb, background_job_ops=background_job_ops, page_ops=page_ops + ) if await migration.run(): migrations_run = True except ImportError as err: diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index f6b678cb82..927a03dcb8 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -223,7 +223,9 @@ def main() -> None: profiles, ) - coll_ops = init_collections_api(app, mdb, org_ops, storage_ops, event_webhook_ops) + coll_ops = init_collections_api( + app, mdb, org_ops, storage_ops, event_webhook_ops, current_active_user + ) base_crawl_init = ( app, @@ -243,14 +245,15 @@ def main() -> None: crawls = init_crawls_api(crawl_manager, *base_crawl_init) + upload_ops = init_uploads_api(*base_crawl_init) + page_ops = init_pages_api( - app, mdb, crawls, org_ops, storage_ops, current_active_user + app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user ) base_crawl_ops.set_page_ops(page_ops) crawls.set_page_ops(page_ops) - - init_uploads_api(*base_crawl_init) + upload_ops.set_page_ops(page_ops) org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops) @@ -260,6 +263,8 @@ def main() -> None: crawl_config_ops.set_coll_ops(coll_ops) + coll_ops.set_page_ops(page_ops) + # run only in first worker if run_once_lock("btrix-init-db"): asyncio.create_task( @@ -273,6 +278,7 @@ def main() -> None: invites, storage_ops, page_ops, + background_job_ops, db_inited, ) ) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 2fba05e53f..709139d8d2 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -12,6 +12,7 @@ job_type = os.environ.get("BG_JOB_TYPE") oid = os.environ.get("OID") +crawl_type = os.environ.get("CRAWL_TYPE") # ============================================================================ @@ -27,7 +28,7 @@ async def main(): ) return 1 - (org_ops, _, _, _, _, _, _, _, _, _, user_manager) = init_ops() + (org_ops, _, _, _, _, page_ops, coll_ops, _, _, _, _, user_manager) = init_ops() if not oid: print("Org id missing, quitting") @@ -57,6 +58,17 @@ async def main(): traceback.print_exc() return 1 + if job_type == BgJobType.READD_ORG_PAGES: + try: + await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) + await coll_ops.recalculate_org_collection_dates(org) + await coll_ops.recalculate_org_collection_counts_tags(org) + return 0 + # pylint: disable=broad-exception-caught + except Exception: + traceback.print_exc() + return 1 + print(f"Provided job type {job_type} not currently supported") return 1 diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index a6f6654be3..af7a2d0956 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -31,6 +31,7 @@ def main(): crawl_config_ops, _, crawl_ops, + _, page_ops, coll_ops, _, diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py new file mode 100644 index 0000000000..62bfe98237 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -0,0 +1,72 @@ +""" +Migration 0037 -- upload pages +""" + +from uuid import UUID + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0037" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.background_job_ops = kwargs.get("background_job_ops") + self.page_ops = kwargs.get("page_ops") + + async def org_upload_pages_already_added(self, oid: UUID) -> bool: + """Check if upload pages have already been added for this org""" + if self.page_ops is None: + print( + f"page_ops missing, assuming pages need to be added for org {oid}", + flush=True, + ) + return False + + mdb_crawls = self.mdb["crawls"] + async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}): + upload_id = upload["_id"] + _, total = await self.page_ops.list_pages(upload_id) + if total > 0: + return True + return False + + async def migrate_up(self): + """Perform migration up. + + Start background jobs to parse uploads and add their pages to db + """ + if self.background_job_ops is None: + print( + "Unable to start background job, missing background_job_ops", flush=True + ) + return + + mdb_orgs = self.mdb["organizations"] + async for org in mdb_orgs.find(): + oid = org["_id"] + + pages_already_added = await self.org_upload_pages_already_added(oid) + + if pages_already_added: + print( + f"Skipping org {oid}, upload pages already added to db", flush=True + ) + continue + + try: + await self.background_job_ops.create_re_add_org_pages_job( + oid, crawl_type="upload" + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error starting background job to add upload pges to org {oid}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 93e708c4d0..5517067948 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -5,6 +5,9 @@ from datetime import datetime from enum import Enum, IntEnum from uuid import UUID +import base64 +import hashlib +import mimetypes import os from typing import Optional, List, Dict, Union, Literal, Any, get_args @@ -21,6 +24,7 @@ BeforeValidator, TypeAdapter, ) +from pathvalidate import sanitize_filename # from fastapi_users import models as fastapi_users_models @@ -29,6 +33,20 @@ # crawl scale for constraint MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +# Presign duration must be less than 604800 seconds (one week), +# so set this one minute short of a week +PRESIGN_MINUTES_MAX = 10079 +PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX + +# Expire duration seconds for presigned urls +PRESIGN_DURATION_MINUTES = int( + os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT +) +PRESIGN_DURATION_SECONDS = min(PRESIGN_DURATION_MINUTES, PRESIGN_MINUTES_MAX) * 60 + +# Minimum part size for file uploads +MIN_UPLOAD_PART_SIZE = 10000000 + # annotated types # ============================================================================ @@ -779,6 +797,9 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): reviewStatus: ReviewStatus = None + filePageCount: Optional[int] = 0 + errorPageCount: Optional[int] = 0 + # ============================================================================ class CollIdName(BaseModel): @@ -995,9 +1016,6 @@ class Crawl(BaseCrawl, CrawlConfigCore): qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} - filePageCount: Optional[int] = 0 - errorPageCount: Optional[int] = 0 - # ============================================================================ class CrawlCompleteIn(BaseModel): @@ -1050,6 +1068,155 @@ class UpdateUpload(UpdateCrawl): """Update modal that also includes name""" +# ============================================================================ +class FilePreparer: + """wrapper to compute digest / name for streaming upload""" + + def __init__(self, prefix, filename): + self.upload_size = 0 + self.upload_hasher = hashlib.sha256() + self.upload_name = prefix + self.prepare_filename(filename) + + def add_chunk(self, chunk): + """add chunk for file""" + self.upload_size += len(chunk) + self.upload_hasher.update(chunk) + + def get_crawl_file(self, storage: StorageRef): + """get crawl file""" + return CrawlFile( + filename=self.upload_name, + hash=self.upload_hasher.hexdigest(), + size=self.upload_size, + storage=storage, + ) + + def prepare_filename(self, filename): + """prepare filename by sanitizing and adding extra string + to avoid duplicates""" + name = sanitize_filename(filename.rsplit("/", 1)[-1]) + parts = name.split(".") + randstr = base64.b32encode(os.urandom(5)).lower() + parts[0] += "-" + randstr.decode("utf-8") + return ".".join(parts) + + +# ============================================================================ + +### USER-UPLOADED IMAGES ### + + +# ============================================================================ +class ImageFileOut(BaseModel): + """output for user-upload imaged file (conformance to Data Resource Spec)""" + + name: str + path: str + hash: str + size: int + + originalFilename: str + mime: str + userid: UUID + userName: str + created: datetime + + +# ============================================================================ +class PublicImageFileOut(BaseModel): + """public output for user-upload imaged file (conformance to Data Resource Spec)""" + + name: str + path: str + hash: str + size: int + + mime: str + + +# ============================================================================ +class ImageFile(BaseFile): + """User-uploaded image file""" + + originalFilename: str + mime: str + userid: UUID + userName: str + created: datetime + + async def get_image_file_out(self, org, storage_ops) -> ImageFileOut: + """Get ImageFileOut with new presigned url""" + presigned_url = await storage_ops.get_presigned_url( + org, self, PRESIGN_DURATION_SECONDS + ) + + return ImageFileOut( + name=self.filename, + path=presigned_url or "", + hash=self.hash, + size=self.size, + originalFilename=self.originalFilename, + mime=self.mime, + userid=self.userid, + userName=self.userName, + created=self.created, + ) + + async def get_public_image_file_out(self, org, storage_ops) -> PublicImageFileOut: + """Get PublicImageFileOut with new presigned url""" + presigned_url = await storage_ops.get_presigned_url( + org, self, PRESIGN_DURATION_SECONDS + ) + + return PublicImageFileOut( + name=self.filename, + path=presigned_url or "", + hash=self.hash, + size=self.size, + mime=self.mime, + ) + + +# ============================================================================ +class ImageFilePreparer(FilePreparer): + """Wrapper for user image streaming uploads""" + + # pylint: disable=too-many-arguments, too-many-function-args + + def __init__( + self, + prefix, + filename, + original_filename: str, + user: User, + created: datetime, + ): + super().__init__(prefix, filename) + + self.original_filename = original_filename + self.mime, _ = mimetypes.guess_type(original_filename) or ("image/jpeg", None) + self.userid = user.id + self.user_name = user.name + self.created = created + + def get_image_file( + self, + storage: StorageRef, + ) -> ImageFile: + """get user-uploaded image file""" + return ImageFile( + filename=self.upload_name, + hash=self.upload_hasher.hexdigest(), + size=self.upload_size, + storage=storage, + originalFilename=self.original_filename, + mime=self.mime, + userid=self.userid, + userName=self.user_name, + created=self.created, + ) + + # ============================================================================ ### COLLECTIONS ### @@ -1071,17 +1238,30 @@ class Collection(BaseMongoModel): name: str = Field(..., min_length=1) oid: UUID description: Optional[str] = None + caption: Optional[str] = None modified: Optional[datetime] = None crawlCount: Optional[int] = 0 pageCount: Optional[int] = 0 totalSize: Optional[int] = 0 + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + # Sorted by count, descending tags: Optional[List[str]] = [] access: CollAccessType = CollAccessType.PRIVATE + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + homeUrlPageId: Optional[UUID] = None + + thumbnail: Optional[ImageFile] = None + defaultThumbnailName: Optional[str] = None + + allowPublicDownload: Optional[bool] = True + # ============================================================================ class CollIn(BaseModel): @@ -1089,16 +1269,74 @@ class CollIn(BaseModel): name: str = Field(..., min_length=1) description: Optional[str] = None + caption: Optional[str] = None crawlIds: Optional[List[str]] = [] access: CollAccessType = CollAccessType.PRIVATE + defaultThumbnailName: Optional[str] = None + allowPublicDownload: bool = True + # ============================================================================ -class CollOut(Collection): +class CollOut(BaseMongoModel): """Collection output model with annotations.""" + name: str + oid: UUID + description: Optional[str] = None + caption: Optional[str] = None + modified: Optional[datetime] = None + + crawlCount: Optional[int] = 0 + pageCount: Optional[int] = 0 + totalSize: Optional[int] = 0 + + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + + # Sorted by count, descending + tags: Optional[List[str]] = [] + + access: CollAccessType = CollAccessType.PRIVATE + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + homeUrlPageId: Optional[UUID] = None + + resources: List[CrawlFileOut] = [] + thumbnail: Optional[ImageFileOut] = None + defaultThumbnailName: Optional[str] = None + + allowPublicDownload: bool = True + + +# ============================================================================ +class PublicCollOut(BaseMongoModel): + """Collection output model with annotations.""" + + name: str + oid: UUID + description: Optional[str] = None + caption: Optional[str] = None + + crawlCount: Optional[int] = 0 + pageCount: Optional[int] = 0 + totalSize: Optional[int] = 0 + + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + + access: CollAccessType = CollAccessType.PUBLIC + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + resources: List[CrawlFileOut] = [] + thumbnail: Optional[PublicImageFileOut] = None + defaultThumbnailName: Optional[str] = None + + allowPublicDownload: bool = True # ============================================================================ @@ -1107,7 +1345,17 @@ class UpdateColl(BaseModel): name: Optional[str] = None description: Optional[str] = None + caption: Optional[str] = None access: Optional[CollAccessType] = None + defaultThumbnailName: Optional[str] = None + allowPublicDownload: Optional[bool] = None + + +# ============================================================================ +class UpdateCollHomeUrl(BaseModel): + """Update home url for collection""" + + pageId: Optional[UUID] = None # ============================================================================ @@ -1167,7 +1415,7 @@ class OrgPublicCollections(BaseModel): org: PublicOrgDetails - collections: List[CollOut] = [] + collections: List[PublicCollOut] = [] # ============================================================================ @@ -2067,6 +2315,7 @@ class BgJobType(str, Enum): DELETE_REPLICA = "delete-replica" DELETE_ORG = "delete-org" RECALCULATE_ORG_STATS = "recalculate-org-stats" + READD_ORG_PAGES = "readd-org-pages" # ============================================================================ @@ -2119,6 +2368,14 @@ class RecalculateOrgStatsJob(BackgroundJob): type: Literal[BgJobType.RECALCULATE_ORG_STATS] = BgJobType.RECALCULATE_ORG_STATS +# ============================================================================ +class ReAddOrgPagesJob(BackgroundJob): + """Model for tracking jobs to readd an org's pages""" + + type: Literal[BgJobType.READD_ORG_PAGES] = BgJobType.READD_ORG_PAGES + crawl_type: Optional[str] = None + + # ============================================================================ # Union of all job types, for response model @@ -2129,6 +2386,7 @@ class RecalculateOrgStatsJob(BackgroundJob): BackgroundJob, DeleteOrgJob, RecalculateOrgStatsJob, + ReAddOrgPagesJob, ] ] @@ -2240,7 +2498,7 @@ class PageWithAllQA(Page): class PageOut(Page): """Model for pages output, no QA""" - status: Optional[int] = 200 + status: int = 200 # ============================================================================ @@ -2266,6 +2524,24 @@ class PageNoteUpdatedResponse(BaseModel): data: PageNote +# ============================================================================ +class PageIdTimestamp(BaseModel): + """Simplified model for page info to include in PageUrlCount""" + + pageId: UUID + ts: Optional[datetime] = None + status: int = 200 + + +# ============================================================================ +class PageUrlCount(BaseModel): + """Model for counting pages by URL""" + + url: AnyHttpUrl + count: int = 0 + snapshots: List[PageIdTimestamp] = [] + + # ============================================================================ ### GENERIC RESPONSE MODELS ### @@ -2512,3 +2788,10 @@ class PaginatedUserEmailsResponse(PaginatedResponse): """Response model for user emails with org info""" items: List[UserEmailWithOrgInfo] + + +# ============================================================================ +class PaginatedPageUrlCountResponse(PaginatedResponse): + """Response model for page count by url""" + + items: List[PageUrlCount] diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 23629de2aa..bee24d00c5 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -16,6 +16,7 @@ from .pages import PageOps from .profiles import ProfileOps from .storages import StorageOps +from .uploads import UploadOps from .users import UserManager from .webhooks import EventWebhookOps @@ -26,6 +27,7 @@ def init_ops() -> Tuple[ CrawlConfigOps, BaseCrawlOps, CrawlOps, + UploadOps, PageOps, CollectionOps, ProfileOps, @@ -70,7 +72,7 @@ def init_ops() -> Tuple[ coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) - base_crawl_ops = BaseCrawlOps( + base_crawl_init = ( mdb, user_manager, org_ops, @@ -81,23 +83,17 @@ def init_ops() -> Tuple[ background_job_ops, ) - crawl_ops = CrawlOps( - crawl_manager, - mdb, - user_manager, - org_ops, - crawl_config_ops, - coll_ops, - storage_ops, - event_webhook_ops, - background_job_ops, - ) + base_crawl_ops = BaseCrawlOps(*base_crawl_init) - page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + crawl_ops = CrawlOps(crawl_manager, *base_crawl_init) - base_crawl_ops.set_page_ops(page_ops) + upload_ops = UploadOps(*base_crawl_init) + page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) + + base_crawl_ops.set_page_ops(page_ops) crawl_ops.set_page_ops(page_ops) + upload_ops.set_page_ops(page_ops) background_job_ops.set_ops(crawl_ops, profile_ops) @@ -109,11 +105,14 @@ def init_ops() -> Tuple[ crawl_config_ops.set_coll_ops(coll_ops) + coll_ops.set_page_ops(page_ops) + return ( org_ops, crawl_config_ops, base_crawl_ops, crawl_ops, + upload_ops, page_ops, coll_ops, profile_ops, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index a980567c49..251d959be1 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 -from fastapi import Depends, HTTPException +from fastapi import Depends, HTTPException, Request import pymongo from .models import ( @@ -24,6 +24,7 @@ PageNoteEdit, PageNoteDelete, QARunBucketStats, + StartedResponse, StartedResponseBool, UpdatedResponse, DeletedResponse, @@ -34,11 +35,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now if TYPE_CHECKING: + from .background_jobs import BackgroundJobOps from .crawls import CrawlOps from .orgs import OrgOps from .storages import StorageOps else: - CrawlOps = StorageOps = OrgOps = object + CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object # ============================================================================ @@ -49,18 +51,24 @@ class PageOps: crawl_ops: CrawlOps org_ops: OrgOps storage_ops: StorageOps + background_job_ops: BackgroundJobOps - def __init__(self, mdb, crawl_ops, org_ops, storage_ops): + def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops): self.pages = mdb["pages"] self.crawls = mdb["crawls"] self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops + self.background_job_ops = background_job_ops async def init_index(self): """init index for pages db collection""" await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + async def set_ops(self, background_job_ops: BackgroundJobOps): + """Set ops classes as needed""" + self.background_job_ops = background_job_ops + async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): """Add pages to database from WACZ files""" pages_buffer: List[Page] = [] @@ -94,9 +102,14 @@ def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID ) -> Page: """Return Page object from dict""" - page_id = page_dict.get("id") + page_id = page_dict.get("id", "") if not page_id: - print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) + page_id = uuid4() + + try: + UUID(page_id) + except ValueError: + page_id = uuid4() status = page_dict.get("status") if not status and page_dict.get("loadState"): @@ -199,10 +212,7 @@ async def update_crawl_file_and_error_counts( inc_query["errorPageCount"] = error_count await self.crawls.find_one_and_update( - { - "_id": crawl_id, - "type": "crawl", - }, + {"_id": crawl_id}, {"$inc": inc_query}, ) @@ -554,13 +564,17 @@ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): print(f"Deleted pages for crawl {crawl_id}", flush=True) await self.add_crawl_pages_to_db_from_wacz(crawl_id) - async def re_add_all_crawl_pages(self, oid: UUID): - """Re-add pages for all crawls in org""" - crawl_ids = await self.crawls.distinct( - "_id", {"type": "crawl", "finished": {"$ne": None}} - ) + async def re_add_all_crawl_pages( + self, org: Organization, crawl_type: Optional[str] = None + ): + """Re-add pages for all crawls and uploads in org""" + match_query: Dict[str, object] = {"finished": {"$ne": None}} + if crawl_type in ("crawl", "upload"): + match_query["type"] = crawl_type + + crawl_ids = await self.crawls.distinct("_id", match_query) for crawl_id in crawl_ids: - await self.re_add_crawl_pages(crawl_id, oid) + await self.re_add_crawl_pages(crawl_id, org.id) async def get_qa_run_aggregate_counts( self, @@ -630,47 +644,102 @@ async def get_qa_run_aggregate_counts( return sorted(return_data, key=lambda bucket: bucket.lowerBoundary) + def get_crawl_type_from_pages_route(self, request: Request): + """Get crawl type to filter on from request route""" + crawl_type = None + + try: + route_path = request.scope["route"].path + type_path = route_path.split("/")[4] + + if type_path == "uploads": + crawl_type = "upload" + if type_path == "crawls": + crawl_type = "crawl" + except (IndexError, AttributeError): + pass + + return crawl_type + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme -def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): +def init_pages_api( + app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep +): """init pages API""" # pylint: disable=invalid-name - ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) org_crawl_dep = org_ops.org_crawl_dep @app.post( "/orgs/{oid}/crawls/all/pages/reAdd", - tags=["pages"], - response_model=StartedResponseBool, + tags=["pages", "crawls"], + response_model=StartedResponse, + ) + @app.post( + "/orgs/{oid}/uploads/all/pages/reAdd", + tags=["pages", "uploads"], + response_model=StartedResponse, + ) + @app.post( + "/orgs/{oid}/all-crawls/all/pages/reAdd", + tags=["pages", "all-crawls"], + response_model=StartedResponse, ) async def re_add_all_crawl_pages( - org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep) + request: Request, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), ): - """Re-add pages for all crawls in org (superuser only)""" + """Re-add pages for all crawls in org (superuser only, may delete page QA data!)""" if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - asyncio.create_task(ops.re_add_all_crawl_pages(org.id)) - return {"started": True} + crawl_type = ops.get_crawl_type_from_pages_route(request) + job_id = await ops.background_job_ops.create_re_add_org_pages_job( + org.id, crawl_type=crawl_type + ) + return {"started": job_id or ""} @app.post( "/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", - tags=["pages"], + tags=["pages", "crawls"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/uploads/{crawl_id}/pages/reAdd", + tags=["pages", "uploads"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/all-crawls/{crawl_id}/pages/reAdd", + tags=["pages", "all-crawls"], response_model=StartedResponseBool, ) async def re_add_crawl_pages( - crawl_id: str, org: Organization = Depends(org_crawl_dep) + crawl_id: str, + org: Organization = Depends(org_crawl_dep), ): - """Re-add pages for crawl""" + """Re-add pages for crawl (may delete page QA data!)""" asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) return {"started": True} @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", - tags=["pages"], + tags=["pages", "crawls"], + response_model=PageOut, + ) + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/pages/{page_id}", + tags=["pages", "uploads"], + response_model=PageOut, + ) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/pages/{page_id}", + tags=["pages", "all-crawls"], response_model=PageOut, ) async def get_page( @@ -692,7 +761,7 @@ async def get_page_with_qa( page_id: UUID, org: Organization = Depends(org_crawl_dep), ): - """GET single page""" + """GET single page with QA details""" return await ops.get_page_out(page_id, org.id, crawl_id, qa_run_id=qa_run_id) @app.patch( @@ -753,12 +822,22 @@ async def delete_page_notes( delete: PageNoteDelete, org: Organization = Depends(org_crawl_dep), ): - """Edit page note""" + """Delete page note""" return await ops.delete_page_notes(page_id, org.id, delete, crawl_id) @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages", - tags=["pages"], + tags=["pages", "crawls"], + response_model=PaginatedPageOutResponse, + ) + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/pages", + tags=["pages", "uploads"], + response_model=PaginatedPageOutResponse, + ) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/pages", + tags=["pages", "all-crawls"], response_model=PaginatedPageOutResponse, ) async def get_pages_list( diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index ab72422472..9b8ae8da8f 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -426,7 +426,7 @@ async def delete_profile( # Delete file from storage if profile.resource: - await self.storage_ops.delete_crawl_file_object(org, profile.resource) + await self.storage_ops.delete_file_object(org, profile.resource) await self.orgs.inc_org_bytes_stored( org.id, -profile.resource.size, "profile" ) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 50b9557a92..43b6fabcd5 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -476,9 +476,7 @@ async def get_presigned_url( return presigned_url - async def delete_crawl_file_object( - self, org: Organization, crawlfile: BaseFile - ) -> bool: + async def delete_file_object(self, org: Organization, crawlfile: BaseFile) -> bool: """delete crawl file from storage.""" return await self._delete_file(org, crawlfile.filename, crawlfile.storage) diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index ded0630719..e95b30427f 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -1,9 +1,6 @@ """ handle user uploads into browsertrix """ import uuid -import hashlib -import os -import base64 from urllib.parse import unquote from uuid import UUID @@ -13,7 +10,6 @@ from fastapi import Depends, UploadFile, File from fastapi import HTTPException from starlette.requests import Request -from pathvalidate import sanitize_filename from .basecrawls import BaseCrawlOps from .storages import CHUNK_SIZE @@ -27,18 +23,16 @@ Organization, PaginatedCrawlOutResponse, User, - StorageRef, UpdatedResponse, DeletedResponseQuota, AddedResponseIdQuota, + FilePreparer, + MIN_UPLOAD_PART_SIZE, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now -MIN_UPLOAD_PART_SIZE = 10000000 - - # ============================================================================ class UploadOps(BaseCrawlOps): """upload ops""" @@ -105,9 +99,10 @@ async def stream_iter(): if prev_upload: try: await self._delete_crawl_files(prev_upload, org) + await self.page_ops.delete_crawl_pages(prev_upload.id, org.id) # pylint: disable=broad-exception-caught except Exception as exc: - print("replace file deletion failed", exc) + print(f"Error handling previous upload: {exc}", flush=True) return await self._create_upload( files, name, description, collections, tags, id_, org, user @@ -195,6 +190,8 @@ async def _create_upload( self.event_webhook_ops.create_upload_finished_notification(crawl_id, org.id) ) + asyncio.create_task(self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)) + await self.orgs.inc_org_bytes_stored(org.id, file_size, "upload") quota_reached = self.orgs.storage_quota_reached(org) @@ -224,39 +221,6 @@ async def delete_uploads( return {"deleted": True, "storageQuotaReached": quota_reached} -# ============================================================================ -class FilePreparer: - """wrapper to compute digest / name for streaming upload""" - - def __init__(self, prefix, filename): - self.upload_size = 0 - self.upload_hasher = hashlib.sha256() - self.upload_name = prefix + self.prepare_filename(filename) - - def add_chunk(self, chunk): - """add chunk for file""" - self.upload_size += len(chunk) - self.upload_hasher.update(chunk) - - def get_crawl_file(self, storage: StorageRef): - """get crawl file""" - return CrawlFile( - filename=self.upload_name, - hash=self.upload_hasher.hexdigest(), - size=self.upload_size, - storage=storage, - ) - - def prepare_filename(self, filename): - """prepare filename by sanitizing and adding extra string - to avoid duplicates""" - name = sanitize_filename(filename.rsplit("/", 1)[-1]) - parts = name.split(".") - randstr = base64.b32encode(os.urandom(5)).lower() - parts[0] += "-" + randstr.decode("utf-8") - return ".".join(parts) - - # ============================================================================ class UploadFileReader(BufferedReader): """Compute digest on file upload""" @@ -446,3 +410,5 @@ async def delete_uploads( org: Organization = Depends(org_crawl_dep), ): return await ops.delete_uploads(delete_list, org, user) + + return ops diff --git a/backend/test/data/thumbnail.jpg b/backend/test/data/thumbnail.jpg new file mode 100644 index 0000000000..133cfae2b0 Binary files /dev/null and b/backend/test/data/thumbnail.jpg differ diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 4faf42540d..c2999bb5ad 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1,5 +1,6 @@ import requests import os +from uuid import uuid4 from zipfile import ZipFile, ZIP_STORED from tempfile import TemporaryFile @@ -12,12 +13,24 @@ UPDATED_NAME = "Updated tést cöllection" SECOND_COLLECTION_NAME = "second-collection" DESCRIPTION = "Test description" +CAPTION = "Short caption" +UPDATED_CAPTION = "Updated caption" + +NON_PUBLIC_COLL_FIELDS = ( + "modified", + "tags", + "homeUrlPageId", +) +NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created") + _coll_id = None _second_coll_id = None _public_coll_id = None +_second_public_coll_id = None upload_id = None modified = None +default_org_slug = None curr_dir = os.path.dirname(os.path.realpath(__file__)) @@ -25,12 +38,16 @@ def test_create_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id ): + default_thumbnail_name = "default-thumbnail.jpg" + r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers, json={ "crawlIds": [crawler_crawl_id], "name": COLLECTION_NAME, + "caption": CAPTION, + "defaultThumbnailName": default_thumbnail_name, }, ) assert r.status_code == 200 @@ -49,6 +66,29 @@ def test_create_collection( assert _coll_id in r.json()["collectionIds"] assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == _coll_id + assert data["name"] == COLLECTION_NAME + assert data["caption"] == CAPTION + assert data["crawlCount"] == 1 + assert data["pageCount"] > 0 + assert data["totalSize"] > 0 + modified = data["modified"] + assert modified + assert modified.endswith("Z") + + assert data["dateEarliest"] + assert data["dateLatest"] + + assert data["defaultThumbnailName"] == default_thumbnail_name + assert data["allowPublicDownload"] + def test_create_public_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id @@ -59,7 +99,9 @@ def test_create_public_collection( json={ "crawlIds": [crawler_crawl_id], "name": PUBLIC_COLLECTION_NAME, + "caption": CAPTION, "access": "public", + "allowPublicDownload": False, }, ) assert r.status_code == 200 @@ -115,6 +157,7 @@ def test_update_collection( headers=crawler_auth_headers, json={ "description": DESCRIPTION, + "caption": UPDATED_CAPTION, }, ) assert r.status_code == 200 @@ -130,6 +173,7 @@ def test_update_collection( assert data["id"] == _coll_id assert data["name"] == COLLECTION_NAME assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 1 assert data["pageCount"] > 0 assert data["totalSize"] > 0 @@ -137,6 +181,9 @@ def test_update_collection( modified = data["modified"] assert modified assert modified.endswith("Z") + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] def test_rename_collection( @@ -211,6 +258,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] # Verify it was added r = requests.get( @@ -233,6 +282,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] == 0 assert data["modified"] >= modified assert data.get("tags", []) == [] + assert data.get("dateEarliest") is None + assert data.get("dateLatest") is None # Verify they were removed r = requests.get( @@ -261,6 +312,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] def test_get_collection(crawler_auth_headers, default_org_id): @@ -274,11 +327,15 @@ def test_get_collection(crawler_auth_headers, default_org_id): assert data["name"] == UPDATED_NAME assert data["oid"] == default_org_id assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] def test_get_collection_replay(crawler_auth_headers, default_org_id): @@ -292,11 +349,15 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["name"] == UPDATED_NAME assert data["oid"] == default_org_id assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] resources = data["resources"] assert resources @@ -413,6 +474,9 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): assert data["totalSize"] > 0 assert data["modified"] assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] # Verify it was added r = requests.get( @@ -459,16 +523,20 @@ def test_list_collections( assert len(items) == 3 first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0] - assert first_coll["id"] + assert first_coll["id"] == _coll_id assert first_coll["name"] == UPDATED_NAME assert first_coll["oid"] == default_org_id assert first_coll["description"] == DESCRIPTION + assert first_coll["caption"] == UPDATED_CAPTION assert first_coll["crawlCount"] == 3 assert first_coll["pageCount"] > 0 assert first_coll["totalSize"] > 0 assert first_coll["modified"] assert first_coll["tags"] == ["wr-test-2", "wr-test-1"] assert first_coll["access"] == "private" + assert first_coll["dateEarliest"] + assert first_coll["dateLatest"] + assert first_coll["defaultThumbnailName"] second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0] assert second_coll["id"] @@ -481,6 +549,8 @@ def test_list_collections( assert second_coll["modified"] assert second_coll["tags"] == ["wr-test-2"] assert second_coll["access"] == "private" + assert second_coll["dateEarliest"] + assert second_coll["dateLatest"] def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): @@ -742,11 +812,14 @@ def test_list_public_collections( json={ "crawlIds": [crawler_crawl_id], "name": "Second public collection", + "description": "Lorem ipsum", "access": "public", }, ) assert r.status_code == 200 - second_public_coll_id = r.json()["id"] + + global _second_public_coll_id + _second_public_coll_id = r.json()["id"] # Get default org slug r = requests.get( @@ -755,7 +828,10 @@ def test_list_public_collections( ) assert r.status_code == 200 data = r.json() - org_slug = data["slug"] + + global default_org_slug + default_org_slug = data["slug"] + org_name = data["name"] # Verify that public profile isn't enabled @@ -764,7 +840,7 @@ def test_list_public_collections( assert data["publicUrl"] == "" # Try listing public collections without org public profile enabled - r = requests.get(f"{API_PREFIX}/public-collections/{org_slug}") + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -795,7 +871,7 @@ def test_list_public_collections( assert data["publicUrl"] == public_url # List public collections with no auth (no public profile) - r = requests.get(f"{API_PREFIX}/public-collections/{org_slug}") + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") assert r.status_code == 200 data = r.json() @@ -807,12 +883,19 @@ def test_list_public_collections( collections = data["collections"] assert len(collections) == 2 for collection in collections: - assert collection["id"] in (_public_coll_id, second_public_coll_id) + assert collection["id"] in (_public_coll_id, _second_public_coll_id) + assert collection["oid"] assert collection["access"] == "public" + assert collection["name"] + assert collection["dateEarliest"] + assert collection["dateLatest"] + assert collection["crawlCount"] > 0 + assert collection["pageCount"] > 0 + assert collection["totalSize"] > 0 # Test non-existing slug - it should return a 404 but not reveal # whether or not an org exists with that slug - r = requests.get(f"{API_PREFIX}/public-collections/nonexistentslug") + r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -820,7 +903,7 @@ def test_list_public_collections( def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers): # Test existing org that's not public - should return same 404 as # if org doesn't exist - r = requests.get(f"{API_PREFIX}/public-collections/{NON_DEFAULT_ORG_SLUG}") + r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections") assert r.status_code == 404 assert r.json()["detail"] == "public_profile_not_found" @@ -837,13 +920,459 @@ def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers # List public collections with no auth - should still get profile even # with no public collections - r = requests.get(f"{API_PREFIX}/public-collections/{NON_DEFAULT_ORG_SLUG}") + r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections") assert r.status_code == 200 data = r.json() assert data["org"]["name"] == NON_DEFAULT_ORG_NAME assert data["collections"] == [] +def test_set_collection_home_url( + crawler_auth_headers, default_org_id, crawler_crawl_id +): + # Get a page id from crawler_crawl_id + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] >= 1 + + page = data["items"][0] + assert page + + page_id = page["id"] + assert page_id + + page_url = page["url"] + page_ts = page["ts"] + + # Set page as home url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url", + headers=crawler_auth_headers, + json={"pageId": page_id}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Check that fields were set in collection as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["homeUrl"] == page_url + assert data["homeUrlTs"] == page_ts + assert data["homeUrlPageId"] == page_id + + +def test_collection_url_list(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + urls = data["items"] + assert urls + + for url in urls: + assert url["url"] + assert url["count"] >= 1 + + snapshots = url["snapshots"] + assert snapshots + + for snapshot in snapshots: + assert snapshot["pageId"] + assert snapshot["ts"] + assert snapshot["status"] + + +def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id): + with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh: + r = requests.put( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg", + headers=crawler_auth_headers, + data=read_in_chunks(fh), + ) + assert r.status_code == 200 + assert r.json()["added"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + thumbnail = r.json()["thumbnail"] + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] > 0 + + assert thumbnail["originalFilename"] == "thumbnail.jpg" + assert thumbnail["mime"] == "image/jpeg" + assert thumbnail["userid"] + assert thumbnail["userName"] + assert thumbnail["created"] + + +def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id): + default_thumbnail_name = "orange-default.avif" + + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + json={"defaultThumbnailName": default_thumbnail_name}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == _second_public_coll_id + assert data["defaultThumbnailName"] == default_thumbnail_name + + +def test_list_public_colls_home_url_thumbnail(): + # Check we get expected data for each public collection + # and nothing we don't expect + non_public_fields = ( + "oid", + "modified", + "crawlCount", + "pageCount", + "totalSize", + "tags", + "access", + "homeUrlPageId", + ) + non_public_image_fields = ("originalFilename", "userid", "userName", "created") + + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") + assert r.status_code == 200 + collections = r.json()["collections"] + assert len(collections) == 2 + + for coll in collections: + assert coll["id"] in (_public_coll_id, _second_public_coll_id) + assert coll["oid"] + assert coll["access"] == "public" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + if coll["id"] == _public_coll_id: + assert coll["allowPublicDownload"] is False + + assert coll["caption"] == CAPTION + + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in NON_PUBLIC_IMAGE_FIELDS: + assert field not in thumbnail + + if coll["id"] == _second_public_coll_id: + assert coll["description"] + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + +def test_get_public_collection(default_org_id): + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "public" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + assert coll["caption"] == CAPTION + + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + assert coll["allowPublicDownload"] is False + + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in NON_PUBLIC_IMAGE_FIELDS: + assert field not in thumbnail + + # Invalid org slug - don't reveal whether org exists or not, use + # same exception as if collection doesn't exist + r = requests.get( + f"{API_PREFIX}/public/orgs/doesntexist/collections/{_public_coll_id}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + # Invalid collection id + random_uuid = uuid4() + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{random_uuid}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + # Collection isn't public + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{ _coll_id}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + +def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): + # Make second public coll unlisted + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + json={ + "access": "unlisted", + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify single public collection GET endpoint works for unlisted collection + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _second_public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "unlisted" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + +def test_get_public_collection_unlisted_org_profile_disabled( + admin_auth_headers, default_org_id +): + # Disable org profile + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": False, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify we can still get public details for unlisted collection + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _second_public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "unlisted" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + # Re-enable org profile + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + +def test_delete_thumbnail(crawler_auth_headers, default_org_id): + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json().get("thumbnail") is None + + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail", + headers=crawler_auth_headers, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "thumbnail_not_found" + + +def test_unset_collection_home_url( + crawler_auth_headers, default_org_id, crawler_crawl_id +): + # Unset home url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url", + headers=crawler_auth_headers, + json={"pageId": None}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Check that fields were set in collection as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data.get("homeUrl") is None + assert data.get("homeUrlTs") is None + assert data.get("homeUrlPageId") is None + + +def test_download_streaming_public_collection(crawler_auth_headers, default_org_id): + # Check that download is blocked if allowPublicDownload is False + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 403 + + # Set allowPublicDownload to True and then check downloading works + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + json={ + "allowPublicDownload": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + +def test_download_streaming_public_collection_profile_disabled( + admin_auth_headers, default_org_id +): + # Disable org public profile and ensure download still works for public collection + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": False, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index fb7543d0a2..3fb1c1c44b 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -232,6 +232,49 @@ def test_get_upload_replay_json_admin( assert "files" not in data +def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): + # Give time for pages to finish being uploaded + time.sleep(10) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] > 0 + + pages = data["items"] + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] == upload_id + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + + page_id = pages[0]["id"] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + page = r.json() + + assert page["id"] == page_id + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + + assert page["notes"] == [] + assert page.get("userid") is None + assert page.get("modified") is None + assert page.get("approved") is None + + def test_replace_upload( admin_auth_headers, default_org_id, uploads_collection_id, upload_id ): diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 132d3bf8fe..f47dd2acfd 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 0 + ttlSecondsAfterFinished: 90 backoffLimit: 3 template: spec: @@ -38,6 +38,9 @@ spec: - name: OID value: {{ oid }} + - name: CRAWL_TYPE + value: {{ crawl_type }} + envFrom: - configMapRef: name: backend-env-config