Skip to content

Backend work for public collections: thumbnails, url list, upload pages, and so on #2198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 57 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
9c7c920
Add list endpoint to get sorted list of URLs in collection
tw4l Nov 26, 2024
2041be5
Add endpoint to set or update collection home url
tw4l Nov 27, 2024
02a7d1e
Fixups
tw4l Nov 27, 2024
d75c4af
Use updated response for /home-urls endpoint
tw4l Nov 27, 2024
867fc7a
Add tests
tw4l Nov 27, 2024
52404c6
WIP: Add collection thumbnail upload
tw4l Nov 28, 2024
130ee51
Add collection thumbnail upload test
tw4l Nov 28, 2024
3b26828
WIP: Only share subset of coll and image data for public endpoint
tw4l Nov 28, 2024
ff64933
Fix test that was checking wrong collection
tw4l Nov 28, 2024
0ee13d8
Change path for thumbnail endpoint
tw4l Dec 2, 2024
ce52e87
Add endpoint to delete collection thumbnail
tw4l Dec 2, 2024
5e880d9
Add caption field to collections
tw4l Dec 2, 2024
355c7f4
Calculate and store earliest and latest dates in collection
tw4l Dec 2, 2024
571c3f7
Add comment to get CI to run
tw4l Dec 2, 2024
d904dc0
Fix test
tw4l Dec 2, 2024
99229e0
Add pages to database for uploads
tw4l Dec 2, 2024
af9265a
WIP: Add all-crawls/ and uploads/ versions of GET pages endpoints
tw4l Dec 2, 2024
1ad1b27
Add fallbacks if pages have no id or a non-UUID id
tw4l Dec 2, 2024
34ebaed
Move pages test later to give it more time
tw4l Dec 2, 2024
e14c807
Delete upload pages when deleted or replaced
tw4l Dec 2, 2024
594dedd
Remove asserts for upload pages for optional fields
tw4l Dec 2, 2024
f00cc9f
Remove outdated comment
tw4l Dec 2, 2024
19cfca6
Filter (re-)add all pages endpoint by crawl type in path
tw4l Dec 3, 2024
4fd6e48
Move re-adding all pages in org to background job
tw4l Dec 3, 2024
7271dbf
Add migration to add upload files for all orgs in bg jobs
tw4l Dec 3, 2024
0ddbcd0
Add API endpoint to GET single public collection
tw4l Dec 3, 2024
34ea143
Recalculate collection dates after adding upload pages
tw4l Dec 3, 2024
60522c1
Rename type_filter to crawl_type
tw4l Dec 3, 2024
67b52af
Recalculate collection stats after adding upload pages
tw4l Dec 3, 2024
2b5ba8e
Reduce per-page print logging
tw4l Dec 3, 2024
2e72abe
Include upload pages in collection pageCount
tw4l Dec 3, 2024
b96346b
Return 404 for GET public collection if org isn't public
tw4l Dec 3, 2024
b260386
Add oid to PublicCollOut model
tw4l Dec 3, 2024
de99071
Modify public collections endpoint paths
tw4l Dec 3, 2024
cabed6d
Fix endpoint path in test
tw4l Dec 3, 2024
5fbf3f3
Only fetch org once, not per-crawl
tw4l Dec 4, 2024
ea86db4
Add counts and size to PublicCollOut
tw4l Dec 4, 2024
90052bc
Enforce max thumbnail file size of 2MB
tw4l Dec 4, 2024
921c6bb
Use proper thumbnail as test data
tw4l Dec 4, 2024
a4ca989
Allow unlisted collections in single public collection GET
tw4l Dec 9, 2024
af0d798
Add access field to PublicCollOut
tw4l Dec 9, 2024
72afd60
Don't expect thumbnail for unlisted collection
tw4l Dec 9, 2024
902e49e
Fix thumbnail-related 500 error with private collections list
tw4l Dec 10, 2024
57825a2
Add defaultThumbnailName field to collections
tw4l Dec 10, 2024
35474a4
Add fallback values for access in coll out models
tw4l Dec 11, 2024
7e8583a
Add ability to unset home url via /home-url endpoint
tw4l Dec 11, 2024
4b9a60d
Add public collection download endpoint
tw4l Dec 11, 2024
7cf0b62
Implement and enforce Collection.allowPublicDownload
tw4l Dec 11, 2024
7ac420d
Fix linting
tw4l Dec 11, 2024
d00cc39
Make sure coll out models return allowPublicDownload as bool
tw4l Dec 11, 2024
4482b38
Make migration idempotent - don't readd existing upload pages
tw4l Dec 17, 2024
73813dd
Reformat migration
tw4l Dec 17, 2024
2dce950
Fix typing
tw4l Dec 17, 2024
979884e
Allow getting and downloading public collections if org profile disabled
tw4l Dec 18, 2024
320b218
URL decode urlPrefix in /urls endpoint
tw4l Dec 18, 2024
fb0ad7b
Do URL decoding inside list urls method
tw4l Dec 18, 2024
f882409
Escape special characters in url_prefix regex
tw4l Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions backend/btrixcloud/background_jobs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""k8s background jobs"""

import asyncio
import os
from datetime import datetime
from typing import Optional, Tuple, Union, List, Dict, TYPE_CHECKING, cast
from uuid import UUID
Expand All @@ -22,6 +21,7 @@
DeleteReplicaJob,
DeleteOrgJob,
RecalculateOrgStatsJob,
ReAddOrgPagesJob,
PaginatedBackgroundJobResponse,
AnyJob,
StorageRef,
Expand Down Expand Up @@ -286,8 +286,6 @@ async def create_delete_org_job(
try:
job_id = await self.crawl_manager.run_delete_org_job(
oid=str(org.id),
backend_image=os.environ.get("BACKEND_IMAGE", ""),
pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""),
existing_job_id=existing_job_id,
)
if existing_job_id:
Expand Down Expand Up @@ -331,8 +329,6 @@ async def create_recalculate_org_stats_job(
try:
job_id = await self.crawl_manager.run_recalculate_org_stats_job(
oid=str(org.id),
backend_image=os.environ.get("BACKEND_IMAGE", ""),
pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""),
existing_job_id=existing_job_id,
)
if existing_job_id:
Expand Down Expand Up @@ -366,6 +362,52 @@ async def create_recalculate_org_stats_job(
print(f"warning: recalculate org stats job could not be started: {exc}")
return None

async def create_re_add_org_pages_job(
self,
oid: UUID,
crawl_type: Optional[str] = None,
existing_job_id: Optional[str] = None,
):
"""Create job to (re)add all pages in an org, optionally filtered by crawl type"""

try:
job_id = await self.crawl_manager.run_re_add_org_pages_job(
oid=str(oid),
crawl_type=crawl_type,
existing_job_id=existing_job_id,
)
if existing_job_id:
readd_pages_job = await self.get_background_job(existing_job_id, oid)
previous_attempt = {
"started": readd_pages_job.started,
"finished": readd_pages_job.finished,
}
if readd_pages_job.previousAttempts:
readd_pages_job.previousAttempts.append(previous_attempt)
else:
readd_pages_job.previousAttempts = [previous_attempt]
readd_pages_job.started = dt_now()
readd_pages_job.finished = None
readd_pages_job.success = None
else:
readd_pages_job = ReAddOrgPagesJob(
id=job_id,
oid=oid,
crawl_type=crawl_type,
started=dt_now(),
)

await self.jobs.find_one_and_update(
{"_id": job_id}, {"$set": readd_pages_job.to_dict()}, upsert=True
)

return job_id
# pylint: disable=broad-exception-caught
except Exception as exc:
# pylint: disable=raise-missing-from
print(f"warning: re-add org pages job could not be started: {exc}")
return None

async def job_finished(
self,
job_id: str,
Expand Down Expand Up @@ -411,7 +453,11 @@ async def job_finished(
async def get_background_job(
self, job_id: str, oid: Optional[UUID] = None
) -> Union[
CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob
CreateReplicaJob,
DeleteReplicaJob,
DeleteOrgJob,
RecalculateOrgStatsJob,
ReAddOrgPagesJob,
]:
"""Get background job"""
query: dict[str, object] = {"_id": job_id}
Expand All @@ -435,6 +481,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]):
if data["type"] == BgJobType.RECALCULATE_ORG_STATS:
return RecalculateOrgStatsJob.from_dict(data)

if data["type"] == BgJobType.READD_ORG_PAGES:
return ReAddOrgPagesJob.from_dict(data)

return DeleteOrgJob.from_dict(data)

async def list_background_jobs(
Expand Down Expand Up @@ -575,6 +624,13 @@ async def retry_background_job(
existing_job_id=job_id,
)

if job.type == BgJobType.READD_ORG_PAGES:
await self.create_re_add_org_pages_job(
org.id,
job.crawl_type,
existing_job_id=job_id,
)

return {"success": True}

async def retry_failed_background_jobs(
Expand Down
24 changes: 6 additions & 18 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" base crawl type """

import os
from datetime import timedelta
from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
from uuid import UUID
Expand Down Expand Up @@ -29,6 +28,7 @@
UpdatedResponse,
DeletedResponseQuota,
CrawlSearchValuesResponse,
PRESIGN_DURATION_SECONDS,
)
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
from .utils import dt_now, date_to_str
Expand All @@ -47,11 +47,6 @@
CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object
StorageOps = EventWebhookOps = BackgroundJobOps = object

# Presign duration must be less than 604800 seconds (one week),
# so set this one minute short of a week.
PRESIGN_MINUTES_MAX = 10079
PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX


# ============================================================================
# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines
Expand Down Expand Up @@ -93,16 +88,8 @@ def __init__(
self.background_job_ops = background_job_ops
self.page_ops = cast(PageOps, None)

presign_duration_minutes = int(
os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT
)

self.presign_duration_seconds = (
min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60
)

# renew when <25% of time remaining
self.expire_at_duration_seconds = int(self.presign_duration_seconds * 0.75)
self.expire_at_duration_seconds = int(PRESIGN_DURATION_SECONDS * 0.75)

def set_page_ops(self, page_ops):
"""set page ops reference"""
Expand Down Expand Up @@ -336,8 +323,9 @@ async def delete_crawls(
status_code=400, detail=f"Error Stopping Crawl: {exc}"
)

await self.page_ops.delete_crawl_pages(crawl_id, org.id)

if type_ == "crawl":
await self.page_ops.delete_crawl_pages(crawl_id, org.id)
await self.delete_all_crawl_qa_files(crawl_id, org)

crawl_size = await self._delete_crawl_files(crawl, org)
Expand Down Expand Up @@ -382,7 +370,7 @@ async def _delete_crawl_files(
size = 0
for file_ in crawl.files:
size += file_.size
if not await self.storage_ops.delete_crawl_file_object(org, file_):
if not await self.storage_ops.delete_file_object(org, file_):
raise HTTPException(status_code=400, detail="file_deletion_error")
# Not replicating QA run WACZs yet
if not isinstance(crawl, QARun):
Expand Down Expand Up @@ -474,7 +462,7 @@ async def resolve_signed_urls(
):
exp = now + delta
presigned_url = await self.storage_ops.get_presigned_url(
org, file_, self.presign_duration_seconds
org, file_, PRESIGN_DURATION_SECONDS
)

prefix = "files"
Expand Down
Loading
Loading