From 39f5acb28efc6645538e3a7881128f0beccab397 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 20 Sep 2024 12:32:20 -0400 Subject: [PATCH 01/69] Add back custom storage endpoints --- backend/btrixcloud/storages.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 1e58521717..88b27a1c89 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -845,10 +845,6 @@ async def clear_all_presigned_urls(user: User = Depends(user_dep)): return {"success": True} - # pylint: disable=unreachable, fixme - # todo: enable when ready to support custom storage - return storage_ops - @router.post( "/customStorage", tags=["organizations"], response_model=AddedResponseName ) From 814f4e4f8dd199cf2b002bca8275ee21b62291ad Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 20 Sep 2024 12:59:39 -0400 Subject: [PATCH 02/69] Flush out tests for setting custom storage --- backend/btrixcloud/storages.py | 4 +++ backend/test/test_org.py | 64 +++++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 88b27a1c89..ffb26ac9e2 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -861,6 +861,10 @@ async def remove_custom_storage( ): return await storage_ops.remove_custom_storage(name, org) + # TODO: Modify this to make it easier to change just the primary or replica locations + # without needing to explicitly pass the secrets for what we're not changing every time + # - Maybe make it a PATCH + # - Maybe split out into two endpoints @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_refs( storage: OrgStorageRefs, diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 755e3711b2..b384617d06 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -13,6 +13,9 @@ invite_email = "test-user@example.com" +CUSTOM_PRIMARY_STORAGE_NAME = "custom-primary" +CUSTOM_REPLICA_STORAGE_NAME = "custom-replica" + def test_ensure_only_one_default_org(admin_auth_headers): r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers) @@ -231,8 +234,7 @@ def test_create_org_duplicate_slug(admin_auth_headers, non_default_org_id, slug) assert data["detail"] == "duplicate_org_slug" -# disable until storage customization is enabled -def _test_change_org_storage(admin_auth_headers): +def test_change_org_storage(admin_auth_headers): # change to invalid storage r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", @@ -251,16 +253,70 @@ def _test_change_org_storage(admin_auth_headers): assert r.status_code == 400 - # change to valid storage + # add custom storages + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/customStorage", + headers=admin_auth_headers, + json={ + "name": CUSTOM_PRIMARY_STORAGE_NAME, + "access_key": "ADMIN", + "secret_key": "PASSW0RD", + "bucket": CUSTOM_PRIMARY_STORAGE_NAME, + "endpoint_url": "http://local-minio.default:9000/", + }, + ) + assert r.status_code == 200 + data = r.json() + assert data["added"] + assert data["name"] == CUSTOM_PRIMARY_STORAGE_NAME + + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/customStorage", + headers=admin_auth_headers, + json={ + "name": CUSTOM_REPLICA_STORAGE_NAME, + "access_key": "ADMIN", + "secret_key": "PASSW0RD", + "bucket": CUSTOM_REPLICA_STORAGE_NAME, + "endpoint_url": "http://local-minio.default:9000/", + }, + ) + assert r.status_code == 200 + data = r.json() + assert data["added"] + assert data["name"] == CUSTOM_REPLICA_STORAGE_NAME + + # set org to use custom storages moving forward r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, - json={"storage": {"name": "alt-storage", "custom": False}}, + json={ + "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, + "storageReplicas": [{"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True}], + }, ) assert r.status_code == 200 assert r.json()["updated"] + # check org was updated as expected + r = requests.get( + f"{API_PREFIX}/orgs/{new_oid}/storage", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + storage = data["storage"] + assert storage["name"] == CUSTOM_PRIMARY_STORAGE_NAME + assert storage["custom"] + + replicas = data["storageReplicas"] + assert len(replicas) == 1 + replica = replicas[0] + assert replica["name"] == CUSTOM_REPLICA_STORAGE_NAME + assert replica["custom"] + def test_remove_user_from_org(admin_auth_headers, default_org_id): # Add new user to org From da403946120e2e7c09ec752194d21b77c8b0150d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 20 Sep 2024 14:12:12 -0400 Subject: [PATCH 03/69] Fix test issue with bucket not existing for now --- backend/btrixcloud/storages.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index ffb26ac9e2..d00af58df2 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -171,7 +171,7 @@ async def add_custom_storage( self, storagein: S3StorageIn, org: Organization ) -> dict: """Add new custom storage""" - name = "!" + slug_from_name(storagein.name) + name = slug_from_name(storagein.name) if name in org.customStorages: raise HTTPException(status_code=400, detail="storage_already_exists") @@ -312,6 +312,12 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None key += filename data = b"" + # create bucket if it doesn't yet exist + # TODO: Remove this, useful for dev but in real cases we want to + # fail if bucket doesn't exist/has invalid credentials + resp = await client.create_bucket(Bucket=bucket) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 @@ -865,6 +871,8 @@ async def remove_custom_storage( # without needing to explicitly pass the secrets for what we're not changing every time # - Maybe make it a PATCH # - Maybe split out into two endpoints + # - Add endpoint to reset to default so we don't have to pass secrets in POST request + # to remove custom storage? @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_refs( storage: OrgStorageRefs, From fb13d7e37aedf7671b74e0334d6f912489e394ab Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 23 Sep 2024 12:04:13 -0400 Subject: [PATCH 04/69] Add additional tests --- backend/test/test_org.py | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index b384617d06..edd7d2a6c8 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -234,8 +234,8 @@ def test_create_org_duplicate_slug(admin_auth_headers, non_default_org_id, slug) assert data["detail"] == "duplicate_org_slug" -def test_change_org_storage(admin_auth_headers): - # change to invalid storage +def test_change_storage_invalid(admin_auth_headers): + # try to change to invalid storage r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, @@ -244,7 +244,6 @@ def test_change_org_storage(admin_auth_headers): assert r.status_code == 400 - # change to invalid storage r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, @@ -253,6 +252,8 @@ def test_change_org_storage(admin_auth_headers): assert r.status_code == 400 + +def test_add_custom_storage(admin_auth_headers): # add custom storages r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/customStorage", @@ -318,6 +319,52 @@ def test_change_org_storage(admin_auth_headers): assert replica["custom"] +def test_remove_custom_storage(admin_auth_headers): + # Try to remove in-use storages, verify we get expected 400 response + r = requests.delete( + f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_PRIMARY_STORAGE_NAME}", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "storage_in_use" + + r = requests.delete( + f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "storage_in_use" + + # Unset replica storage from org + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/storage", + headers=admin_auth_headers, + json={"storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}}, + ) + + # Delete no longer used replica storage location + r = requests.delete( + f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + # Check org + r = requests.get( + f"{API_PREFIX}/orgs/{new_oid}/storage", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + storage = data["storage"] + assert storage["name"] == CUSTOM_PRIMARY_STORAGE_NAME + assert storage["custom"] + + assert data["storageReplicas"] == [] + + def test_remove_user_from_org(admin_auth_headers, default_org_id): # Add new user to org r = requests.post( From f9406018c2ae7ad650c6c9cfefb503af75d08a56 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 14:36:29 -0400 Subject: [PATCH 05/69] Fix custom storage so it works as expected - Set access_endpoint_url to the endpoint url with bucket so that we can generate a presigned URL as expected - Make adding bucket in verify_storage_upload a backup routine after first exception is raised --- backend/btrixcloud/storages.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index d00af58df2..714f1696ab 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -188,7 +188,8 @@ async def add_custom_storage( region=storagein.region, endpoint_url=endpoint_url, endpoint_no_bucket_url=endpoint_no_bucket_url, - access_endpoint_url=storagein.access_endpoint_url or storagein.endpoint_url, + access_endpoint_url=storagein.access_endpoint_url or endpoint_url, + use_access_for_presign=True, ) try: @@ -312,14 +313,16 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None key += filename data = b"" - # create bucket if it doesn't yet exist - # TODO: Remove this, useful for dev but in real cases we want to - # fail if bucket doesn't exist/has invalid credentials - resp = await client.create_bucket(Bucket=bucket) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - - resp = await client.put_object(Bucket=bucket, Key=key, Body=data) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + try: + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + except: + # create bucket if it doesn't yet exist and then try again + resp = await client.create_bucket(Bucket=bucket) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 def resolve_internal_access_path(self, path): """Resolve relative path for internal access to minio bucket""" From 8f8cb68b5b872f05cc3c881fb5507cdcdafc3148 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 15:09:09 -0400 Subject: [PATCH 06/69] Actually unset custom replica storage before deleting --- backend/test/test_org.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index edd7d2a6c8..a632006781 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -339,7 +339,10 @@ def test_remove_custom_storage(admin_auth_headers): r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, - json={"storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}}, + json={ + "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, + "storageReplicas": [], + }, ) # Delete no longer used replica storage location From 33393f6d71fc7fa73717fe47f0102512b7ba7c73 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 16:20:07 -0400 Subject: [PATCH 07/69] Add TODO where custom storage deletion is failing --- backend/btrixcloud/crawlmanager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index bbadb2424c..6b2542579e 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -306,6 +306,9 @@ async def remove_org_storage(self, storage: StorageRef, oid: str) -> bool: storage_secret = storage.get_storage_secret_name(oid) storage_label = f"btrix.storage={storage_secret}" + # TODO: This is causing deletion of custom storage to fail + # even when it hasn't been used - why has this label been applied + # when no crawls or profiles have been created yet? if await self.has_custom_jobs_with_label("crawljobs", storage_label): raise HTTPException(status_code=400, detail="storage_in_use") From bb86c49f31ab9033c6f0569dd272cebb079eee53 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 18:00:04 -0400 Subject: [PATCH 08/69] Fix check for whether storage label is in use --- backend/btrixcloud/crawlmanager.py | 3 --- backend/btrixcloud/k8sapi.py | 5 ++++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 6b2542579e..bbadb2424c 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -306,9 +306,6 @@ async def remove_org_storage(self, storage: StorageRef, oid: str) -> bool: storage_secret = storage.get_storage_secret_name(oid) storage_label = f"btrix.storage={storage_secret}" - # TODO: This is causing deletion of custom storage to fail - # even when it hasn't been used - why has this label been applied - # when no crawls or profiles have been created yet? if await self.has_custom_jobs_with_label("crawljobs", storage_label): raise HTTPException(status_code=400, detail="storage_in_use") diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index bfaaabb656..148f656604 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -328,7 +328,7 @@ async def has_custom_jobs_with_label(self, plural, label) -> bool: """return true/false if any crawljobs or profilejobs match given label""" try: - await self.custom_api.list_namespaced_custom_object( + resp = await self.custom_api.list_namespaced_custom_object( group="btrix.cloud", version="v1", namespace=self.namespace, @@ -336,6 +336,9 @@ async def has_custom_jobs_with_label(self, plural, label) -> bool: label_selector=label, limit=1, ) + matching_items = resp.get("items", []) + if not matching_items: + return False return True # pylint: disable=broad-exception-caught except Exception: From c45b2090b2381e15eb3d4c8ad914e70fed503f95 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 18:08:29 -0400 Subject: [PATCH 09/69] Remove todo on endpoint that's fine --- backend/btrixcloud/storages.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 714f1696ab..fe01973f38 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -870,12 +870,6 @@ async def remove_custom_storage( ): return await storage_ops.remove_custom_storage(name, org) - # TODO: Modify this to make it easier to change just the primary or replica locations - # without needing to explicitly pass the secrets for what we're not changing every time - # - Maybe make it a PATCH - # - Maybe split out into two endpoints - # - Add endpoint to reset to default so we don't have to pass secrets in POST request - # to remove custom storage? @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_refs( storage: OrgStorageRefs, From 21b4867ad6d5ef7a7ad34fb1746e0df416c9b917 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 24 Sep 2024 18:13:20 -0400 Subject: [PATCH 10/69] Add todos re: tasks necessary to change storage --- backend/btrixcloud/storages.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index fe01973f38..4c4f8d1dd5 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -261,6 +261,21 @@ async def update_storage_refs( org.storage = storage_refs.storage org.storageReplicas = storage_refs.storageReplicas + # TODO: Account for replication if there's stored content, + # we'll need to move it to the new bucket and update paths in the + # database accordingly (if any updates are needed, at minimum should + # probably re-generate presigned URLs?) + # If a replica location is added, we should replicate everything in + # primary storage into it + # If a replica location is removed, should we wait for content to + # be deleted before removing it, or assume that happens manually as + # necessary? + + # We'll also need to make sure any running crawl, bg, or profile jobs + # that use this storage are completed first + + # We can set the org to read-only while handling these details + await self.org_ops.update_storage_refs(org) return {"updated": True} From b60d7cca9332dea671d38a9c90210d811cc446cc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 09:15:42 -0400 Subject: [PATCH 11/69] Check that no crawls are running before updating storage --- backend/btrixcloud/orgs.py | 9 +++++++++ backend/btrixcloud/storages.py | 27 ++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index ac939391f1..518f94a1ea 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -993,6 +993,15 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "publicCollectionsCount": public_collections_count, } + async def is_crawl_running(self, oid: UUID) -> bool: + """Return boolean indicating whether any crawls are currently running in org""" + workflows_running_count = await self.crawls_db.count_documents( + {"oid": org.id, "state": {"$in": RUNNING_STATES}} + ) + if workflows_running_count > 0: + return True + return False + async def get_all_org_slugs(self) -> dict[str, list[str]]: """Return list of all org slugs.""" slugs = await self.orgs.distinct("slug", {}) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 4c4f8d1dd5..48cae1ff03 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -258,25 +258,26 @@ async def update_storage_refs( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") + if await self.org_ops.is_crawl_running(org.id): + raise HTTPException(status_code=400, detail="crawl_running") + org.storage = storage_refs.storage org.storageReplicas = storage_refs.storageReplicas - # TODO: Account for replication if there's stored content, - # we'll need to move it to the new bucket and update paths in the - # database accordingly (if any updates are needed, at minimum should - # probably re-generate presigned URLs?) - # If a replica location is added, we should replicate everything in - # primary storage into it - # If a replica location is removed, should we wait for content to - # be deleted before removing it, or assume that happens manually as - # necessary? + await self.org_ops.update_storage_refs(org) - # We'll also need to make sure any running crawl, bg, or profile jobs - # that use this storage are completed first + # TODO: Handle practical consequences of changing buckets + # - If previous primary bucket(s) had files stored, copy or move those + # into new storage and make necessary updates (e.g. regenerate presigned + # URLs?) + # - If replica location is added, replicate everything in primary + # to new replica storage location + # - If replica location is removed, start jobs to delete content? + # (or do we want to handle that manually?) # We can set the org to read-only while handling these details - - await self.org_ops.update_storage_refs(org) + # Think through timing and/or how to communicate status of jobs to + # user, since background jobs don't block return {"updated": True} From bd91ce430ffc265b055079c69962217b71c08d82 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 10:08:07 -0400 Subject: [PATCH 12/69] Start adding post-storage update logic --- backend/btrixcloud/orgs.py | 52 ++++++++++++++++++++++++++++-- backend/btrixcloud/storages.py | 59 +++++++++++++++++++++++++++------- 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 518f94a1ea..ba32c23b44 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -503,6 +503,36 @@ async def update_storage_refs(self, org: Organization) -> bool: res = await self.orgs.find_one_and_update({"_id": org.id}, {"$set": set_dict}) return res is not None + async def update_file_storage_refs( + self, org: Organization, previous_storage: StorageRef, new_storage: StorageRef + ) -> bool: + """Update storage refs for all crawl and profile files in given org""" + res = await self.crawls_db.update_many( + {"_id": org.id, "files.$.storage": previous_storage}, + {"$set": {"files.$.storage": new_storage}}, + ) + if not res: + return False + + res = await self.profiles_db.update_many( + {"_id": org.id, "resource.storage": previous_storage}, + {"$set": {"resource.storage": new_storage}}, + ) + if not res: + return False + + return True + + async def unset_file_presigned_urls(self, org: Organization) -> bool: + """Unset all presigned URLs for files in org""" + res = await self.crawls_db.update_many( + {"_id": org.id}, {"$set": {"files.$.presignedUrl": None}} + ) + if not res: + return False + + return True + async def update_subscription_data( self, update: SubscriptionUpdate ) -> Optional[Organization]: @@ -993,13 +1023,29 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "publicCollectionsCount": public_collections_count, } - async def is_crawl_running(self, oid: UUID) -> bool: + async def is_crawl_running(self, org: Organization) -> bool: """Return boolean indicating whether any crawls are currently running in org""" - workflows_running_count = await self.crawls_db.count_documents( + running_count = await self.crawls_db.count_documents( {"oid": org.id, "state": {"$in": RUNNING_STATES}} ) - if workflows_running_count > 0: + if running_count > 0: + return True + return False + + async def has_files_stored(self, org: Organization) -> bool: + """Return boolean indicating whether any files are stored on org""" + crawl_count = await self.crawls_db.count_documents( + {"_id": org.id, "files.1": {"$exists": True}}, + ) + if crawl_count > 0: + return True + + profile_count = await self.profiles_db.count_documents( + {"_id": org.id, "resource": {"$exists": True}}, + ) + if profile_count > 0: return True + return False async def get_all_org_slugs(self) -> dict[str, list[str]]: diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 48cae1ff03..42fa72b63e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -261,26 +261,61 @@ async def update_storage_refs( if await self.org_ops.is_crawl_running(org.id): raise HTTPException(status_code=400, detail="crawl_running") + prev_storage = org.storage + prev_storage_replicas = org.storageReplicas + org.storage = storage_refs.storage org.storageReplicas = storage_refs.storageReplicas await self.org_ops.update_storage_refs(org) - # TODO: Handle practical consequences of changing buckets - # - If previous primary bucket(s) had files stored, copy or move those - # into new storage and make necessary updates (e.g. regenerate presigned - # URLs?) - # - If replica location is added, replicate everything in primary - # to new replica storage location - # - If replica location is removed, start jobs to delete content? - # (or do we want to handle that manually?) - - # We can set the org to read-only while handling these details - # Think through timing and/or how to communicate status of jobs to - # user, since background jobs don't block + asyncio.create_task( + self.run_post_storage_update_tasks( + OrgStorageRefs(storage=prev_storage, storageReplicas=prev_storage_refs), + storage_refs, + org, + ) + ) return {"updated": True} + async def run_post_storage_update_tasks( + self, + prev_storage_refs: StorageRef, + new_storage_refs: StorageRef, + org: Organization, + ): + """Handle tasks necessary after changing org storage""" + if not await self.org_ops.has_files_stored(org): + return + + if new_storage_refs.storage != prev_storage_refs.storage: + await self.org_ops.update_read_only(org, True, "Updating storage") + + # TODO: Copy files from from previous to new primary storage + # (Ideally block on this, otherwise unset read-only on completion in + # operator?) + + await self.org_ops.update_file_storage_refs( + org, prev_storage_refs.storage, new_storage_refs.storage + ) + + await self.org_ops.unset_file_presigned_urls(org) + + await self.org_ops.update_read_only(org, False) + + if new_storage_refs.storageReplicas != prev_storage_refs.storageReplicas: + # If replica location added, kick off background jobs to replicate + # primary storage to new replica storage location (non-blocking) + + # If replica location is removed, start jobs to delete files from + # removed replica location (non-blocking) + + # If we also changed primary storage in this update, we should make + # sure all files are successfully copied before doing anything to + # the replicas + pass + def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" refs: List[StorageRef] = [] From b7fededcae76ef8e56c4052dc9a76e4454c8cb5d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 10:59:49 -0400 Subject: [PATCH 13/69] WIP: Add background job to copy old s3 bucket to new --- backend/btrixcloud/background_jobs.py | 79 ++++++++++++++++++++++++++- backend/btrixcloud/crawlmanager.py | 41 +++++++++++++- backend/btrixcloud/main.py | 2 + backend/btrixcloud/models.py | 15 ++++- backend/btrixcloud/storages.py | 36 +++++++++--- 5 files changed, 162 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 49f82d12e1..471dbcfd19 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -24,6 +24,7 @@ RecalculateOrgStatsJob, ReAddOrgPagesJob, OptimizePagesJob, + CopyBucketJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -429,7 +430,7 @@ async def create_re_add_org_pages_job( return job_id # pylint: disable=broad-exception-caught except Exception as exc: - # pylint: disable=raise-missing-from + # pylint: disable=raise-missing-from print(f"warning: re-add org pages job could not be started: {exc}") return None @@ -473,6 +474,67 @@ async def create_optimize_crawl_pages_job( print(f"warning: optimize pages job could not be started: {exc}") return None + async def create_copy_bucket_job( + self, + org: Organization, + prev_storage_ref: StorageRef, + new_storage_ref: StorageRef, + existing_job_id: Optional[str] = None, + ) -> str: + """Start background job to copy entire s3 bucket and return job id""" + prev_storage = self.storage_ops.get_org_storage_by_ref(org, prev_storage_ref) + prev_endpoint, prev_bucket = self.strip_bucket(prev_storage.endpoint_url) + + new_storage = self.storage_ops.get_org_storage_by_ref(org, new_storage_ref) + new_endpoint, new_bucket = self.strip_bucket(new_storage.endpoint_url) + + job_type = BgJobType.COPY_BUCKET.value + + try: + job_id = await self.crawl_manager.run_copy_bucket_job( + oid=str(org.id), + job_type=job_type, + prev_storage=prev_storage_ref, + prev_endpoint=prev_endpoint, + prev_bucket=prev_bucket, + new_storage=new_storage_ref, + new_endpoint=new_endpoint, + new_bucket=new_bucket, + job_id_prefix=f"{job_type}-{object_id}", + existing_job_id=existing_job_id, + ) + if existing_job_id: + copy_job = await self.get_background_job(existing_job_id, org.id) + previous_attempt = { + "started": copy_job.started, + "finished": copy_job.finished, + } + if copy_job.previousAttempts: + copy_job.previousAttempts.append(previous_attempt) + else: + copy_job.previousAttempts = [previous_attempt] + copy_job.started = dt_now() + copy_job.finished = None + copy_job.success = None + else: + copy_job = CopyBucketJob( + id=job_id, + oid=org.id, + started=dt_now(), + prev_storage=prev_storage_ref, + new_storage=new_storage_ref, + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": copy_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + print(f"warning: copy bucket job could not be started for org {org.id}") + return "" + async def job_finished( self, job_id: str, @@ -528,6 +590,9 @@ async def get_background_job( DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob, + CopyBucketJob, + DeleteOrgJob, + RecalculateOrgStatsJob, ReAddOrgPagesJob, OptimizePagesJob, ]: @@ -559,6 +624,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.OPTIMIZE_PAGES: return OptimizePagesJob.from_dict(data) + if data["type"] == BgJobType.COPY_BUCKET: + return CopyBucketJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( @@ -736,6 +804,15 @@ async def retry_org_background_job( ) return {"success": True} + if job.type == BgJobType.COPY_BUCKET: + await self.create_copy_bucket_job( + org, + job.prev_storage, + job.new_storage, + existing_job_id=job_id, + ) + return {"success": True} + return {"success": False} async def retry_failed_org_background_jobs( diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index bbadb2424c..ede0b8480a 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -124,7 +124,6 @@ async def run_delete_org_job( existing_job_id: Optional[str] = None, ) -> str: """run job to delete org and all of its data""" - if existing_job_id: job_id = existing_job_id else: @@ -210,6 +209,46 @@ async def _run_bg_job_with_ops_classes( await self.create_from_yaml(data, namespace=DEFAULT_NAMESPACE) + async def run_copy_bucket_job( + self, + oid: str, + job_type: str, + prev_storage: StorageRef, + prev_endpoint: str, + prev_bucket: str, + new_storage: StorageRef, + new_endpoint: str, + new_bucket: str, + job_id_prefix: Optional[str] = None, + existing_job_id: Optional[str] = None, + ): + """run job to copy entire contents of one s3 bucket to another""" + if existing_job_id: + job_id = existing_job_id + else: + if not job_id_prefix: + job_id_prefix = job_type + + # ensure name is <=63 characters + job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}" + + params = { + "id": job_id, + "oid": oid, + "job_type": job_type, + "prev_secret_name": prev_storage.get_storage_secret_name(oid), + "prev_endpoint": prev_endpoint, + "prev_bucket": prev_bucket, + "new_secret_name": new_storage.get_storage_secret_name(oid), + "new_endpoint": new_endpoint, + "new_bucket": new_bucket, + "BgJobType": BgJobType, + } + + data = self.templates.env.get_template("copy_job.yaml").render(params) + + await self.create_from_yaml(data) + return job_id async def create_crawl_job( diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 71dd62da95..ad08a0ec24 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -266,6 +266,8 @@ def main() -> None: org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops, page_ops) + storage_ops.set_ops(background_job_ops) + user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) background_job_ops.set_ops(base_crawl_ops, profiles) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index fa333f832f..e7a7ca2399 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2570,6 +2570,7 @@ class BgJobType(str, Enum): RECALCULATE_ORG_STATS = "recalculate-org-stats" READD_ORG_PAGES = "readd-org-pages" OPTIMIZE_PAGES = "optimize-pages" + COPY_BUCKET = "copy-bucket" # ============================================================================ @@ -2588,7 +2589,7 @@ class BackgroundJob(BaseMongoModel): # ============================================================================ class CreateReplicaJob(BackgroundJob): - """Model for tracking create of replica jobs""" + """Model for tracking creation of replica jobs""" type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA file_path: str @@ -2639,15 +2640,25 @@ class OptimizePagesJob(BackgroundJob): type: Literal[BgJobType.OPTIMIZE_PAGES] = BgJobType.OPTIMIZE_PAGES +# ============================================================================ +class CopyBucketJob(BackgroundJob): + """Model for tracking job to copy entire s3 bucket""" + + type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA + prev_storage: StorageRef + new_storage: StorageRef + + # ============================================================================ # Union of all job types, for response model AnyJob = RootModel[ Union[ + BackgroundJob, CreateReplicaJob, DeleteReplicaJob, - BackgroundJob, DeleteOrgJob, + CopyBucketJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, OptimizePagesJob, diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 42fa72b63e..3c90eae966 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -23,6 +23,7 @@ import zlib import json import os +import time from datetime import datetime, timedelta from zipfile import ZipInfo @@ -63,8 +64,9 @@ if TYPE_CHECKING: from .orgs import OrgOps from .crawlmanager import CrawlManager + from .background_jobs import BackgroundJobOps else: - OrgOps = CrawlManager = object + OrgOps = CrawlManager = BackgroundJobOps = object CHUNK_SIZE = 1024 * 256 @@ -104,6 +106,8 @@ def __init__(self, org_ops, crawl_manager, mdb) -> None: default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") self.frontend_origin = f"{frontend_origin}.{default_namespace}" + self.base_crawl_ops = cast(BackgroundJobOps, None) + with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) @@ -148,6 +152,10 @@ async def init_index(self): "signedAt", expireAfterSeconds=self.expire_at_duration_seconds ) + def set_ops(self, background_job_ops: BackgroundJobOps) -> None: + """Set background job ops""" + self.background_job_ops = background_job_ops + def _create_s3_storage(self, storage: dict[str, str]) -> S3Storage: """create S3Storage object""" endpoint_url = storage["endpoint_url"] @@ -281,8 +289,8 @@ async def update_storage_refs( async def run_post_storage_update_tasks( self, - prev_storage_refs: StorageRef, - new_storage_refs: StorageRef, + prev_storage_refs: OrgStorageRefs, + new_storage_refs: OrgStorageRefs, org: Organization, ): """Handle tasks necessary after changing org storage""" @@ -292,14 +300,28 @@ async def run_post_storage_update_tasks( if new_storage_refs.storage != prev_storage_refs.storage: await self.org_ops.update_read_only(org, True, "Updating storage") - # TODO: Copy files from from previous to new primary storage - # (Ideally block on this, otherwise unset read-only on completion in - # operator?) + # Create the background job to copy files + job_id = await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_refs.storage, new_storage_refs.storage + ) + + # Block until copy job is complete + # TODO: Handle in operator instead? + while True: + bg_job = await self.background_job_ops.get_background_job( + job_id, org.id + ) + if bg_job.success: + break + if bg_job.success is False: + # TODO: Handle failure case + pass + + time.sleep(5) await self.org_ops.update_file_storage_refs( org, prev_storage_refs.storage, new_storage_refs.storage ) - await self.org_ops.unset_file_presigned_urls(org) await self.org_ops.update_read_only(org, False) From b15c329bf603e72178aaa9b9d13e8ceb3f4601a2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 11:06:58 -0400 Subject: [PATCH 14/69] WIP: Start adding logic to handle replica location updates --- backend/btrixcloud/storages.py | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 3c90eae966..b01760a3c1 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -314,7 +314,8 @@ async def run_post_storage_update_tasks( if bg_job.success: break if bg_job.success is False: - # TODO: Handle failure case + # TODO: Handle failure case, including partial failure + # (i.e. some files but not all copied) pass time.sleep(5) @@ -327,16 +328,32 @@ async def run_post_storage_update_tasks( await self.org_ops.update_read_only(org, False) if new_storage_refs.storageReplicas != prev_storage_refs.storageReplicas: - # If replica location added, kick off background jobs to replicate - # primary storage to new replica storage location (non-blocking) - - # If replica location is removed, start jobs to delete files from - # removed replica location (non-blocking) + # TODO: If we changed primary storage in this update, make sure that + # are files are successfully copied to new primary before doing + # anything with the replicas - this may be simple or complex depending + # on final approach taken to handle above + + # Replicate files to any new replica locations + for replica_storage in new_storage_refs.storageReplicas: + if replica_storage not in prev_storage_refs.storageReplicas: + # TODO: Kick off background jobs to replicate primary + # storage to new replica location + print( + "Not yet implemented: Replicate files to {replica_storage.name}", + flush=True, + ) - # If we also changed primary storage in this update, we should make - # sure all files are successfully copied before doing anything to - # the replicas - pass + # Delete files from previous replica locations that are no longer + # being used + for replica_storage in prev_storage_refs.storageReplicas: + if replica_storage not in new_storage_refs.storageReplicas: + # TODO: Kick off background jobs to delete replicas + # (may be easier to just delete all files from bucket + # in one rclone command) + print( + "Not yet implemented: Delete files from {replica_storage.name}", + flush=True, + ) def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" From 311c01116b50ad337a3c215fb19d74860ddca268 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 11:08:22 -0400 Subject: [PATCH 15/69] Add additional note --- backend/btrixcloud/storages.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index b01760a3c1..f3ed3a8468 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -293,7 +293,11 @@ async def run_post_storage_update_tasks( new_storage_refs: OrgStorageRefs, org: Organization, ): - """Handle tasks necessary after changing org storage""" + """Handle tasks necessary after changing org storage + + This is a good candidate for a background job with access to ops + classes, which may kick off other background jobs as needed. + """ if not await self.org_ops.has_files_stored(org): return From d991900de07ee1e8ed13a4859167f5fcdb30f390 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 11:55:05 -0400 Subject: [PATCH 16/69] Fix argument --- backend/btrixcloud/storages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index f3ed3a8468..99056bef59 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -266,7 +266,7 @@ async def update_storage_refs( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") - if await self.org_ops.is_crawl_running(org.id): + if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=400, detail="crawl_running") prev_storage = org.storage From 933c0435c3b1939a0108760b40631efb67a0c8a1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 12:16:23 -0400 Subject: [PATCH 17/69] Fix another argument --- backend/btrixcloud/storages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 99056bef59..1bbe07fec7 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -279,7 +279,9 @@ async def update_storage_refs( asyncio.create_task( self.run_post_storage_update_tasks( - OrgStorageRefs(storage=prev_storage, storageReplicas=prev_storage_refs), + OrgStorageRefs( + storage=prev_storage, storageReplicas=prev_storage_replicas + ), storage_refs, org, ) From e77bfe62b16ff25aac98dfe2f956c672eeb70da3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 12:20:07 -0400 Subject: [PATCH 18/69] Fixups --- backend/btrixcloud/background_jobs.py | 2 +- backend/btrixcloud/storages.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 471dbcfd19..857c0993b9 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -500,7 +500,7 @@ async def create_copy_bucket_job( new_storage=new_storage_ref, new_endpoint=new_endpoint, new_bucket=new_bucket, - job_id_prefix=f"{job_type}-{object_id}", + job_id_prefix=job_type, existing_job_id=existing_job_id, ) if existing_job_id: diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 1bbe07fec7..38fad45391 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -72,7 +72,7 @@ # ============================================================================ -# pylint: disable=broad-except,raise-missing-from +# pylint: disable=broad-except,raise-missing-from,too-many-public-methods class StorageOps: """All storage handling, download/upload operations""" @@ -106,7 +106,7 @@ def __init__(self, org_ops, crawl_manager, mdb) -> None: default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") self.frontend_origin = f"{frontend_origin}.{default_namespace}" - self.base_crawl_ops = cast(BackgroundJobOps, None) + self.background_job_ops = cast(BackgroundJobOps, None) with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) From b4115a297709ed81745c5d634414728c921e0c42 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 13:11:37 -0400 Subject: [PATCH 19/69] Fix linting --- backend/btrixcloud/background_jobs.py | 6 ++++-- backend/btrixcloud/basecrawls.py | 2 +- backend/btrixcloud/colls.py | 1 + backend/btrixcloud/crawlconfigs.py | 2 +- backend/btrixcloud/crawlmanager.py | 2 +- backend/btrixcloud/crawls.py | 4 ++-- backend/btrixcloud/db.py | 4 ++-- backend/btrixcloud/emailsender.py | 2 +- backend/btrixcloud/invites.py | 1 + backend/btrixcloud/k8sapi.py | 2 +- .../btrixcloud/migrations/migration_0032_dupe_org_names.py | 2 +- backend/btrixcloud/operator/baseoperator.py | 2 +- backend/btrixcloud/operator/crawls.py | 2 +- backend/btrixcloud/operator/cronjobs.py | 2 +- backend/btrixcloud/orgs.py | 2 +- backend/btrixcloud/pages.py | 4 ++-- backend/btrixcloud/profiles.py | 4 ++-- backend/btrixcloud/storages.py | 2 +- backend/btrixcloud/subs.py | 2 ++ backend/btrixcloud/uploads.py | 2 +- backend/btrixcloud/users.py | 2 +- backend/btrixcloud/webhooks.py | 4 ++-- 22 files changed, 31 insertions(+), 25 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 857c0993b9..9a68901fcd 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -532,7 +532,9 @@ async def create_copy_bucket_job( return job_id # pylint: disable=broad-exception-caught except Exception as exc: - print(f"warning: copy bucket job could not be started for org {org.id}") + print( + f"warning: copy bucket job could not be started for org {org.id}: {exc}" + ) return "" async def job_finished( @@ -850,7 +852,7 @@ async def retry_all_failed_background_jobs( # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments def init_background_jobs_api( app, mdb, email, user_manager, org_ops, crawl_manager, storage_ops, user_dep ): diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 48ed89402f..b0b77bafbb 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -49,7 +49,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-branches +# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-branches, too-many-positional-arguments class BaseCrawlOps: """operations that apply to all crawls""" diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index ff9cc70959..352b5b670c 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -66,6 +66,7 @@ # ============================================================================ +# pylint: disable=too-many-positional-arguments class CollectionOps: """ops for working with named collections of crawls""" diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 1b1e236589..6c45ec6024 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -76,7 +76,7 @@ class CrawlConfigOps: """Crawl Config Operations""" - # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-positional-arguments user_manager: UserManager org_ops: OrgOps diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index ede0b8480a..7e0fc5c639 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -21,7 +21,7 @@ # ============================================================================ -# pylint: disable=too-many-public-methods +# pylint: disable=too-many-public-methods, too-many-positional-arguments class CrawlManager(K8sAPI): """abstract crawl manager""" diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 3718a34bed..7661897ae6 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -72,7 +72,7 @@ # ============================================================================ -# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods +# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-positional-arguments class CrawlOps(BaseCrawlOps): """Crawl Ops""" @@ -1115,7 +1115,7 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, too-many-statements +# pylint: disable=too-many-arguments, too-many-locals, too-many-statements, too-many-positional-arguments def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 22df847515..1567bbcfa4 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -87,7 +87,7 @@ async def ping_db(mdb) -> None: # ============================================================================ async def update_and_prepare_db( - # pylint: disable=R0913 + # pylint: disable=R0913, too-many-positional-arguments mdb: AsyncIOMotorDatabase, user_manager: UserManager, org_ops: OrgOps, @@ -223,7 +223,7 @@ async def drop_indexes(mdb): # ============================================================================ -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments, too-many-positional-arguments async def create_indexes( org_ops, crawl_ops, diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index 7651e8dc3f..c82682db2e 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -17,7 +17,7 @@ from .utils import is_bool, get_origin -# pylint: disable=too-few-public-methods, too-many-instance-attributes +# pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-positional-arguments class EmailSender: """SMTP Email Sender""" diff --git a/backend/btrixcloud/invites.py b/backend/btrixcloud/invites.py index fdab122d22..e5f84ca785 100644 --- a/backend/btrixcloud/invites.py +++ b/backend/btrixcloud/invites.py @@ -27,6 +27,7 @@ # ============================================================================ +# pylint: disable=too-many-positional-arguments class InviteOps: """invite users (optionally to an org), send emails and delete invites""" diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 148f656604..b0a901a824 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -22,7 +22,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes +# pylint: disable=too-many-instance-attributes, too-many-positional-arguments class K8sAPI: """K8S API accessors""" diff --git a/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py b/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py index d2c06781e0..c1f417a4d5 100644 --- a/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py +++ b/backend/btrixcloud/migrations/migration_0032_dupe_org_names.py @@ -54,7 +54,7 @@ async def migrate_up(self): orgs_db, org_name_set, org_slug_set, name, org_dict.get("_id") ) - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-positional-arguments async def update_org_name_and_slug( self, orgs_db, diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index b63ff9f4ae..ff3bb5ea54 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -136,7 +136,7 @@ async def async_init(self) -> None: print("Auto-Resize Enabled", self.enable_auto_resize) -# pylint: disable=too-many-instance-attributes, too-many-arguments +# pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-positional-arguments # ============================================================================ class BaseOperator: """BaseOperator""" diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 70bcec66a0..9d9fae5743 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -78,7 +78,7 @@ # pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements -# pylint: disable=invalid-name, too-many-lines, too-many-return-statements +# pylint: disable=invalid-name, too-many-lines, too-many-return-statements, too-many-positional-arguments # ============================================================================ class CrawlOperator(BaseOperator): """CrawlOperator Handler""" diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 9a411431e5..b811d8e26b 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -51,7 +51,7 @@ def get_finished_response( status=status, ) - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-positional-arguments async def make_new_crawljob( self, cid: UUID, diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index ba32c23b44..27750f8e41 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -111,7 +111,7 @@ # ============================================================================ -# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals, too-many-arguments +# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals, too-many-arguments, too-many-positional-arguments class OrgOps: """Organization API operations""" diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 189c2108d9..b80ff4b8d3 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -52,7 +52,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods +# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods, too-many-positional-arguments class PageOps: """crawl pages""" @@ -1013,7 +1013,7 @@ async def process_finished_crawls(): # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments def init_pages_api( app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep ) -> PageOps: diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index 06010551d3..2693e79fe0 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -49,7 +49,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-arguments +# pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-positional-arguments class ProfileOps: """Profile management""" @@ -500,7 +500,7 @@ async def calculate_org_profile_file_storage(self, oid: UUID) -> int: # ============================================================================ -# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments +# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments, too-many-positional-arguments def init_profiles_api( mdb, org_ops: OrgOps, diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 38fad45391..ad68301092 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -72,7 +72,7 @@ # ============================================================================ -# pylint: disable=broad-except,raise-missing-from,too-many-public-methods +# pylint: disable=broad-except,raise-missing-from,too-many-public-methods, too-many-positional-arguments class StorageOps: """All storage handling, download/upload operations""" diff --git a/backend/btrixcloud/subs.py b/backend/btrixcloud/subs.py index fd4a359686..2b4abe0a3f 100644 --- a/backend/btrixcloud/subs.py +++ b/backend/btrixcloud/subs.py @@ -56,6 +56,8 @@ class SubOps: """API for managing subscriptions. Only enabled if billing is enabled""" + # pylint: disable=too-many-positional-arguments + org_ops: OrgOps user_manager: UserManager diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 80771f3c17..ac5e262193 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -247,7 +247,7 @@ def read(self, size: Optional[int] = CHUNK_SIZE) -> bytes: # ============================================================================ -# pylint: disable=too-many-arguments, too-many-locals, invalid-name +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, too-many-positional-arguments def init_uploads_api(app, user_dep, *args): """uploads api""" diff --git a/backend/btrixcloud/users.py b/backend/btrixcloud/users.py index e41a583b81..ae4e808103 100644 --- a/backend/btrixcloud/users.py +++ b/backend/btrixcloud/users.py @@ -327,7 +327,7 @@ async def request_verify( user.email, token, dict(request.headers) if request else None ) - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-positional-arguments async def create_user( self, name: str, diff --git a/backend/btrixcloud/webhooks.py b/backend/btrixcloud/webhooks.py index 1222ccea17..4259308627 100644 --- a/backend/btrixcloud/webhooks.py +++ b/backend/btrixcloud/webhooks.py @@ -40,7 +40,7 @@ class EventWebhookOps: """Event webhook notification management""" - # pylint: disable=invalid-name, too-many-arguments, too-many-locals + # pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-positional-arguments org_ops: OrgOps crawl_ops: CrawlOps @@ -556,7 +556,7 @@ async def create_collection_deleted_notification( ) -# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme +# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-positional-arguments def init_event_webhooks_api(mdb, org_ops, app): """init event webhooks system""" # pylint: disable=invalid-name From e1007cac5318d7609a9e99eb88ed5b41ca4fcda2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 25 Sep 2024 13:17:06 -0400 Subject: [PATCH 20/69] More linting fixes --- backend/btrixcloud/background_jobs.py | 2 +- backend/btrixcloud/crawlconfigs.py | 2 +- backend/btrixcloud/storages.py | 2 +- backend/btrixcloud/subs.py | 2 +- backend/btrixcloud/uploads.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 9a68901fcd..4613645fb5 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -57,7 +57,7 @@ class BackgroundJobOps: migration_jobs_scale: int - # pylint: disable=too-many-locals, too-many-arguments, invalid-name + # pylint: disable=too-many-locals, too-many-arguments, too-many-positional-arguments, invalid-name def __init__(self, mdb, email, user_manager, org_ops, crawl_manager, storage_ops): self.jobs = mdb["jobs"] diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 6c45ec6024..2160c943ae 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1265,7 +1265,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): # ============================================================================ -# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments +# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments,too-many-positional-arguments def init_crawl_config_api( app, dbclient, diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index ad68301092..d6bce1f0e9 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -412,7 +412,7 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None try: resp = await client.put_object(Bucket=bucket, Key=key, Body=data) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - except: + except Exception: # create bucket if it doesn't yet exist and then try again resp = await client.create_bucket(Bucket=bucket) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 diff --git a/backend/btrixcloud/subs.py b/backend/btrixcloud/subs.py index 2b4abe0a3f..95c17dd158 100644 --- a/backend/btrixcloud/subs.py +++ b/backend/btrixcloud/subs.py @@ -334,7 +334,7 @@ async def get_billing_portal_url( return SubscriptionPortalUrlResponse() -# pylint: disable=invalid-name,too-many-arguments +# pylint: disable=invalid-name,too-many-arguments,too-many-positional-arguments def init_subs_api( app, mdb, diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index ac5e262193..ab11ead663 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -47,7 +47,7 @@ async def get_upload( return UploadedCrawl.from_dict(res) # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-function-args - # pylint: disable=too-many-arguments, too-many-locals, duplicate-code, invalid-name + # pylint: disable=too-many-positional-arguments, too-many-locals, duplicate-code, invalid-name async def upload_stream( self, stream, From eeaff5b1540c96301869a2ba5c22ca020b0013a3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 10:51:46 -0400 Subject: [PATCH 21/69] Refactor, seperate storage and replicas updates --- backend/btrixcloud/background_jobs.py | 19 +-- backend/btrixcloud/models.py | 13 +- backend/btrixcloud/orgs.py | 18 ++- backend/btrixcloud/storages.py | 167 ++++++++++++++------------ backend/test/test_org.py | 25 ++-- 5 files changed, 140 insertions(+), 102 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 4613645fb5..b07e8d3386 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -500,7 +500,7 @@ async def create_copy_bucket_job( new_storage=new_storage_ref, new_endpoint=new_endpoint, new_bucket=new_bucket, - job_id_prefix=job_type, + job_id_prefix=f"{job_type}-{org.id}", existing_job_id=existing_job_id, ) if existing_job_id: @@ -562,6 +562,9 @@ async def job_finished( await self.handle_delete_replica_job_finished( cast(DeleteReplicaJob, job) ) + if job_type == BgJobType.COPY_BUCKET: + org = await self.orgs_ops.get_org_by_id(oid) + await self.org_ops.update_read_only(org, False) else: print( f"Background job {job.id} failed, sending email to superuser", @@ -611,22 +614,22 @@ async def get_background_job( def _get_job_by_type_from_data(self, data: dict[str, object]): """convert dict to propert background job type""" - if data["type"] == BgJobType.CREATE_REPLICA: + if data["type"] == BgJobType.CREATE_REPLICA.value: return CreateReplicaJob.from_dict(data) - if data["type"] == BgJobType.DELETE_REPLICA: + if data["type"] == BgJobType.DELETE_REPLICA.value: return DeleteReplicaJob.from_dict(data) - if data["type"] == BgJobType.RECALCULATE_ORG_STATS: + if data["type"] == BgJobType.RECALCULATE_ORG_STATS.value: return RecalculateOrgStatsJob.from_dict(data) - if data["type"] == BgJobType.READD_ORG_PAGES: + if data["type"] == BgJobType.READD_ORG_PAGES.value: return ReAddOrgPagesJob.from_dict(data) - if data["type"] == BgJobType.OPTIMIZE_PAGES: + if data["type"] == BgJobType.OPTIMIZE_PAGES.value: return OptimizePagesJob.from_dict(data) - if data["type"] == BgJobType.COPY_BUCKET: + if data["type"] == BgJobType.COPY_BUCKET.value: return CopyBucketJob.from_dict(data) return DeleteOrgJob.from_dict(data) @@ -640,7 +643,7 @@ async def list_background_jobs( job_type: Optional[str] = None, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, - ) -> Tuple[List[BackgroundJob], int]: + ) -> Tuple[List[Union[CreateReplicaJob, DeleteReplicaJob, CopyBucketJob]], int]: """List all background jobs""" # pylint: disable=duplicate-code # Zero-index page for query diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e7a7ca2399..78ee7b5b56 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1633,12 +1633,17 @@ class OrgPublicCollections(BaseModel): # ============================================================================ -class OrgStorageRefs(BaseModel): - """Input model for setting primary storage + optional replicas""" +class OrgStorageRef(BaseModel): + """Input model for setting primary storage""" storage: StorageRef - storageReplicas: List[StorageRef] = [] + +# ============================================================================ +class OrgStorageReplicaRefs(BaseModel): + """Input model for setting replica storages""" + + storageReplicas: List[StorageRef] # ============================================================================ @@ -2644,7 +2649,7 @@ class OptimizePagesJob(BackgroundJob): class CopyBucketJob(BackgroundJob): """Model for tracking job to copy entire s3 bucket""" - type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA + type: Literal[BgJobType.COPY_BUCKET] = BgJobType.COPY_BUCKET prev_storage: StorageRef new_storage: StorageRef diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 27750f8e41..0294519927 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -496,9 +496,17 @@ async def update_slug_and_name(self, org: Organization) -> bool: ) return res is not None - async def update_storage_refs(self, org: Organization) -> bool: - """Update storage + replicas for given org""" - set_dict = org.dict(include={"storage": True, "storageReplicas": True}) + async def update_storage_refs( + self, org: Organization, replicas: bool = False + ) -> bool: + """Update storage or replica refs for given org""" + include = {} + if replicas: + include["storageReplicas"] = True + else: + include["storage"] = True + + set_dict = org.dict(include=include) res = await self.orgs.find_one_and_update({"_id": org.id}, {"$set": set_dict}) return res is not None @@ -1035,13 +1043,13 @@ async def is_crawl_running(self, org: Organization) -> bool: async def has_files_stored(self, org: Organization) -> bool: """Return boolean indicating whether any files are stored on org""" crawl_count = await self.crawls_db.count_documents( - {"_id": org.id, "files.1": {"$exists": True}}, + {"oid": org.id, "files": {"$exists": True, "$ne": []}}, ) if crawl_count > 0: return True profile_count = await self.profiles_db.count_documents( - {"_id": org.id, "resource": {"$exists": True}}, + {"oid": org.id, "resource": {"$exists": True}}, ) if profile_count > 0: return True diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index d6bce1f0e9..821c764dae 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -47,7 +47,8 @@ StorageRef, S3Storage, S3StorageIn, - OrgStorageRefs, + OrgStorageRef, + OrgStorageReplicaRefs, DeletedResponse, UpdatedResponse, AddedResponseName, @@ -250,116 +251,117 @@ async def remove_custom_storage( return {"deleted": True} - async def update_storage_refs( + async def update_storage_ref( self, storage_refs: OrgStorageRefs, org: Organization, ) -> dict[str, bool]: """update storage for org""" + storage_ref = storage_refs.storage try: - self.get_org_storage_by_ref(org, storage_refs.storage) - - for replica in storage_refs.storageReplicas: - self.get_org_storage_by_ref(org, replica) - + self.get_org_storage_by_ref(org, storage_ref) except: raise HTTPException(status_code=400, detail="invalid_storage_ref") if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=400, detail="crawl_running") - prev_storage = org.storage - prev_storage_replicas = org.storageReplicas + if org.storage == storage_ref: + raise HTTPException(status_code=400, detail="identical_storage_ref") + + # TODO: Check that no CreateReplicaJobs are running - org.storage = storage_refs.storage - org.storageReplicas = storage_refs.storageReplicas + prev_storage = org.storage + org.storage = storage_ref await self.org_ops.update_storage_refs(org) - asyncio.create_task( - self.run_post_storage_update_tasks( - OrgStorageRefs( - storage=prev_storage, storageReplicas=prev_storage_replicas - ), - storage_refs, - org, - ) + # TODO: Consider running into asyncio task + await self.run_post_storage_update_tasks( + prev_storage_ref, + storage_ref, + org, ) return {"updated": True} async def run_post_storage_update_tasks( self, - prev_storage_refs: OrgStorageRefs, - new_storage_refs: OrgStorageRefs, + prev_storage_ref: StorageRef, + new_storage_ref: StorageRef, org: Organization, ): - """Handle tasks necessary after changing org storage - - This is a good candidate for a background job with access to ops - classes, which may kick off other background jobs as needed. - """ + """Handle tasks necessary after changing org storage""" if not await self.org_ops.has_files_stored(org): + print(f"No files stored", flush=True) return - if new_storage_refs.storage != prev_storage_refs.storage: + if new_storage_ref != prev_storage_ref: await self.org_ops.update_read_only(org, True, "Updating storage") # Create the background job to copy files - job_id = await self.background_job_ops.create_copy_bucket_job( - org, prev_storage_refs.storage, new_storage_refs.storage + await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_ref, new_storage_ref ) - # Block until copy job is complete - # TODO: Handle in operator instead? - while True: - bg_job = await self.background_job_ops.get_background_job( - job_id, org.id - ) - if bg_job.success: - break - if bg_job.success is False: - # TODO: Handle failure case, including partial failure - # (i.e. some files but not all copied) - pass - - time.sleep(5) - await self.org_ops.update_file_storage_refs( - org, prev_storage_refs.storage, new_storage_refs.storage + org, prev_storage_ref, new_storage_ref ) await self.org_ops.unset_file_presigned_urls(org) - await self.org_ops.update_read_only(org, False) + async def update_storage_replica_refs( + self, + storage_refs: OrgStorageReplicaRefs, + org: Organization, + ) -> dict[str, bool]: + """update storage for org""" - if new_storage_refs.storageReplicas != prev_storage_refs.storageReplicas: - # TODO: If we changed primary storage in this update, make sure that - # are files are successfully copied to new primary before doing - # anything with the replicas - this may be simple or complex depending - # on final approach taken to handle above + replicas = storage_refs.storageReplicas - # Replicate files to any new replica locations - for replica_storage in new_storage_refs.storageReplicas: - if replica_storage not in prev_storage_refs.storageReplicas: - # TODO: Kick off background jobs to replicate primary - # storage to new replica location - print( - "Not yet implemented: Replicate files to {replica_storage.name}", - flush=True, - ) + try: + for replica in replicas: + self.get_org_storage_by_ref(org, replica) + except: + raise HTTPException(status_code=400, detail="invalid_storage_ref") - # Delete files from previous replica locations that are no longer - # being used - for replica_storage in prev_storage_refs.storageReplicas: - if replica_storage not in new_storage_refs.storageReplicas: - # TODO: Kick off background jobs to delete replicas - # (may be easier to just delete all files from bucket - # in one rclone command) - print( - "Not yet implemented: Delete files from {replica_storage.name}", - flush=True, - ) + if await self.org_ops.is_crawl_running(org): + raise HTTPException(status_code=400, detail="crawl_running") + + if org.storageReplicas == replicas: + raise HTTPException(status_code=400, detail="identical_storage_ref") + + # TODO: Check that no CreateReplicaJobs are running + + prev_storage_replicas = org.storageReplicas + org.storageReplicas = replicas + + await self.org_ops.update_storage_refs(org, replicas=True) + + # Replicate files to any new replica locations + for replica_storage in replicas: + if replica_storage not in prev_storage_replicas: + # TODO: Kick off background jobs to replicate primary + # storage to new replica location + print( + "Not yet implemented: Replicate files to {replica_storage.name}", + flush=True, + ) + + # Delete files from previous replica locations that are no longer + # being used + for replica_storage in prev_storage_replicas: + if replica_storage not in replicas: + # TODO: Kick off background jobs to delete replicas + # (may be easier to just delete all files from bucket + # in one rclone command - if so, will need to handle + # updating files in db as well) + print( + "Not yet implemented: Delete files from {replica_storage.name}", + flush=True, + ) + + return {"updated": True} def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" @@ -923,7 +925,9 @@ def get_storage_refs( """get storage refs for an org""" return OrgStorageRefs(storage=org.storage, storageReplicas=org.storageReplicas) - @router.get("/allStorages", tags=["organizations"], response_model=List[StorageRef]) + @router.get( + "/all-storages", tags=["organizations"], response_model=List[StorageRef] + ) def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) @@ -951,7 +955,7 @@ async def clear_all_presigned_urls(user: User = Depends(user_dep)): return {"success": True} @router.post( - "/customStorage", tags=["organizations"], response_model=AddedResponseName + "/custom-storage", tags=["organizations"], response_model=AddedResponseName ) async def add_custom_storage( storage: S3StorageIn, org: Organization = Depends(org_owner_dep) @@ -959,7 +963,7 @@ async def add_custom_storage( return await storage_ops.add_custom_storage(storage, org) @router.delete( - "/customStorage/{name}", tags=["organizations"], response_model=DeletedResponse + "/custom-storage/{name}", tags=["organizations"], response_model=DeletedResponse ) async def remove_custom_storage( name: str, org: Organization = Depends(org_owner_dep) @@ -967,10 +971,21 @@ async def remove_custom_storage( return await storage_ops.remove_custom_storage(name, org) @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) - async def update_storage_refs( + async def update_storage_ref( + storage: OrgStorageRefs, + org: Organization = Depends(org_owner_dep), + ): + return await storage_ops.update_storage_ref(storage, org) + + return storage_ops + + @router.post( + "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse + ) + async def update_storage_replica_refs( storage: OrgStorageRefs, org: Organization = Depends(org_owner_dep), ): - return await storage_ops.update_storage_refs(storage, org) + return await storage_ops.update_storage_replica_refs(storage, org) return storage_ops diff --git a/backend/test/test_org.py b/backend/test/test_org.py index a632006781..0bf1e7b037 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -256,7 +256,7 @@ def test_change_storage_invalid(admin_auth_headers): def test_add_custom_storage(admin_auth_headers): # add custom storages r = requests.post( - f"{API_PREFIX}/orgs/{new_oid}/customStorage", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage", headers=admin_auth_headers, json={ "name": CUSTOM_PRIMARY_STORAGE_NAME, @@ -272,7 +272,7 @@ def test_add_custom_storage(admin_auth_headers): assert data["name"] == CUSTOM_PRIMARY_STORAGE_NAME r = requests.post( - f"{API_PREFIX}/orgs/{new_oid}/customStorage", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage", headers=admin_auth_headers, json={ "name": CUSTOM_REPLICA_STORAGE_NAME, @@ -287,19 +287,27 @@ def test_add_custom_storage(admin_auth_headers): assert data["added"] assert data["name"] == CUSTOM_REPLICA_STORAGE_NAME - # set org to use custom storages moving forward + # set org to use custom storage moving forward r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", headers=admin_auth_headers, json={ "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, - "storageReplicas": [{"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True}], }, ) assert r.status_code == 200 assert r.json()["updated"] + # set org to use custom storage replica moving forward + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/storage-replicas", + headers=admin_auth_headers, + json={ + "storageReplicas": [{"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True}], + }, + ) + # check org was updated as expected r = requests.get( f"{API_PREFIX}/orgs/{new_oid}/storage", @@ -322,14 +330,14 @@ def test_add_custom_storage(admin_auth_headers): def test_remove_custom_storage(admin_auth_headers): # Try to remove in-use storages, verify we get expected 400 response r = requests.delete( - f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_PRIMARY_STORAGE_NAME}", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage/{CUSTOM_PRIMARY_STORAGE_NAME}", headers=admin_auth_headers, ) assert r.status_code == 400 assert r.json()["detail"] == "storage_in_use" r = requests.delete( - f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage/{CUSTOM_REPLICA_STORAGE_NAME}", headers=admin_auth_headers, ) assert r.status_code == 400 @@ -337,17 +345,16 @@ def test_remove_custom_storage(admin_auth_headers): # Unset replica storage from org r = requests.post( - f"{API_PREFIX}/orgs/{new_oid}/storage", + f"{API_PREFIX}/orgs/{new_oid}/storage-replicas", headers=admin_auth_headers, json={ - "storage": {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True}, "storageReplicas": [], }, ) # Delete no longer used replica storage location r = requests.delete( - f"{API_PREFIX}/orgs/{new_oid}/customStorage/{CUSTOM_REPLICA_STORAGE_NAME}", + f"{API_PREFIX}/orgs/{new_oid}/custom-storage/{CUSTOM_REPLICA_STORAGE_NAME}", headers=admin_auth_headers, ) assert r.status_code == 200 From 84a58118a0878105d9e491f12b547367c93ddd28 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 10:59:25 -0400 Subject: [PATCH 22/69] More refactoring --- backend/btrixcloud/storages.py | 58 +++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 821c764dae..79164b09e3 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -265,7 +265,10 @@ async def update_storage_ref( raise HTTPException(status_code=400, detail="invalid_storage_ref") if await self.org_ops.is_crawl_running(org): - raise HTTPException(status_code=400, detail="crawl_running") + raise HTTPException(status_code=403, detail="crawl_running") + + if org.readOnly: + raise HTTPException(status_code=403, detail="org_set_to_read_only") if org.storage == storage_ref: raise HTTPException(status_code=400, detail="identical_storage_ref") @@ -277,7 +280,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # TODO: Consider running into asyncio task + # TODO: Consider running into asyncio task or background job await self.run_post_storage_update_tasks( prev_storage_ref, storage_ref, @@ -294,21 +297,20 @@ async def run_post_storage_update_tasks( ): """Handle tasks necessary after changing org storage""" if not await self.org_ops.has_files_stored(org): - print(f"No files stored", flush=True) + print(f"No files stored, no updates to do", flush=True) return - if new_storage_ref != prev_storage_ref: - await self.org_ops.update_read_only(org, True, "Updating storage") + await self.org_ops.update_read_only(org, True, "Updating storage") - # Create the background job to copy files - await self.background_job_ops.create_copy_bucket_job( - org, prev_storage_ref, new_storage_ref - ) + # Create the background job to copy files + await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_ref, new_storage_ref + ) - await self.org_ops.update_file_storage_refs( - org, prev_storage_ref, new_storage_ref - ) - await self.org_ops.unset_file_presigned_urls(org) + await self.org_ops.update_file_storage_refs( + org, prev_storage_ref, new_storage_ref + ) + await self.org_ops.unset_file_presigned_urls(org) async def update_storage_replica_refs( self, @@ -326,7 +328,10 @@ async def update_storage_replica_refs( raise HTTPException(status_code=400, detail="invalid_storage_ref") if await self.org_ops.is_crawl_running(org): - raise HTTPException(status_code=400, detail="crawl_running") + raise HTTPException(status_code=403, detail="crawl_running") + + if org.readOnly: + raise HTTPException(status_code=403, detail="org_set_to_read_only") if org.storageReplicas == replicas: raise HTTPException(status_code=400, detail="identical_storage_ref") @@ -338,6 +343,26 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) + # TODO: Consider running in asyncio task or background job + await self.run_post_storage_replica_update_tasks( + prev_storage_replicas, replicas, org + ) + + return {"updated": True} + + async def run_post_storage_replica_update_tasks( + self, + prev_storage_refs: List[StorageRef], + new_storage_refs: List[StorageRef], + org: Organization, + ): + """Handle tasks necessary after updating org replica storages""" + if not await self.org_ops.has_files_stored(org): + print(f"No files stored, no updates to do", flush=True) + return + + await self.org_ops.update_read_only(org, True, "Updating storage replicas") + # Replicate files to any new replica locations for replica_storage in replicas: if replica_storage not in prev_storage_replicas: @@ -361,7 +386,10 @@ async def update_storage_replica_refs( flush=True, ) - return {"updated": True} + # TODO: Unset read only once all tasks are complete + # May need to handle this in the operators or a background job + # depending on how post-update tasks are run + await self.org_ops.update_read_only(org, False) def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" From aede4384967fdc7e78b021f1f9c51cb119f3f107 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:00:01 -0400 Subject: [PATCH 23/69] Make post-update task methods private --- backend/btrixcloud/storages.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 79164b09e3..d4b0df5534 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -281,7 +281,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) # TODO: Consider running into asyncio task or background job - await self.run_post_storage_update_tasks( + await self._run_post_storage_update_tasks( prev_storage_ref, storage_ref, org, @@ -289,7 +289,7 @@ async def update_storage_ref( return {"updated": True} - async def run_post_storage_update_tasks( + async def _run_post_storage_update_tasks( self, prev_storage_ref: StorageRef, new_storage_ref: StorageRef, @@ -344,13 +344,13 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) # TODO: Consider running in asyncio task or background job - await self.run_post_storage_replica_update_tasks( + await self._run_post_storage_replica_update_tasks( prev_storage_replicas, replicas, org ) return {"updated": True} - async def run_post_storage_replica_update_tasks( + async def _run_post_storage_replica_update_tasks( self, prev_storage_refs: List[StorageRef], new_storage_refs: List[StorageRef], From a0e6e2e76f79a0225b472f6852e750f336f7d67a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:04:58 -0400 Subject: [PATCH 24/69] Check if any bg jobs running before changing storage --- backend/btrixcloud/storages.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index d4b0df5534..2aea46dfd1 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -264,16 +264,20 @@ async def update_storage_ref( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") + if org.storage == storage_ref: + raise HTTPException(status_code=400, detail="identical_storage_ref") + if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=403, detail="crawl_running") if org.readOnly: raise HTTPException(status_code=403, detail="org_set_to_read_only") - if org.storage == storage_ref: - raise HTTPException(status_code=400, detail="identical_storage_ref") - - # TODO: Check that no CreateReplicaJobs are running + _, jobs_running_count = await self.background_job_ops.list_background_jobs( + org=org, success=None + ) + if jobs_running_count > 0: + raise HTTPException(status_code=403, detail="background_jobs_running") prev_storage = org.storage org.storage = storage_ref @@ -327,16 +331,20 @@ async def update_storage_replica_refs( except: raise HTTPException(status_code=400, detail="invalid_storage_ref") + if org.storageReplicas == replicas: + raise HTTPException(status_code=400, detail="identical_storage_ref") + if await self.org_ops.is_crawl_running(org): raise HTTPException(status_code=403, detail="crawl_running") if org.readOnly: raise HTTPException(status_code=403, detail="org_set_to_read_only") - if org.storageReplicas == replicas: - raise HTTPException(status_code=400, detail="identical_storage_ref") - - # TODO: Check that no CreateReplicaJobs are running + _, jobs_running_count = await self.background_job_ops.list_background_jobs( + org=org, success=None + ) + if jobs_running_count > 0: + raise HTTPException(status_code=403, detail="background_jobs_running") prev_storage_replicas = org.storageReplicas org.storageReplicas = replicas From 2826ee72e980beb2808c5308903d136193e2c189 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:05:40 -0400 Subject: [PATCH 25/69] Check bg job finished as well --- backend/btrixcloud/storages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 2aea46dfd1..e3a7aa5c28 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -274,7 +274,7 @@ async def update_storage_ref( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None + org=org, success=None, finished=None ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") @@ -341,7 +341,7 @@ async def update_storage_replica_refs( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None + org=org, success=None, finished=None ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") From 271f956e8551c01909c657340cbccf2a70be25d9 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:13:34 -0400 Subject: [PATCH 26/69] Fixups --- backend/btrixcloud/background_jobs.py | 3 +-- backend/btrixcloud/models.py | 9 +++++++++ backend/btrixcloud/storages.py | 28 +++++++++++++-------------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b07e8d3386..b508693d8d 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -16,7 +16,6 @@ from .models import ( BaseFile, Organization, - BackgroundJob, BgJobType, CreateReplicaJob, DeleteReplicaJob, @@ -563,7 +562,7 @@ async def job_finished( cast(DeleteReplicaJob, job) ) if job_type == BgJobType.COPY_BUCKET: - org = await self.orgs_ops.get_org_by_id(oid) + org = await self.org_ops.get_org_by_id(oid) await self.org_ops.update_read_only(org, False) else: print( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 78ee7b5b56..d972dd7244 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1646,6 +1646,15 @@ class OrgStorageReplicaRefs(BaseModel): storageReplicas: List[StorageRef] +# ============================================================================ +class OrgStorageRefs(BaseModel): + """Model for org storage references""" + + storage: StorageRef + + storageReplicas: List[StorageRef] = [] + + # ============================================================================ class S3StorageIn(BaseModel): """Custom S3 Storage input model""" diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index e3a7aa5c28..389b8f4c88 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -23,7 +23,6 @@ import zlib import json import os -import time from datetime import datetime, timedelta from zipfile import ZipInfo @@ -47,6 +46,7 @@ StorageRef, S3Storage, S3StorageIn, + OrgStorageRefs, OrgStorageRef, OrgStorageReplicaRefs, DeletedResponse, @@ -253,7 +253,7 @@ async def remove_custom_storage( async def update_storage_ref( self, - storage_refs: OrgStorageRefs, + storage_refs: OrgStorageRef, org: Organization, ) -> dict[str, bool]: """update storage for org""" @@ -279,7 +279,7 @@ async def update_storage_ref( if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") - prev_storage = org.storage + prev_storage_ref = org.storage org.storage = storage_ref await self.org_ops.update_storage_refs(org) @@ -301,7 +301,7 @@ async def _run_post_storage_update_tasks( ): """Handle tasks necessary after changing org storage""" if not await self.org_ops.has_files_stored(org): - print(f"No files stored, no updates to do", flush=True) + print("No files stored, no updates to do", flush=True) return await self.org_ops.update_read_only(org, True, "Updating storage") @@ -360,20 +360,20 @@ async def update_storage_replica_refs( async def _run_post_storage_replica_update_tasks( self, - prev_storage_refs: List[StorageRef], - new_storage_refs: List[StorageRef], + prev_replica_refs: List[StorageRef], + new_replica_refs: List[StorageRef], org: Organization, ): """Handle tasks necessary after updating org replica storages""" if not await self.org_ops.has_files_stored(org): - print(f"No files stored, no updates to do", flush=True) + print("No files stored, no updates to do", flush=True) return await self.org_ops.update_read_only(org, True, "Updating storage replicas") # Replicate files to any new replica locations - for replica_storage in replicas: - if replica_storage not in prev_storage_replicas: + for replica_storage in new_replica_refs: + if replica_storage not in prev_replica_refs: # TODO: Kick off background jobs to replicate primary # storage to new replica location print( @@ -383,8 +383,8 @@ async def _run_post_storage_replica_update_tasks( # Delete files from previous replica locations that are no longer # being used - for replica_storage in prev_storage_replicas: - if replica_storage not in replicas: + for replica_storage in prev_replica_refs: + if replica_storage not in new_replica_refs: # TODO: Kick off background jobs to delete replicas # (may be easier to just delete all files from bucket # in one rclone command - if so, will need to handle @@ -1008,18 +1008,16 @@ async def remove_custom_storage( @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_ref( - storage: OrgStorageRefs, + storage: OrgStorageRef, org: Organization = Depends(org_owner_dep), ): return await storage_ops.update_storage_ref(storage, org) - return storage_ops - @router.post( "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse ) async def update_storage_replica_refs( - storage: OrgStorageRefs, + storage: OrgStorageReplicaRefs, org: Organization = Depends(org_owner_dep), ): return await storage_ops.update_storage_replica_refs(storage, org) From 66ed5db6fee96d0d068151c6733c2ed3ff56a566 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:29:57 -0400 Subject: [PATCH 27/69] Storage update improvements --- backend/btrixcloud/orgs.py | 44 ++++++++++++++++++++++++++++++++-- backend/btrixcloud/storages.py | 34 ++++++++++---------------- 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 0294519927..eb7308e32b 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -516,14 +516,14 @@ async def update_file_storage_refs( ) -> bool: """Update storage refs for all crawl and profile files in given org""" res = await self.crawls_db.update_many( - {"_id": org.id, "files.$.storage": previous_storage}, + {"oid": org.id, "files.$.storage": previous_storage}, {"$set": {"files.$.storage": new_storage}}, ) if not res: return False res = await self.profiles_db.update_many( - {"_id": org.id, "resource.storage": previous_storage}, + {"oid": org.id, "resource.storage": previous_storage}, {"$set": {"resource.storage": new_storage}}, ) if not res: @@ -531,6 +531,46 @@ async def update_file_storage_refs( return True + async def add_file_replica_storage_refs( + self, org: Organization, new_storage: StorageRef + ) -> bool: + """Add replica storage ref for all files in given org""" + res = await self.crawls_db.update_many( + {"oid": org.id}, + {"$push": {"files.$.replicas": new_storage}}, + ) + if not res: + return False + + res = await self.profiles_db.update_many( + {"oid": org.id}, + {"$push": {"resource.replicas": new_storage}}, + ) + if not res: + return False + + return True + + async def remove_file_replica_storage_refs( + self, org: Organization, new_storage: StorageRef + ) -> bool: + """Remove replica storage ref from all files in given org""" + res = await self.crawls_db.update_many( + {"oid": org.id}, + {"$pull": {"files.$.replicas": new_storage}}, + ) + if not res: + return False + + res = await self.profiles_db.update_many( + {"oid": org.id}, + {"$pull": {"resource.replicas": new_storage}}, + ) + if not res: + return False + + return True + async def unset_file_presigned_urls(self, org: Organization) -> bool: """Unset all presigned URLs for files in org""" res = await self.crawls_db.update_many( diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 389b8f4c88..24df3cad01 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -284,7 +284,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # TODO: Consider running into asyncio task or background job + # TODO: Run in asyncio task or background job? await self._run_post_storage_update_tasks( prev_storage_ref, storage_ref, @@ -306,7 +306,6 @@ async def _run_post_storage_update_tasks( await self.org_ops.update_read_only(org, True, "Updating storage") - # Create the background job to copy files await self.background_job_ops.create_copy_bucket_job( org, prev_storage_ref, new_storage_ref ) @@ -314,6 +313,7 @@ async def _run_post_storage_update_tasks( await self.org_ops.update_file_storage_refs( org, prev_storage_ref, new_storage_ref ) + await self.org_ops.unset_file_presigned_urls(org) async def update_storage_replica_refs( @@ -351,7 +351,7 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # TODO: Consider running in asyncio task or background job + # TODO: Run in asyncio task or background job? await self._run_post_storage_replica_update_tasks( prev_storage_replicas, replicas, org ) @@ -369,35 +369,27 @@ async def _run_post_storage_replica_update_tasks( print("No files stored, no updates to do", flush=True) return - await self.org_ops.update_read_only(org, True, "Updating storage replicas") + # TODO: Determine if we need to set read-only for replica operations + # (likely not?) + # await self.org_ops.update_read_only(org, True, "Updating storage replicas") # Replicate files to any new replica locations for replica_storage in new_replica_refs: if replica_storage not in prev_replica_refs: - # TODO: Kick off background jobs to replicate primary - # storage to new replica location - print( - "Not yet implemented: Replicate files to {replica_storage.name}", - flush=True, + await self.background_job_ops.create_copy_bucket_job( + org, org.storage, replica_storage ) + await self.org_ops.add_file_replica_storage_refs(org, replica_storage) # Delete files from previous replica locations that are no longer # being used for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: - # TODO: Kick off background jobs to delete replicas - # (may be easier to just delete all files from bucket - # in one rclone command - if so, will need to handle - # updating files in db as well) - print( - "Not yet implemented: Delete files from {replica_storage.name}", - flush=True, - ) + # TODO: Background job to delete files with rclone delete? - # TODO: Unset read only once all tasks are complete - # May need to handle this in the operators or a background job - # depending on how post-update tasks are run - await self.org_ops.update_read_only(org, False) + await self.org_ops.remove_file_replica_storage_refs( + org, replica_storage + ) def get_available_storages(self, org: Organization) -> List[StorageRef]: """return a list of available default + custom storages""" From c4772f766f698d9b239292dbf88f4d55431e3467 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 11:57:53 -0400 Subject: [PATCH 28/69] Fixup --- backend/btrixcloud/orgs.py | 2 +- backend/btrixcloud/storages.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index eb7308e32b..ec4e3ef194 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -574,7 +574,7 @@ async def remove_file_replica_storage_refs( async def unset_file_presigned_urls(self, org: Organization) -> bool: """Unset all presigned URLs for files in org""" res = await self.crawls_db.update_many( - {"_id": org.id}, {"$set": {"files.$.presignedUrl": None}} + {"oid": org.id}, {"$set": {"files.$.presignedUrl": None}} ) if not res: return False diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 24df3cad01..fbefb0d516 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -381,12 +381,10 @@ async def _run_post_storage_replica_update_tasks( ) await self.org_ops.add_file_replica_storage_refs(org, replica_storage) - # Delete files from previous replica locations that are no longer - # being used + # Delete no-longer-used replica storage refs from files + # TODO: Determine if we want to delete files from the buckets as well for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: - # TODO: Background job to delete files with rclone delete? - await self.org_ops.remove_file_replica_storage_refs( org, replica_storage ) From eef99516163466bf9d33b180eb8a9f82b9f23e27 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:03:41 -0400 Subject: [PATCH 29/69] Remove another todo --- backend/btrixcloud/storages.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index fbefb0d516..bd55e28494 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -369,10 +369,6 @@ async def _run_post_storage_replica_update_tasks( print("No files stored, no updates to do", flush=True) return - # TODO: Determine if we need to set read-only for replica operations - # (likely not?) - # await self.org_ops.update_read_only(org, True, "Updating storage replicas") - # Replicate files to any new replica locations for replica_storage in new_replica_refs: if replica_storage not in prev_replica_refs: From d3d0f49f20aa695644fbf326fc458c77bc6eb8e2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:30:40 -0400 Subject: [PATCH 30/69] More fixups --- backend/btrixcloud/background_jobs.py | 9 +++ backend/btrixcloud/orgs.py | 19 +++--- backend/btrixcloud/storages.py | 5 +- chart/app-templates/copy_job.yaml | 96 +++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 chart/app-templates/copy_job.yaml diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b508693d8d..62281e0114 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -639,6 +639,7 @@ async def list_background_jobs( page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, success: Optional[bool] = None, + running: Optional[bool] = None, job_type: Optional[str] = None, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, @@ -657,6 +658,12 @@ async def list_background_jobs( if success in (True, False): query["success"] = success + if running: + query["success"] = None + + if running is False: + query["success"] = {"$in": [True, False]} + if job_type: query["type"] = job_type @@ -975,6 +982,7 @@ async def list_background_jobs( pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, success: Optional[bool] = None, + running: Optional[bool] = None, jobType: Optional[str] = None, sortBy: Optional[str] = None, sortDirection: Optional[int] = -1, @@ -985,6 +993,7 @@ async def list_background_jobs( page_size=pageSize, page=page, success=success, + running=running, job_type=jobType, sort_by=sortBy, sort_direction=sortDirection, diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index ec4e3ef194..ddec09746c 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -516,15 +516,15 @@ async def update_file_storage_refs( ) -> bool: """Update storage refs for all crawl and profile files in given org""" res = await self.crawls_db.update_many( - {"oid": org.id, "files.$.storage": previous_storage}, - {"$set": {"files.$.storage": new_storage}}, + {"oid": org.id, "files.$.storage": dict(previous_storage)}, + {"$set": {"files.$.storage": dict(new_storage)}}, ) if not res: return False res = await self.profiles_db.update_many( - {"oid": org.id, "resource.storage": previous_storage}, - {"$set": {"resource.storage": new_storage}}, + {"oid": org.id, "resource.storage": dict(previous_storage)}, + {"$set": {"resource.storage": dict(new_storage)}}, ) if not res: return False @@ -537,14 +537,14 @@ async def add_file_replica_storage_refs( """Add replica storage ref for all files in given org""" res = await self.crawls_db.update_many( {"oid": org.id}, - {"$push": {"files.$.replicas": new_storage}}, + {"$push": {"files.$.replicas": dict(new_storage)}}, ) if not res: return False res = await self.profiles_db.update_many( {"oid": org.id}, - {"$push": {"resource.replicas": new_storage}}, + {"$push": {"resource.replicas": dict(new_storage)}}, ) if not res: return False @@ -557,14 +557,14 @@ async def remove_file_replica_storage_refs( """Remove replica storage ref from all files in given org""" res = await self.crawls_db.update_many( {"oid": org.id}, - {"$pull": {"files.$.replicas": new_storage}}, + {"$pull": {"files.$.replicas": dict(new_storage)}}, ) if not res: return False res = await self.profiles_db.update_many( {"oid": org.id}, - {"$pull": {"resource.replicas": new_storage}}, + {"$pull": {"resource.replicas": dict(new_storage)}}, ) if not res: return False @@ -574,7 +574,8 @@ async def remove_file_replica_storage_refs( async def unset_file_presigned_urls(self, org: Organization) -> bool: """Unset all presigned URLs for files in org""" res = await self.crawls_db.update_many( - {"oid": org.id}, {"$set": {"files.$.presignedUrl": None}} + {"oid": org.id, "files.$.presignedUrl": {"$ne": None}}, + {"$set": {"files.$.presignedUrl": None}}, ) if not res: return False diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index bd55e28494..afe64a5344 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -217,6 +217,7 @@ async def add_custom_storage( "STORE_ENDPOINT_NO_BUCKET_URL": storage.endpoint_no_bucket_url, "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, + "STORE_REGION": storage.region, } await self.crawl_manager.add_org_storage( @@ -274,7 +275,7 @@ async def update_storage_ref( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None, finished=None + org=org, running=True ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") @@ -341,7 +342,7 @@ async def update_storage_replica_refs( raise HTTPException(status_code=403, detail="org_set_to_read_only") _, jobs_running_count = await self.background_job_ops.list_background_jobs( - org=org, success=None, finished=None + org=org, running=True ) if jobs_running_count > 0: raise HTTPException(status_code=403, detail="background_jobs_running") diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml new file mode 100644 index 0000000000..f59b0da6ad --- /dev/null +++ b/chart/app-templates/copy_job.yaml @@ -0,0 +1,96 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ id }}" + labels: + role: "background-job" + job_type: {{ job_type }} + btrix.org: {{ oid }} + +spec: + ttlSecondsAfterFinished: 0 + backoffLimit: 3 + template: + spec: + restartPolicy: Never + priorityClassName: bg-job + podFailurePolicy: + rules: + - action: FailJob + onExitCodes: + containerName: rclone + operator: NotIn + values: [0] + containers: + - name: rclone + image: rclone/rclone:latest + env: + + - name: RCLONE_CONFIG_PREV_TYPE + value: "s3" + + - name: RCLONE_CONFIG_PREV_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_ACCESS_KEY + + - name: RCLONE_CONFIG_PREV_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_SECRET_KEY + + - name: RCLONE_CONFIG_PREV_REGION + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_REGION + + - name: RCLONE_CONFIG_PREV_PROVIDER + valueFrom: + secretKeyRef: + name: "{{ prev_secret_name }}" + key: STORE_S3_PROVIDER + + - name: RCLONE_CONFIG_PREV_ENDPOINT + value: "{{ prev_endpoint }}" + + - name: RCLONE_CONFIG_NEW_TYPE + value: "s3" + + - name: RCLONE_CONFIG_NEW_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_ACCESS_KEY + + - name: RCLONE_CONFIG_NEW_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_SECRET_KEY + + - name: RCLONE_CONFIG_NEW_REGION + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_REGION + + - name: RCLONE_CONFIG_NEW_PROVIDER + valueFrom: + secretKeyRef: + name: "{{ new_secret_name }}" + key: STORE_S3_PROVIDER + + - name: RCLONE_CONFIG_NEW_ENDPOINT + value: "{{ new_endpoint }}" + + command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}", "new:{{ new_bucket }}"] + resources: + limits: + memory: "200Mi" + + requests: + memory: "200Mi" + cpu: "50m" From 8cbf9760f5798ffff2f429f94fc4a35256fe34c9 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:48:29 -0400 Subject: [PATCH 31/69] Add provider to s3storage for rclone --- backend/btrixcloud/models.py | 2 ++ backend/btrixcloud/storages.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index d972dd7244..76a5d2da24 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1669,6 +1669,7 @@ class S3StorageIn(BaseModel): bucket: str access_endpoint_url: Optional[str] = None region: str = "" + provider: str = "Other" # ============================================================================ @@ -1683,6 +1684,7 @@ class S3Storage(BaseModel): secret_key: str access_endpoint_url: str region: str = "" + provider: str = "Other" # ============================================================================ diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index afe64a5344..2a29be2484 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -174,6 +174,7 @@ def _create_s3_storage(self, storage: dict[str, str]) -> S3Storage: endpoint_url=endpoint_url, endpoint_no_bucket_url=endpoint_no_bucket_url, access_endpoint_url=access_endpoint_url, + provider=storage.get("provider", "Other"), ) async def add_custom_storage( @@ -198,7 +199,7 @@ async def add_custom_storage( endpoint_url=endpoint_url, endpoint_no_bucket_url=endpoint_no_bucket_url, access_endpoint_url=storagein.access_endpoint_url or endpoint_url, - use_access_for_presign=True, + provider=storagein.provider, ) try: @@ -218,6 +219,7 @@ async def add_custom_storage( "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, "STORE_REGION": storage.region, + "STORE_PROVIDER": storage.provider, } await self.crawl_manager.add_org_storage( From 66253459aa43ccc5a0223611558e9a649fbc50f2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 12:50:55 -0400 Subject: [PATCH 32/69] Fix typo --- backend/btrixcloud/storages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 2a29be2484..04d4cbeb72 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -219,7 +219,7 @@ async def add_custom_storage( "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, "STORE_REGION": storage.region, - "STORE_PROVIDER": storage.provider, + "STORE_S3_PROVIDER": storage.provider, } await self.crawl_manager.add_org_storage( From 703e7d8e75f6fa560a973bf12f1f210395ca8d9d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 30 Sep 2024 17:49:10 -0400 Subject: [PATCH 33/69] Make API endpoints that change storage superuser-only for now --- backend/btrixcloud/storages.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 04d4cbeb72..b4b83b5f61 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -983,23 +983,37 @@ async def clear_all_presigned_urls(user: User = Depends(user_dep)): "/custom-storage", tags=["organizations"], response_model=AddedResponseName ) async def add_custom_storage( - storage: S3StorageIn, org: Organization = Depends(org_owner_dep) + storage: S3StorageIn, + org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.add_custom_storage(storage, org) @router.delete( "/custom-storage/{name}", tags=["organizations"], response_model=DeletedResponse ) async def remove_custom_storage( - name: str, org: Organization = Depends(org_owner_dep) + name: str, + org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.remove_custom_storage(name, org) @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) async def update_storage_ref( storage: OrgStorageRef, org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.update_storage_ref(storage, org) @router.post( @@ -1008,7 +1022,11 @@ async def update_storage_ref( async def update_storage_replica_refs( storage: OrgStorageReplicaRefs, org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + return await storage_ops.update_storage_replica_refs(storage, org) return storage_ops From 11f6b8eca97419c001ea35a5075187eec96d6412 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 30 Sep 2024 18:03:02 -0400 Subject: [PATCH 34/69] Add typing for init_storages_api, import Callable --- backend/btrixcloud/storages.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index b4b83b5f61..9b4c6a0bf0 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -12,6 +12,7 @@ TYPE_CHECKING, Any, cast, + Callable, ) from urllib.parse import urlsplit from contextlib import asynccontextmanager @@ -931,7 +932,7 @@ def _parse_json(line) -> dict: # ============================================================================ def init_storages_api( - org_ops: OrgOps, crawl_manager: CrawlManager, app: APIRouter, mdb, user_dep + org_ops: OrgOps, crawl_manager: CrawlManager, app: APIRouter, mdb, user_dep: Callable ) -> StorageOps: """API for updating storage for an org""" From 3f12cb3983d01d3ced0a11dd135571215cd0446e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 1 Oct 2024 11:24:24 -0400 Subject: [PATCH 35/69] Fix StorageOps in operator main --- backend/btrixcloud/main_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index c1d79c1617..d7e2948e93 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -9,7 +9,6 @@ from .ops import init_ops from .utils import register_exit_handler - app_root = FastAPI() @@ -26,6 +25,7 @@ def main(): ) sys.exit(1) + ( org_ops, crawl_config_ops, From 94df4184de7539e106f2e6f0a370ab7573017087 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 1 Oct 2024 16:09:15 -0400 Subject: [PATCH 36/69] Always use oid prefix in s3 storage Previously, files in a default bucket were prefixed with the oid but were not in custom storages. This commit removes that distinction to aid in copying files in buckets, removing the need for unnecessary filepath manipulation. The CopyBucketJob now only copies an organization's directory rather than the entire bucket to prevent accidentally copying another organization's data. --- backend/btrixcloud/models.py | 7 ++----- chart/app-templates/copy_job.yaml | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 76a5d2da24..8bdec2ece3 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -693,11 +693,8 @@ def get_storage_secret_name(self, oid: str) -> str: return f"storage-cs-{self.name}-{oid[:12]}" def get_storage_extra_path(self, oid: str) -> str: - """return extra path added to the endpoint - using oid for default storages, no extra path for custom""" - if not self.custom: - return oid + "/" - return "" + """return extra oid path added to the endpoint""" + return oid + "/" # ============================================================================ diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index f59b0da6ad..971a10e675 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,7 +86,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}", "new:{{ new_bucket }}"] + command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] resources: limits: memory: "200Mi" From 75f064a83e347b92e09cdd6c128712c814c9bcf8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 10 Oct 2024 15:20:33 -0400 Subject: [PATCH 37/69] Post-rebase fixups and remove create bucket fallback Creating a bucket in the verification stage for adding custom storages if it didn't exist was useful for testing but is an anti-pattern for production, so we remove it here. --- backend/btrixcloud/background_jobs.py | 6 ++++-- backend/btrixcloud/crawlmanager.py | 7 +++++-- backend/btrixcloud/storages.py | 18 +++++------------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 62281e0114..6656428310 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -302,9 +302,11 @@ async def create_delete_org_job( self, org: Organization, existing_job_id: Optional[str] = None, - ) -> Optional[str]: + ) -> str: """Create background job to delete org and its data""" + job_type = BgJobType.DELETE_ORG.value + try: job_id = await self.crawl_manager.run_delete_org_job( oid=str(org.id), @@ -339,7 +341,7 @@ async def create_delete_org_job( except Exception as exc: # pylint: disable=raise-missing-from print(f"warning: delete org job could not be started: {exc}") - return None + return "" async def create_recalculate_org_stats_job( self, diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 7e0fc5c639..4044c78f00 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -127,7 +127,9 @@ async def run_delete_org_job( if existing_job_id: job_id = existing_job_id else: - job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" + job_id_prefix = f"delete-org-{oid}" + # ensure name is <=63 characters + job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( job_id, job_type=BgJobType.DELETE_ORG.value, oid=oid @@ -209,6 +211,8 @@ async def _run_bg_job_with_ops_classes( await self.create_from_yaml(data, namespace=DEFAULT_NAMESPACE) + return job_id + async def run_copy_bucket_job( self, oid: str, @@ -242,7 +246,6 @@ async def run_copy_bucket_job( "new_secret_name": new_storage.get_storage_secret_name(oid), "new_endpoint": new_endpoint, "new_bucket": new_bucket, - "BgJobType": BgJobType, } data = self.templates.env.get_template("copy_job.yaml").render(params) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 9b4c6a0bf0..dc768b4f3b 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -288,7 +288,7 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # TODO: Run in asyncio task or background job? + # Run in background jobs (1 to copy files, 1 for db updates) await self._run_post_storage_update_tasks( prev_storage_ref, storage_ref, @@ -355,7 +355,7 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # TODO: Run in asyncio task or background job? + # Run in background job? or just kick off background jobs? await self._run_post_storage_replica_update_tasks( prev_storage_replicas, replicas, org ) @@ -382,7 +382,7 @@ async def _run_post_storage_replica_update_tasks( await self.org_ops.add_file_replica_storage_refs(org, replica_storage) # Delete no-longer-used replica storage refs from files - # TODO: Determine if we want to delete files from the buckets as well + # Determine if we want to delete files from the buckets as well for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: await self.org_ops.remove_file_replica_storage_refs( @@ -437,16 +437,8 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None key += filename data = b"" - try: - resp = await client.put_object(Bucket=bucket, Key=key, Body=data) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - except Exception: - # create bucket if it doesn't yet exist and then try again - resp = await client.create_bucket(Bucket=bucket) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - - resp = await client.put_object(Bucket=bucket, Key=key, Body=data) - assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + resp = await client.put_object(Bucket=bucket, Key=key, Body=data) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 def resolve_internal_access_path(self, path): """Resolve relative path for internal access to minio bucket""" From 989144bd6c2e11ef937bd68a6cb972d54614a187 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 14:36:58 -0400 Subject: [PATCH 38/69] Create extra test buckets in CI --- .github/workflows/k3d-ci.yaml | 5 +++++ .github/workflows/microk8s-ci.yaml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/k3d-ci.yaml b/.github/workflows/k3d-ci.yaml index 9bcb10b3aa..17cf9a564a 100644 --- a/.github/workflows/k3d-ci.yaml +++ b/.github/workflows/k3d-ci.yaml @@ -116,6 +116,11 @@ jobs: - name: Wait for all pods to be ready run: kubectl wait --for=condition=ready pod --all --timeout=240s + - name: Create Extra Test Buckets + run: | + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-primary && + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-replica + - name: Run Tests timeout-minutes: 30 run: pytest -vv ./backend/test/test_*.py diff --git a/.github/workflows/microk8s-ci.yaml b/.github/workflows/microk8s-ci.yaml index 07a3a2124e..0111915ac5 100644 --- a/.github/workflows/microk8s-ci.yaml +++ b/.github/workflows/microk8s-ci.yaml @@ -68,6 +68,11 @@ jobs: - name: Wait for all pods to be ready run: sudo microk8s kubectl wait --for=condition=ready pod --all --timeout=240s + - name: Create Extra Test Buckets + run: | + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-primary && + kubectl exec -i deployment/local-minio -c minio -- mkdir /data/custom-replica + - name: Run Tests run: pytest -vv ./backend/test/test_*.py From c1e5f377e1003285d6f53a88b75342ce3b58820b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 14:40:16 -0400 Subject: [PATCH 39/69] Add test for non-verified custom storage --- backend/test/test_org.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 0bf1e7b037..c8faf67760 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -253,6 +253,24 @@ def test_change_storage_invalid(admin_auth_headers): assert r.status_code == 400 +def test_add_custom_storage_doesnt_verify(admin_auth_headers): + # verify that custom storage that can't be verified with + # a test file isn't added + r = requests.post( + f"{API_PREFIX}/orgs/{new_oid}/custom-storage", + headers=admin_auth_headers, + json={ + "name": "custom-bucket-doesnt-exist", + "access_key": "ADMIN", + "secret_key": "PASSW0RD", + "bucket": "custom-bucket-doesnt-exist", + "endpoint_url": "http://local-minio.default:9000/", + }, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "Could not verify custom storage. Check credentials are valid?" + + def test_add_custom_storage(admin_auth_headers): # add custom storages r = requests.post( From ad6e061fbdc0c51f4e89b94375cd191bf8ed03a6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 16:18:42 -0400 Subject: [PATCH 40/69] Refactor to move updates to FastAPI background tasks --- backend/btrixcloud/models.py | 7 ++++ backend/btrixcloud/orgs.py | 63 ++++++++--------------------- backend/btrixcloud/storages.py | 74 +++++++++++++++++++++------------- 3 files changed, 70 insertions(+), 74 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 8bdec2ece3..7e0351104c 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2691,6 +2691,13 @@ class UpdatedResponse(BaseModel): updated: bool +# ============================================================================ +class UpdatedResponseId(UpdatedResponse): + """Response for API endpoints that return updated + id""" + + id: Optional[str] = None + + # ============================================================================ class SuccessResponse(BaseModel): """Response for API endpoints that return success""" diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index ddec09746c..f5e5b8fda4 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -513,74 +513,43 @@ async def update_storage_refs( async def update_file_storage_refs( self, org: Organization, previous_storage: StorageRef, new_storage: StorageRef - ) -> bool: + ) -> None: """Update storage refs for all crawl and profile files in given org""" - res = await self.crawls_db.update_many( + await self.crawls_db.update_many( {"oid": org.id, "files.$.storage": dict(previous_storage)}, {"$set": {"files.$.storage": dict(new_storage)}}, ) - if not res: - return False - res = await self.profiles_db.update_many( + await self.profiles_db.update_many( {"oid": org.id, "resource.storage": dict(previous_storage)}, {"$set": {"resource.storage": dict(new_storage)}}, ) - if not res: - return False - return True - - async def add_file_replica_storage_refs( - self, org: Organization, new_storage: StorageRef - ) -> bool: + async def add_or_remove_file_replica_storage_refs( + self, org: Organization, storage_ref: StorageRef, remove=False + ) -> None: """Add replica storage ref for all files in given org""" - res = await self.crawls_db.update_many( - {"oid": org.id}, - {"$push": {"files.$.replicas": dict(new_storage)}}, - ) - if not res: - return False - - res = await self.profiles_db.update_many( - {"oid": org.id}, - {"$push": {"resource.replicas": dict(new_storage)}}, - ) - if not res: - return False - - return True + if remove: + verb = "$pull" + else: + verb = "$push" - async def remove_file_replica_storage_refs( - self, org: Organization, new_storage: StorageRef - ) -> bool: - """Remove replica storage ref from all files in given org""" - res = await self.crawls_db.update_many( + await self.crawls_db.update_many( {"oid": org.id}, - {"$pull": {"files.$.replicas": dict(new_storage)}}, + {verb: {"files.$.replicas": dict(storage_ref)}}, ) - if not res: - return False - res = await self.profiles_db.update_many( + await self.profiles_db.update_many( {"oid": org.id}, - {"$pull": {"resource.replicas": dict(new_storage)}}, + {verb: {"resource.replicas": dict(storage_ref)}}, ) - if not res: - return False - - return True - async def unset_file_presigned_urls(self, org: Organization) -> bool: + async def unset_file_presigned_urls(self, org: Organization) -> None: """Unset all presigned URLs for files in org""" - res = await self.crawls_db.update_many( + await self.crawls_db.update_many( {"oid": org.id, "files.$.presignedUrl": {"$ne": None}}, {"$set": {"files.$.presignedUrl": None}}, ) - if not res: - return False - - return True async def update_subscription_data( self, update: SubscriptionUpdate diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index dc768b4f3b..8d1266f599 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -13,6 +13,7 @@ Any, cast, Callable, + Union, ) from urllib.parse import urlsplit from contextlib import asynccontextmanager @@ -28,7 +29,7 @@ from datetime import datetime, timedelta from zipfile import ZipInfo -from fastapi import Depends, HTTPException, APIRouter +from fastapi import Depends, HTTPException, APIRouter, BackgroundTasks from stream_zip import stream_zip, NO_COMPRESSION_64, Method from remotezip import RemoteZip from aiobotocore.config import AioConfig @@ -52,6 +53,7 @@ OrgStorageReplicaRefs, DeletedResponse, UpdatedResponse, + UpdatedResponseId, AddedResponseName, PRESIGN_DURATION_SECONDS, PresignedUrl, @@ -259,7 +261,8 @@ async def update_storage_ref( self, storage_refs: OrgStorageRef, org: Organization, - ) -> dict[str, bool]: + background_tasks: BackgroundTasks, + ) -> dict[str, Union[bool, Optional[str]]]: """update storage for org""" storage_ref = storage_refs.storage @@ -288,32 +291,35 @@ async def update_storage_ref( await self.org_ops.update_storage_refs(org) - # Run in background jobs (1 to copy files, 1 for db updates) - await self._run_post_storage_update_tasks( + if not await self.org_ops.has_files_stored(org): + print("No files stored, no updates to do", flush=True) + return {"updated": True, "id": None} + + await self.org_ops.update_read_only(org, True, "Updating storage") + + job_id = await self.background_job_ops.create_copy_bucket_job( + org, prev_storage_ref, storage_ref + ) + + # This runs only two update_many mongo commands, so should be safe to run + # in a FastAPI background task rather than requiring a full Browsertrix + # Background job + background_tasks.add_task( + self._run_post_storage_update_tasks, + org, prev_storage_ref, storage_ref, - org, ) - return {"updated": True} + return {"updated": True, "id": job_id} async def _run_post_storage_update_tasks( self, + org: Organization, prev_storage_ref: StorageRef, new_storage_ref: StorageRef, - org: Organization, ): """Handle tasks necessary after changing org storage""" - if not await self.org_ops.has_files_stored(org): - print("No files stored, no updates to do", flush=True) - return - - await self.org_ops.update_read_only(org, True, "Updating storage") - - await self.background_job_ops.create_copy_bucket_job( - org, prev_storage_ref, new_storage_ref - ) - await self.org_ops.update_file_storage_refs( org, prev_storage_ref, new_storage_ref ) @@ -324,6 +330,7 @@ async def update_storage_replica_refs( self, storage_refs: OrgStorageReplicaRefs, org: Organization, + background_tasks: BackgroundTasks, ) -> dict[str, bool]: """update storage for org""" @@ -355,18 +362,25 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # Run in background job? or just kick off background jobs? - await self._run_post_storage_replica_update_tasks( - prev_storage_replicas, replicas, org + # This only kicks off background jobs and runs a few update_many mongo + # commands, so it might be fine to keep as a FastAPI background job. + # Consider moving to a Browsertrix background job, but that may make + # retrying difficult as the job which would be retried also kicks off + # other background jobs which may have been successful already + background_tasks.add_task( + self._run_post_storage_replica_update_tasks, + org, + prev_storage_replicas, + replicas, ) return {"updated": True} async def _run_post_storage_replica_update_tasks( self, + org: Organization, prev_replica_refs: List[StorageRef], new_replica_refs: List[StorageRef], - org: Organization, ): """Handle tasks necessary after updating org replica storages""" if not await self.org_ops.has_files_stored(org): @@ -379,14 +393,16 @@ async def _run_post_storage_replica_update_tasks( await self.background_job_ops.create_copy_bucket_job( org, org.storage, replica_storage ) - await self.org_ops.add_file_replica_storage_refs(org, replica_storage) + await self.org_ops.add_or_remove_file_replica_storage_refs( + org, replica_storage + ) # Delete no-longer-used replica storage refs from files # Determine if we want to delete files from the buckets as well for replica_storage in prev_replica_refs: if replica_storage not in new_replica_refs: - await self.org_ops.remove_file_replica_storage_refs( - org, replica_storage + await self.org_ops.add_or_remove_file_replica_storage_refs( + org, replica_storage, remove=True ) def get_available_storages(self, org: Organization) -> List[StorageRef]: @@ -998,28 +1014,32 @@ async def remove_custom_storage( return await storage_ops.remove_custom_storage(name, org) - @router.post("/storage", tags=["organizations"], response_model=UpdatedResponse) + @router.post("/storage", tags=["organizations"], response_model=UpdatedResponseId) async def update_storage_ref( storage: OrgStorageRef, + background_tasks: BackgroundTasks, org: Organization = Depends(org_owner_dep), user: User = Depends(user_dep), ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await storage_ops.update_storage_ref(storage, org) + return await storage_ops.update_storage_ref(storage, org, background_tasks) @router.post( "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse ) async def update_storage_replica_refs( storage: OrgStorageReplicaRefs, + background_tasks: BackgroundTasks, org: Organization = Depends(org_owner_dep), user: User = Depends(user_dep), ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await storage_ops.update_storage_replica_refs(storage, org) + return await storage_ops.update_storage_replica_refs( + storage, org, background_tasks + ) return storage_ops From 958a72194061d11218de3440f0e186cd54184f98 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 15 Oct 2024 18:07:26 -0400 Subject: [PATCH 41/69] Include default replicas in /storage response if no org replicas --- backend/btrixcloud/storages.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 8d1266f599..fa8d81b772 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -957,7 +957,11 @@ def get_storage_refs( org: Organization = Depends(org_owner_dep), ): """get storage refs for an org""" - return OrgStorageRefs(storage=org.storage, storageReplicas=org.storageReplicas) + if org.storageReplicas: + replica_refs = org.storageReplicas + else: + replica_refs = storage_ops.default_replicas + return OrgStorageRefs(storage=org.storage, storageReplicas=replica_refs) @router.get( "/all-storages", tags=["organizations"], response_model=List[StorageRef] From e8df4e4cad57e9e80fc8355bf5d6ec81318bca93 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 15:10:44 -0400 Subject: [PATCH 42/69] Fix unsetting of presigned URLs --- backend/btrixcloud/orgs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index f5e5b8fda4..25fef00707 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -547,8 +547,8 @@ async def add_or_remove_file_replica_storage_refs( async def unset_file_presigned_urls(self, org: Organization) -> None: """Unset all presigned URLs for files in org""" await self.crawls_db.update_many( - {"oid": org.id, "files.$.presignedUrl": {"$ne": None}}, - {"$set": {"files.$.presignedUrl": None}}, + {"oid": org.id}, + {"$set": {"files.$[].presignedUrl": None}}, ) async def update_subscription_data( From 82c4ef5bf52be0b3a3eb3c7ea19a111e464678b7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 15:46:23 -0400 Subject: [PATCH 43/69] Add --progress flag to rclone copy command --- chart/app-templates/copy_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 971a10e675..f727a6deea 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,7 +86,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] + command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] resources: limits: memory: "200Mi" From 7946d1a6a487e26f6c7ff0030cf43f28e58df698 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 20:14:55 -0400 Subject: [PATCH 44/69] Increase ttl seconds after finished for testing on dev --- chart/app-templates/copy_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index f727a6deea..bdb09b4db7 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 0 + ttlSecondsAfterFinished: 300 backoffLimit: 3 template: spec: From cc93be6f5de783789ac2b4a5e0deb3adce21aa72 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 20:37:26 -0400 Subject: [PATCH 45/69] Ensure there are no double slashes between bucket name and oid --- backend/btrixcloud/background_jobs.py | 4 ++++ chart/app-templates/copy_job.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 6656428310..eb1d4248f4 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -489,6 +489,10 @@ async def create_copy_bucket_job( new_storage = self.storage_ops.get_org_storage_by_ref(org, new_storage_ref) new_endpoint, new_bucket = self.strip_bucket(new_storage.endpoint_url) + # Ensure buckets terminate with trailing slash + prev_bucket = os.path.join(prev_bucket, "") + new_bucket = os.path.join(new_bucket, "") + job_type = BgJobType.COPY_BUCKET.value try: diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index bdb09b4db7..8c2514ab26 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,7 +86,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}/{{ oid }}", "new:{{ new_bucket }}/{{ oid }}"] + command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: memory: "200Mi" From da320f277b8cb0d6c30399ea14fba64ceeedc6df Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 20:53:40 -0400 Subject: [PATCH 46/69] Increase memory limit/request for copy job to 500Mi --- chart/app-templates/copy_job.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 8c2514ab26..3d8e5a4a03 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -89,8 +89,8 @@ spec: command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: - memory: "200Mi" + memory: "500Mi" requests: - memory: "200Mi" + memory: "500Mi" cpu: "50m" From e1f3ebde3ed0624c0058232d4ec5c493182fd432 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 21:00:44 -0400 Subject: [PATCH 47/69] Reduce copy job ttlSecondsAfterFinished to 60 --- chart/app-templates/copy_job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 3d8e5a4a03..501f7ef749 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 300 + ttlSecondsAfterFinished: 60 backoffLimit: 3 template: spec: From e7ae3308d27ae8da233ba200e9dc4a4211622cd2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 16 Oct 2024 23:08:22 -0400 Subject: [PATCH 48/69] Add storage tag to API endpoints --- backend/btrixcloud/storages.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index fa8d81b772..b5c75f5501 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -952,7 +952,9 @@ def init_storages_api( router = org_ops.router org_owner_dep = org_ops.org_owner_dep - @router.get("/storage", tags=["organizations"], response_model=OrgStorageRefs) + @router.get( + "/storage", tags=["organizations", "storage"], response_model=OrgStorageRefs + ) def get_storage_refs( org: Organization = Depends(org_owner_dep), ): @@ -964,7 +966,9 @@ def get_storage_refs( return OrgStorageRefs(storage=org.storage, storageReplicas=replica_refs) @router.get( - "/all-storages", tags=["organizations"], response_model=List[StorageRef] + "/all-storages", + tags=["organizations", "storage"], + response_model=List[StorageRef], ) def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) @@ -993,7 +997,9 @@ async def clear_all_presigned_urls(user: User = Depends(user_dep)): return {"success": True} @router.post( - "/custom-storage", tags=["organizations"], response_model=AddedResponseName + "/custom-storage", + tags=["organizations", "storage"], + response_model=AddedResponseName, ) async def add_custom_storage( storage: S3StorageIn, @@ -1006,7 +1012,9 @@ async def add_custom_storage( return await storage_ops.add_custom_storage(storage, org) @router.delete( - "/custom-storage/{name}", tags=["organizations"], response_model=DeletedResponse + "/custom-storage/{name}", + tags=["organizations", "storage"], + response_model=DeletedResponse, ) async def remove_custom_storage( name: str, @@ -1018,7 +1026,9 @@ async def remove_custom_storage( return await storage_ops.remove_custom_storage(name, org) - @router.post("/storage", tags=["organizations"], response_model=UpdatedResponseId) + @router.post( + "/storage", tags=["organizations", "storage"], response_model=UpdatedResponseId + ) async def update_storage_ref( storage: OrgStorageRef, background_tasks: BackgroundTasks, @@ -1031,7 +1041,9 @@ async def update_storage_ref( return await storage_ops.update_storage_ref(storage, org, background_tasks) @router.post( - "/storage-replicas", tags=["organizations"], response_model=UpdatedResponse + "/storage-replicas", + tags=["organizations", "storage"], + response_model=UpdatedResponse, ) async def update_storage_replica_refs( storage: OrgStorageReplicaRefs, From 7e074d0a815baef80b40cdce80dca5bacd954a75 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 09:45:45 -0400 Subject: [PATCH 49/69] Add flags to rclone to reduce memory usage, set limit to 350Mi --- chart/app-templates/copy_job.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index 501f7ef749..cae8a3a449 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -86,11 +86,11 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "--progress", "copy", "--checksum", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] + command: ["rclone", "-vv", "--progress", "copy", "--checksum", "--use-mmap", "--transfers=2", "--checkers=2", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: - memory: "500Mi" + memory: "350Mi" requests: - memory: "500Mi" + memory: "350Mi" cpu: "50m" From 74978939548463864d2bfde92bbab804bfc437cc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 10:18:51 -0400 Subject: [PATCH 50/69] Fix positional operator in storage ref update --- backend/btrixcloud/orgs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 25fef00707..c5dc61cd58 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -536,7 +536,7 @@ async def add_or_remove_file_replica_storage_refs( await self.crawls_db.update_many( {"oid": org.id}, - {verb: {"files.$.replicas": dict(storage_ref)}}, + {verb: {"files.$[].replicas": dict(storage_ref)}}, ) await self.profiles_db.update_many( From 0c861e164d01f5e3f9a9c50d069cfc086ac19b9b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 10:27:03 -0400 Subject: [PATCH 51/69] One more positional operator fix --- backend/btrixcloud/orgs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index c5dc61cd58..907361f9a1 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -516,8 +516,8 @@ async def update_file_storage_refs( ) -> None: """Update storage refs for all crawl and profile files in given org""" await self.crawls_db.update_many( - {"oid": org.id, "files.$.storage": dict(previous_storage)}, - {"$set": {"files.$.storage": dict(new_storage)}}, + {"oid": org.id}, + {"$set": {"files.$[].storage": dict(new_storage)}}, ) await self.profiles_db.update_many( From 12eb105f3174359d8ffbfa7b8c20c38c13ab6146 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 10:34:17 -0400 Subject: [PATCH 52/69] Update docstrings and comments --- backend/btrixcloud/storages.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index b5c75f5501..60ffdee217 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -236,7 +236,7 @@ async def add_custom_storage( async def remove_custom_storage( self, name: str, org: Organization ) -> dict[str, bool]: - """remove custom storage""" + """Remove custom storage from org""" if org.storage.custom and org.storage.name == name: raise HTTPException(status_code=400, detail="storage_in_use") @@ -263,7 +263,7 @@ async def update_storage_ref( org: Organization, background_tasks: BackgroundTasks, ) -> dict[str, Union[bool, Optional[str]]]: - """update storage for org""" + """Update storage for org""" storage_ref = storage_refs.storage try: @@ -301,9 +301,6 @@ async def update_storage_ref( org, prev_storage_ref, storage_ref ) - # This runs only two update_many mongo commands, so should be safe to run - # in a FastAPI background task rather than requiring a full Browsertrix - # Background job background_tasks.add_task( self._run_post_storage_update_tasks, org, @@ -332,7 +329,7 @@ async def update_storage_replica_refs( org: Organization, background_tasks: BackgroundTasks, ) -> dict[str, bool]: - """update storage for org""" + """Update replica storage for org""" replicas = storage_refs.storageReplicas @@ -362,11 +359,6 @@ async def update_storage_replica_refs( await self.org_ops.update_storage_refs(org, replicas=True) - # This only kicks off background jobs and runs a few update_many mongo - # commands, so it might be fine to keep as a FastAPI background job. - # Consider moving to a Browsertrix background job, but that may make - # retrying difficult as the job which would be retried also kicks off - # other background jobs which may have been successful already background_tasks.add_task( self._run_post_storage_replica_update_tasks, org, From 8d50dd0712c8540b9a621abc0bd9c61d62f6cbde Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:16:55 -0400 Subject: [PATCH 53/69] Make all-storages response valid JSON with response model --- backend/btrixcloud/models.py | 7 +++++++ backend/btrixcloud/storages.py | 7 ++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7e0351104c..3f95a9a19d 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1652,6 +1652,13 @@ class OrgStorageRefs(BaseModel): storageReplicas: List[StorageRef] = [] +# ============================================================================ +class OrgAllStorages(BaseModel): + """Response model for listing all available storages""" + + allStorages: List[StorageRef] + + # ============================================================================ class S3StorageIn(BaseModel): """Custom S3 Storage input model""" diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 60ffdee217..9c9c283613 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -51,6 +51,7 @@ OrgStorageRefs, OrgStorageRef, OrgStorageReplicaRefs, + OrgAllStorages, DeletedResponse, UpdatedResponse, UpdatedResponseId, @@ -397,14 +398,14 @@ async def _run_post_storage_replica_update_tasks( org, replica_storage, remove=True ) - def get_available_storages(self, org: Organization) -> List[StorageRef]: + def get_available_storages(self, org: Organization) -> Dict[str, List[StorageRef]]: """return a list of available default + custom storages""" refs: List[StorageRef] = [] for name in self.default_storages: refs.append(StorageRef(name=name, custom=False)) for name in org.customStorages: refs.append(StorageRef(name=name, custom=True)) - return refs + return {"allStorages": refs} @asynccontextmanager async def get_s3_client( @@ -960,7 +961,7 @@ def get_storage_refs( @router.get( "/all-storages", tags=["organizations", "storage"], - response_model=List[StorageRef], + response_model=OrgAllStorages, ) def get_available_storages(org: Organization = Depends(org_owner_dep)): return storage_ops.get_available_storages(org) From 9c391dbcd4f39b4703904249f89a9542fb749e60 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:38:45 -0400 Subject: [PATCH 54/69] Add admin docs for storage --- .../docs/docs/deploy/admin/org-storage.md | 74 +++++++++++++++++++ frontend/docs/mkdocs.yml | 1 + 2 files changed, 75 insertions(+) create mode 100644 frontend/docs/docs/deploy/admin/org-storage.md diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md new file mode 100644 index 0000000000..b8bf11a2ad --- /dev/null +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -0,0 +1,74 @@ +# Org Storage + +This guide covers configuring storage for an organization in Browsertrix. + +By default, all organizations will use the default storage and default replica locations (if any are configured) set in the Helm chart. + +The Browsertrix API supports adding custom storage locations per organization and configuring the organization to use custom storages for primary and/or replica storage. These endpoints are available to superusers only. + +## Adding Custom Storage + +The first step to configuring custom storage with an organization is to add the S3 buckets to the organization, e.g.: + +```sh +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage --data '{"name": "new-custom-storage", "access_key": "", "secret_key": "", "bucket": "new-custom-storage", "endpoint_url": "https://s3-provider.example.com/"}' +``` + +Verify that the custom storage has been added to the organization by checking the `/all-storages` API endpoint: + +```sh +curl -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//all-storages +``` + +The storage reference for our new custom storage should be present in the returned JSON, e.g.: + +```json +{ + "allStorages": [ + {"name": "default", "custom": false}, + {"name": "default-replica", "custom": false}, + {"name": "new-custom-storage", "custom": true}, + ] +} +``` + +The custom storage is now ready to be configured. + + +## Updating Org Storage + +Each organization has one primary storage location. It is possible to configure the organization to use any of the storage options listed in the `/all-storages` API endpoint as primary storage, e.g.: + +```sh +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage --data '{"storage": {"name": "new-custom-storage", "custom": true}}' +``` + +If any crawls, uploads, or browser profiles have been created on the organization, modifying the primary storage will disable archiving on the organization while files are migrated from the previous to the new storage location. Archiving is re-enabled when the migration completes. + +At this time, all files are copied from the prior storage location to the new storage location, and are not automatically deleted from their source location. + + +## Updating Org Replica Storage + +Each organization can have any number of replica storage locations. These locations serve as replicas of the content in the primary storage location, and are most commonly used as backups. + +It is possible to configure the organization to use any of the storage options listed in the `/all-storages` API endpoint as replica storage, e.g.: + +```sh +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage --data '{"storageReplicas": [{"name": "default-replica", "custom": false}, {"new-custom-storage": true}]}' +``` + +If any crawls, uploads, or browser profiles have been created on the organization, adding a new replica location will result in a background job to replicate all of the organization's files from primary storage to the new replica location. Unlike with updating primary storage, this process will not disable archiving on the organization. + +If any crawls, uploads, or browser profiles have been created on the organization, removing a previously used replica location will result in database updates to reflect that the prior replica location is no longer available. At this time, no files are automatically deleted from prior replica location. + + +## Removing Custom Storage + +It is also possible to remove a custom storage from an organization, referencing the storage to be deleted's name in the API endpoint, e.g.: + +```sh +curl -X DELETE -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage/new-custom-storage +``` + +The custom storage location to be deleted must not be in use on the organization, or else the endpoint will return `400`. Default storage locations shared between organizations cannot be deleted with this endpoint. diff --git a/frontend/docs/mkdocs.yml b/frontend/docs/mkdocs.yml index bc7d16a203..5982e7d094 100644 --- a/frontend/docs/mkdocs.yml +++ b/frontend/docs/mkdocs.yml @@ -86,6 +86,7 @@ nav: - Administration: - deploy/admin/upgrade-notes.md - deploy/admin/org-import-export.md + - deploy/admin/org-storage.md - Development: - develop/index.md - develop/local-dev-setup.md From b824bab0da2a24b9fa781bf6b4a84d6fb180c7ea Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:42:31 -0400 Subject: [PATCH 55/69] Fix API endpoint path in docs example --- frontend/docs/docs/deploy/admin/org-storage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index b8bf11a2ad..a9282dc9af 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -55,7 +55,7 @@ Each organization can have any number of replica storage locations. These locati It is possible to configure the organization to use any of the storage options listed in the `/all-storages` API endpoint as replica storage, e.g.: ```sh -curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage --data '{"storageReplicas": [{"name": "default-replica", "custom": false}, {"new-custom-storage": true}]}' +curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//storage-replicas --data '{"storageReplicas": [{"name": "default-replica", "custom": false}, {"new-custom-storage": true}]}' ``` If any crawls, uploads, or browser profiles have been created on the organization, adding a new replica location will result in a background job to replicate all of the organization's files from primary storage to the new replica location. Unlike with updating primary storage, this process will not disable archiving on the organization. From f986efbc11f91fb1ac2e72991b81b907abe5821e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:43:38 -0400 Subject: [PATCH 56/69] Docs typo fix --- frontend/docs/docs/deploy/admin/org-storage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index a9282dc9af..2c3b613565 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -45,7 +45,7 @@ curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer Date: Thu, 17 Oct 2024 11:54:21 -0400 Subject: [PATCH 57/69] Add provider field note --- frontend/docs/docs/deploy/admin/org-storage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index 2c3b613565..7f0867b3a7 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -14,6 +14,8 @@ The first step to configuring custom storage with an organization is to add the curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage --data '{"name": "new-custom-storage", "access_key": "", "secret_key": "", "bucket": "new-custom-storage", "endpoint_url": "https://s3-provider.example.com/"}' ``` +If desired, the `provider` field can be set to any of the [values supported by rclone for S3 providers](https://rclone.org/s3/#s3-provider). By default, this field is set to "Other", which should work for nearly all S3 storage providers. This value is used solely for migrations from rclone when updating org storage and/or replica locations. + Verify that the custom storage has been added to the organization by checking the `/all-storages` API endpoint: ```sh From 23f388714194979e4890906db0b4831477e99b32 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:54:58 -0400 Subject: [PATCH 58/69] Docs language cleanup --- frontend/docs/docs/deploy/admin/org-storage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/docs/docs/deploy/admin/org-storage.md b/frontend/docs/docs/deploy/admin/org-storage.md index 7f0867b3a7..ad5563f77d 100644 --- a/frontend/docs/docs/deploy/admin/org-storage.md +++ b/frontend/docs/docs/deploy/admin/org-storage.md @@ -14,7 +14,7 @@ The first step to configuring custom storage with an organization is to add the curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer " https://app.browsertrix.com/api/orgs//custom-storage --data '{"name": "new-custom-storage", "access_key": "", "secret_key": "", "bucket": "new-custom-storage", "endpoint_url": "https://s3-provider.example.com/"}' ``` -If desired, the `provider` field can be set to any of the [values supported by rclone for S3 providers](https://rclone.org/s3/#s3-provider). By default, this field is set to "Other", which should work for nearly all S3 storage providers. This value is used solely for migrations from rclone when updating org storage and/or replica locations. +If desired, the `provider` field can be set to any of the [values supported by rclone for S3 providers](https://rclone.org/s3/#s3-provider). By default, this field is set to "Other", which should work for nearly all S3 storage providers. This value is used solely for migrating existing files with rclone when updating org storage and/or replica locations. Verify that the custom storage has been added to the organization by checking the `/all-storages` API endpoint: From a695d839899f040a7db65a87ee918bcdf806667f Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 11:58:23 -0400 Subject: [PATCH 59/69] Check /all-storages in backend tests --- backend/test/test_org.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index c8faf67760..78011ac1b6 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -268,7 +268,10 @@ def test_add_custom_storage_doesnt_verify(admin_auth_headers): }, ) assert r.status_code == 400 - assert r.json()["detail"] == "Could not verify custom storage. Check credentials are valid?" + assert ( + r.json()["detail"] + == "Could not verify custom storage. Check credentials are valid?" + ) def test_add_custom_storage(admin_auth_headers): @@ -305,6 +308,16 @@ def test_add_custom_storage(admin_auth_headers): assert data["added"] assert data["name"] == CUSTOM_REPLICA_STORAGE_NAME + # verify custom storages are now available on org + r = requests.get( + f"{API_PREFIX}/orgs/{new_oid}/all-storages", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + all_storages = r.json()["allStorages"] + assert {"name": CUSTOM_PRIMARY_STORAGE_NAME, "custom": True} in all_storages + assert {"name": CUSTOM_REPLICA_STORAGE_NAME, "custom": True} in all_storages + # set org to use custom storage moving forward r = requests.post( f"{API_PREFIX}/orgs/{new_oid}/storage", From e7958d3132ed003cd1e46a87334e70846939f72c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 17 Oct 2024 22:45:20 -0400 Subject: [PATCH 60/69] Add API endpoint for background job progress --- backend/btrixcloud/background_jobs.py | 58 +++++++++++++++++++++++++++ backend/btrixcloud/crawlmanager.py | 16 ++++++++ backend/btrixcloud/models.py | 8 ++++ chart/app-templates/copy_job.yaml | 5 ++- 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index eb1d4248f4..ad3b245812 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -30,6 +30,7 @@ User, SuccessResponse, SuccessResponseId, + JobProgress, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now @@ -639,6 +640,52 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): return DeleteOrgJob.from_dict(data) + async def get_job_progress(self, job_id: str) -> JobProgress: + """Return progress of background job for supported types""" + job = await self.get_background_job(job_id) + + if job.type != BgJobType.COPY_BUCKET: + raise HTTPException(status_code=403, detail="job_type_not_supported") + + if job.success is False: + raise HTTPException(status_code=400, detail="job_failed") + + if job.finished: + return JobProgress(percentage=1.0) + + log_tail = await self.crawl_manager.tail_background_job(job_id) + if not log_tail: + raise HTTPException(status_code=400, detail="job_log_not_available") + + lines = log_tail.splitlines() + reversed_lines = list(reversed(lines)) + + progress = JobProgress(percentage=0.0) + + # Parse lines in reverse order until we find one with latest stats + for line in reversed_lines: + try: + if "ETA" not in line: + continue + + stats_groups = line.split(",") + for group in stats_groups: + group = group.strip() + if "%" in group: + progress.percentage = float(group.strip("%")) / 100 + if "ETA" in group: + eta_str = group.strip("ETA ") + # Split on white space to remove byte mark rclone sometimes + # adds to end of stats line + eta_list = eta_str.split(" ") + progress.eta = eta_list[0] + + break + except: + continue + + return progress + async def list_background_jobs( self, org: Optional[Organization] = None, @@ -894,6 +941,17 @@ async def get_org_background_job( """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) + @router.get( + "/{job_id}/progress", + response_model=JobProgress, + ) + async def get_job_progress( + job_id: str, + org: Organization = Depends(org_crawl_dep), + ): + """Return progress information for background job""" + return await ops.get_job_progress(job_id) + @app.get("/orgs/all/jobs/{job_id}", response_model=AnyJob, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): """Get background job from any org""" diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 4044c78f00..90aed035c6 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -436,6 +436,22 @@ async def delete_crawl_config_by_id(self, cid: str) -> None: """Delete all crawl configs by id""" await self._delete_crawl_configs(f"btrix.crawlconfig={cid}") + async def tail_background_job(self, job_id: str) -> str: + """Tail running background job pod""" + pods = await self.core_api.list_namespaced_pod( + namespace=self.namespace, + label_selector=f"batch.kubernetes.io/job-name={job_id}", + ) + + if not pods.items: + return "" + + pod_name = pods.items[0].metadata.name + + return await self.core_api.read_namespaced_pod_log( + pod_name, self.namespace, tail_lines=10 + ) + # ======================================================================== # Internal Methods async def _delete_crawl_configs(self, label) -> None: diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 3f95a9a19d..6306d5cab4 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2686,6 +2686,14 @@ class CopyBucketJob(BackgroundJob): ] +# ============================================================================ +class JobProgress(BaseModel): + """Model for reporting background job progress""" + + percentage: float + eta: Optional[str] = None + + # ============================================================================ ### GENERIC RESPONSE MODELS ### diff --git a/chart/app-templates/copy_job.yaml b/chart/app-templates/copy_job.yaml index cae8a3a449..61a8febcde 100644 --- a/chart/app-templates/copy_job.yaml +++ b/chart/app-templates/copy_job.yaml @@ -3,12 +3,13 @@ kind: Job metadata: name: "{{ id }}" labels: + job-id: "{{ id }}" role: "background-job" job_type: {{ job_type }} btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 60 + ttlSecondsAfterFinished: 30 backoffLimit: 3 template: spec: @@ -86,7 +87,7 @@ spec: - name: RCLONE_CONFIG_NEW_ENDPOINT value: "{{ new_endpoint }}" - command: ["rclone", "-vv", "--progress", "copy", "--checksum", "--use-mmap", "--transfers=2", "--checkers=2", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] + command: ["rclone", "-v", "--stats-one-line", "--stats", "10s", "copy", "--checksum", "--use-mmap", "--transfers=2", "--checkers=2", "prev:{{ prev_bucket }}{{ oid }}", "new:{{ new_bucket }}{{ oid }}"] resources: limits: memory: "350Mi" From da14020710ee7c2b9e96b09abed00ec0a11190e5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 18 Oct 2024 01:09:14 -0400 Subject: [PATCH 61/69] Fix linting --- backend/btrixcloud/background_jobs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index ad3b245812..58379f0112 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -681,6 +681,7 @@ async def get_job_progress(self, job_id: str) -> JobProgress: progress.eta = eta_list[0] break + # pylint: disable=bare-except except: continue @@ -947,6 +948,7 @@ async def get_org_background_job( ) async def get_job_progress( job_id: str, + # pylint: disable=unused-argument org: Organization = Depends(org_crawl_dep), ): """Return progress information for background job""" From d17075f78331b2c121ccb5ccbec7aed2764b241c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Dec 2024 16:56:00 -0500 Subject: [PATCH 62/69] Format post-rebase with Black --- backend/btrixcloud/main_op.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index d7e2948e93..9c8ea34587 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -25,7 +25,6 @@ def main(): ) sys.exit(1) - ( org_ops, crawl_config_ops, From 46172e3d85aa569b9ec1b7bacd27a003d5b03aee Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 24 Jan 2025 09:59:26 -0500 Subject: [PATCH 63/69] Format with Black --- backend/btrixcloud/background_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 58379f0112..d59ebc0efe 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -432,7 +432,7 @@ async def create_re_add_org_pages_job( return job_id # pylint: disable=broad-exception-caught except Exception as exc: - # pylint: disable=raise-missing-from + # pylint: disable=raise-missing-from print(f"warning: re-add org pages job could not be started: {exc}") return None From 698b692cd5f280623d75a9cd5983067b5235211c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 24 Jan 2025 10:03:02 -0500 Subject: [PATCH 64/69] Fix linting --- backend/btrixcloud/background_jobs.py | 2 -- backend/btrixcloud/db.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index d59ebc0efe..4bd1652ad8 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -306,8 +306,6 @@ async def create_delete_org_job( ) -> str: """Create background job to delete org and its data""" - job_type = BgJobType.DELETE_ORG.value - try: job_id = await self.crawl_manager.run_delete_org_job( oid=str(org.id), diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 1567bbcfa4..dafd99f59d 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -131,7 +131,7 @@ async def update_and_prepare_db( # ============================================================================ -# pylint: disable=too-many-locals, too-many-arguments +# pylint: disable=too-many-locals, too-many-arguments, too-many-positional-arguments async def run_db_migrations( mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops ): From b74017cdfffbd603bab8705a6c5e2652401ded37 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 24 Feb 2025 14:42:21 -0500 Subject: [PATCH 65/69] Fix linking --- backend/btrixcloud/background_jobs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 4bd1652ad8..de19499828 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -31,6 +31,7 @@ SuccessResponse, SuccessResponseId, JobProgress, + BackgroundJob, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now @@ -44,7 +45,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes +# pylint: disable=too-many-instance-attributes, too-many-lines, too-many-return-statements, too-many-public-methods class BackgroundJobOps: """k8s background job management""" @@ -807,6 +808,7 @@ async def retry_org_background_job( self, job: BackgroundJob, org: Organization ) -> Dict[str, Union[bool, Optional[str]]]: """Retry background job specific to one org""" + # pylint: disable=too-many-return-statements if job.type == BgJobType.CREATE_REPLICA: job = cast(CreateReplicaJob, job) file = await self.get_replica_job_file(job, org) @@ -872,7 +874,7 @@ async def retry_org_background_job( org, job.prev_storage, job.new_storage, - existing_job_id=job_id, + existing_job_id=job.id, ) return {"success": True} From 0d44854b355ece21c102e6f0d919869159c2119d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 24 Feb 2025 14:48:29 -0500 Subject: [PATCH 66/69] Cast job to CopyBucketJob --- backend/btrixcloud/background_jobs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index de19499828..786d4f97fb 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -870,6 +870,7 @@ async def retry_org_background_job( return {"success": True} if job.type == BgJobType.COPY_BUCKET: + job = cast(CopyBucketJob, job) await self.create_copy_bucket_job( org, job.prev_storage, From d220318d6c3cd25ac5c84f2f02d09e1f0b52e597 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 24 Feb 2025 14:53:11 -0500 Subject: [PATCH 67/69] Fix type errors from rebase --- backend/btrixcloud/background_jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 786d4f97fb..4236c685b5 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -567,8 +567,8 @@ async def job_finished( await self.handle_delete_replica_job_finished( cast(DeleteReplicaJob, job) ) - if job_type == BgJobType.COPY_BUCKET: - org = await self.org_ops.get_org_by_id(oid) + if job_type == BgJobType.COPY_BUCKET and job.oid: + org = await self.org_ops.get_org_by_id(job.oid) await self.org_ops.update_read_only(org, False) else: print( From 6eafcbf4e0c2a8c97afbbc1f374cb496de2f594a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 22 Apr 2025 12:17:55 -0400 Subject: [PATCH 68/69] Reformat --- backend/btrixcloud/storages.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 9c9c283613..98afa4a3fa 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -933,7 +933,11 @@ def _parse_json(line) -> dict: # ============================================================================ def init_storages_api( - org_ops: OrgOps, crawl_manager: CrawlManager, app: APIRouter, mdb, user_dep: Callable + org_ops: OrgOps, + crawl_manager: CrawlManager, + app: APIRouter, + mdb, + user_dep: Callable, ) -> StorageOps: """API for updating storage for an org""" From 3a65fee1ba84a72fe647a61b6478b30dfc8d1fce Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 22 Apr 2025 12:23:56 -0400 Subject: [PATCH 69/69] Fix linting --- backend/btrixcloud/storages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 98afa4a3fa..3e866894ac 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -78,6 +78,7 @@ # ============================================================================ # pylint: disable=broad-except,raise-missing-from,too-many-public-methods, too-many-positional-arguments +# pylint: disable=too-many-lines,too-many-instance-attributes class StorageOps: """All storage handling, download/upload operations""" @@ -932,6 +933,7 @@ def _parse_json(line) -> dict: # ============================================================================ +# pylint: disable=too-many-locals def init_storages_api( org_ops: OrgOps, crawl_manager: CrawlManager,