Skip to content

Add pageCount to crawls and uploads and use in frontend for page counts #2315

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .migrations import BaseMigration


CURR_DB_VERSION = "0039"
CURR_DB_VERSION = "0040"


# ============================================================================
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Migration 0040 -- archived item pageCount
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0040"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

self.page_ops = kwargs.get("page_ops")

async def migrate_up(self):
"""Perform migration up.

Calculate and store pageCount for archived items that don't have it yet
"""
crawls_mdb = self.mdb["crawls"]

if self.page_ops is None:
print(
"Unable to set pageCount for archived items, missing page_ops",
flush=True,
)
return

async for crawl_raw in crawls_mdb.find({"pageCount": None}):
crawl_id = crawl_raw["_id"]
try:
await self.page_ops.set_archived_item_page_count(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error saving pageCount for archived item {crawl_id}: {err}",
flush=True,
)
5 changes: 5 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,8 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel):

reviewStatus: ReviewStatus = None

pageCount: Optional[int] = 0

filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0

Expand Down Expand Up @@ -872,6 +874,7 @@ class CrawlOut(BaseMongoModel):
lastQAState: Optional[str] = None
lastQAStarted: Optional[datetime] = None

pageCount: Optional[int] = 0
filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0

Expand Down Expand Up @@ -1914,6 +1917,8 @@ class OrgMetrics(BaseModel):
crawlCount: int
uploadCount: int
pageCount: int
crawlPageCount: int
uploadPageCount: int
profileCount: int
workflowsRunningCount: int
maxConcurrentCrawls: int
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -1503,6 +1503,7 @@ async def do_crawl_finished_tasks(
)

if state in SUCCESSFUL_STATES and crawl.oid:
await self.page_ops.set_archived_item_page_count(crawl.id)
await self.org_ops.inc_org_bytes_stored(
crawl.oid, status.filesAddedSize, "crawl"
)
Expand Down
11 changes: 9 additions & 2 deletions backend/btrixcloud/orgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,10 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]:
archived_item_count = 0
crawl_count = 0
upload_count = 0

page_count = 0
crawl_page_count = 0
upload_page_count = 0

async for item_data in self.crawls_db.find({"oid": org.id}):
item = BaseCrawl.from_dict(item_data)
Expand All @@ -948,10 +951,12 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]:
archived_item_count += 1
if item.type == "crawl":
crawl_count += 1
crawl_page_count += item.pageCount or 0
if item.type == "upload":
upload_count += 1
if item.stats:
page_count += item.stats.done
upload_page_count += item.pageCount or 0
if item.pageCount:
page_count += item.pageCount

profile_count = await self.profiles_db.count_documents({"oid": org.id})
workflows_running_count = await self.crawls_db.count_documents(
Expand All @@ -975,6 +980,8 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]:
"crawlCount": crawl_count,
"uploadCount": upload_count,
"pageCount": page_count,
"crawlPageCount": crawl_page_count,
"uploadPageCount": upload_page_count,
"profileCount": profile_count,
"workflowsRunningCount": workflows_running_count,
"maxConcurrentCrawls": max_concurrent_crawls,
Expand Down
10 changes: 10 additions & 0 deletions backend/btrixcloud/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
if pages_buffer:
await self._add_pages_to_db(crawl_id, pages_buffer)

await self.set_archived_item_page_count(crawl_id)

print(f"Added pages for crawl {crawl_id} to db", flush=True)
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err:
Expand Down Expand Up @@ -661,6 +663,14 @@ def get_crawl_type_from_pages_route(self, request: Request):

return crawl_type

async def set_archived_item_page_count(self, crawl_id: str):
"""Store archived item page count in crawl document"""
_, page_count = await self.list_pages(crawl_id)

await self.crawls.find_one_and_update(
{"_id": crawl_id}, {"$set": {"pageCount": page_count}}
)


# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
Expand Down
8 changes: 8 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,14 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
)
assert r.status_code == 403

# Check that pageCount was stored on crawl
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["pageCount"] > 0


def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
note_text = "testing"
Expand Down
8 changes: 8 additions & 0 deletions backend/test/test_uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,14 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page.get("modified") is None
assert page.get("approved") is None

# Check that pageCount was stored on upload
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["pageCount"] > 0


def test_replace_upload(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
Expand Down
19 changes: 18 additions & 1 deletion frontend/src/features/archived-items/archived-item-list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,24 @@ export class ArchivedItemListItem extends BtrixElement {
</btrix-table-cell>
<btrix-table-cell class="tabular-nums">
${isUpload
? notApplicable
? html`<sl-tooltip
hoist
@click=${this.onTooltipClick}
content=${msg(
str`${this.localize.number(
this.item.pageCount ? +this.item.pageCount : 0,
)}`,
)}
>
<div class="min-w-4">
${this.localize.number(
this.item.pageCount ? +this.item.pageCount : 0,
{
notation: "compact",
},
)}
</div>
</sl-tooltip>`
: html`<sl-tooltip
hoist
@click=${this.onTooltipClick}
Expand Down
3 changes: 2 additions & 1 deletion frontend/src/features/archived-items/crawl-list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,13 @@ export class CrawlListItem extends BtrixElement {
</btrix-table-cell>
<btrix-table-cell>
${this.safeRender((crawl) => {
const pagesComplete = +(crawl.stats?.done || 0);
const pagesFound = +(crawl.stats?.found || 0);
if (crawl.finished) {
const pagesComplete = crawl.pageCount ? +crawl.pageCount : 0;
return `${this.localize.number(pagesComplete, { notation: "compact" })} ${pluralOf("pages", pagesComplete)}`;
}

const pagesComplete = +(crawl.stats?.done || 0);
return `${this.localize.number(pagesComplete, { notation: "compact" })} / ${this.localize.number(pagesFound, { notation: "compact" })} ${pluralOf("pages", pagesFound)}`;
})}
</btrix-table-cell>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ export class ArchivedItemDetail extends BtrixElement {
? html`${this.item.fileSize
? html`${this.localize.bytes(this.item.fileSize || 0, {
unitDisplay: "narrow",
})}${this.item.stats
})}${this.item.stats?.done
? html`<span>,</span
><span
class="tracking-tighter${this.isActive
Expand All @@ -873,7 +873,18 @@ export class ArchivedItemDetail extends BtrixElement {
<span
>${pluralOf("pages", +this.item.stats.found)}</span
>`
: ""}`
: html`<span>,</span
><span>
${this.localize.number(
this.item.pageCount ? +this.item.pageCount : 0,
)}
</span>
<span
>${pluralOf(
"pages",
this.item.pageCount ? +this.item.pageCount : 0,
)}</span
>`}`
: html`<span class="text-0-400">${msg("Unknown")}</span>`}`
: html`<sl-skeleton class="h-[16px] w-24"></sl-skeleton>`}
</btrix-desc-list-item>
Expand Down
25 changes: 24 additions & 1 deletion frontend/src/pages/org/dashboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ type Metrics = {
crawlCount: number;
uploadCount: number;
pageCount: number;
crawlPageCount: number;
uploadPageCount: number;
profileCount: number;
workflowsRunningCount: number;
maxConcurrentCrawls: number;
Expand Down Expand Up @@ -236,10 +238,31 @@ export class Dashboard extends BtrixElement {
pluralLabel: msg("Crawl Workflows Waiting"),
iconProps: { name: "hourglass-split", color: "violet" },
})}
<sl-divider
style="--spacing:var(--sl-spacing-small)"
></sl-divider>
${this.renderStat({
value: metrics.pageCount,
value: metrics.crawlPageCount,
singleLabel: msg("Page Crawled"),
pluralLabel: msg("Pages Crawled"),
iconProps: {
name: "file-richtext-fill",
color: this.colors.crawls,
},
})}
${this.renderStat({
value: metrics.uploadPageCount,
singleLabel: msg("Page Uploaded"),
pluralLabel: msg("Pages Uploaded"),
iconProps: {
name: "file-richtext-fill",
color: this.colors.uploads,
},
})}
${this.renderStat({
value: metrics.pageCount,
singleLabel: msg("Page Total"),
pluralLabel: msg("Pages Total"),
iconProps: { name: "file-richtext-fill" },
})}
</dl>
Expand Down
1 change: 1 addition & 0 deletions frontend/src/types/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ type ArchivedItemBase = {
activeQAStats: { done: number; found: number } | null;
lastQAState: CrawlState | null;
lastQAStarted: string | null;
pageCount?: number;
filePageCount?: number;
errorPageCount?: number;
};
Expand Down
1 change: 0 additions & 1 deletion frontend/webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ const childProcess = require("child_process");
const fs = require("fs");
const path = require("path");


const CopyPlugin = require("copy-webpack-plugin");
const ForkTsCheckerWebpackPlugin = require("fork-ts-checker-webpack-plugin");
const HtmlWebpackPlugin = require("html-webpack-plugin");
Expand Down
Loading
Loading