Skip to content

Minimise db changes when crawling sources #293

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 18, 2025
Merged
2 changes: 1 addition & 1 deletion MPCAutofill/MPCAutofill/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@
PATREON_URL = env("PATREON_URL", default="")

# Sentry
if len(sys.argv) >= 2 and sys.argv[1] != "runserver":
if len(sys.argv) >= 2 and sys.argv[1] != "runserver" and env("DJANGO_DEBUG", default=False) is False:
sentry_sdk.init(
dsn="https://4d29db1957fb9b3153aaba66e776b01f@o4505848489246720.ingest.sentry.io/4505848491540480",
integrations=[DjangoIntegration()],
Expand Down
2 changes: 1 addition & 1 deletion MPCAutofill/cardpicker/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class AdminTag(admin.ModelAdmin[Tag]):

@admin.register(Card)
class AdminCard(admin.ModelAdmin[Card]):
list_display = ("identifier", "name", "source", "dpi", "date", "tags")
list_display = ("identifier", "name", "source", "dpi", "date_created", "tags")
search_fields = ("identifier", "name")


Expand Down
3 changes: 2 additions & 1 deletion MPCAutofill/cardpicker/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class CardSearch(Document):
searchq_precise = fields.TextField(attr="searchq", analyzer=precise_analyser)
searchq_keyword = fields.KeywordField(attr="searchq")
card_type = fields.KeywordField()
date = fields.DateField()
date_created = fields.DateField()
date_modified = fields.DateField()
language = fields.TextField(analyzer=precise_analyser) # case insensitivity is one less thing which can go wrong
tags = fields.KeywordField() # all elasticsearch fields support arrays by default

Expand Down
3 changes: 0 additions & 3 deletions MPCAutofill/cardpicker/management/commands/update_database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import time
from typing import Any, Optional

from django.core import management
from django.core.management.base import BaseCommand

from cardpicker.models import Source
Expand All @@ -23,6 +22,4 @@ def handle(self, *args: Any, **kwargs: str) -> None:
drive: Optional[str] = kwargs.get("drive", None)
t0 = time.time()
update_database(source_key=drive)
management.call_command("search_index", "--rebuild", "-f")
print("")
log_hours_minutes_seconds_elapsed(t0)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.14 on 2025-05-16 09:08

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
("cardpicker", "0039_remove_card_searchq_keyword"),
]

operations = [
migrations.RenameField(
model_name="card",
old_name="date",
new_name="date_created",
),
]
23 changes: 23 additions & 0 deletions MPCAutofill/cardpicker/migrations/0041_card_date_modified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.14 on 2025-05-16 21:15

import datetime

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("cardpicker", "0040_rename_date_card_date_created"),
]

operations = [
migrations.AddField(
model_name="card",
name="date_modified",
field=models.DateTimeField(default=datetime.datetime.now),
),
migrations.RunSQL(
sql="UPDATE cardpicker_card SET date_modified = date_created", reverse_sql=migrations.RunSQL.noop
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 4.2.14 on 2025-05-18 02:22

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("cardpicker", "0041_card_date_modified"),
]

operations = [
migrations.RemoveConstraint(
model_name="projectmember",
name="projectmember_unique",
),
migrations.RemoveField(
model_name="projectmember",
name="card_id",
),
migrations.AddField(
model_name="projectmember",
name="card",
field=models.ForeignKey(
blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to="cardpicker.card"
),
),
migrations.AddConstraint(
model_name="projectmember",
constraint=models.UniqueConstraint(fields=("card", "project", "slot", "face"), name="projectmember_unique"),
),
]
33 changes: 24 additions & 9 deletions MPCAutofill/cardpicker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ class Card(models.Model):
dpi = models.IntegerField(default=0)
searchq = models.CharField(max_length=200)
extension = models.CharField(max_length=200)
date = models.DateTimeField(default=datetime.now)
date_created = models.DateTimeField(default=datetime.now)
date_modified = models.DateTimeField(default=datetime.now)
size = models.IntegerField()
tags = ArrayField(models.CharField(max_length=20), default=list, blank=True) # null=True is just for admin panel
language = models.CharField(max_length=5)
Expand All @@ -205,7 +206,7 @@ def __str__(self) -> str:
f"{self.name} "
f"[Type: {self.card_type}, "
f"Identifier: {self.identifier}, "
f"Uploaded: {self.date.strftime('%d/%m/%Y')}, "
f"Uploaded: {self.date_created.strftime('%d/%m/%Y')}, "
f"Priority: {self.priority}]"
)

Expand All @@ -225,7 +226,8 @@ def serialise(self) -> SerialisedCard:
dpi=self.dpi,
searchq=self.searchq,
extension=self.extension,
date=dateformat.format(self.date, DATE_FORMAT),
dateCreated=dateformat.format(self.date_created, DATE_FORMAT),
dateModified=dateformat.format(self.date_modified, DATE_FORMAT),
size=self.size,
downloadLink=self.get_download_link() or "",
smallThumbnailUrl=self.get_small_thumbnail_url() or "",
Expand Down Expand Up @@ -361,8 +363,18 @@ def set_project_members(self, records: dict[str, dict[str, list[dict[str, Any]]]
if (card_identifier := record.get("card_identifier"), None) is not None:
card_identifiers.add(card_identifier)

card_identifiers_to_pk: dict[str, Card] = {
x.identifier: x for x in Card.objects.filter(identifier__in=card_identifiers)
}
members: list[ProjectMember] = [
ProjectMember(card_id=value.get("card_identifier", None), slot=value["slot"], query=query, face=face)
ProjectMember(
card=card_identifiers_to_pk[card_identifier]
if (card_identifier := value.get("card_identifier", None)) is not None
else None,
slot=value["slot"],
query=query,
face=face,
)
for face in Faces
if (face_members := records.get(face, None)) is not None
for query, values in face_members.items()
Expand All @@ -388,19 +400,22 @@ def __str__(self) -> str:


class ProjectMember(models.Model):
card_id = models.CharField(max_length=200, null=True, blank=True)
card = models.ForeignKey(to=Card, on_delete=models.SET_NULL, null=True, blank=True)
project = models.ForeignKey(to=Project, on_delete=models.CASCADE)
query = models.CharField(max_length=200)
slot = models.IntegerField()
face = models.CharField(max_length=5, choices=Faces.choices, default=Faces.FRONT)

class Meta:
constraints = [
models.UniqueConstraint(fields=["card_id", "project", "slot", "face"], name="projectmember_unique")
]
constraints = [models.UniqueConstraint(fields=["card", "project", "slot", "face"], name="projectmember_unique")]

def to_dict(self) -> dict[str, Any]:
return {"card_identifier": self.card_id, "query": self.query, "slot": self.slot, "face": self.face}
return {
"card_identifier": self.card.identifier if self.card else None,
"query": self.query,
"slot": self.slot,
"face": self.face,
}


__all__ = [
Expand Down
14 changes: 10 additions & 4 deletions MPCAutofill/cardpicker/schema_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,12 @@ class SourceType(str, Enum):

class Card(BaseModel):
cardType: CardType
date: str
dateCreated: str
"""Created date - formatted by backend"""

dateModified: str
"""Modified date - formatted by backend"""

downloadLink: str
dpi: int
extension: str
Expand All @@ -258,7 +261,8 @@ class Card(BaseModel):
def from_dict(obj: Any) -> "Card":
assert isinstance(obj, dict)
cardType = CardType(obj.get("cardType"))
date = from_str(obj.get("date"))
dateCreated = from_str(obj.get("dateCreated"))
dateModified = from_str(obj.get("dateModified"))
downloadLink = from_str(obj.get("downloadLink"))
dpi = from_int(obj.get("dpi"))
extension = from_str(obj.get("extension"))
Expand All @@ -279,7 +283,8 @@ def from_dict(obj: Any) -> "Card":
sourceType = from_union([SourceType, from_none], obj.get("sourceType"))
return Card(
cardType,
date,
dateCreated,
dateModified,
downloadLink,
dpi,
extension,
Expand All @@ -303,7 +308,8 @@ def from_dict(obj: Any) -> "Card":
def to_dict(self) -> dict:
result: dict = {}
result["cardType"] = to_enum(CardType, self.cardType)
result["date"] = from_str(self.date)
result["dateCreated"] = from_str(self.dateCreated)
result["dateModified"] = from_str(self.dateModified)
result["downloadLink"] = from_str(self.downloadLink)
result["dpi"] = from_int(self.dpi)
result["extension"] = from_str(self.extension)
Expand Down
4 changes: 2 additions & 2 deletions MPCAutofill/cardpicker/search/search_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ def retrieve_cardback_identifiers(search_settings: SearchSettings) -> list[str]:
def get_new_cards_paginator(source: Source) -> Paginator[QuerySet[Card]]:
now = timezone.now()
cards = Card.objects.filter(
source=source, date__lt=now, date__gte=now - dt.timedelta(days=NEW_CARDS_DAYS)
).order_by("-date", "name")
source=source, date_created__lt=now, date_created__gte=now - dt.timedelta(days=NEW_CARDS_DAYS)
).order_by("-date_created", "name")
return Paginator(cards, NEW_CARDS_PAGE_SIZE) # type: ignore # TODO: `_SupportsPagination`


Expand Down
1 change: 1 addition & 0 deletions MPCAutofill/cardpicker/sources/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class Image:
name: str
size: int
created_time: dt.datetime
modified_time: dt.datetime
height: int
folder: Folder

Expand Down
7 changes: 5 additions & 2 deletions MPCAutofill/cardpicker/sources/source_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from tqdm import tqdm

from django.db.models import TextChoices
from django.utils.dateparse import parse_datetime
from django.utils.timezone import now
from django.utils.translation import gettext_lazy

from cardpicker.schema_types import SourceType as SchemaSourceType
Expand Down Expand Up @@ -151,7 +153,7 @@ def get_all_images_inside_folder(folder: Folder) -> list[Image]:
"mimeType contains 'image/jpeg') and "
f"'{folder.id}' in parents",
fields="nextPageToken, files("
"id, name, trashed, size, parents, createdTime, imageMediaMetadata"
"id, name, trashed, size, parents, createdTime, modifiedTime, imageMediaMetadata"
")",
pageSize=500,
pageToken=page_token,
Expand All @@ -167,7 +169,8 @@ def get_all_images_inside_folder(folder: Folder) -> list[Image]:
Image(
id=item["id"],
name=item["name"],
created_time=item["createdTime"],
created_time=parse_datetime(item["createdTime"]) or now(),
modified_time=parse_datetime(item["modifiedTime"]) or now(),
folder=folder,
height=item["imageMediaMetadata"]["height"],
size=int(item["size"]),
Expand Down
61 changes: 56 additions & 5 deletions MPCAutofill/cardpicker/sources/update_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from django.db import transaction

from cardpicker.constants import DEFAULT_LANGUAGE, MAX_SIZE_MB
from cardpicker.documents import CardSearch
from cardpicker.models import Card, CardTypes, Source
from cardpicker.search.sanitisation import to_searchable
from cardpicker.sources.api import Folder, Image
Expand Down Expand Up @@ -112,7 +113,8 @@ def transform_images_into_objects(source: Source, images: list[Image], tags: Tag
dpi=dpi,
searchq=searchable_name, # search-friendly card name
extension=extension,
date=image.created_time,
date_created=image.created_time,
date_modified=image.modified_time,
size=image.size,
tags=list(extracted_tags),
language=(language or DEFAULT_LANGUAGE).alpha_2.upper(),
Expand Down Expand Up @@ -143,10 +145,59 @@ def transform_images_into_objects(source: Source, images: list[Image], tags: Tag
def bulk_sync_objects(source: Source, cards: list[Card]) -> None:
print(f"Synchronising objects to database for source {TEXT_BOLD}{source.name}{TEXT_END}...", end="", flush=True)
t0 = time.time()
with transaction.atomic(): # django-bulk-sync is crushingly slow with postgres
Card.objects.filter(source=source).delete()
Card.objects.bulk_create(cards)
print(f" and done! That took {TEXT_BOLD}{(time.time() - t0):.2f}{TEXT_END} seconds.")

incoming = {card.identifier: card for card in cards}
incoming_ids = set(incoming.keys())
existing = {card.identifier: card for card in Card.objects.filter(source=source)}
existing_ids = set(existing.keys())

created = [incoming[identifier] for identifier in incoming_ids - existing_ids]
updated: list[Card] = []
for identifier in incoming_ids & existing_ids:
if incoming[identifier].date_modified > existing[identifier].date_modified:
incoming[identifier].pk = existing[identifier].pk # this must be explicitly set for bulk_update.
updated.append(incoming[identifier])
deleted_ids = existing_ids - incoming_ids
deleted = [existing[identifier] for identifier in deleted_ids]

with transaction.atomic():
if created:
Card.objects.bulk_create(created)
CardSearch().update(list(created), action="index")
if updated:
Card.objects.bulk_update(
updated,
# update every field except for `identifier`
[
"card_type",
"name",
"priority",
"source",
"source_verbose",
"folder_location",
"dpi",
"searchq",
"extension",
"date_created",
"date_modified",
"size",
"tags",
"language",
],
batch_size=1000,
)
# as per this thread https://github.com/django-es/django-elasticsearch-dsl/issues/224#issuecomment-551955511
# action type "index" is used for indexing new objects as well as updating existing objects
CardSearch().update(list(updated), action="index")
if deleted_ids:
Card.objects.filter(identifier__in=deleted_ids).delete()
CardSearch().update(list(deleted), action="delete")
print(
f" and done! That took {TEXT_BOLD}{(time.time() - t0):.2f}{TEXT_END} seconds.\n"
f"Created {TEXT_BOLD}{len(created)}{TEXT_END}, "
f"updated {TEXT_BOLD}{len(updated)}{TEXT_END}, "
f"and deleted {TEXT_BOLD}{len(deleted_ids)}{TEXT_END} cards."
)


def update_database_for_source(source: Source, source_type: Type[SourceType], root_folder: Folder, tags: Tags) -> None:
Expand Down
Loading