Skip to content

fix: PLT-802: _update_tasks_states job optimization #7804

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions label_studio/core/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@
DATA_MANAGER_ACTIONS = {}
DATA_MANAGER_CUSTOM_FILTER_EXPRESSIONS = 'data_manager.functions.custom_filter_expressions'
DATA_MANAGER_PREPROCESS_FILTER = 'data_manager.functions.preprocess_filter'
BULK_UPDATE_IS_LABELED = 'tasks.functions.bulk_update_is_labeled_by_overlap'
USER_LOGIN_FORM = 'users.forms.LoginForm'
PROJECT_MIXIN = 'projects.mixins.ProjectMixin'
TASK_MIXIN = 'tasks.mixins.TaskMixin'
Expand Down
2 changes: 1 addition & 1 deletion label_studio/projects/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_unique_ids_list(tasks_queryset):

elif isinstance(tasks_queryset, QuerySet):
# It's a Django QuerySet
return list(tasks_queryset.values_list('id', flat=True))
return list(tasks_queryset.values_list('id', flat=True).iterator(chunk_size=1000))

else:
raise ValueError(f'Unsupported type for tasks_queryset: {type(tasks_queryset)}')
Expand Down
44 changes: 44 additions & 0 deletions label_studio/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,47 @@
updated_count += len(batch_list)

return updated_count


def bulk_update_is_labeled_by_overlap(tasks_ids, project):
"""Bulk update is_labeled status for tasks using overlap-based calculation only.

This function updates the is_labeled field for tasks based on overlap settings.
It only works when the project can use overlap (automatic assignment mode).
Processes tasks in batches to handle large datasets efficiently.

:param tasks_ids: List or QuerySet of task IDs
:param project: Project instance
:return: None
:raises ValueError: If project cannot use overlap
"""
from django.db.models import F, Q
from django.db.models.lookups import GreaterThanOrEqual

if not tasks_ids:
return

Check warning on line 251 in label_studio/tasks/functions.py

View check run for this annotation

Codecov / codecov/patch

label_studio/tasks/functions.py#L251

Added line #L251 was not covered by tests

if not project._can_use_overlap():
raise ValueError('Project cannot use overlap-based calculation')

Check warning on line 254 in label_studio/tasks/functions.py

View check run for this annotation

Codecov / codecov/patch

label_studio/tasks/functions.py#L254

Added line #L254 was not covered by tests

# Convert to list if it's a QuerySet to enable slicing
if hasattr(tasks_ids, 'values_list'):
tasks_ids = list(tasks_ids)

Check warning on line 258 in label_studio/tasks/functions.py

View check run for this annotation

Codecov / codecov/patch

label_studio/tasks/functions.py#L258

Added line #L258 was not covered by tests

# following definition of `completed_annotations` above, count cancelled annotations
# as completed if project is in IGNORE_SKIPPED mode
completed_annotations_f_expr = F('total_annotations')
if project.skip_queue == project.SkipQueue.IGNORE_SKIPPED:
completed_annotations_f_expr += F('cancelled_annotations')

Check warning on line 264 in label_studio/tasks/functions.py

View check run for this annotation

Codecov / codecov/patch

label_studio/tasks/functions.py#L264

Added line #L264 was not covered by tests
finished_q = Q(GreaterThanOrEqual(completed_annotations_f_expr, F('overlap')))

# Process in batches
batch_size = settings.BATCH_SIZE
for i in range(0, len(tasks_ids), batch_size):
batch_ids = tasks_ids[i : i + batch_size]

# Update is_labeled=True for tasks that meet the overlap criteria
Task.objects.filter(id__in=batch_ids, project=project).filter(finished_q).update(is_labeled=True)

# Update is_labeled=False for tasks that don't meet the criteria
Task.objects.filter(id__in=batch_ids, project=project).exclude(finished_q).update(is_labeled=False)
40 changes: 39 additions & 1 deletion label_studio/tasks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1376,7 +1376,7 @@
task.save()


def bulk_update_stats_project_tasks(tasks, project=None):
def deprecated_bulk_update_stats_project_tasks(tasks, project=None):
"""bulk Task update accuracy
ex: after change settings
apply several update queries size of batch
Expand Down Expand Up @@ -1426,5 +1426,43 @@
)


def bulk_update_stats_project_tasks(tasks, project=None):
"""bulk Task update accuracy
ex: after change settings
apply several update queries size of batch
on updated Task objects
in single transaction as execute sql
:param tasks: QuerySet or list of Task objects
:param project: Project instance (optional, will be inferred from first task if not provided)
:return: None
"""
from core.feature_flags import flag_set

bulk_update_is_labeled = load_func(settings.BULK_UPDATE_IS_LABELED)

if flag_set('fflag_fix_back_plt_802_update_is_labeled_20062025_short', user='auto'):
# New implementation using bulk_update_is_labeled
from projects.functions.utils import get_unique_ids_list

# Convert tasks to list of IDs
task_ids = get_unique_ids_list(tasks)

# Check if empty
if not task_ids:
return

# Get project if it's not in params
if project is None:
# Load first task to get project
first_task = Task.objects.get(id=task_ids[0])
project = first_task.project

Check warning on line 1458 in label_studio/tasks/models.py

View check run for this annotation

Codecov / codecov/patch

label_studio/tasks/models.py#L1457-L1458

Added lines #L1457 - L1458 were not covered by tests

# Call the loaded bulk_update_is_labeled function
bulk_update_is_labeled(task_ids, project)
else:
# Use deprecated implementation
return deprecated_bulk_update_stats_project_tasks(tasks, project)

Check warning on line 1464 in label_studio/tasks/models.py

View check run for this annotation

Codecov / codecov/patch

label_studio/tasks/models.py#L1464

Added line #L1464 was not covered by tests


Q_finished_annotations = Q(was_cancelled=False) & Q(result__isnull=False)
Q_task_finished_annotations = Q(annotations__was_cancelled=False) & Q(annotations__result__isnull=False)
Loading