From 8b0ebcb6cd6e937ec39186d7f40ddfbcd7aba80d Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 5 Jun 2025 15:59:23 -0700 Subject: [PATCH 1/3] feat: add SPEND_LOG_CLEANUP_BATCH_SIZE --- litellm/constants.py | 1 + .../db_transaction_queue/spend_log_cleanup.py | 68 +++++++++++++------ 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/litellm/constants.py b/litellm/constants.py index 48ea7d1a691d..c4adb09dc6ab 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -734,6 +734,7 @@ PROMETHEUS_EMIT_BUDGET_METRICS_JOB_NAME = "prometheus_emit_budget_metrics" SPEND_LOG_CLEANUP_JOB_NAME = "spend_log_cleanup" SPEND_LOG_RUN_LOOPS = int(os.getenv("SPEND_LOG_RUN_LOOPS", 500)) +SPEND_LOG_CLEANUP_BATCH_SIZE = int(os.getenv("SPEND_LOG_CLEANUP_BATCH_SIZE", 1000)) DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = int( os.getenv("DEFAULT_CRON_JOB_LOCK_TTL_SECONDS", 60) ) # 1 minute diff --git a/litellm/proxy/db/db_transaction_queue/spend_log_cleanup.py b/litellm/proxy/db/db_transaction_queue/spend_log_cleanup.py index ed76e5eddf59..02fa84bae309 100644 --- a/litellm/proxy/db/db_transaction_queue/spend_log_cleanup.py +++ b/litellm/proxy/db/db_transaction_queue/spend_log_cleanup.py @@ -1,11 +1,16 @@ +import asyncio from datetime import datetime, timedelta, timezone from typing import Optional -from litellm.proxy.utils import PrismaClient -from litellm.litellm_core_utils.duration_parser import duration_in_seconds + from litellm._logging import verbose_proxy_logger from litellm.caching import RedisCache -from litellm.constants import SPEND_LOG_CLEANUP_JOB_NAME, SPEND_LOG_RUN_LOOPS -import asyncio +from litellm.constants import ( + SPEND_LOG_CLEANUP_BATCH_SIZE, + SPEND_LOG_CLEANUP_JOB_NAME, + SPEND_LOG_RUN_LOOPS, +) +from litellm.litellm_core_utils.duration_parser import duration_in_seconds +from litellm.proxy.utils import PrismaClient class SpendLogCleanup: @@ -16,21 +21,26 @@ class SpendLogCleanup: """ def __init__(self, general_settings=None, redis_cache: Optional[RedisCache] = None): - self.batch_size = 1000 + self.batch_size = SPEND_LOG_CLEANUP_BATCH_SIZE self.retention_seconds: Optional[int] = None from litellm.proxy.proxy_server import general_settings as default_settings + self.general_settings = general_settings or default_settings from litellm.proxy.proxy_server import proxy_logging_obj pod_lock_manager = proxy_logging_obj.db_spend_update_writer.pod_lock_manager self.pod_lock_manager = pod_lock_manager - verbose_proxy_logger.info(f"SpendLogCleanup initialized with batch size: {self.batch_size}") + verbose_proxy_logger.info( + f"SpendLogCleanup initialized with batch size: {self.batch_size}" + ) def _should_delete_spend_logs(self) -> bool: """ Determines if logs should be deleted based on the max retention period in settings. """ - retention_setting = self.general_settings.get("maximum_spend_logs_retention_period") + retention_setting = self.general_settings.get( + "maximum_spend_logs_retention_period" + ) verbose_proxy_logger.info(f"Checking retention setting: {retention_setting}") if retention_setting is None: @@ -41,7 +51,9 @@ def _should_delete_spend_logs(self) -> bool: if isinstance(retention_setting, int): retention_setting = str(retention_setting) self.retention_seconds = duration_in_seconds(retention_setting) - verbose_proxy_logger.info(f"Retention period set to {self.retention_seconds} seconds") + verbose_proxy_logger.info( + f"Retention period set to {self.retention_seconds} seconds" + ) return True except ValueError as e: verbose_proxy_logger.error( @@ -49,7 +61,9 @@ def _should_delete_spend_logs(self) -> bool: ) return False - async def _delete_old_logs(self, prisma_client: PrismaClient, cutoff_date: datetime) -> int: + async def _delete_old_logs( + self, prisma_client: PrismaClient, cutoff_date: datetime + ) -> int: """ Helper method to delete old logs in batches. Returns the total number of logs deleted. @@ -58,7 +72,9 @@ async def _delete_old_logs(self, prisma_client: PrismaClient, cutoff_date: datet run_count = 0 while True: if run_count > SPEND_LOG_RUN_LOOPS: - verbose_proxy_logger.info("Max logs deleted - 1,00,000, rest of the logs will be deleted in next run") + verbose_proxy_logger.info( + "Max logs deleted - 1,00,000, rest of the logs will be deleted in next run" + ) break # Step 1: Find logs to delete logs_to_delete = await prisma_client.db.litellm_spendlogs.find_many( @@ -68,7 +84,9 @@ async def _delete_old_logs(self, prisma_client: PrismaClient, cutoff_date: datet verbose_proxy_logger.info(f"Found {len(logs_to_delete)} logs in this batch") if not logs_to_delete: - verbose_proxy_logger.info(f"No more logs to delete. Total deleted: {total_deleted}") + verbose_proxy_logger.info( + f"No more logs to delete. Total deleted: {total_deleted}" + ) break request_ids = [log.request_id for log in logs_to_delete] @@ -80,7 +98,7 @@ async def _delete_old_logs(self, prisma_client: PrismaClient, cutoff_date: datet total_deleted += len(logs_to_delete) run_count += 1 - + # Add a small sleep to prevent overwhelming the database await asyncio.sleep(0.1) @@ -96,11 +114,15 @@ async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None: verbose_proxy_logger.info(f"Cleanup job triggered at {datetime.now()}") if not self._should_delete_spend_logs(): - verbose_proxy_logger.info("Skipping cleanup — invalid or missing retention setting.") + verbose_proxy_logger.info( + "Skipping cleanup — invalid or missing retention setting." + ) return if self.retention_seconds is None: - verbose_proxy_logger.error("Retention seconds is None, cannot proceed with cleanup") + verbose_proxy_logger.error( + "Retention seconds is None, cannot proceed with cleanup" + ) return # If we have a pod lock manager, try to acquire the lock @@ -108,14 +130,20 @@ async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None: lock_acquired = await self.pod_lock_manager.acquire_lock( cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME, ) - verbose_proxy_logger.info(f"Lock acquisition attempt: {'successful' if lock_acquired else 'failed'} at {datetime.now()}") - + verbose_proxy_logger.info( + f"Lock acquisition attempt: {'successful' if lock_acquired else 'failed'} at {datetime.now()}" + ) + if not lock_acquired: verbose_proxy_logger.info("Another pod is already running cleanup") return - cutoff_date = datetime.now(timezone.utc) - timedelta(seconds=float(self.retention_seconds)) - verbose_proxy_logger.info(f"Deleting logs older than {cutoff_date.isoformat()}") + cutoff_date = datetime.now(timezone.utc) - timedelta( + seconds=float(self.retention_seconds) + ) + verbose_proxy_logger.info( + f"Deleting logs older than {cutoff_date.isoformat()}" + ) # Perform the actual deletion total_deleted = await self._delete_old_logs(prisma_client, cutoff_date) @@ -127,5 +155,7 @@ async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None: finally: # Always release the lock if we have a pod lock manager if self.pod_lock_manager and self.pod_lock_manager.redis_cache: - await self.pod_lock_manager.release_lock(cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME) + await self.pod_lock_manager.release_lock( + cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME + ) verbose_proxy_logger.info("Released cleanup lock") From ceffd7c6d0d4b8f1cf613d829b828cad5c18b984 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 5 Jun 2025 16:00:06 -0700 Subject: [PATCH 2/3] docs update --- docs/my-website/docs/proxy/config_settings.md | 3 ++- docs/my-website/docs/proxy/spend_logs_deletion.md | 8 +++++--- docs/my-website/docs/proxy/ui_logs.md | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md index 0c226ffca64d..3c9b7f1ac1d7 100644 --- a/docs/my-website/docs/proxy/config_settings.md +++ b/docs/my-website/docs/proxy/config_settings.md @@ -650,4 +650,5 @@ router_settings: | USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption | USE_PRISMA_MIGRATE | Flag to use prisma migrate instead of prisma db push. Recommended for production environments. | WEBHOOK_URL | URL for receiving webhooks from external services -| SPEND_LOG_RUN_LOOPS | Constant for setting how many runs of 1000 batch deletes should spend_log_cleanup task run \ No newline at end of file +| SPEND_LOG_RUN_LOOPS | Constant for setting how many runs of 1000 batch deletes should spend_log_cleanup task run | +| SPEND_LOG_CLEANUP_BATCH_SIZE | Number of logs deleted per batch during cleanup. Default is 1000 | diff --git a/docs/my-website/docs/proxy/spend_logs_deletion.md b/docs/my-website/docs/proxy/spend_logs_deletion.md index 5b980e61eaca..3738df5eaad2 100644 --- a/docs/my-website/docs/proxy/spend_logs_deletion.md +++ b/docs/my-website/docs/proxy/spend_logs_deletion.md @@ -71,18 +71,20 @@ If Redis is enabled, LiteLLM uses it to make sure only one instance runs the cle Once cleanup starts: - It calculates the cutoff date using the configured retention period -- Deletes logs older than the cutoff in **batches of 1000** +- Deletes logs older than the cutoff in batches (default size `1000`) - Adds a short delay between batches to avoid overloading the database ### Default settings: -- **Batch size**: 1000 logs +- **Batch size**: 1000 logs (configurable via `SPEND_LOG_CLEANUP_BATCH_SIZE`) - **Max batches per run**: 500 - **Max deletions per run**: 500,000 logs -You can change the number of batches using an environment variable: +You can change the cleanup parameters using environment variables: ```bash SPEND_LOG_RUN_LOOPS=200 +# optional: change batch size from the default 1000 +SPEND_LOG_CLEANUP_BATCH_SIZE=2000 ``` This would allow up to 200,000 logs to be deleted in one run. diff --git a/docs/my-website/docs/proxy/ui_logs.md b/docs/my-website/docs/proxy/ui_logs.md index bca50a2165bb..cd2ee982232c 100644 --- a/docs/my-website/docs/proxy/ui_logs.md +++ b/docs/my-website/docs/proxy/ui_logs.md @@ -69,7 +69,9 @@ general_settings: You can control how many logs are deleted per run using this environment variable: -`SPEND_LOG_RUN_LOOPS=200 # Deletes up to 200,000 logs in one run (batch size = 1000)` +`SPEND_LOG_RUN_LOOPS=200 # Deletes up to 200,000 logs in one run` + +Set `SPEND_LOG_CLEANUP_BATCH_SIZE` to control how many logs are deleted per batch (default `1000`). For detailed architecture and how it works, see [Spend Logs Deletion](../proxy/spend_logs_deletion). From b944767efbb8bd6539747fb5ae4fe6efa51939a6 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 5 Jun 2025 16:42:38 -0700 Subject: [PATCH 3/3] test: test_cleanup_batch_size_env_var --- .../proxy/test_spend_log_cleanup.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_litellm/proxy/test_spend_log_cleanup.py b/tests/test_litellm/proxy/test_spend_log_cleanup.py index 3c66d9baef10..709ebd662d34 100644 --- a/tests/test_litellm/proxy/test_spend_log_cleanup.py +++ b/tests/test_litellm/proxy/test_spend_log_cleanup.py @@ -142,3 +142,24 @@ async def test_cleanup_old_spend_logs_no_retention_period(): mock_prisma_client.db.litellm_spendlogs.find_many.assert_not_called() mock_prisma_client.db.litellm_spendlogs.delete.assert_not_called() + + +def test_cleanup_batch_size_env_var(monkeypatch): + """Ensure batch size is configurable via environment variable""" + import importlib + + import litellm.constants as constants_module + import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module + + # Set env var and reload modules to pick up new value + monkeypatch.setenv("SPEND_LOG_CLEANUP_BATCH_SIZE", "25") + importlib.reload(constants_module) + importlib.reload(cleanup_module) + + cleaner = cleanup_module.SpendLogCleanup(general_settings={}) + assert cleaner.batch_size == 25 + + # Remove env var and reload to restore default for other tests + monkeypatch.delenv("SPEND_LOG_CLEANUP_BATCH_SIZE", raising=False) + importlib.reload(constants_module) + importlib.reload(cleanup_module)