Update vllm models

mikix · mikix · commit 30577ac090c0 · 2025-10-07T16:25:04.000-04:00
diff --git a/compose.yaml b/compose.yaml
@@ -145,7 +145,7 @@ services:
     # Needs 80GB of GPU memory. A g6.12xlarge EC2 instance should work.
     # Docs: https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html
     extends: common-base
-    image: vllm/vllm-openai:gptoss  # TODO: once stabilized, revert to main releases
+    image: vllm/vllm-openai:v0.10.2
     environment:
       - VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1  # needed for NVIDIA A10 and A100 chips
     command:
@@ -154,7 +154,7 @@ services:
       # If you update anything here that could affect NLP results, consider updating the
       # task_version of any tasks that use this docker.
       - --model=openai/gpt-oss-120b
-      - --revision=bc75b44b8a2a116a0e4c6659bcd1b7969885f423
+      - --revision=b5c939de8f754692c1647ca79fbf85e8c1e70f8a
       - --tensor-parallel-size=4
     shm_size: 32G
     healthcheck:
@@ -174,11 +174,11 @@ services:
           devices:
             - capabilities: [gpu]
 
-  llama4-scout:  # WIP, have not gotten it to run successfully yet
+  llama4-scout:
     # Needs 80GB of GPU memory. A g6.12xlarge EC2 instance should work.
     # Docs: https://docs.vllm.ai/projects/recipes/en/latest/Llama/Llama4-Scout.html
     extends: common-base
-    image: vllm/vllm-openai:v0.10.0
+    image: vllm/vllm-openai:v0.10.2
     environment:
       - HF_TOKEN
       - HUGGING_FACE_HUB_TOKEN
@@ -188,8 +188,8 @@ services:
       - --port=8087
       # If you update anything here that could affect NLP results, consider updating the
       # task_version of any tasks that use this docker.
-      - --model=nvidia/Llama-4-Scout-17B-16E-Instruct-FP8
-      - --revision=d1cf1e9db03b67e10422f97f38c8b546dec14789
+      - --model=nvidia/Llama-4-Scout-17B-16E-Instruct-FP4
+      - --revision=5588d5387bb37753ee29cde60e76910efc7fb4a9
       - --tensor-parallel-size=4
     shm_size: 32G
     healthcheck:
diff --git a/cumulus_etl/nlp/models.py b/cumulus_etl/nlp/models.py
@@ -298,7 +298,7 @@ class Llama4ScoutModel(Model):
     AZURE_ID = "Llama-4-Scout-17B-16E-Instruct"
     BEDROCK_ID = "us.meta.llama4-scout-17b-instruct-v1:0"
     COMPOSE_ID = "llama4-scout"
-    VLLM_INFO = ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", "LLAMA4_SCOUT", 8087)
+    VLLM_INFO = ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", "LLAMA4_SCOUT", 8087)
 
 
 class ClaudeSonnet45Model(Model):
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "cumulus-etl"
 requires-python = ">= 3.11"
 dependencies = [
-    "aiobotocore < 2.22.0",  # FIXME: temp hotfix for dependency version madness - remove later
+    "s3fs[boto3]",  # specify this early to resolve some botocore/aiobotocore dependency madness
     "ctakesclient >= 5.1",
     "cumulus-fhir-support >= 1.6",
     "delta-spark >= 4, < 5",