Merge pull request #428 from smart-on-fhir/mikix/vllm

mikix · web-flow · commit 5057eb092298 · 2025-08-06T12:24:06.000-04:00
Convert our previous hugging face support from TGI to vLLM
diff --git a/compose.yaml b/compose.yaml
@@ -40,7 +40,6 @@ services:
       - AZURE_OPENAI_API_KEY
       - AZURE_OPENAI_ENDPOINT
       # Internal environment variobles
-      - CUMULUS_HUGGING_FACE_URL=http://llama2:8086/
       - URL_CTAKES_REST=http://ctakes-covid:8080/ctakes-web-rest/service/analyze
       - URL_CNLP_NEGATION=http://cnlpt-negation:8000/negation/process
       - URL_CNLP_TERM_EXISTS=http://cnlpt-term-exists:8000/termexists/process
@@ -60,6 +59,7 @@ services:
   cumulus-etl-gpu:
     extends: cumulus-etl-base
     environment:
+      - CUMULUS_LLAMA2_URL=http://llama2:8086/v1
       - URL_CNLP_NEGATION=http://cnlpt-negation-gpu:8000/negation/process
       - URL_CNLP_TERM_EXISTS=http://cnlpt-term-exists-gpu:8000/termexists/process
     profiles:
@@ -140,21 +140,23 @@ services:
   # This is a WIP llama2 setup, currently suitable for running in a g5.xlarge AWS instance.
   llama2:
     extends: common-base
-    image: ghcr.io/huggingface/text-generation-inference:1.0.1
+    image: vllm/vllm-openai:v0.10.0
     environment:
+      - HF_TOKEN
+      - HUGGING_FACE_HUB_TOKEN
+    command:
       # If you update anything here that could affect NLP results, consider updating the
       # task_version of any tasks that use this docker.
-      - HUGGING_FACE_HUB_TOKEN
-      - MODEL_ID=meta-llama/Llama-2-13b-chat-hf
-      - QUANTIZE=bitsandbytes-nf4  # 4bit
-      - PORT=8086
-      - REVISION=0ba94ac9b9e1d5a0037780667e8b219adde1908c
+      - --download-dir=/data
+      - --model=meta-llama/Llama-2-13b-chat-hf
+      - --port=8086
+      - --quantization=bitsandbytes  # 4bit
+      - --revision=a2cb7a712bb6e5e736ca7f8cd98167f81a0b5bd8
     healthcheck:
-      # There's no curl or wget inside this container, but there is python3!
-      test: ["CMD", "python3", "-c", "import socket; socket.create_connection(('localhost', 8086))"]
+      test: ["CMD", "wget", "localhost:8086/health", "--output-document=/dev/null"]
       start_period: 20m  # give plenty of time for startup, since we may be downloading a model
     volumes:
-      - hf-data:/data
+      - vllm-data:/data
     profiles:
       - hf-test
     networks:
@@ -257,4 +259,4 @@ networks:
 
 volumes:
   ctakes-overrides:
-  hf-data:
+  vllm-data:
diff --git a/cumulus_etl/etl/studies/hftest/hf_tasks.py b/cumulus_etl/etl/studies/hftest/hf_tasks.py
@@ -1,11 +1,9 @@
 """Define tasks for the hftest study"""
 
-import cumulus_fhir_support as cfs
-import httpx
 import pyarrow
 import rich.progress
 
-from cumulus_etl import common, errors, nlp
+from cumulus_etl import common, nlp
 from cumulus_etl.etl import tasks
 
 
@@ -17,48 +15,35 @@ class HuggingFaceTestTask(tasks.BaseNlpTask):
     task_version = 0
     # Task Version History:
     # ** 0 **
-    #   This is fluid until we actually promote this to a real task - feel free to update without bumping the version.
-    #   container: ghcr.io/huggingface/text-generation-inference
-    #   container reversion: 09eca6422788b1710c54ee0d05dd6746f16bb681
+    #   This is fluid until we actually promote this to a real task - feel free to update without
+    #   bumping the version.
+    #   container: vllm/vllm-openai
+    #   container revision: v0.10.0
     #   container properties:
-    #     QUANTIZE=bitsandbytes-nf4
+    #     QUANTIZE=bitsandbytes
     #   model: meta-llama/Llama-2-13b-chat-hf
-    #   model revision: 0ba94ac9b9e1d5a0037780667e8b219adde1908c
+    #   model revision: a2cb7a712bb6e5e736ca7f8cd98167f81a0b5bd8
     #   system prompt:
-    #     "You will be given a clinical note, and you should reply with a short summary of that note."
+    #     "You will be given a clinical note, and you should reply with a short summary of that
+    #     note."
     #   user prompt: a clinical note
 
     @classmethod
     async def init_check(cls) -> None:
-        try:
-            raw_info = await nlp.hf_info()
-        except cfs.NetworkError:
-            errors.fatal(
-                "Llama2 NLP server is unreachable.\n Try running 'docker compose up llama2 --wait'.",
-                errors.SERVICE_MISSING,
-            )
-
-        # Sanity check a few of the properties, to make sure we don't accidentally get pointed at an unexpected model.
-        expected_info_present = (
-            raw_info.get("model_id") == "meta-llama/Llama-2-13b-chat-hf"
-            and raw_info.get("model_sha") == "0ba94ac9b9e1d5a0037780667e8b219adde1908c"
-            and raw_info.get("sha") == "09eca6422788b1710c54ee0d05dd6746f16bb681"
-        )
-        if not expected_info_present:
-            errors.fatal(
-                "LLama2 NLP server is using an unexpected model setup.",
-                errors.SERVICE_MISSING,
-            )
+        await nlp.Llama2Model().check()
 
     async def read_entries(self, *, progress: rich.progress.Progress = None) -> tasks.EntryIterator:
         """Passes clinical notes through HF and returns any symptoms found"""
-        http_client = httpx.AsyncClient(timeout=300)
+        client = nlp.Llama2Model()
 
         async for _, docref, clinical_note in self.read_notes(progress=progress):
             timestamp = common.datetime_now().isoformat()
 
             # If you change this prompt, consider updating task_version.
-            system_prompt = "You will be given a clinical note, and you should reply with a short summary of that note."
+            system_prompt = (
+                "You will be given a clinical note, "
+                "and you should reply with a short summary of that note."
+            )
             user_prompt = clinical_note
 
             summary = await nlp.cache_wrapper(
@@ -67,18 +52,17 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
                 clinical_note,
                 lambda x: x,  # from file: just store the string
                 lambda x: x,  # to file: just read it back
-                nlp.llama2_prompt,
+                client.prompt,
                 system_prompt,
                 user_prompt,
-                client=http_client,
             )
 
             # Debugging
-            # logging.warning("\n\n\n\n" "**********************************************************")
+            # logging.warning("\n\n\n\n" "********************************************************")
             # logging.warning(user_prompt)
-            # logging.warning("==========================================================")
+            # logging.warning("========================================================")
             # logging.warning(summary)
-            # logging.warning("**********************************************************")
+            # logging.warning("********************************************************")
 
             yield {
                 "id": docref["id"],  # just copy the docref
diff --git a/cumulus_etl/nlp/__init__.py b/cumulus_etl/nlp/__init__.py
@@ -1,7 +1,7 @@
 """Support code for NLP servers"""
 
 from .extract import TransformerModel, ctakes_extract, ctakes_httpx_client, list_polarity
-from .huggingface import hf_info, hf_prompt, llama2_prompt
+from .openai import Llama2Model
 from .utils import cache_wrapper, get_docref_info, is_docref_valid
 from .watcher import (
     check_ctakes,
diff --git a/cumulus_etl/nlp/huggingface.py b/cumulus_etl/nlp/huggingface.py
diff --git a/cumulus_etl/nlp/openai.py b/cumulus_etl/nlp/openai.py
@@ -0,0 +1,69 @@
+"""Abstraction layer for Hugging Face's inference API"""
+
+import abc
+import os
+
+import openai
+
+from cumulus_etl import errors
+
+
+class OpenAIModel(abc.ABC):
+    COMPOSE_ID = None
+    MODEL_NAME = None
+
+    @property
+    @abc.abstractmethod
+    def url(self) -> str:
+        """The OpenAI compatible URL to talk to (where's the server?)"""
+
+    @property
+    @abc.abstractmethod
+    def api_key(self) -> str:
+        """The API key to use (empty string for local servers)"""
+
+    def __init__(self):
+        self.client = openai.AsyncClient(base_url=self.url, api_key=self.api_key)
+
+    async def check(self) -> None:
+        try:
+            models = self.client.models.list()
+            names = {model.id async for model in models}
+        except openai.APIError:
+            errors.fatal(
+                f"NLP server '{self.COMPOSE_ID}' is unreachable.\n"
+                f"Try running 'docker compose up {self.COMPOSE_ID} --wait'.",
+                errors.SERVICE_MISSING,
+            )
+
+        if self.MODEL_NAME not in names:
+            errors.fatal(
+                f"NLP server '{self.COMPOSE_ID}' is using an unexpected model setup.",
+                errors.SERVICE_MISSING,
+            )
+
+    async def prompt(self, system: str, user: str) -> str:
+        response = await self.client.responses.create(
+            model=self.MODEL_NAME,
+            instructions=system,
+            input=user,
+            temperature=0,
+        )
+        return response.output_text.strip()
+
+
+class LocalModel(OpenAIModel, abc.ABC):
+    @property
+    def api_key(self) -> str:
+        return ""
+
+
+class Llama2Model(LocalModel):
+    COMPOSE_ID = "llama2"
+    MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"
+
+    @property
+    def url(self) -> str:
+        # 8000 and 8080 are both used as defaults in ctakesclient (cnlp & ctakes respectively).
+        # 8086 is used as a joking reference to Hugging Face (HF = 86).
+        return os.environ.get("CUMULUS_LLAMA2_URL") or "http://localhost:8086/v1"
diff --git a/docs/chart-review.md b/docs/chart-review.md
@@ -64,7 +64,6 @@ docker compose run --rm \
 
 The above command will take all the DiagnosticReports and DocumentReferences
 in Group `67890` from the EHR,
-mark the notes with the default NLP dictionary,
 anonymize the notes with `philter`,
 and then push the results to your Label Studio project number `3`.
 
diff --git a/tests/hftest/test_hftask.py b/tests/hftest/test_hftask.py