[CI] Update CI docker for latest ml_dtypes (#3162)

MasterJH5574 · web-flow · commit 0c3720d79ec2 · 2025-03-09T23:47:48.000-04:00
This PR updates the CI docker images so that the latest
packages of `ml_dtypes` can be installed.
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,2 @@
+[MESSAGES CONTROL]
+disable=too-many-positional-arguments,duplicate-code
diff --git a/ci/jenkinsfile.groovy b/ci/jenkinsfile.groovy
@@ -17,9 +17,9 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
-run_cpu = "bash ci/bash.sh mlcaidev/ci-cpu:4d61e5d -e GPU cpu -e MLC_CI_SETUP_DEPS 1"
-run_cuda = "bash ci/bash.sh mlcaidev/ci-cu121:4d61e5d -e GPU cuda-12.1 -e MLC_CI_SETUP_DEPS 1"
-run_rocm = "bash ci/bash.sh mlcaidev/ci-rocm57:4d61e5d -e GPU rocm-5.7 -e MLC_CI_SETUP_DEPS 1"
+run_cpu = "bash ci/bash.sh mlcaidev/ci-cpu:26d65cc -e GPU cpu -e MLC_CI_SETUP_DEPS 1"
+run_cuda = "bash ci/bash.sh mlcaidev/ci-cu128:26d65cc -e GPU cuda-12.8 -e MLC_CI_SETUP_DEPS 1"
+// run_rocm = "bash ci/bash.sh mlcaidev/ci-rocm57:26d65cc -e GPU rocm-5.7 -e MLC_CI_SETUP_DEPS 1"
 
 pkg_cpu = "bash ci/bash.sh mlcaidev/package-rocm61:5b6f876 -e GPU cpu -e MLC_CI_SETUP_DEPS 1"
 pkg_cuda = "bash ci/bash.sh mlcaidev/package-cu128:5b6f876 -e GPU cuda-12.8 -e MLC_CI_SETUP_DEPS 1"
@@ -123,10 +123,10 @@ stage('Build') {
         ws(per_exec_ws('mlc-llm-build-cuda')) {
           init_git(true)
           sh(script: "ls -alh", label: 'Show work directory')
-          sh(script: "${pkg_cuda} conda env export --name py38", label: 'Checkout version')
-          sh(script: "${pkg_cuda} -j 8 -v \$HOME/.ccache /ccache conda run -n py38 ./ci/task/build_lib.sh", label: 'Build MLC LLM runtime')
-          sh(script: "${pkg_cuda} -j 8 conda run -n py38 ./ci/task/build_wheel.sh", label: 'Build MLC LLM wheel')
-          sh(script: "${pkg_cuda} -j 1 conda run -n py38 ./ci/task/build_clean.sh", label: 'Clean up after build')
+          sh(script: "${pkg_cuda} conda env export --name py312", label: 'Checkout version')
+          sh(script: "${pkg_cuda} -j 8 -v \$HOME/.ccache /ccache conda run -n py312 ./ci/task/build_lib.sh", label: 'Build MLC LLM runtime')
+          sh(script: "${pkg_cuda} -j 8 conda run -n py312 ./ci/task/build_wheel.sh", label: 'Build MLC LLM wheel')
+          sh(script: "${pkg_cuda} -j 1 conda run -n py312 ./ci/task/build_clean.sh", label: 'Clean up after build')
           sh(script: "ls -alh ./wheels/", label: 'Build artifact')
           pack_lib('mlc_wheel_cuda', 'wheels/*.whl')
         }
@@ -165,10 +165,10 @@ stage('Build') {
         ws(per_exec_ws('mlc-llm-build-vulkan')) {
           init_git(true)
           sh(script: "ls -alh", label: 'Show work directory')
-          sh(script: "${pkg_cpu} conda env export --name py38", label: 'Checkout version')
-          sh(script: "${pkg_cpu} -j 8 conda run -n py38 ./ci/task/build_lib.sh", label: 'Build MLC LLM runtime')
-          sh(script: "${pkg_cpu} -j 8 conda run -n py38 ./ci/task/build_wheel.sh", label: 'Build MLC LLM wheel')
-          sh(script: "${pkg_cpu} -j 1 conda run -n py38 ./ci/task/build_clean.sh", label: 'Clean up after build')
+          sh(script: "${pkg_cpu} conda env export --name py312", label: 'Checkout version')
+          sh(script: "${pkg_cpu} -j 8 conda run -n py312 ./ci/task/build_lib.sh", label: 'Build MLC LLM runtime')
+          sh(script: "${pkg_cpu} -j 8 conda run -n py312 ./ci/task/build_wheel.sh", label: 'Build MLC LLM wheel')
+          sh(script: "${pkg_cpu} -j 1 conda run -n py312 ./ci/task/build_clean.sh", label: 'Clean up after build')
           sh(script: "ls -alh ./wheels/", label: 'Build artifact')
           pack_lib('mlc_wheel_vulkan', 'wheels/*.whl')
         }
diff --git a/ci/task/pylint.sh b/ci/task/pylint.sh
@@ -10,6 +10,7 @@ if [[ -n ${MLC_CI_SETUP_DEPS:-} ]]; then
     echo "MLC_CI_SETUP_DEPS=1 start setup deps"
     # TVM Unity is a dependency to this testing
     pip install --quiet --pre -U --no-index -f https://mlc.ai/wheels mlc-ai-nightly-cpu
+    pip install requests
     pip install --quiet --pre -U cuda-python
 fi
 
diff --git a/ci/task/test_model_compile.sh b/ci/task/test_model_compile.sh
@@ -31,7 +31,6 @@ elif [[ ${GPU} == ios ]]; then
 elif [[ ${GPU} == android* ]]; then
     TARGET=android
     pip install --pre -U --no-index -f https://mlc.ai/wheels mlc-ai-nightly-cpu
-    source /android_env_vars.sh
 else
     TARGET=vulkan
     pip install --pre -U --no-index -f https://mlc.ai/wheels mlc-ai-nightly-cpu
diff --git a/python/mlc_llm/cli/delivery.py b/python/mlc_llm/cli/delivery.py
@@ -282,7 +282,7 @@ def _main(  # pylint: disable=too-many-locals, too-many-arguments
     failed_cases: List[Tuple[str, str]] = []
     delivered_log = _get_current_log(log)
     for task_index, task in enumerate(delivery_diff.tasks, 1):
-        logger.info(
+        logger.info(  # pylint: disable=logging-not-lazy
             bold("[{task_index}/{total_tasks}] Processing model: ").format(
                 task_index=task_index,
                 total_tasks=len(delivery_diff.tasks),
diff --git a/python/mlc_llm/cli/lib_delivery.py b/python/mlc_llm/cli/lib_delivery.py
@@ -125,7 +125,7 @@ def _main(  # pylint: disable=too-many-locals
     """Compile the model libs in the spec and save them to the binary_libs_dir."""
     failed_cases: List[Any] = []
     for task_index, task in enumerate(spec["tasks"], 1):
-        logger.info(
+        logger.info(  # pylint: disable=logging-not-lazy
             bold("[{task_index}/{total_tasks}] Processing model: ").format(
                 task_index=task_index,
                 total_tasks=len(spec["tasks"]),
diff --git a/python/mlc_llm/compiler_pass/attach_logit_processor.py b/python/mlc_llm/compiler_pass/attach_logit_processor.py
@@ -72,8 +72,7 @@ def _apply_logit_bias_inplace(
 def _get_apply_logit_bias_inplace(target: tvm.target.Target):
     tx = 1024  # default
     max_num_threads_per_block = get_max_num_threads_per_block(target)
-    if max_num_threads_per_block < tx:
-        tx = max_num_threads_per_block
+    tx = min(tx, max_num_threads_per_block)
     check_thread_limits(target, bdx=tx, bdy=1, bdz=1, gdz=1)
 
     @T.prim_func
@@ -157,8 +156,7 @@ def _apply_penalty_inplace(  # pylint: disable=too-many-arguments,too-many-local
 def _get_apply_penalty_inplace(target: tvm.target.Target):
     tx = 1024  # default
     max_num_threads_per_block = get_max_num_threads_per_block(target)
-    if max_num_threads_per_block < tx:
-        tx = max_num_threads_per_block
+    tx = min(tx, max_num_threads_per_block)
     check_thread_limits(target, bdx=tx, bdy=1, bdz=1, gdz=1)
 
     @T.prim_func
@@ -248,8 +246,7 @@ def _apply_bitmask_inplace(
 def _get_apply_bitmask_inplace(target: tvm.target.Target):
     tx = 1024  # default
     max_num_threads_per_block = get_max_num_threads_per_block(target)
-    if max_num_threads_per_block < tx:
-        tx = max_num_threads_per_block
+    tx = min(tx, max_num_threads_per_block)
     check_thread_limits(target, bdx=tx, bdy=1, bdz=1, gdz=1)
 
     @T.prim_func
diff --git a/python/mlc_llm/contrib/embeddings/embeddings.py b/python/mlc_llm/contrib/embeddings/embeddings.py
@@ -170,8 +170,8 @@ def _tokenize_queries(self, queries: List[str]) -> Tuple[np.ndarray, np.ndarray]
         tokens = engine_utils.process_prompts(queries, self.tokenizer.encode)  # type: ignore
         max_query_length = max(len(token_seq) for token_seq in tokens)
 
-        token_inputs = np.zeros((len(tokens), max_query_length), dtype=np.int32)
-        attention_mask = np.zeros((len(tokens), max_query_length), dtype=np.int32)
+        token_inputs: np.ndarray = np.zeros((len(tokens), max_query_length), dtype=np.int32)
+        attention_mask: np.ndarray = np.zeros((len(tokens), max_query_length), dtype=np.int32)
 
         for i, token_seq in enumerate(tokens):
             token_inputs[i, : len(token_seq)] = token_seq
diff --git a/python/mlc_llm/json_ffi/engine.py b/python/mlc_llm/json_ffi/engine.py
@@ -194,7 +194,7 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
             ),
             request_id=request_id,
         )
-        for response in chatcmpl_generator:
+        for response in chatcmpl_generator:  # pylint: disable=use-yield-from
             yield response
 
 
diff --git a/python/mlc_llm/model/phi/phi_model.py b/python/mlc_llm/model/phi/phi_model.py
@@ -135,8 +135,7 @@ def __post_init__(self):
                 )
         if self.prefill_chunk_size == 0:
             self.prefill_chunk_size = self.context_window_size
-        if self.prefill_chunk_size > self.context_window_size:
-            self.prefill_chunk_size = self.context_window_size
+        self.prefill_chunk_size = min(self.prefill_chunk_size, self.context_window_size)
         if self.n_head_kv == 0 or self.n_head_kv is None:
             self.n_head_kv = self.n_head
         if self.n_inner == 0 or self.n_inner is None:
diff --git a/python/mlc_llm/op/top_p_pivot.py b/python/mlc_llm/op/top_p_pivot.py
@@ -1,4 +1,4 @@
-"""Operators for choosing the pivot to cut-off top-p percentile """
+"""Operators for choosing the pivot to cut-off top-p percentile"""
 
 import tvm
 from tvm.script import tir as T
@@ -42,8 +42,7 @@ def top_p_pivot(pN, target: tvm.target.Target):
     eps_LR = 1e-7
 
     max_num_threads_per_block = get_max_num_threads_per_block(target)
-    if max_num_threads_per_block < TX:
-        TX = max_num_threads_per_block
+    TX = min(TX, max_num_threads_per_block)
 
     def _var(dtype="int32"):
         return T.alloc_buffer((1,), dtype, scope="local")
@@ -294,8 +293,7 @@ def top_p_renorm(target: tvm.target.Target = None):
 
     if target:
         max_num_threads_per_block = get_max_num_threads_per_block(target)
-        if max_num_threads_per_block < TX:
-            TX = max_num_threads_per_block
+        TX = min(TX, max_num_threads_per_block)
 
     def _var(dtype="int32"):
         return T.alloc_buffer((1,), dtype, scope="local")
diff --git a/python/mlc_llm/protocol/openai_api_protocol.py b/python/mlc_llm/protocol/openai_api_protocol.py
@@ -409,7 +409,7 @@ class ChatCompletionStreamResponse(BaseModel):
 
 
 def openai_api_get_unsupported_fields(
-    request: Union[CompletionRequest, ChatCompletionRequest]
+    request: Union[CompletionRequest, ChatCompletionRequest],
 ) -> List[str]:
     """Get the unsupported fields in the request."""
     unsupported_field_default_values: List[Tuple[str, Any]] = [
diff --git a/python/mlc_llm/quantization/fp8_quantization.py b/python/mlc_llm/quantization/fp8_quantization.py
@@ -1,4 +1,4 @@
-""" Quantization techniques for FP8 """
+"""Quantization techniques for FP8"""
 
 import numpy as np
 from tvm import nd, relax
diff --git a/python/mlc_llm/router/router.py b/python/mlc_llm/router/router.py
@@ -1,4 +1,4 @@
-""" Programmable router for dispatching OpenAI API to Microserving API"""
+"""Programmable router for dispatching OpenAI API to Microserving API"""
 
 import json
 import math
diff --git a/python/mlc_llm/serve/data.py b/python/mlc_llm/serve/data.py
@@ -12,10 +12,10 @@
 
 
 @tvm._ffi.register_object("mlc.serve.Data")  # pylint: disable=protected-access
-class Data(Object):
+class Data(Object):  # pylint: disable=too-few-public-methods
     """The base class of multi-modality data (text, tokens, embedding, etc)."""
 
-    def __init__(self):
+    def __init__(self):  # pylint: disable=super-init-not-called
         pass
 
 
diff --git a/python/mlc_llm/serve/engine.py b/python/mlc_llm/serve/engine.py
@@ -1907,7 +1907,7 @@ def abort_request():
                 request_outputs, request_final_usage_json_str = self._request_stream_callback_impl(
                     delta_outputs
                 )
-                for request_output in request_outputs:
+                for request_output in request_outputs:  # pylint: disable=use-yield-from
                     yield request_output
 
                 if request_final_usage_json_str is not None:
diff --git a/python/mlc_llm/serve/engine_utils.py b/python/mlc_llm/serve/engine_utils.py
@@ -160,7 +160,7 @@ def process_prompts(
 
 
 def convert_prompts_to_data(
-    prompts: Union[str, List[int], List[Union[str, List[int], data.Data]]]
+    prompts: Union[str, List[int], List[Union[str, List[int], data.Data]]],
 ) -> List[data.Data]:
     """Convert the given prompts in the combination of token id lists
     and/or data to all data."""
diff --git a/python/mlc_llm/serve/event_trace_recorder.py b/python/mlc_llm/serve/event_trace_recorder.py
@@ -10,7 +10,7 @@
 class EventTraceRecorder(Object):
     """The event trace recorder for requests."""
 
-    def __init__(self) -> None:
+    def __init__(self) -> None:  # pylint: disable=super-init-not-called
         """Initialize a trace recorder."""
         self.__init_handle_by_constructor__(
             _ffi_api.EventTraceRecorder  # type: ignore  # pylint: disable=no-member
diff --git a/python/mlc_llm/serve/radix_tree.py b/python/mlc_llm/serve/radix_tree.py
@@ -13,7 +13,7 @@
 class PagedRadixTree(Object):
     """The paged radix tree to manage prefix and sequence."""
 
-    def __init__(self):
+    def __init__(self):  # pylint: disable=super-init-not-called
         """
         Constructor of paged radix tree.
         """
diff --git a/python/mlc_llm/support/random.py b/python/mlc_llm/support/random.py
@@ -10,7 +10,7 @@ def set_global_random_seed(seed):
     if "torch" in sys.modules:
         sys.modules["torch"].manual_seed(seed)
     if "random" in sys.modules:
-        sys.modules["random"].seed(seed)
+        sys.modules["random"].seed(seed)  # pylint: disable=no-member
     if "tvm" in sys.modules:
         set_seed = sys.modules["tvm"].get_global_func("mlc.random.set_seed")
         if set_seed:
diff --git a/python/mlc_llm/tokenizers/streamer.py b/python/mlc_llm/tokenizers/streamer.py
@@ -16,7 +16,7 @@ class TextStreamer(Object):
     that generated by tokenizer.
     """
 
-    def __init__(self, tokenizer: Tokenizer) -> None:
+    def __init__(self, tokenizer: Tokenizer) -> None:  # pylint: disable=super-init-not-called
         """Create the text streamer from tokenizer"""
         self.__init_handle_by_constructor__(
             _ffi_api.TextStreamer, tokenizer  # type: ignore  # pylint: disable=no-member
@@ -55,7 +55,9 @@ class StopStrHandler(Object):
     one at a time, and return the output delta token before stopping due to
     stop strings."""
 
-    def __init__(self, stop_strs: List[str], tokenizer: Tokenizer) -> None:
+    def __init__(  # pylint: disable=super-init-not-called
+        self, stop_strs: List[str], tokenizer: Tokenizer
+    ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.StopStrHandler,  # type: ignore  # pylint: disable=no-member
             stop_strs,
diff --git a/python/mlc_llm/tokenizers/tokenizers.py b/python/mlc_llm/tokenizers/tokenizers.py
@@ -25,12 +25,12 @@ class TokenizerInfo:  # pylint: disable=too-many-instance-attributes
         The method to post-process the tokens to their original strings.
         Possible values (each refers to a kind of tokenizer):
         - "byte_fallback": The same as the byte-fallback BPE tokenizer, including LLaMA-2,
-            Mixtral-7b, etc. E.g. "▁of" -> " of", "<0x1B>" -> "\x1B".
+            Mixtral-7b, etc. E.g. "▁of" -> " of", "<0x1B>" -> "\x1b".
             This method:
             1) Transform tokens like <0x1B> to hex char byte 1B. (so-called byte-fallback)
             2) Replace \\u2581 "▁" with space.
         - "byte_level": The same as the byte-level BPE tokenizer, including LLaMA-3, GPT-2,
-            Phi-2, etc. E.g. "Ġin" -> " in", "ě" -> "\x1B"
+            Phi-2, etc. E.g. "Ġin" -> " in", "ě" -> "\x1b"
             This method inverses the bytes-to-unicode transformation in the encoding process in
             https://github.com/huggingface/transformers/blob/87be06ca77166e6a6215eee5a990ab9f07238a18/src/transformers/models/gpt2/tokenization_gpt2.py#L38-L59
 
@@ -59,7 +59,7 @@ def from_json(json_str: str) -> "TokenizerInfo":
 class Tokenizer(Object):
     """The tokenizer class in MLC LLM."""
 
-    def __init__(self, tokenizer_path: str) -> None:
+    def __init__(self, tokenizer_path: str) -> None:  # pylint: disable=super-init-not-called
         """Create the tokenizer from tokenizer directory path."""
         self.__init_handle_by_constructor__(
             _ffi_api.Tokenizer, tokenizer_path  # type: ignore  # pylint: disable=no-member

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+[MESSAGES CONTROL]`
	`2`	`+disable=too-many-positional-arguments,duplicate-code`
Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,7 @@ def create( # pylint: disable=too-many-arguments,too-many-locals`
`194`	`194`	`),`
`195`	`195`	`request_id=request_id,`
`196`	`196`	`)`
`197`		`- for response in chatcmpl_generator:`
	`197`	`+ for response in chatcmpl_generator: # pylint: disable=use-yield-from`
`198`	`198`	`yield response`
`199`	`199`
`200`	`200`