jesse996
diff --git a/‎tests/singlecard/test_baichuan.py
Lines changed: 110 additions & 0 deletions b/‎tests/singlecard/test_baichuan.py
Lines changed: 110 additions & 0 deletions
diff --git a/‎tests/singlecard/test_lora_allowed_token_ids.py
Lines changed: 133 additions & 0 deletions b/‎tests/singlecard/test_lora_allowed_token_ids.py
Lines changed: 133 additions & 0 deletions
diff --git a/‎tests/singlecard/test_lora_checkpoints.py
Lines changed: 128 additions & 0 deletions b/‎tests/singlecard/test_lora_checkpoints.py
Lines changed: 128 additions & 0 deletions
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import vllm
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "baichuan-inc/Baichuan-7B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+        ),
+    ]
+    print(prompts)
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_baichuan_lora(baichuan_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=64,
+                   trust_remote_code=True)
+
+    expected_lora_output = [
+        "SELECT count(*) FROM singer",
+        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
+        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
+    ]
+
+    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i] == expected_lora_output[i]
+    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i] == expected_lora_output[i]
+
+
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
+                                           num_gpus_available, fully_sharded):
+    if num_gpus_available < 4:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+
+    llm_tp1 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
+    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup_dist_env_and_memory()
+
+    llm_tp2 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=2,
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
+    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
+
+    del llm_tp2
+    cleanup_dist_env_and_memory()
+
+    assert output_tp1 == output_tp2
+
+    llm_tp4 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       max_lora_rank=64,
+                       tensor_parallel_size=4,
+                       trust_remote_code=True,
+                       fully_sharded_loras=fully_sharded)
+    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
+
+    del llm_tp4
+    cleanup_dist_env_and_memory()
+
+    assert output_tp1 == output_tp4
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+                         VllmConfig)
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.v1.engine.processor import Processor
+
+
+def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
+                                           sql_lora_files):
+    """
+    Test that we properly resolve the range of allowed token ids for lora
+    adapters that define additional tokens.
+    """
+
+    # Setup a base model compatible with the sql_lora_files adapter and
+    # a known number of tokens in the base model.
+    model_config = ModelConfig(
+        model=llama_2_7b_base_huggingface_id,
+        tokenizer=llama_2_7b_base_huggingface_id,
+        tokenizer_mode="auto",
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        device_config=DeviceConfig(),
+        lora_config=LoRAConfig(),
+    )
+
+    tokenizer = init_tokenizer_from_configs(
+        model_config=vllm_config.model_config,
+        scheduler_config=vllm_config.scheduler_config,
+        lora_config=vllm_config.lora_config)
+    processor = Processor(vllm_config, tokenizer)
+
+    lora_request = LoRARequest("1", 1, str(sql_lora_files))
+    request_id = "1"
+    prompt = "a prompt"
+
+    # tokens added in the lora adapter should not raise an error
+    lora_token_ids = [32000, 32001, 32002, 32003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=lora_token_ids),
+        lora_request=lora_request)
+
+    # tokens in the base model should not raise an error
+    base_token_ids = [1000, 1001, 1002, 1003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=base_token_ids),
+        lora_request=lora_request)
+
+    # tokens not in the lora adapter should raise an error
+    invalid_token_ids = [35000, 35001, 35002, 35003]
+    with pytest.raises(ValueError):
+        processor.process_inputs(
+            request_id,
+            prompt,
+            params=SamplingParams(allowed_token_ids=invalid_token_ids),
+            lora_request=lora_request)
+
+    # tokens in the lora adapter with no lora request should raise an error
+    with pytest.raises(ValueError):
+        processor.process_inputs(
+            request_id,
+            prompt,
+            params=SamplingParams(allowed_token_ids=lora_token_ids),
+        )
+
+
+def test_allowed_token_ids_with_lora_adapter_no_vocab(
+        qwen25vl_base_huggingface_id, qwen25vl_lora_files):
+    """
+    Test that we properly resolve the range of allowed token ids for lora
+    adapters that do not define additional tokens.
+    """
+
+    # Setup a base model compatible with the qwen25vl_lora_files adapter and
+    # a known number of tokens in the base model.
+    model_config = ModelConfig(
+        model=qwen25vl_base_huggingface_id,
+        tokenizer=qwen25vl_base_huggingface_id,
+        tokenizer_mode="auto",
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(),
+        device_config=DeviceConfig(),
+        lora_config=LoRAConfig(),
+    )
+
+    tokenizer = init_tokenizer_from_configs(
+        model_config=vllm_config.model_config,
+        scheduler_config=vllm_config.scheduler_config,
+        lora_config=vllm_config.lora_config)
+    processor = Processor(vllm_config, tokenizer)
+
+    lora_request = LoRARequest("1", 1, str(qwen25vl_lora_files))
+    request_id = "1"
+    prompt = "a prompt"
+
+    # tokens in the base model should not raise an error
+    base_token_ids = [1000, 1001, 1002, 1003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=base_token_ids),
+        lora_request=lora_request)
+
+    # tokens in the base model with no lora request should not raise an error
+    base_token_ids = [1000, 1001, 1002, 1003]
+    processor.process_inputs(
+        request_id,
+        prompt,
+        params=SamplingParams(allowed_token_ids=base_token_ids),
+    )
+
+    # tokens not in the base model should raise an error
+    invalid_token_ids = [200000, 200001, 200002, 200003]
+    with pytest.raises(ValueError):
+        processor.process_inputs(
+            request_id,
+            prompt,
+            params=SamplingParams(allowed_token_ids=invalid_token_ids),
+            lora_request=lora_request)
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
+from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+from vllm.model_executor.models.utils import WeightsMapper
+
+lora_lst = [
+    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
+]
+BAICHUAN_LORA_MODULES = [
+    "W_pack",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+]
+
+
+@pytest.mark.parametrize("lora_name", lora_lst)
+def test_load_checkpoints(
+    lora_name,
+    baichuan_lora_files,
+    baichuan_zero_lora_files,
+    baichuan_regex_lora_files,
+    chatglm3_lora_files,
+):
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+    if lora_name == "baichuan7B":
+        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                                max_position_embeddings=4096)
+        # For the baichuan7B model, load it's LoRA,
+        # and the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_lora_files,
+            expected_lora_modules,
+            peft_helper=peft_helper,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero":
+        # Test that the target_modules contain prefix
+        # such as "model.layers.0.self_atten.W_pack", and
+        # the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
+                                                max_position_embeddings=4096)
+        LoRAModel.from_local_checkpoint(
+            baichuan_zero_lora_files,
+            expected_lora_modules,
+            peft_helper=peft_helper,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero-regex":
+        # Test that the `target_modules` in the form of regular expressions,
+        # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
+                                                max_position_embeddings=4096)
+        LoRAModel.from_local_checkpoint(
+            baichuan_regex_lora_files,
+            expected_lora_modules,
+            peft_helper=peft_helper,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    else:
+        # For the baichuan7B model, load chatglm3-6b's LoRA,
+        # and the test should raise the following error.
+        expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
+                                                max_position_embeddings=4096)
+        with pytest.raises(ValueError, match=expected_error):
+            LoRAModel.from_local_checkpoint(
+                chatglm3_lora_files,
+                expected_lora_modules,
+                peft_helper=peft_helper,
+                lora_model_id=1,
+                device="cpu",
+                embedding_modules=embedding_modules,
+                embedding_padding_modules=embed_padding_modules)
+
+
+def test_lora_weights_mapping(baichuan_lora_files):
+
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            ".layers.": ".baichuan_layers.",
+        },
+    )
+    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                            max_position_embeddings=4096)
+    lora_model = LoRAModel.from_local_checkpoint(
+        baichuan_lora_files,
+        expected_lora_modules,
+        peft_helper=peft_helper,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules,
+        weights_mapper=hf_to_vllm_mapper,
+    )
+    for name in lora_model.loras:
+        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
+        assert ".baichuan_layers." in name