-
Notifications
You must be signed in to change notification settings - Fork 992
Open
Description
System Info
- Platform: Modal (Debian Slim, Python 3.12)
- GPU: NVIDIA H100
- Key Packages: [transformers==4.53.2', 'torch==2.7.1', 'accelerate==1.9.0']
- Image definition:
modal.Image.debian_slim(python_version="3.12")
.run_commands("pip install --upgrade pip")
.pip_install("torch", "transformers>=4.40.0", "accelerate", "fastapi[standard]",
"sentencepiece", "bitsandbytes")
.run_commands("mkdir -p /models")Task
Run inference with "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" on an H100 GPU
Error
Traceback (most recent call last):
File "/pkg/modal/_runtime/container_io_manager.py", line 772, in handle_input_exception
yield
File "/pkg/modal/_container_entrypoint.py", line 222, in run_input_async
value = await res
^^^^^^^^^
File "/root/llm/modal_app.py", line 87, in inference
outputs = self.model["generator"].generate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/transformers/generation/utils.py", line 2625, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/transformers/generation/utils.py", line 3599, in _sample
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1728, in prepare_inputs_for_generation
max_cache_length = past_key_values.get_max_length()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'DynamicCache' object has no attribute 'get_max_length'. Did you mean: 'get_seq_length'?
Traceback (most recent call last):
File "/pkg/modal/_runtime/container_io_manager.py", line 772, in handle_input_exception
yield
File "/pkg/modal/_container_entrypoint.py", line 205, in run_input_async
async for value in res:
File "/pkg/modal/_runtime/asgi.py", line 226, in fn
app_task.result() # consume/raise exceptions if there are any!
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/applications.py", line 113, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/middleware/errors.py", line 186, in __call__
raise exc
File "/usr/local/lib/python3.12/site-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/usr/local/lib/python3.12/site-packages/starlette/middleware/cors.py", line 85, in __call__
await self.app(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
raise exc
File "/usr/local/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.12/site-packages/starlette/routing.py", line 716, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/routing.py", line 736, in app
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/routing.py", line 290, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/routing.py", line 78, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/usr/local/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
raise exc
File "/usr/local/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.12/site-packages/starlette/routing.py", line 75, in app
response = await f(request)
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/fastapi/routing.py", line 302, in app
raw_response = await run_endpoint_function(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/fastapi/routing.py", line 215, in run_endpoint_function
return await run_in_threadpool(dependant.call, **values)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/starlette/concurrency.py", line 38, in run_in_threadpool
return await anyio.to_thread.run_sync(func)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/anyio/_backends/_asyncio.py", line 2470, in run_sync_in_worker_thread
return await future
^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/anyio/_backends/_asyncio.py", line 967, in run
result = context.run(func, *args)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/llm/modal_app.py", line 131, in run_generator_inference
output = modalGenatorValidator.inference.remote(messages)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pkg/modal/_object.py", line 285, in wrapped
return await method(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pkg/modal/_functions.py", line 1570, in remote
return await self._call_function(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pkg/modal/_functions.py", line 1525, in _call_function
return await invocation.run_function()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pkg/modal/_functions.py", line 285, in run_function
return await _process_result(item.result, item.data_format, self.stub, self.client)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pkg/modal/_utils/function_utils.py", line 506, in _process_result
raise exc_with_hints(exc)
File "<ta-01K0N5C94X71R8EYC4AK213MK9>:/root/llm/modal_app.py", line 87, in inference
File "<ta-01K0N5C94X71R8EYC4AK213MK9>:/usr/local/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
File "<ta-01K0N5C94X71R8EYC4AK213MK9>:/usr/local/lib/python3.12/site-packages/transformers/generation/utils.py", line 2625, in generate
File "<ta-01K0N5C94X71R8EYC4AK213MK9>:/usr/local/lib/python3.12/site-packages/transformers/generation/utils.py", line 3599, in _sample
File "<ta-01K0N5C94X71R8EYC4AK213MK9>:/root/.cache/huggingface/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1728, in prepare_inputs_for_generation
AttributeError: 'DynamicCache' object has no attribute 'get_max_length'
Expected Behaviour
Successful text generation. (unable to test if it works locally due to having no access to a cuda gpu at home)
Reproduction Steps
- Deploy the above image on Modal platform with a H100 GPU
- Load the model:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(
'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct',
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
).cuda()- Call "model.generate()" with any input:
messages = [
{ "role": "system", "content": "..." },
{ "role": "user", "content": "..."}
]
input = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(self.model["generator"].device)
outputs = model.generate(
input_tensor,
do_sample=False,
top_k=50, top_p=0.95,
num_return_sequences=1,
eos_token_id=self.tokenizer["generator"].eos_token_id,
max_new_tokens=1000
)
result = self.tokenizer["generator"].decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)Additional Context
- The error suggests DynamicCache.get_max_length() was deprecated in favour of get_seq_length()
- Occurs despite using a version of the transformers library greater than 4.40.0
- The model's modeling_deepseek.py is using the deprecated get_max_length() method
Metadata
Metadata
Assignees
Labels
No labels