Skip to content

Commit b01d4d5

Browse files
authored
Merge branch 'develop' into mm_structred_output
2 parents 2e8f219 + 6b10c19 commit b01d4d5

File tree

6 files changed

+78
-20
lines changed

6 files changed

+78
-20
lines changed

build.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,10 @@ function version_info() {
187187
fastdeploy_git_commit_id=$(git rev-parse HEAD)
188188
paddle_version=$(${python} -c "import paddle; print(paddle.__version__)")
189189
paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.__git_commit__)")
190-
cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
190+
cuda_version="nvcc-not-installed"
191+
if command -v nvcc &> /dev/null; then
192+
cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
193+
fi
191194
cxx_version=$(g++ --version | head -n 1 | grep -Po "(?<=\) )[\d.]+")
192195

193196
echo "fastdeploy GIT COMMIT ID: $fastdeploy_git_commit_id" > $output_file

fastdeploy/engine/config.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import json
1818
import os
1919
from datetime import datetime
20+
from dataclasses import dataclass
2021
from typing import Any, Dict, List, Literal, Optional
2122

2223
from fastdeploy import envs
@@ -467,7 +468,63 @@ def print(self):
467468
llm_logger.info("Parallel Configuration Information :")
468469
for k, v in self.__dict__.items():
469470
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
470-
llm_logger.info("==================")
471+
llm_logger.info(
472+
"=============================================================")
473+
474+
475+
@dataclass
476+
class CommitConfig:
477+
"""
478+
Configuration for tracking version information from version.txt
479+
480+
Attributes:
481+
fastdeploy_commit: Full FastDeploy git commit hash
482+
paddle_version: PaddlePaddle version string
483+
paddle_commit: PaddlePaddle git commit hash
484+
cuda_version: CUDA version string
485+
compiler_version: CXX compiler version string
486+
"""
487+
fastdeploy_commit: str = ""
488+
paddle_version: str = ""
489+
paddle_commit: str = ""
490+
cuda_version: str = ""
491+
compiler_version: str = ""
492+
493+
def __post_init__(self):
494+
"""Automatically load version info when initialized"""
495+
self._load_from_version_file()
496+
497+
def _load_from_version_file(self, file_path: str = "fastdeploy/version.txt"):
498+
"""Internal method to load version info from file"""
499+
try:
500+
with open(file_path, 'r') as f:
501+
for line in f:
502+
line = line.strip()
503+
if line.startswith("fastdeploy GIT COMMIT ID:"):
504+
self.fastdeploy_commit = line.split(":")[1].strip()
505+
elif line.startswith("Paddle version:"):
506+
self.paddle_version = line.split(":")[1].strip()
507+
elif line.startswith("Paddle GIT COMMIT ID:"):
508+
self.paddle_commit = line.split(":")[1].strip()
509+
elif line.startswith("CUDA version:"):
510+
self.cuda_version = line.split(":")[1].strip()
511+
elif line.startswith("CXX compiler version:"):
512+
self.compiler_version = line.split(":")[1].strip()
513+
except FileNotFoundError:
514+
llm_logger.info(f"Warning: Version file not found at {file_path}")
515+
except Exception as e:
516+
llm_logger.info(f"Warning: Could not read version file - {str(e)}")
517+
518+
def print(self):
519+
"""
520+
print all config
521+
522+
"""
523+
llm_logger.info("Fasedeploy Commit Information :")
524+
for k, v in self.__dict__.items():
525+
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
526+
llm_logger.info(
527+
"=============================================================")
471528

472529

473530
class Config:
@@ -502,6 +559,7 @@ def __init__(
502559
cache_config: CacheConfig,
503560
scheduler_config: SchedulerConfig,
504561
parallel_config: ParallelConfig,
562+
commit_config: CommitConfig = CommitConfig(),
505563
model_name_or_path: str = None,
506564
tokenizer: str = None,
507565
tensor_parallel_size: int = 8,
@@ -559,6 +617,7 @@ def __init__(
559617
self.cache_config = cache_config
560618
self.scheduler_config = scheduler_config
561619
self.parallel_config = parallel_config
620+
self.commit_config = commit_config
562621
self.model_name_or_path = model_name_or_path
563622
self.tokenizer = tokenizer
564623
self.max_num_batched_tokens = max_num_batched_tokens
@@ -756,7 +815,7 @@ def print(self, file=None):
756815
if k == "generation_config" and v is not None:
757816
for gck, gcv in v.to_dict().items():
758817
llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
759-
elif k in ["cache_config", "model_config", "scheduler_config", "scheduler_config", "parallel_config", "speculative_config"]:
818+
elif k in ["cache_config", "model_config", "scheduler_config", "parallel_config", "commit_config", "speculative_config"]:
760819
v.print()
761820
else:
762821
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))

fastdeploy/model_executor/layers/attention/append_attn_backend.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
self.use_speculate: bool = self.speculative_method is not None
9292
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads
@@ -108,12 +108,12 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
108108

109109
if fd_config.parallel_config.expert_parallel_rank is None:
110110
fd_config.parallel_config.expert_parallel_rank = 0
111+
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
112+
fd_config.parallel_config.expert_parallel_rank
111113
if self.device_id is None:
112-
self.device_id = self.rank
114+
self.device_id = device_id
113115
else:
114-
device_ids = self.device_id.split(",")
115-
rank_index = self.rank % len(device_ids)
116-
self.device_id = self.device_id[rank_index]
116+
self.device_id = self.device_id.split(",")[device_id]
117117

118118
def init_attention_metadata(self, forward_meta: ForwardMeta):
119119
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/flash_attn_backend.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
100100
self.use_speculate = self.speculative_method is not None
101101
self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens
102102
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
103-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
103+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
104104

105105
# pd_disaggregation
106106
self.use_pd_disaggregation: int = int(
@@ -110,13 +110,12 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
110110

111111
if fd_config.parallel_config.expert_parallel_rank is None:
112112
fd_config.parallel_config.expert_parallel_rank = 0
113-
113+
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
114+
fd_config.parallel_config.expert_parallel_rank
114115
if self.device_id is None:
115-
self.device_id = self.rank
116+
self.device_id = device_id
116117
else:
117-
device_ids = self.device_id.split(",")
118-
rank_index = self.rank % len(device_ids)
119-
self.device_id = self.device_id[rank_index]
118+
self.device_id = self.device_id.split(",")[device_id]
120119

121120
def get_attntion_meta(self):
122121
"""get_attntion_meta"""

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
109109
self.use_speculate: bool = self.speculative_method is not None
110110
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
111111
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
112-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
112+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
113113

114114
self.kv_num_heads: int = kv_num_heads
115115
self.num_heads: int = num_heads
@@ -135,13 +135,10 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
135135
os.getenv("FLAGS_use_pd_disaggregation", 0))
136136
self.start_layer_index: int = fd_config.model_config.start_layer_index
137137
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
138-
139138
if self.device_id is None:
140139
self.device_id = self.rank
141140
else:
142-
device_ids = self.device_id.split(",")
143-
rank_index = self.rank % len(device_ids)
144-
self.device_id = self.device_id[rank_index]
141+
self.device_id = self.device_id.split(",")[self.rank]
145142

146143
def init_attention_metadata(self, forward_meta: ForwardMeta):
147144
"""Initialize attention metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/xpu_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
# self.use_speculate = self.speculate_method is not None
9292
# self.speculate_max_draft_token_num = fd_config.parallel_config.speculate_max_draft_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads

0 commit comments

Comments
 (0)