From c74453d8ca1a9725e7cab7ba32be264be5d6f365 Mon Sep 17 00:00:00 2001 From: liam Date: Thu, 13 Feb 2025 16:26:31 +0800 Subject: [PATCH 1/3] :memo: add doc support and fix bug in qwen2 --- .github/workflows/book-ci.yml | 32 ++++++++++++++++++++ .github/workflows/deploy.yml | 48 ++++++++++++++++++++++++++++++ .gitignore | 1 + book.toml | 18 +++++++++++ doc/README.md | 31 +++++++++++++++++++ doc/SUMMARY.md | 14 +++++++++ doc/basic/note1.md | 1 + doc/basic/note2.md | 1 + doc/zh/api/server/README.md | 2 ++ ktransformers/operators/experts.py | 2 ++ ktransformers/operators/linear.py | 32 ++++++++++++-------- 11 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/book-ci.yml create mode 100644 .github/workflows/deploy.yml create mode 100644 book.toml create mode 100644 doc/README.md create mode 100644 doc/SUMMARY.md create mode 100644 doc/basic/note1.md create mode 100644 doc/basic/note2.md create mode 100644 doc/zh/api/server/README.md diff --git a/.github/workflows/book-ci.yml b/.github/workflows/book-ci.yml new file mode 100644 index 00000000..f09f18ab --- /dev/null +++ b/.github/workflows/book-ci.yml @@ -0,0 +1,32 @@ +name: Book-CI + +on: + push: + branches: + - main + - server_support + + pull_request: + branches: + - main + - server_support +jobs: + test: + name: test + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v4 + - name: Install Rust + run: | + rustup set profile minimal + rustup toolchain install stable + rustup default stable + - name: Setup mdBook + uses: peaceiris/actions-mdbook@v2 + with: + mdbook-version: "latest" + # - name: Run tests + # run: mdbook test \ No newline at end of file diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 00000000..f9f83419 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,48 @@ +name: Deploy + +on: + push: + branches: + - main + - server_support + + pull_request: + branches: + - main + - server_support + +defaults: + run: + shell: bash + +permissions: + contents: write + +jobs: + deploy: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v4 + - name: Install Rust + run: | + rustup set profile minimal + rustup toolchain install stable + rustup default stable + - name: Setup mdBook + uses: peaceiris/actions-mdbook@v2 + with: + mdbook-version: "latest" + - run: mdbook build + # - name: Copy Assets + # run: | + # chmod +x ci/copy-assets.sh + # ci/copy-assets.sh ${{ matrix.os }} + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + if: ${{ github.ref == 'refs/heads/main' }} or || github.ref == 'refs/heads/server_support' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./book \ No newline at end of file diff --git a/.gitignore b/.gitignore index d45e9564..1631d012 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ img/ tmp1.txt test_65_300_1536.txt test.txt +book diff --git a/book.toml b/book.toml new file mode 100644 index 00000000..c88d9b70 --- /dev/null +++ b/book.toml @@ -0,0 +1,18 @@ +[book] +authors = ["kvcache-ai"] +language = "zh-CN" +title = "Ktransformers" +src = "doc" + +[output.html] +git-repository-url = "https://github.com/kvcache-ai/ktransformers" +edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}" + +[output.html.playground] +editable = true +copy-js = true +# line-numbers = true + +[output.html.fold] +enable = true +level = 0 \ No newline at end of file diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 00000000..01834974 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,31 @@ +
+ +

+ + + KTransformers + + + +

+ +
+ +

🎉 Introduction

+KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 Transformers experience with advanced kernel optimizations and placement/parallelism strategies. +

+KTransformers is a flexible, Python-centric framework designed with extensibility at its core. +By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible +interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. +

+Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features. + +

🔥 Updates

+ +* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md). +* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). +* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G. +* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. +* **Aug 14, 2024**: Support llamfile as linear backend. +* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu. +* **Aug 9, 2024**: Support windows native. \ No newline at end of file diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md new file mode 100644 index 00000000..449e0f61 --- /dev/null +++ b/doc/SUMMARY.md @@ -0,0 +1,14 @@ +# Ktransformer + +[Introduction](./README.md) +# DeepSeek +- [DeepseekR1_V3_tutorial](en/DeepseekR1_V3_tutorial.md) +- [deepseek-v2-injection](en/deepseek-v2-injection.md) +- [Makefile_usage](en/makefile_usage.md) +# Server +- [Server](zh/api/server/README.md) + - [Server](zh/api/server/server.md) + - [Website](zh/api/server/website.md) + - [Tabby](zh/api/server/tabby.md) +# FAQ +- [FAQ](en/FAQ.md) \ No newline at end of file diff --git a/doc/basic/note1.md b/doc/basic/note1.md new file mode 100644 index 00000000..daa3dba7 --- /dev/null +++ b/doc/basic/note1.md @@ -0,0 +1 @@ +# basic-first20 diff --git a/doc/basic/note2.md b/doc/basic/note2.md new file mode 100644 index 00000000..b73e982e --- /dev/null +++ b/doc/basic/note2.md @@ -0,0 +1 @@ +# basic-data_structure diff --git a/doc/zh/api/server/README.md b/doc/zh/api/server/README.md new file mode 100644 index 00000000..a0f47f47 --- /dev/null +++ b/doc/zh/api/server/README.md @@ -0,0 +1,2 @@ +# Server +Still Under Construction... (May have bugs and lack of documentation) \ No newline at end of file diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 274a3cad..ecfbca0d 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -576,6 +576,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu() shared_expert_output = self.shared_expert(hidden_states) + tmp = self.shared_expert_gate(hidden_states) + print("shared_expert_gate shape ", tmp.shape) shared_expert_output = ( F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output ) diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py index 9e35e8d7..305f2666 100644 --- a/ktransformers/operators/linear.py +++ b/ktransformers/operators/linear.py @@ -54,15 +54,15 @@ def __init__( self.has_bias = False self.dtype = torch.get_default_dtype() - # if orig_module is not None: - # self.in_features = orig_module.in_features - # self.out_features = orig_module.out_features - # else: - shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"] - if len(shape) == 1: - print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF") - self.in_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0] - self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1] + if orig_module is not None: + self.in_features = orig_module.in_features + self.out_features = orig_module.out_features + else: + shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"] + if len(shape) == 1: + print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF") + self.in_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0] + self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1] @abstractmethod def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -136,12 +136,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None): if device is None: device = self.device if w is None: w = self.load_weight(device=device) + # else: self.out_features = w.shape[0], self.in_features = w.shape[1] if isinstance(w, nn.Parameter): - self.w = w.to(dtype=self.dtype).T + try: + self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T + except: + self.w = w.to(dtype=self.dtype).T self.has_bias = False elif isinstance(w, tuple): - self.w = w[0].to(dtype=self.dtype).T + try: + self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T + except: + self.w = w[0].to(dtype=self.dtype).T self.bias = w[1].to(dtype=self.dtype) self.has_bias = True else: @@ -187,7 +194,8 @@ def __init__( def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None): if device is None: device = self.device assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device" - if w is None: w = self.load_weight(device=device) + if w is None: + w = self.load_weight(device=device) if isinstance(w, nn.Parameter): # pad weight From ad2c52d72a996732bdcd3f6bfe2afbdaa4a2b19e Mon Sep 17 00:00:00 2001 From: liam Date: Thu, 13 Feb 2025 17:16:27 +0800 Subject: [PATCH 2/3] :memo: update doc --- .github/workflows/book-ci.yml | 4 ++-- .github/workflows/deploy.yml | 3 ++- doc/SUMMARY.md | 16 +++++++++++----- doc/en/V3-success.md | 10 ++++++++++ 4 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 doc/en/V3-success.md diff --git a/.github/workflows/book-ci.yml b/.github/workflows/book-ci.yml index f09f18ab..61f5f56c 100644 --- a/.github/workflows/book-ci.yml +++ b/.github/workflows/book-ci.yml @@ -4,12 +4,12 @@ on: push: branches: - main - - server_support + # - server_support pull_request: branches: - main - - server_support + # - server_support jobs: test: name: test diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f9f83419..4a8de5ee 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -42,7 +42,8 @@ jobs: # ci/copy-assets.sh ${{ matrix.os }} - name: Deploy uses: peaceiris/actions-gh-pages@v3 - if: ${{ github.ref == 'refs/heads/main' }} or || github.ref == 'refs/heads/server_support' + # or || github.ref == 'refs/heads/server_support' + if: ${{ github.ref == 'refs/heads/main' }} with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./book \ No newline at end of file diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md index 449e0f61..76450185 100644 --- a/doc/SUMMARY.md +++ b/doc/SUMMARY.md @@ -2,13 +2,19 @@ [Introduction](./README.md) # DeepSeek -- [DeepseekR1_V3_tutorial](en/DeepseekR1_V3_tutorial.md) -- [deepseek-v2-injection](en/deepseek-v2-injection.md) -- [Makefile_usage](en/makefile_usage.md) +- [Deepseek-R1/V3 Tutorial](en/DeepseekR1_V3_tutorial.md) +- [Deepseek-V2 Injection](en/deepseek-v2-injection.md) +- [Injection Tutorial](en/injection_tutorial.md) + # Server - [Server](zh/api/server/README.md) - - [Server](zh/api/server/server.md) + - [Server](en/api/server/server.md) - [Website](zh/api/server/website.md) - [Tabby](zh/api/server/tabby.md) +# For Developer +- [For Developer](en/injection_tutorial.md) + # FAQ -- [FAQ](en/FAQ.md) \ No newline at end of file +- [FAQ](en/FAQ.md) +# V3 Reproduction +- [Success List](en/V3-success.md) \ No newline at end of file diff --git a/doc/en/V3-success.md b/doc/en/V3-success.md new file mode 100644 index 00000000..af69f277 --- /dev/null +++ b/doc/en/V3-success.md @@ -0,0 +1,10 @@ +## Hello everyone, here is the successfully reproduced environment configuration for your reference: +### Case 1 +- Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel +- Performance: prefill 108 tokens/s, decode 10.8 tokens/s +- Used version: main source code compiled +### Case 2 +- Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card +- Performance: Running speed approximately 6-8 tokens per second +# NOTE +If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : ) https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2 \ No newline at end of file From 8d5ebe49abb32626c0c857e0e3a4cb5fad063c24 Mon Sep 17 00:00:00 2001 From: liam Date: Thu, 13 Feb 2025 17:25:12 +0800 Subject: [PATCH 3/3] :memo: :zap: fix some debug output and update doc --- .github/workflows/deploy.yml | 4 ++-- doc/SUMMARY.md | 7 +++---- doc/en/V3-success.md | 5 +++-- doc/zh/api/server/README.md | 2 -- ktransformers/operators/experts.py | 2 -- 5 files changed, 8 insertions(+), 12 deletions(-) delete mode 100644 doc/zh/api/server/README.md diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 4a8de5ee..dd406dfe 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -4,12 +4,12 @@ on: push: branches: - main - - server_support + # - server_support pull_request: branches: - main - - server_support + # - server_support defaults: run: diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md index 76450185..bf5579f1 100644 --- a/doc/SUMMARY.md +++ b/doc/SUMMARY.md @@ -7,12 +7,11 @@ - [Injection Tutorial](en/injection_tutorial.md) # Server -- [Server](zh/api/server/README.md) - [Server](en/api/server/server.md) - - [Website](zh/api/server/website.md) - - [Tabby](zh/api/server/tabby.md) + - [Website](en/api/server/website.md) + - [Tabby](en/api/server/tabby.md) # For Developer -- [For Developer](en/injection_tutorial.md) +- [Makefile Usage](en/makefile_usage.md) # FAQ - [FAQ](en/FAQ.md) diff --git a/doc/en/V3-success.md b/doc/en/V3-success.md index af69f277..fed16640 100644 --- a/doc/en/V3-success.md +++ b/doc/en/V3-success.md @@ -6,5 +6,6 @@ ### Case 2 - Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card - Performance: Running speed approximately 6-8 tokens per second -# NOTE -If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : ) https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2 \ No newline at end of file +## NOTE +If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : ) +[click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2) \ No newline at end of file diff --git a/doc/zh/api/server/README.md b/doc/zh/api/server/README.md deleted file mode 100644 index a0f47f47..00000000 --- a/doc/zh/api/server/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Server -Still Under Construction... (May have bugs and lack of documentation) \ No newline at end of file diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index ecfbca0d..274a3cad 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -576,8 +576,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu() shared_expert_output = self.shared_expert(hidden_states) - tmp = self.shared_expert_gate(hidden_states) - print("shared_expert_gate shape ", tmp.shape) shared_expert_output = ( F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output )