Skip to content

Commit b0d3a63

Browse files
committed
Merge branch 'develop' of https://github.com/hong19860320/FastDeploy into hongming/fix_xpu_doc
2 parents ef72873 + 4a5db82 commit b0d3a63

File tree

3 files changed

+7
-9
lines changed

3 files changed

+7
-9
lines changed

custom_ops/gpu_ops/per_token_quant_fp8.cu

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,11 @@ __global__ void quant_per_token_per_block(const T *input,
5050
max_value_thread = max(abs(load_vec_float[vid]), max_value_thread);
5151
}
5252
// get max value per warp
53-
max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 16), max_value_thread);
54-
max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 8), max_value_thread);
55-
max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 4), max_value_thread);
56-
max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 2), max_value_thread);
57-
max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 1), max_value_thread);
58-
// broadcast max_value
59-
max_value_thread = __shfl_sync(0xFFFFFFFF, max_value_thread, 0);
53+
max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 16), max_value_thread);
54+
max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 8), max_value_thread);
55+
max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 4), max_value_thread);
56+
max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 2), max_value_thread);
57+
max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 1), max_value_thread);
6058
max_value_thread = max(max_value_thread, epsilon);
6159
float scale_to_store = max_value_thread / MAX_VALUE;
6260
// quant

docs/get_started/installation/kunlunxin_xpu.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,4 +218,4 @@ for chunk in response:
218218
print('\n')
219219
```
220220

221-
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create).
221+
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).

docs/zh/get_started/installation/kunlunxin_xpu.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,4 +223,4 @@ for chunk in response:
223223
print('\n')
224224
```
225225

226-
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)
226+
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)

0 commit comments

Comments
 (0)