We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent bd6e2b8 commit 709ab81Copy full SHA for 709ab81
torchtitan/models/deepseek-v3/model/moe.py
@@ -114,13 +114,11 @@ def __init__(
114
self.num_experts = num_experts
115
self.top_k = top_k
116
self.use_sigmoid = use_sigmoid
117
- self.route_sclaing_factor
+ self.route_sclaing_factor = route_sclaing_factor
118
119
self.weight = nn.Parameter(
120
torch.empty((self.n_routed_experts, self.gating_dim))
121
)
122
- # TODO: is this needed? This is not "Complementary Sequence-Wise Auxiliary Loss"
123
- # self.e_score_correction_bias = nn.Parameter(torch.rand((self.num_experts)))
124
125
def forward(
126
self, x: torch.Tensor, expert_bias: torch.Tensor = None
0 commit comments