@@ -1149,10 +1149,11 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo
1149
1149
amax = ax ;
1150
1150
amax_i = i ;
1151
1151
}
1152
- Laux [i ] = k_heap -> mid_k ;
1153
1152
sumlx += w * x [i ] * kmin ;
1154
1153
suml2 += w * kmin * kmin ;
1155
1154
}
1155
+ memset (Laux , k_heap -> mid_k , n );
1156
+ memset (L , k_heap -> mid_k , n );
1156
1157
1157
1158
const bool neg_scale = signed_scale && fast ? (x [amax_i ] < 0.0f ) != (k_heap -> kmax < 0 ) : false;
1158
1159
@@ -1163,57 +1164,49 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo
1163
1164
float best_suml2 ;
1164
1165
if (suml2 != 0.0f ) {
1165
1166
best = sumlx * sumlx ;
1166
- best_sumlx = neg_scale ? - sumlx : sumlx ;
1167
- best_suml2 = suml2 != 0.0f ? suml2 : 1.0f ;
1167
+ best_sumlx = sumlx ; // can't change the sign of kmin
1168
+ best_suml2 = suml2 ;
1168
1169
} else {
1169
1170
best = 0.0f ;
1170
1171
best_sumlx = 0.0f ;
1171
1172
best_suml2 = 1.0f ;
1172
1173
}
1173
- {
1174
- float sumlx_p = neg_scale ? - sumlx : sumlx ;
1175
- float suml2_p = suml2 ;
1176
- int best_p_i = -2 ; // not consecutive with 0..n_frac
1177
- int i = 0 ;
1178
- while (k_heap -> n > 0 ) {
1179
- struct fraction frac = k_heap_pop (k_heap );
1180
- const int ii = frac .i ;
1181
- const float w = weights ? weights [ii ] : x [ii ] * x [ii ];
1182
- sumlx_p += w * frac .numer ;
1183
- suml2_p += w * frac .denom ;
1184
- const float current = sumlx_p * sumlx_p ;
1185
- Laux [ii ] += (x [ii ] < 0.0f ) != neg_scale ? -1 : 1 ;
1186
- if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p ) {
1187
- best = current ;
1188
- best_sumlx = neg_scale ? - sumlx_p : sumlx_p ;
1189
- best_suml2 = suml2_p ;
1190
- if (i == best_p_i + 1 ) {
1191
- // reduce copies for consecutive bests
1192
- L [ii ] += (x [ii ] < 0.0f ) != neg_scale ? -1 : 1 ;
1193
- } else {
1194
- for (int j = 0 ; j < n ; ++ j ) {
1195
- L [j ] = Laux [j ];
1196
- }
1197
- }
1198
- best_p_i = i ;
1174
+ float sumlx_p = neg_scale ? - sumlx : sumlx ;
1175
+ float suml2_p = suml2 ;
1176
+ int best_p_i = -1 ; // consecutive with 0..n_frac
1177
+ for (int i = 0 ; k_heap -> n > 0 ; ++ i ) {
1178
+ struct fraction frac = k_heap_pop (k_heap );
1179
+ const int ii = frac .i ;
1180
+ const float w = weights ? weights [ii ] : x [ii ] * x [ii ];
1181
+ sumlx_p += w * frac .numer ;
1182
+ suml2_p += w * frac .denom ;
1183
+ const float current = sumlx_p * sumlx_p ;
1184
+ Laux [ii ] += (x [ii ] < 0.0f ) != neg_scale ? -1 : 1 ;
1185
+ if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p ) {
1186
+ best = current ;
1187
+ best_sumlx = neg_scale ? - sumlx_p : sumlx_p ;
1188
+ best_suml2 = suml2_p ;
1189
+ if (i == best_p_i + 1 ) {
1190
+ // reduce copies for consecutive bests
1191
+ L [ii ] += (x [ii ] < 0.0f ) != neg_scale ? -1 : 1 ;
1192
+ } else {
1193
+ memcpy (L , Laux , n );
1199
1194
}
1195
+ best_p_i = i ;
1200
1196
}
1201
1197
}
1202
1198
1203
1199
// Non-linear mappings are usually not symmetric, so try negating the scale
1204
1200
// This is the same as above, but keeping the old best if the new best is not better.
1205
1201
if (signed_scale && !fast ) {
1206
- for (int i = 0 ; i < n ; ++ i ) {
1207
- Laux [i ] = k_heap -> mid_k ;
1208
- }
1202
+ memset (Laux , k_heap -> mid_k , n );
1209
1203
1210
1204
k_heap_set_x (k_heap , x , n , true);
1211
1205
1212
1206
float sumlx_n = - sumlx ;
1213
1207
float suml2_n = suml2 ;
1214
1208
int best_n_i = -2 ; // not consecutive with 0..n_frac
1215
- int i = 0 ;
1216
- while (k_heap -> n > 0 ) {
1209
+ for (int i = 0 ; k_heap -> n > 0 ; ++ i ) {
1217
1210
struct fraction frac = k_heap_pop (k_heap );
1218
1211
const int ii = frac .i ;
1219
1212
const float w = weights ? weights [ii ] : x [ii ] * x [ii ];
@@ -1229,13 +1222,10 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo
1229
1222
// reduce copies for consecutive bests
1230
1223
L [ii ] += x [ii ] >= 0.0f ? -1 : 1 ;
1231
1224
} else {
1232
- for (int j = 0 ; j < n ; ++ j ) {
1233
- L [j ] = Laux [j ];
1234
- }
1225
+ memcpy (L , Laux , n );
1235
1226
}
1236
1227
best_n_i = i ;
1237
1228
}
1238
- ++ i ;
1239
1229
}
1240
1230
}
1241
1231
0 commit comments