@@ -4,10 +4,11 @@ import 'dart:math';
4
4
5
5
import 'package:ffi/ffi.dart' ;
6
6
import 'package:ensemble_llama/ensemble_llama_cpp.dart' ;
7
+ import 'package:ensemble_llama/src/ensemble_llama_base.dart' show ContextParams;
7
8
8
9
// 4294967295 (32 bit unsigned)
9
10
// -1 (32 bit signed)
10
- const _int32Max = 0xFFFFFFFF ;
11
+ const int32Max = 0xFFFFFFFF ;
11
12
12
13
extension on llama_context_params {
13
14
// Sets most of the context parameters, such as int, double, bool.
@@ -37,42 +38,20 @@ extension on llama_context_params {
37
38
}
38
39
}
39
40
40
- class ContextParams {
41
- final int seed;
42
- final int contextSizeTokens;
43
- final int batchSizeTokens;
44
- final int gpuLayers;
45
- final int cudaMainGpu;
46
- // final List<double> cudaTensorSplits;
47
- final double ropeFreqBase;
48
- final double ropeFreqScale;
49
- final bool useLessVram;
50
- final bool cudaUseMulMatQ;
51
- final bool useFloat16KVCache;
52
- final bool calculateAllLogits;
53
- final bool loadOnlyVocabSkipTensors;
54
- final bool useMmap;
55
- final bool useMlock;
56
- final bool willUseEmbedding;
57
-
58
- const ContextParams ({
59
- this .seed = _int32Max,
60
- this .contextSizeTokens = 512 ,
61
- this .batchSizeTokens = 512 ,
62
- this .gpuLayers = 0 ,
63
- this .cudaMainGpu = 0 ,
64
- // this.cudaTensorSplits = const [0.0],
65
- this .ropeFreqBase = 10000.0 ,
66
- this .ropeFreqScale = 1.0 ,
67
- this .useLessVram = false ,
68
- this .cudaUseMulMatQ = true ,
69
- this .useFloat16KVCache = true ,
70
- this .calculateAllLogits = false ,
71
- this .loadOnlyVocabSkipTensors = false ,
72
- this .useMmap = true ,
73
- this .useMlock = false ,
74
- this .willUseEmbedding = false ,
75
- }) : assert (seed <= _int32Max);
41
+ class Model {
42
+ final int _rawPointer;
43
+ const Model ._(this ._rawPointer);
44
+ Pointer <llama_model> get _ffiPointer =>
45
+ Pointer .fromAddress (_rawPointer).cast <llama_model>();
46
+ @override
47
+ String toString () => "Model{$_rawPointer }" ;
48
+ }
49
+
50
+ class Context {
51
+ final int _rawPointer;
52
+ const Context ._(this ._rawPointer);
53
+ Pointer <llama_context> get _ffiPointer =>
54
+ Pointer .fromAddress (_rawPointer).cast <llama_context>();
76
55
}
77
56
78
57
class LogMessage {
@@ -97,7 +76,7 @@ class LogMessage {
97
76
}
98
77
99
78
sealed class ControlMessage {
100
- final id = Random ().nextInt (_int32Max );
79
+ final id = Random ().nextInt (int32Max );
101
80
ControlMessage ();
102
81
}
103
82
@@ -145,7 +124,7 @@ class FreeContextCtl extends ControlMessage {
145
124
sealed class ResponseMessage {
146
125
final int id;
147
126
final Object ? err;
148
- const ResponseMessage (this .id, {this .err}) : assert (id <= _int32Max );
127
+ const ResponseMessage (this .id, {this .err}) : assert (id <= int32Max );
149
128
void throwIfErr () {
150
129
if (err != null ) {
151
130
throw err! ;
@@ -191,22 +170,6 @@ class EntryArgs {
191
170
const EntryArgs ({required this .log, required this .response});
192
171
}
193
172
194
- class Model {
195
- final int _rawPointer;
196
- const Model ._(this ._rawPointer);
197
- Pointer <llama_model> get _ffiPointer =>
198
- Pointer .fromAddress (_rawPointer).cast <llama_model>();
199
- @override
200
- String toString () => "Model{$_rawPointer }" ;
201
- }
202
-
203
- class Context {
204
- final int _rawPointer;
205
- const Context ._(this ._rawPointer);
206
- Pointer <llama_context> get _ffiPointer =>
207
- Pointer .fromAddress (_rawPointer).cast <llama_context>();
208
- }
209
-
210
173
class _Allocations <E > {
211
174
final Map <E , Set <Pointer >> _map = {};
212
175
@@ -263,9 +226,6 @@ void _onControl(ControlMessage ctl) {
263
226
final params = libllama.llama_context_default_params ()
264
227
..setSimpleFrom (ctl.params);
265
228
266
- // TODO: can't do this until we track contexts to manage memory allocation
267
- // pc.tensor_split
268
-
269
229
params.progress_callback = Pointer .fromFunction (_onModelLoadProgress);
270
230
final idPointer = calloc.allocate <Uint32 >(sizeOf <Uint32 >());
271
231
allocs.add (idPointer);
0 commit comments