Skip to content

Commit c3f5e24

Browse files
committed
Move main dev test into main2.dart
1 parent d096a39 commit c3f5e24

File tree

4 files changed

+82
-83
lines changed

4 files changed

+82
-83
lines changed

example/main2.dart

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import 'dart:io';
2+
3+
import 'package:ensemble_llama/ensemble_llama.dart';
4+
5+
void main() async {
6+
var llama = await Llama.create();
7+
llama.log.listen((msg) {
8+
final msgText = msg.toString();
9+
if (!msgText.contains("llama_model_loader: - tensor")) {
10+
print(msgText);
11+
}
12+
});
13+
14+
final params = ContextParams(gpuLayers: 1, useMmap: false);
15+
final model = await llama.loadModel(
16+
"/Users/vczf/models/default/ggml-model-f16.gguf",
17+
params: params,
18+
progressCallback: (p) => stdout.write("."),
19+
);
20+
21+
print(model);
22+
23+
final ctx = await llama.newContext(model, params);
24+
await llama.freeContext(ctx);
25+
26+
await llama.freeModel(model);
27+
llama.dispose();
28+
}

lib/src/ensemble_llama_base.dart

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,43 @@
1-
import 'dart:io';
21
import 'dart:isolate';
32

43
import 'package:ensemble_llama/src/llama_cpp_isolate_wrapper.dart';
54

6-
void main() async {
7-
var llama = await Llama.create();
8-
llama.log.listen((msg) {
9-
final msgText = msg.toString();
10-
if (!msgText.contains("llama_model_loader: - tensor")) {
11-
print(msgText);
12-
}
13-
});
14-
15-
final params = ContextParams(gpuLayers: 1, useMmap: false);
16-
final model = await llama.loadModel(
17-
"/Users/vczf/models/default/ggml-model-f16.gguf",
18-
params: params,
19-
progressCallback: (p) => stdout.write("."),
20-
);
21-
22-
print(model);
23-
24-
final ctx = await llama.newContext(model, params);
25-
await llama.freeContext(ctx);
26-
27-
await llama.freeModel(model);
28-
llama.dispose();
5+
class ContextParams {
6+
final int seed;
7+
final int contextSizeTokens;
8+
final int batchSizeTokens;
9+
final int gpuLayers;
10+
final int cudaMainGpu;
11+
// final List<double> cudaTensorSplits;
12+
final double ropeFreqBase;
13+
final double ropeFreqScale;
14+
final bool useLessVram;
15+
final bool cudaUseMulMatQ;
16+
final bool useFloat16KVCache;
17+
final bool calculateAllLogits;
18+
final bool loadOnlyVocabSkipTensors;
19+
final bool useMmap;
20+
final bool useMlock;
21+
final bool willUseEmbedding;
22+
23+
const ContextParams({
24+
this.seed = int32Max,
25+
this.contextSizeTokens = 512,
26+
this.batchSizeTokens = 512,
27+
this.gpuLayers = 0,
28+
this.cudaMainGpu = 0,
29+
// this.cudaTensorSplits = const [0.0],
30+
this.ropeFreqBase = 10000.0,
31+
this.ropeFreqScale = 1.0,
32+
this.useLessVram = false,
33+
this.cudaUseMulMatQ = true,
34+
this.useFloat16KVCache = true,
35+
this.calculateAllLogits = false,
36+
this.loadOnlyVocabSkipTensors = false,
37+
this.useMmap = true,
38+
this.useMlock = false,
39+
this.willUseEmbedding = false,
40+
}) : assert(seed <= int32Max);
2941
}
3042

3143
class Llama {

lib/src/llama_cpp_isolate_wrapper.dart

Lines changed: 18 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ import 'dart:math';
44

55
import 'package:ffi/ffi.dart';
66
import 'package:ensemble_llama/ensemble_llama_cpp.dart';
7+
import 'package:ensemble_llama/src/ensemble_llama_base.dart' show ContextParams;
78

89
// 4294967295 (32 bit unsigned)
910
// -1 (32 bit signed)
10-
const _int32Max = 0xFFFFFFFF;
11+
const int32Max = 0xFFFFFFFF;
1112

1213
extension on llama_context_params {
1314
// Sets most of the context parameters, such as int, double, bool.
@@ -37,42 +38,20 @@ extension on llama_context_params {
3738
}
3839
}
3940

40-
class ContextParams {
41-
final int seed;
42-
final int contextSizeTokens;
43-
final int batchSizeTokens;
44-
final int gpuLayers;
45-
final int cudaMainGpu;
46-
// final List<double> cudaTensorSplits;
47-
final double ropeFreqBase;
48-
final double ropeFreqScale;
49-
final bool useLessVram;
50-
final bool cudaUseMulMatQ;
51-
final bool useFloat16KVCache;
52-
final bool calculateAllLogits;
53-
final bool loadOnlyVocabSkipTensors;
54-
final bool useMmap;
55-
final bool useMlock;
56-
final bool willUseEmbedding;
57-
58-
const ContextParams({
59-
this.seed = _int32Max,
60-
this.contextSizeTokens = 512,
61-
this.batchSizeTokens = 512,
62-
this.gpuLayers = 0,
63-
this.cudaMainGpu = 0,
64-
// this.cudaTensorSplits = const [0.0],
65-
this.ropeFreqBase = 10000.0,
66-
this.ropeFreqScale = 1.0,
67-
this.useLessVram = false,
68-
this.cudaUseMulMatQ = true,
69-
this.useFloat16KVCache = true,
70-
this.calculateAllLogits = false,
71-
this.loadOnlyVocabSkipTensors = false,
72-
this.useMmap = true,
73-
this.useMlock = false,
74-
this.willUseEmbedding = false,
75-
}) : assert(seed <= _int32Max);
41+
class Model {
42+
final int _rawPointer;
43+
const Model._(this._rawPointer);
44+
Pointer<llama_model> get _ffiPointer =>
45+
Pointer.fromAddress(_rawPointer).cast<llama_model>();
46+
@override
47+
String toString() => "Model{$_rawPointer}";
48+
}
49+
50+
class Context {
51+
final int _rawPointer;
52+
const Context._(this._rawPointer);
53+
Pointer<llama_context> get _ffiPointer =>
54+
Pointer.fromAddress(_rawPointer).cast<llama_context>();
7655
}
7756

7857
class LogMessage {
@@ -97,7 +76,7 @@ class LogMessage {
9776
}
9877

9978
sealed class ControlMessage {
100-
final id = Random().nextInt(_int32Max);
79+
final id = Random().nextInt(int32Max);
10180
ControlMessage();
10281
}
10382

@@ -145,7 +124,7 @@ class FreeContextCtl extends ControlMessage {
145124
sealed class ResponseMessage {
146125
final int id;
147126
final Object? err;
148-
const ResponseMessage(this.id, {this.err}) : assert(id <= _int32Max);
127+
const ResponseMessage(this.id, {this.err}) : assert(id <= int32Max);
149128
void throwIfErr() {
150129
if (err != null) {
151130
throw err!;
@@ -191,22 +170,6 @@ class EntryArgs {
191170
const EntryArgs({required this.log, required this.response});
192171
}
193172

194-
class Model {
195-
final int _rawPointer;
196-
const Model._(this._rawPointer);
197-
Pointer<llama_model> get _ffiPointer =>
198-
Pointer.fromAddress(_rawPointer).cast<llama_model>();
199-
@override
200-
String toString() => "Model{$_rawPointer}";
201-
}
202-
203-
class Context {
204-
final int _rawPointer;
205-
const Context._(this._rawPointer);
206-
Pointer<llama_context> get _ffiPointer =>
207-
Pointer.fromAddress(_rawPointer).cast<llama_context>();
208-
}
209-
210173
class _Allocations<E> {
211174
final Map<E, Set<Pointer>> _map = {};
212175

@@ -263,9 +226,6 @@ void _onControl(ControlMessage ctl) {
263226
final params = libllama.llama_context_default_params()
264227
..setSimpleFrom(ctl.params);
265228

266-
// TODO: can't do this until we track contexts to manage memory allocation
267-
// pc.tensor_split
268-
269229
params.progress_callback = Pointer.fromFunction(_onModelLoadProgress);
270230
final idPointer = calloc.allocate<Uint32>(sizeOf<Uint32>());
271231
allocs.add(idPointer);

test/ensemble_llama_test.dart

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
// import 'package:ensemble_llama/ensemble_llama.dart' as llama;
21
import 'package:test/test.dart';
32

43
void main() {

0 commit comments

Comments
 (0)