Skip to content

Commit 9959d59

Browse files
committed
WIP: sqlite3: Add decoder
See sqlite3.{go,jq} for TODO Related to #27
1 parent 1a4b332 commit 9959d59

File tree

7 files changed

+339
-0
lines changed

7 files changed

+339
-0
lines changed

format/all/all.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
_ "github.com/wader/fq/format/png"
2727
_ "github.com/wader/fq/format/protobuf"
2828
_ "github.com/wader/fq/format/raw"
29+
_ "github.com/wader/fq/format/sqlite3"
2930
_ "github.com/wader/fq/format/tar"
3031
_ "github.com/wader/fq/format/tiff"
3132
_ "github.com/wader/fq/format/vorbis"

format/format.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ const (
8888
WAV = "wav"
8989
WEBP = "webp"
9090
ZIP = "zip"
91+
92+
SQLITE3 = "sqlite3"
9193
)
9294

9395
// below are data types used to communicate between formats <FormatName>In/Out

format/sqlite3/sqlite3.go

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
package sqlite3
2+
3+
// https://www.sqlite.org/fileformat.html
4+
// https://sqlite.org/schematab.html
5+
6+
// TODO: page overflow
7+
// TODO: format version
8+
// TODO: text encoding
9+
// TODO: table/column names
10+
// TODO: assert version and schema version?
11+
// TODO: ptrmap
12+
// TODO: how to represent NULL serials
13+
14+
// CREATE TABLE sqlite_schema(
15+
// type text,
16+
// name text,
17+
// tbl_name text,
18+
// rootpage integer,
19+
// sql text
20+
// );
21+
// > A table with the name "sqlite_sequence" that is used to keep track of the maximum historical INTEGER PRIMARY KEY for a table using AUTOINCREMENT.
22+
// CREATE TABLE sqlite_sequence(name,seq);
23+
// > Tables with names of the form "sqlite_statN" where N is an integer. Such tables store database statistics gathered by the ANALYZE command and used by the query planner to help determine the best algorithm to use for each query.
24+
// CREATE TABLE sqlite_stat1(tbl,idx,stat);
25+
// Only if compiled with SQLITE_ENABLE_STAT2:
26+
// CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample);
27+
// Only if compiled with SQLITE_ENABLE_STAT3:
28+
// CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample);
29+
// Only if compiled with SQLITE_ENABLE_STAT4:
30+
// CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample);
31+
// TODO: sqlite_autoindex_TABLE_N index
32+
33+
import (
34+
"embed"
35+
36+
"github.com/wader/fq/format"
37+
"github.com/wader/fq/format/registry"
38+
"github.com/wader/fq/internal/num"
39+
"github.com/wader/fq/pkg/decode"
40+
"github.com/wader/fq/pkg/scalar"
41+
)
42+
43+
//go:embed *.jq
44+
var sqlite3FS embed.FS
45+
46+
func init() {
47+
registry.MustRegister(decode.Format{
48+
Name: format.SQLITE3,
49+
Description: "SQLite v3 database",
50+
Groups: []string{format.PROBE},
51+
DecodeFn: sqlite3Decode,
52+
Files: sqlite3FS,
53+
})
54+
}
55+
56+
const (
57+
bTreeIndexInterior = 0x02
58+
bTreeTableInterior = 0x05
59+
bTreeIndexLeaf = 0x0a
60+
bTreeTableLeaf = 0x0d
61+
)
62+
63+
var bTreeTypeMap = scalar.UToScalar{
64+
bTreeIndexInterior: scalar.S{Sym: "index_interior", Description: "Index interior b-tree page"},
65+
bTreeTableInterior: scalar.S{Sym: "table_interior", Description: "Table interior b-tree page"},
66+
bTreeIndexLeaf: scalar.S{Sym: "index_leaf", Description: "Index leaf b-tree page"},
67+
bTreeTableLeaf: scalar.S{Sym: "table_leaf", Description: "Table leaf b-tree page"},
68+
}
69+
70+
const (
71+
textEncodingUTF8 = 1
72+
textEncodingUTF16LE = 2
73+
textEncodingUTF16BE = 3
74+
)
75+
76+
var textEncodingMap = scalar.UToSymStr{
77+
textEncodingUTF8: "utf8",
78+
textEncodingUTF16LE: "utf16le",
79+
textEncodingUTF16BE: "utf16be",
80+
}
81+
82+
var versionMap = scalar.UToSymStr{
83+
1: "legacy",
84+
2: "wal",
85+
}
86+
87+
// TODO: all bits if nine bytes?
88+
// TODO: two complement on bit read count
89+
func varintDecode(d *decode.D) int64 {
90+
var n uint64
91+
for i := 0; i < 9; i++ {
92+
v := d.U8()
93+
n = n<<7 | v&0b0111_1111
94+
if v&0b1000_0000 == 0 {
95+
break
96+
}
97+
}
98+
return num.TwosComplement(64, n)
99+
}
100+
101+
func sqlite3DecodeSerialType(d *decode.D, typ int64) {
102+
switch typ {
103+
case 0:
104+
d.FieldValueStr("value", "NULL", scalar.Description("null"))
105+
case 1:
106+
d.FieldS8("value", scalar.Description("8-bit integer"))
107+
case 2:
108+
d.FieldS16("value", scalar.Description("16-bit integer"))
109+
case 3:
110+
d.FieldS24("value", scalar.Description("24-bit integer"))
111+
case 4:
112+
d.FieldS32("value", scalar.Description("32-bit integer"))
113+
case 5:
114+
d.FieldS48("value", scalar.Description("48-bit integer"))
115+
case 6:
116+
d.FieldS64("value", scalar.Description("64-bit integer"))
117+
case 7:
118+
d.FieldF64("value", scalar.Description("64-bit float"))
119+
case 8:
120+
d.FieldValueU("value", 0, scalar.Description("constant 0"))
121+
case 9:
122+
d.FieldValueU("value", 1, scalar.Description("constant 1"))
123+
case 10, 11:
124+
default:
125+
if typ%2 == 0 {
126+
// N => 12 and even: (N-12)/2 bytes blob.
127+
d.FieldRawLen("value", (typ-12)/2*8, scalar.Description("blob"))
128+
} else {
129+
// N => 13 and odd: (N-13)/2 bytes text
130+
d.FieldUTF8("value", int(typ-13)/2, scalar.Description("text"))
131+
}
132+
}
133+
}
134+
135+
func sqlite3CellFreeblockDecode(d *decode.D) uint64 {
136+
nextOffset := d.FieldU16("next_offset")
137+
if nextOffset == 0 {
138+
return 0
139+
}
140+
// TODO: "header" is size bytes or offset+size? seems to be just size
141+
// "size of the freeblock in bytes, including the 4-byte header"
142+
size := d.FieldU16("size")
143+
d.FieldRawLen("space", int64(size-4)*8)
144+
return nextOffset
145+
}
146+
147+
func sqlite3CellPayloadDecode(d *decode.D) {
148+
lengthStart := d.Pos()
149+
length := d.FieldSFn("length", varintDecode)
150+
lengtbBits := d.Pos() - lengthStart
151+
var serialTypes []int64
152+
d.LenFn((length)*8-lengtbBits, func(d *decode.D) {
153+
d.FieldArray("serials", func(d *decode.D) {
154+
for !d.End() {
155+
serialTypes = append(serialTypes, d.FieldSFn("serial", varintDecode))
156+
}
157+
})
158+
})
159+
d.FieldArray("contents", func(d *decode.D) {
160+
for _, s := range serialTypes {
161+
sqlite3DecodeSerialType(d, s)
162+
}
163+
})
164+
}
165+
166+
func sqlite3Decode(d *decode.D, in interface{}) interface{} {
167+
var pageSizeS *scalar.S
168+
var databaseSizePages uint64
169+
170+
d.FieldStruct("header", func(d *decode.D) {
171+
d.FieldUTF8("magic", 16, d.AssertStr("SQLite format 3\x00"))
172+
pageSizeS = d.FieldScalarU16("page_size", scalar.UToSymU{1: 65536}) // in bytes. Must be a power of two between 512 and 32768 inclusive, or the value 1 representing a page size of 65536.
173+
d.FieldU8("write_version", versionMap) // 1 for legacy; 2 for WAL.
174+
d.FieldU8("read_version", versionMap) // . 1 for legacy; 2 for WAL.
175+
d.FieldU8("unused_space") // at the end of each page. Usually 0.
176+
d.FieldU8("maximum_embedded_payload_fraction") // . Must be 64.
177+
d.FieldU8("minimum_embedded_payload_fraction") // . Must be 32.
178+
d.FieldU8("leaf_payload_fraction") // . Must be 32.
179+
d.FieldU32("file_change_counter") //
180+
databaseSizePages = d.FieldU32("database_size_pages") // . The "in-header database size".
181+
d.FieldU32("page_number_freelist") // of the first freelist trunk page.
182+
d.FieldU32("total_number_freelist") // pages.
183+
d.FieldU32("schema_cookie") // .
184+
d.FieldU32("schema_format_number") // . Supported schema formats are 1, 2, 3, and 4.
185+
d.FieldU32("default_page_cache_size") // .
186+
d.FieldU32("page_number_largest_root_btree") // page when in auto-vacuum or incremental-vacuum modes, or zero otherwise.
187+
d.FieldU32("text_encoding", textEncodingMap)
188+
d.FieldU32("user_version") // " as read and set by the user_version pragma.
189+
d.FieldU32("incremental_vacuum_mode") // False (zero) otherwise.
190+
d.FieldU32("application_id") // " set by PRAGMA application_id.
191+
d.FieldRawLen("reserved", 160, d.BitBufIsZero()) // for expansion. Must be zero.
192+
d.FieldU32("version_valid_for") // number.
193+
d.FieldU32("sqlite_version_number") //
194+
})
195+
196+
// TODO: nicer API for fallback?
197+
pageSize := pageSizeS.ActualU()
198+
if pageSizeS.Sym != nil {
199+
pageSize = pageSizeS.SymU()
200+
}
201+
202+
d.FieldArray("pages", func(d *decode.D) {
203+
for i := uint64(0); i < databaseSizePages; i++ {
204+
pageOffset := int64(pageSize) * int64(i)
205+
d.SeekAbs(pageOffset * 8)
206+
// skip header for first page
207+
if i == 0 {
208+
d.SeekRel(100 * 8)
209+
}
210+
211+
d.FieldStruct("page", func(d *decode.D) {
212+
typ := d.FieldU8("type", bTreeTypeMap)
213+
startFreeblocks := d.FieldU16("start_freeblocks") // The two-byte integer at offset 1 gives the start of the first freeblock on the page, or is zero if there are no freeblocks.
214+
pageCells := d.FieldU16("page_cells") // The two-byte integer at offset 3 gives the number of cells on the page.
215+
d.FieldU16("cell_start") // sThe two-byte integer at offset 5 designates the start of the cell content area. A zero value for this integer is interpreted as 65536.
216+
d.FieldU8("cell_fragments") // The one-byte integer at offset 7 gives the number of fragmented free bytes within the cell content area.
217+
switch typ {
218+
case bTreeIndexInterior,
219+
bTreeTableInterior:
220+
d.FieldU32("right_pointer") // The four-byte page number at offset 8 is the right-most pointer. This value appears in the header of interior b-tree pages only and is omitted from all other pages.
221+
}
222+
var cellPointers []uint64
223+
d.FieldArray("cells_pointers", func(d *decode.D) {
224+
for i := uint64(0); i < pageCells; i++ {
225+
cellPointers = append(cellPointers, d.FieldU16("pointer"))
226+
}
227+
})
228+
if startFreeblocks != 0 {
229+
d.FieldArray("freeblocks", func(d *decode.D) {
230+
nextOffset := startFreeblocks
231+
for nextOffset != 0 {
232+
d.SeekAbs((pageOffset + int64(nextOffset)) * 8)
233+
d.FieldStruct("freeblock", func(d *decode.D) {
234+
nextOffset = sqlite3CellFreeblockDecode(d)
235+
})
236+
}
237+
})
238+
}
239+
d.FieldArray("cells", func(d *decode.D) {
240+
for _, p := range cellPointers {
241+
d.FieldStruct("cell", func(d *decode.D) {
242+
// TODO: SeekAbs with fn later?
243+
d.SeekAbs((pageOffset + int64(p)) * 8)
244+
switch typ {
245+
case bTreeIndexInterior:
246+
d.FieldU32("left_child")
247+
payLoadLen := d.FieldSFn("payload_len", varintDecode)
248+
d.LenFn(payLoadLen*8, func(d *decode.D) {
249+
d.FieldStruct("payload", sqlite3CellPayloadDecode)
250+
})
251+
case bTreeTableInterior:
252+
d.FieldU32("left_child")
253+
d.FieldSFn("rowid", varintDecode)
254+
case bTreeIndexLeaf:
255+
payLoadLen := d.FieldSFn("payload_len", varintDecode)
256+
d.LenFn(payLoadLen*8, func(d *decode.D) {
257+
d.FieldStruct("payload", sqlite3CellPayloadDecode)
258+
})
259+
case bTreeTableLeaf:
260+
payLoadLen := d.FieldSFn("payload_len", varintDecode)
261+
d.FieldSFn("rowid", varintDecode)
262+
d.LenFn(payLoadLen*8, func(d *decode.D) {
263+
d.FieldStruct("payload", sqlite3CellPayloadDecode)
264+
})
265+
}
266+
})
267+
}
268+
})
269+
})
270+
}
271+
})
272+
273+
return nil
274+
}

format/sqlite3/sqlite3.jq

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
# TODO: two columns tables are index tables?
3+
# TODO: why page numbers-1? 0 excluded as special?
4+
# TODO: traverse is wrong somehow
5+
# TODO: chinook.db => [sqlite3_table("Track")] | length => 3496, should be 3503 rows
6+
7+
def sqlite3_traverse($root; $page):
8+
def _t:
9+
( . # debug({TRAVESE: .})
10+
| if .type == "table_interior" or .type == "index_interior" then
11+
( $root.pages[.cells[].left_child-1, .right_pointer-1]
12+
| _t
13+
)
14+
elif .type == "table_leaf" or .type == "index_leaf" then
15+
( .cells[]
16+
)
17+
end
18+
);
19+
( $page
20+
| _t
21+
);
22+
23+
def sqlite3_table($name):
24+
( . as $root
25+
| ( first(
26+
( sqlite3_traverse($root; $root.pages[0])
27+
| select(.payload.contents | .[0] == "table" and .[2] == $name)
28+
)
29+
)
30+
) as $table_start_cell
31+
| ( first(
32+
( sqlite3_traverse($root; $root.pages[0])
33+
| select(.payload.contents| .[0] == "index" and .[2] == $name)
34+
)
35+
)
36+
) as $index_start_cell
37+
| sqlite3_traverse($root; $root.pages[$index_start_cell.payload.contents[3]-1]) as $index_row
38+
| sqlite3_traverse($root; $root.pages[$table_start_cell.payload.contents[3]-1])
39+
| first(select(.rowid == $index_row.payload.contents[1]))
40+
| .payload.contents
41+
);

format/sqlite3/testdata/test.db

24 KB
Binary file not shown.

format/sqlite3/testdata/test.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/sh
2+
3+
cat test.sql | sqlite3 test.db

format/sqlite3/testdata/test.sql

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
CREATE TABLE aaa (
2+
cint int primary key,
3+
cvarchar varchar(30),
4+
ctext text,
5+
creal real,
6+
cblob blob
7+
);
8+
INSERT INTO "aaa" VALUES(0, 'var1', 'text1', 0, "blob1");
9+
INSERT INTO "aaa" VALUES(1, 'var2', 'test2', 1, "blob2");
10+
INSERT INTO "aaa" VALUES(128, 'var3', 'test3', 128, "blob3");
11+
INSERT INTO "aaa" VALUES(-128, 'var3', 'test3', -128, "blob3");
12+
INSERT INTO "aaa" VALUES(9223372036854775807, 'var4', 'test4', 9223372036854775807, "blob4");
13+
INSERT INTO "aaa" VALUES(-9223372036854775808, 'var5', 'test5', -9223372036854775808, "blob5");
14+
15+
-- CREATE TABLE aaa (
16+
-- cint int primary key
17+
-- );
18+
-- INSERT INTO "aaa" VALUES(123);

0 commit comments

Comments
 (0)