-
Notifications
You must be signed in to change notification settings - Fork 4
Description
While integrating Bodkin in one of our pipelines we bumped into a panic that we could narrow down to a very specific piece of shaped data. I’m opening this issue in case it’s helpful for you to reproduce and (if appropriate) guard against.
When the reader hits the record it panics somewhere inside Arrow’s JSON reader, presumably because the inferred schema can’t accommodate the second record’s shape. Our wrapper catches the panic, but because Arrow panics before returning an error, the resulting message is fairly generic.
I think ideally if Arrow can't handle the downstream data Bodkin would surface a clear validation error (schema mismatch, nested map too deep, etc.) instead of Arrow panicking. If that’s outside Bodkin’s remit, totally understood because Arrow shouldn't be panicking in the first place but I can see warnings being surfaced from Bodkin in the console logs. Would be helpful to understand why this happens if you're familiar.
Thanks for your amazing repo! Here's a reproduction script:
package main
import (
"bytes"
"fmt"
"github.com/loicalleyne/bodkin"
"log"
"strings"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/memory"
)
func main() {
fmt.Println("json 1")
jsonPass := []byte(`{"books": [{ "id": 1, "title": "To Kill a Mockingbird", "author": "Harper Lee" }, { "id": 2, "title": "1984", "author": "George Orwell" }, { "id": 3, "title": "The Great Gatsby", "author": "F. Scott Fitzgerald" }]}
{"books": [{ "id": 1, "title": "To Kill a Mockingbird", "author": "Harper Lee" }, { "newIDColumn": 2, "title": "1984", "author": "George Orwell" }, { "id": 3, "title": "The Great Gatsby", "author": "F. Scott Fitzgerald" }]}`)
buildTableFromJSON(jsonPass)
fmt.Println("successfully created arrow table from json 1")
fmt.Println("json 2")
// Deeply nested JSON data (NDJSON format - each record on its own line)
jsonFail := []byte(`{"file_path":"s3://test-bucket/batch-gnarly/file-0.jpg","some_label":"weird_label_#0","confidence":0.99,"model_name":"🤖_model_v9","processed_at":"2025-07-23T14:00:00Z","batch_id":"gnarly-batch-0001","metadata":{"notes":"This is a \"very\" strange record with \n newlines, tabs\t and unicode: 🌐🚀","edge_cases":{"null_value":null,"bool_true":true,"bool_false":false,"nested":{"inner":{"value":123,"list":[1,2,3,{"deep":"value"}]}}},"timestamps":["2020-01-01T00:00:00Z","2050-12-31T23:59:59Z"],"weird_keys":{"1key":"numeric start","spaces in key":"hello world","symbols":"!@#$%^&*()"},"json_string":{"embedded_json":"yes"}}}
{"file_path":"s3://test-bucket/batch-gnarly/file-1.jpg","some_label":"$$$-odd_label","confidence":0.42,"model_name":"model_<script>alert(1)</script>","processed_at":"2025-07-23T14:00:00Z","batch_id":"gnarly-batch-0001","metadata":{"long_text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit.","deeply_nested":{"a":{"b":{"c":{"d":{"e":{"f":{"g":"too deep?"}}}}}}},"mixed_list":[true,42,"string",null,{"inner":"thing"}],"random_data":"<0001f9ec>🔣🌈"}}`)
buildTableFromJSON(jsonFail)
}
func buildTableFromJSON(json []byte) {
u := bodkin.NewBodkin(
bodkin.WithInferTimeUnits(),
bodkin.WithTypeConversion(),
)
lines := strings.Split(strings.TrimSpace(string(json)), "\n")
// unify the schema
var cleanedLines []string
for _, l := range lines {
l = strings.TrimSpace(l)
if l == "" {
continue
}
if err := u.Unify(l); err != nil {
log.Fatal(err)
}
cleanedLines = append(cleanedLines, l)
}
schema, err := u.Schema()
if err != nil {
log.Fatal(err)
}
// Debug: print schema info
log.Printf("Schema has %d fields", len(schema.Fields()))
for i, field := range schema.Fields() {
log.Printf("Field %d: %s (%s)", i, field.Name, field.Type)
}
// Create clean NDJSON with proper newlines
cleanJSON := strings.Join(cleanedLines, "\n")
// Try with a smaller chunk size to avoid memory issues
rdr := array.NewJSONReader(bytes.NewReader([]byte(cleanJSON)),
schema,
array.WithAllocator(memory.DefaultAllocator),
array.WithChunk(-1), // set to 1 reveals 2025/07/24 11:33:12 Reader error: json: cannot unmarshal { into Go value of type int64
)
defer rdr.Release()
var records []arrow.Record
for rdr.Next() {
rec := rdr.Record()
rec.Retain()
records = append(records, rec)
log.Printf("Successfully read record with %d columns", rec.NumCols())
}
if err := rdr.Err(); err != nil {
log.Fatalf("Reader error: %v", err)
}
if len(records) == 0 {
log.Fatal("No records were read")
}
table := array.NewTableFromRecords(schema, records)
defer table.Release()
log.Printf("Successfully created table with %d records and %d columns",
table.NumRows(), table.NumCols())
// Clean up
for _, rec := range records {
rec.Release()
}
}