Skip to content

Panic/Err when reading certain NDJSON with Bodkin + Arrow JSON reader #28

@caldempsey

Description

@caldempsey

While integrating Bodkin in one of our pipelines we bumped into a panic that we could narrow down to a very specific piece of shaped data. I’m opening this issue in case it’s helpful for you to reproduce and (if appropriate) guard against.

When the reader hits the record it panics somewhere inside Arrow’s JSON reader, presumably because the inferred schema can’t accommodate the second record’s shape. Our wrapper catches the panic, but because Arrow panics before returning an error, the resulting message is fairly generic.

I think ideally if Arrow can't handle the downstream data Bodkin would surface a clear validation error (schema mismatch, nested map too deep, etc.) instead of Arrow panicking. If that’s outside Bodkin’s remit, totally understood because Arrow shouldn't be panicking in the first place but I can see warnings being surfaced from Bodkin in the console logs. Would be helpful to understand why this happens if you're familiar.

Thanks for your amazing repo! Here's a reproduction script:

package main

import (
	"bytes"
	"fmt"
	"github.com/loicalleyne/bodkin"
	"log"
	"strings"

	"github.com/apache/arrow-go/v18/arrow"
	"github.com/apache/arrow-go/v18/arrow/array"
	"github.com/apache/arrow-go/v18/arrow/memory"
)

func main() {
	fmt.Println("json 1")
	jsonPass := []byte(`{"books": [{ "id": 1, "title": "To Kill a Mockingbird", "author": "Harper Lee" }, { "id": 2, "title": "1984", "author": "George Orwell" }, { "id": 3, "title": "The Great Gatsby", "author": "F. Scott Fitzgerald" }]}
{"books": [{ "id": 1, "title": "To Kill a Mockingbird", "author": "Harper Lee" }, { "newIDColumn": 2, "title": "1984", "author": "George Orwell" }, { "id": 3, "title": "The Great Gatsby", "author": "F. Scott Fitzgerald" }]}`)
	buildTableFromJSON(jsonPass)
	fmt.Println("successfully created arrow table from json 1")

	fmt.Println("json 2")
	// Deeply nested JSON data (NDJSON format - each record on its own line)
	jsonFail := []byte(`{"file_path":"s3://test-bucket/batch-gnarly/file-0.jpg","some_label":"weird_label_#0","confidence":0.99,"model_name":"🤖_model_v9","processed_at":"2025-07-23T14:00:00Z","batch_id":"gnarly-batch-0001","metadata":{"notes":"This is a \"very\" strange record with \n newlines, tabs\t and unicode: 🌐🚀","edge_cases":{"null_value":null,"bool_true":true,"bool_false":false,"nested":{"inner":{"value":123,"list":[1,2,3,{"deep":"value"}]}}},"timestamps":["2020-01-01T00:00:00Z","2050-12-31T23:59:59Z"],"weird_keys":{"1key":"numeric start","spaces in key":"hello world","symbols":"!@#$%^&*()"},"json_string":{"embedded_json":"yes"}}}
{"file_path":"s3://test-bucket/batch-gnarly/file-1.jpg","some_label":"$$$-odd_label","confidence":0.42,"model_name":"model_<script>alert(1)</script>","processed_at":"2025-07-23T14:00:00Z","batch_id":"gnarly-batch-0001","metadata":{"long_text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit.","deeply_nested":{"a":{"b":{"c":{"d":{"e":{"f":{"g":"too deep?"}}}}}}},"mixed_list":[true,42,"string",null,{"inner":"thing"}],"random_data":"<0001f9ec>🔣🌈"}}`)
	buildTableFromJSON(jsonFail)
}

func buildTableFromJSON(json []byte) {
	u := bodkin.NewBodkin(
		bodkin.WithInferTimeUnits(),
		bodkin.WithTypeConversion(),
	)

	lines := strings.Split(strings.TrimSpace(string(json)), "\n")
	// unify the schema
	var cleanedLines []string

	for _, l := range lines {
		l = strings.TrimSpace(l)
		if l == "" {
			continue
		}
		if err := u.Unify(l); err != nil {
			log.Fatal(err)
		}
		cleanedLines = append(cleanedLines, l)
	}

	schema, err := u.Schema()
	if err != nil {
		log.Fatal(err)
	}

	// Debug: print schema info
	log.Printf("Schema has %d fields", len(schema.Fields()))
	for i, field := range schema.Fields() {
		log.Printf("Field %d: %s (%s)", i, field.Name, field.Type)
	}

	// Create clean NDJSON with proper newlines
	cleanJSON := strings.Join(cleanedLines, "\n")

	// Try with a smaller chunk size to avoid memory issues
	rdr := array.NewJSONReader(bytes.NewReader([]byte(cleanJSON)),
		schema,
		array.WithAllocator(memory.DefaultAllocator),
		array.WithChunk(-1), // set to 1 reveals 2025/07/24 11:33:12 Reader error: json: cannot unmarshal { into Go value of type int64
	)
	defer rdr.Release()

	var records []arrow.Record
	for rdr.Next() {
		rec := rdr.Record()
		rec.Retain()
		records = append(records, rec)
		log.Printf("Successfully read record with %d columns", rec.NumCols())
	}
	if err := rdr.Err(); err != nil {
		log.Fatalf("Reader error: %v", err)
	}

	if len(records) == 0 {
		log.Fatal("No records were read")
	}

	table := array.NewTableFromRecords(schema, records)
	defer table.Release()

	log.Printf("Successfully created table with %d records and %d columns",
		table.NumRows(), table.NumCols())

	// Clean up
	for _, rec := range records {
		rec.Release()
	}
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions