Skip to content

Commit 5dedff0

Browse files
authored
Merge pull request #354 from Krmjn09/feature-string-decoding
String Decoding Logic
2 parents 00d083c + 70dfd71 commit 5dedff0

File tree

2 files changed

+179
-80
lines changed

2 files changed

+179
-80
lines changed

demo/node/rntuple_selector.js

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,41 @@ import { TSelector, openFile } from 'jsroot';
44
const selector = new TSelector();
55
selector.sum = 0;
66
selector.count = 0;
7-
selector.addBranch('myDouble');
7+
selector.addBranch('Nation');
88
selector.Begin = function() {
99
console.log('Begin processing');
1010
};
1111

1212
selector.Process = function() {
13-
// console.log('Entry : ', this.tgtobj);
14-
this.sum += this.tgtobj.myDouble;
13+
console.log('Entry : ', this.tgtobj);
1514
this.count++;
1615
};
1716

17+
1818
selector.Terminate = function() {
1919
if (this.count === 0)
2020
console.error('No entries processed');
21-
else
22-
console.log(`Mean = ${(this.sum / this.count).toFixed(4)} from ${this.count} entries`);
2321
};
2422

2523
if (typeof window === 'undefined') {
26-
openFile('./simple.root')
27-
.then(file => file.readObject('myNtuple'))
24+
openFile('./ntpl001_staff.root')
25+
.then(file => file.readObject('Staff'))
2826
.then(rntuple => {
2927
if (!rntuple) throw new Error('myNtuple not found');
3028
return rntupleProcess(rntuple, selector);
3129
})
3230
.then(() => console.log('RNTuple::Process finished'))
3331
.catch(err => console.error(err));
3432
}
33+
34+
35+
// if (typeof window === 'undefined') {
36+
// openFile('./simple.root')
37+
// .then(file => file.readObject('myNtuple'))
38+
// .then(rntuple => {
39+
// if (!rntuple) throw new Error('myNtuple not found');
40+
// return rntupleProcess(rntuple, selector);
41+
// })
42+
// .then(() => console.log('RNTuple::Process finished'))
43+
// .catch(err => console.error(err));
44+
// }

modules/rntuple.mjs

Lines changed: 162 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,23 @@ function getTypeByteSize(coltype) {
150150
case ENTupleColumnType.kReal64:
151151
case ENTupleColumnType.kInt64:
152152
case ENTupleColumnType.kUInt64:
153+
case ENTupleColumnType.kIndex64:
153154
return 8;
154155
case ENTupleColumnType.kReal32:
155156
case ENTupleColumnType.kInt32:
157+
case ENTupleColumnType.kIndex32:
156158
case ENTupleColumnType.kUInt32:
159+
case ENTupleColumnType.kSplitIndex64:
157160
return 4;
158161
case ENTupleColumnType.kInt16:
159162
case ENTupleColumnType.kUInt16:
160163
return 2;
161164
case ENTupleColumnType.kInt8:
162165
case ENTupleColumnType.kUInt8:
163166
case ENTupleColumnType.kByte:
167+
case ENTupleColumnType.kByteArray:
168+
case ENTupleColumnType.kIndexArrayU8:
169+
case ENTupleColumnType.kChar:
164170
return 1;
165171
default:
166172
throw new Error(`Unsupported coltype for byte size: ${coltype} (0x${coltype.toString(16).padStart(2, '0')})`);
@@ -599,54 +605,68 @@ class RNTupleDescriptorBuilder {
599605

600606
// Example Of Deserializing Page Content
601607
deserializePage(blob, columnDescriptor) {
602-
const reader = new RBufferReader(blob),
603-
values = [],
604-
byteSize = getTypeByteSize(columnDescriptor.coltype),
605-
numValues = blob.byteLength / byteSize;
606-
607-
for (let i = 0; i < numValues; ++i) {
608-
let val;
609-
switch (columnDescriptor.coltype) {
610-
case ENTupleColumnType.kReal64:
611-
val = reader.readF64();
612-
break;
613-
case ENTupleColumnType.kReal32:
614-
val = reader.readF32();
615-
break;
616-
case ENTupleColumnType.kInt64:
617-
val = reader.readI64();
618-
break;
619-
case ENTupleColumnType.kUInt64:
620-
val = reader.readU64();
621-
break;
622-
case ENTupleColumnType.kInt32:
623-
val = reader.readI32();
624-
break;
625-
case ENTupleColumnType.kUInt32:
626-
val = reader.readU32();
627-
break;
628-
case ENTupleColumnType.kInt16:
629-
val = reader.readI16();
630-
break;
631-
case ENTupleColumnType.kUInt16:
632-
val = reader.readU16();
633-
break;
634-
case ENTupleColumnType.kInt8:
635-
val = reader.readI8();
636-
break;
637-
case ENTupleColumnType.kUInt8:
638-
case ENTupleColumnType.kByte:
639-
val = reader.readU8();
640-
break;
641-
default:
608+
const reader = new RBufferReader(blob),
609+
values = [],
610+
coltype = columnDescriptor.coltype,
611+
byteSize = getTypeByteSize(coltype),
612+
numValues = byteSize ? blob.byteLength / byteSize : undefined;
613+
614+
for (let i = 0; i < (numValues ?? blob.byteLength); ++i) {
615+
let val;
616+
617+
switch (coltype) {
618+
case ENTupleColumnType.kReal64:
619+
val = reader.readF64();
620+
break;
621+
case ENTupleColumnType.kReal32:
622+
val = reader.readF32();
623+
break;
624+
case ENTupleColumnType.kInt64:
625+
val = reader.readI64();
626+
break;
627+
case ENTupleColumnType.kUInt64:
628+
val = reader.readU64();
629+
break;
630+
case ENTupleColumnType.kInt32:
631+
case ENTupleColumnType.kIndex32:
632+
val = reader.readU32();
633+
break;
634+
case ENTupleColumnType.kUInt32:
635+
val = reader.readU32();
636+
break;
637+
case ENTupleColumnType.kInt16:
638+
val = reader.readI16();
639+
break;
640+
case ENTupleColumnType.kUInt16:
641+
val = reader.readU16();
642+
break;
643+
case ENTupleColumnType.kInt8:
644+
val = reader.readS8();
645+
break;
646+
case ENTupleColumnType.kUInt8:
647+
case ENTupleColumnType.kByte:
648+
case ENTupleColumnType.kByteArray:
649+
case ENTupleColumnType.kIndexArrayU8:
650+
val = reader.readU8();
651+
break;
652+
case ENTupleColumnType.kChar:
653+
val = String.fromCharCode(reader.readS8());
654+
break;
655+
case ENTupleColumnType.kIndex64:
656+
val = reader.readU64();
657+
break;
658+
case ENTupleColumnType.kSplitIndex64:
659+
val = reader.readU32();
660+
break;
661+
default:
642662
throw new Error(`Unsupported column type: ${columnDescriptor.coltype}`);
643-
}
644-
values.push(val);
645663
}
646-
647-
return values;
664+
values.push(val);
648665
}
649666

667+
return values;
668+
}
669+
650670

651671
}
652672

@@ -662,22 +682,24 @@ async function readHeaderFooter(tuple) {
662682
if (blobs?.length !== 2)
663683
return false;
664684

665-
// unzip both buffers
685+
// Handle both compressed and uncompressed cases
686+
const processBlob = (blob, uncompressedSize) => {
687+
// If uncompressedSize matches blob size, it's uncompressed
688+
if (blob.byteLength === uncompressedSize)
689+
return Promise.resolve(blob);
690+
return R__unzip(blob, uncompressedSize);
691+
};
692+
666693
return Promise.all([
667-
R__unzip(blobs[0], tuple.fLenHeader),
668-
R__unzip(blobs[1], tuple.fLenFooter)
694+
processBlob(blobs[0], tuple.fLenHeader),
695+
processBlob(blobs[1], tuple.fLenFooter)
669696
]).then(unzip_blobs => {
670-
const header_blob = unzip_blobs[0],
671-
footer_blob = unzip_blobs[1];
672-
if (!header_blob || !footer_blob)
697+
const [header_blob, footer_blob] = unzip_blobs;
698+
if (!header_blob || !footer_blob)
673699
return false;
674700

675-
// create builder description and decode it - dummy for the moment
676-
677701
tuple.builder = new RNTupleDescriptorBuilder;
678-
679702
tuple.builder.deserializeHeader(header_blob);
680-
681703
tuple.builder.deserializeFooter(footer_blob);
682704

683705
// Build fieldToColumns mapping
@@ -703,13 +725,20 @@ async function readHeaderFooter(tuple) {
703725
if (!(page_list_blob instanceof DataView))
704726
throw new Error(`Expected DataView from readBuffer, got ${Object.prototype.toString.call(page_list_blob)}`);
705727

706-
return R__unzip(page_list_blob, uncompressedSize).then(unzipped_blob => {
707-
if (!(unzipped_blob instanceof DataView))
708-
throw new Error(`Unzipped page list is not a DataView, got ${Object.prototype.toString.call(unzipped_blob)}`);
709-
710-
tuple.builder.deserializePageList(unzipped_blob);
728+
// Check if page list data is uncompressed
729+
if (page_list_blob.byteLength === uncompressedSize) {
730+
// Data is uncompressed, use directly
731+
tuple.builder.deserializePageList(page_list_blob);
711732
return true;
712-
});
733+
}
734+
// Attempt to decompress the page list
735+
return R__unzip(page_list_blob, uncompressedSize).then(unzipped_blob => {
736+
if (!(unzipped_blob instanceof DataView))
737+
throw new Error(`Unzipped page list is not a DataView, got ${Object.prototype.toString.call(unzipped_blob)}`);
738+
739+
tuple.builder.deserializePageList(unzipped_blob);
740+
return true;
741+
});
713742
});
714743
});
715744
}).catch(err => {
@@ -718,11 +747,40 @@ async function readHeaderFooter(tuple) {
718747
});
719748
}
720749

750+
function readEntry(rntuple, fieldName, entryIndex) {
751+
const builder = rntuple.builder,
752+
field = builder.fieldDescriptors.find(f => f.fieldName === fieldName),
753+
fieldData = rntuple._clusterData[fieldName];
754+
755+
if (!field)
756+
throw new Error(`No descriptor for field ${fieldName}`);
757+
if (!fieldData)
758+
throw new Error(`No data for field ${fieldName}`);
759+
760+
// Detect and decode string fields
761+
if (Array.isArray(fieldData) && fieldData.length === 2) {
762+
const [offsets, payload] = fieldData,
763+
start = entryIndex === 0 ? 0 : Number(offsets[entryIndex - 1]),
764+
end = Number(offsets[entryIndex]),
765+
decoded = payload.slice(start, end).join(''); // Convert to string
766+
return decoded;
767+
}
768+
769+
// Fallback: primitive type (e.g. int, float)
770+
return fieldData[0][entryIndex];
771+
}
772+
773+
721774
// Read and process the next data cluster from the RNTuple
722775
function readNextCluster(rntuple, selector) {
723-
const builder = rntuple.builder,
724-
clusterIndex = selector.currentCluster,
725-
clusterSummary = builder.clusterSummaries[clusterIndex],
776+
const builder = rntuple.builder;
777+
778+
// Add validation
779+
if (!builder.clusterSummaries || builder.clusterSummaries.length === 0)
780+
throw new Error('No cluster summaries available - possibly incomplete file reading');
781+
782+
const clusterIndex = selector.currentCluster,
783+
clusterSummary = builder.clusterSummaries[clusterIndex],
726784

727785
// Gather all pages for this cluster from selected fields only
728786
pages = [],
@@ -742,11 +800,11 @@ function readNextCluster(rntuple, selector) {
742800
const colEntry = builder.pageLocations[clusterIndex]?.[colDesc.index];
743801

744802
// When the data is missing or broken
745-
if (!colEntry || !colEntry.pages)
803+
if (!colEntry || !colEntry.pages)
746804
throw new Error(`No pages for column ${colDesc.index} in cluster ${clusterIndex}`);
747805

748806
for (const page of colEntry.pages)
749-
pages.push({ page, colDesc });
807+
pages.push({ page, colDesc, fieldName });
750808
}
751809
}
752810

@@ -766,36 +824,67 @@ function readNextCluster(rntuple, selector) {
766824
return rntuple.$file.readBuffer(dataToRead).then(blobsRaw => {
767825
const blobs = Array.isArray(blobsRaw) ? blobsRaw : [blobsRaw],
768826
unzipPromises = blobs.map((blob, idx) => {
769-
const { page, colDesc } = pages[idx],
827+
const { page, colDesc } = pages[idx],
828+
colEntry = builder.pageLocations[clusterIndex][colDesc.index], // Access column entry
770829
numElements = Number(page.numElements),
771830
elementSize = colDesc.bitsOnStorage / 8;
772-
return R__unzip(blob, numElements * elementSize);
831+
832+
// Check if data is compressed
833+
if (colEntry.compression === 0)
834+
return Promise.resolve(blob); // Uncompressed: use blob directly
835+
return R__unzip(blob, numElements * elementSize);
773836
});
774837

775838
return Promise.all(unzipPromises).then(unzipBlobs => {
776839
rntuple._clusterData = {}; // store deserialized data per field
777840

778841
for (let i = 0; i < unzipBlobs.length; ++i) {
842+
const blob = unzipBlobs[i];
843+
// Ensure blob is a DataView
844+
if (!(blob instanceof DataView))
845+
throw new Error(`Invalid blob type for page ${i}: ${Object.prototype.toString.call(blob)}`);
779846
const {
780847
colDesc
781848
} = pages[i],
782849
field = builder.fieldDescriptors[colDesc.fieldId],
783-
values = builder.deserializePage(unzipBlobs[i], colDesc, field);
850+
values = builder.deserializePage(blob, colDesc);
784851

785-
// TODO: Handle fields with multiple columns (e.g., data + metadata).
786-
// For now, we only store the first column's data to avoid overwriting.
852+
// Support multiple representations (e.g., string fields with offsets + payload)
787853
if (!rntuple._clusterData[field.fieldName])
788-
rntuple._clusterData[field.fieldName] = values;
854+
rntuple._clusterData[field.fieldName] = [];
855+
856+
// splitting string fields into offset and payload components
857+
if (field.typeName === 'std::string') {
858+
if (colDesc.coltype === ENTupleColumnType.kIndex64) // Index64/Index32
859+
rntuple._clusterData[field.fieldName][0] = values; // Offsets
860+
else if (colDesc.coltype === ENTupleColumnType.kChar)
861+
rntuple._clusterData[field.fieldName][1] = values; // Payload
862+
else
863+
throw new Error(`Unsupported column type for string field: ${colDesc.coltype}`);
864+
} else
865+
rntuple._clusterData[field.fieldName][0] = values;
866+
}
867+
868+
// Ensure string fields have ending offset for proper reconstruction of the last entry
869+
for (const fieldName of selectedFields) {
870+
const field = builder.fieldDescriptors.find(f => f.fieldName === fieldName),
871+
colData = rntuple._clusterData[fieldName];
872+
if (field.typeName === 'std::string') {
873+
if (!Array.isArray(colData) || colData.length !== 2)
874+
throw new Error(`String field '${fieldName}' must have 2 columns`);
875+
if (colData[0].length !== builder.clusterSummaries[clusterIndex].numEntries)
876+
throw new Error(`Malformed string field '${fieldName}': missing final offset`);
877+
}
789878
}
790879

791880
const numEntries = clusterSummary.numEntries;
792881
for (let i = 0; i < numEntries; ++i) {
793882
for (let b = 0; b < selector.numBranches(); ++b) {
794883
const fieldName = selector.nameOfBranch(b),
795884
values = rntuple._clusterData[fieldName];
796-
if (!values)
885+
if (!values)
797886
throw new Error(`Missing values for selected field: ${fieldName}`);
798-
selector.tgtobj[fieldName] = values[i];
887+
selector.tgtobj[fieldName] = readEntry(rntuple, fieldName, i);
799888
}
800889
selector.Process();
801890
}

0 commit comments

Comments
 (0)