@@ -150,17 +150,23 @@ function getTypeByteSize(coltype) {
150
150
case ENTupleColumnType . kReal64 :
151
151
case ENTupleColumnType . kInt64 :
152
152
case ENTupleColumnType . kUInt64 :
153
+ case ENTupleColumnType . kIndex64 :
153
154
return 8 ;
154
155
case ENTupleColumnType . kReal32 :
155
156
case ENTupleColumnType . kInt32 :
157
+ case ENTupleColumnType . kIndex32 :
156
158
case ENTupleColumnType . kUInt32 :
159
+ case ENTupleColumnType . kSplitIndex64 :
157
160
return 4 ;
158
161
case ENTupleColumnType . kInt16 :
159
162
case ENTupleColumnType . kUInt16 :
160
163
return 2 ;
161
164
case ENTupleColumnType . kInt8 :
162
165
case ENTupleColumnType . kUInt8 :
163
166
case ENTupleColumnType . kByte :
167
+ case ENTupleColumnType . kByteArray :
168
+ case ENTupleColumnType . kIndexArrayU8 :
169
+ case ENTupleColumnType . kChar :
164
170
return 1 ;
165
171
default :
166
172
throw new Error ( `Unsupported coltype for byte size: ${ coltype } (0x${ coltype . toString ( 16 ) . padStart ( 2 , '0' ) } )` ) ;
@@ -599,54 +605,68 @@ class RNTupleDescriptorBuilder {
599
605
600
606
// Example Of Deserializing Page Content
601
607
deserializePage ( blob , columnDescriptor ) {
602
- const reader = new RBufferReader ( blob ) ,
603
- values = [ ] ,
604
- byteSize = getTypeByteSize ( columnDescriptor . coltype ) ,
605
- numValues = blob . byteLength / byteSize ;
606
-
607
- for ( let i = 0 ; i < numValues ; ++ i ) {
608
- let val ;
609
- switch ( columnDescriptor . coltype ) {
610
- case ENTupleColumnType . kReal64 :
611
- val = reader . readF64 ( ) ;
612
- break ;
613
- case ENTupleColumnType . kReal32 :
614
- val = reader . readF32 ( ) ;
615
- break ;
616
- case ENTupleColumnType . kInt64 :
617
- val = reader . readI64 ( ) ;
618
- break ;
619
- case ENTupleColumnType . kUInt64 :
620
- val = reader . readU64 ( ) ;
621
- break ;
622
- case ENTupleColumnType . kInt32 :
623
- val = reader . readI32 ( ) ;
624
- break ;
625
- case ENTupleColumnType . kUInt32 :
626
- val = reader . readU32 ( ) ;
627
- break ;
628
- case ENTupleColumnType . kInt16 :
629
- val = reader . readI16 ( ) ;
630
- break ;
631
- case ENTupleColumnType . kUInt16 :
632
- val = reader . readU16 ( ) ;
633
- break ;
634
- case ENTupleColumnType . kInt8 :
635
- val = reader . readI8 ( ) ;
636
- break ;
637
- case ENTupleColumnType . kUInt8 :
638
- case ENTupleColumnType . kByte :
639
- val = reader . readU8 ( ) ;
640
- break ;
641
- default :
608
+ const reader = new RBufferReader ( blob ) ,
609
+ values = [ ] ,
610
+ coltype = columnDescriptor . coltype ,
611
+ byteSize = getTypeByteSize ( coltype ) ,
612
+ numValues = byteSize ? blob . byteLength / byteSize : undefined ;
613
+
614
+ for ( let i = 0 ; i < ( numValues ?? blob . byteLength ) ; ++ i ) {
615
+ let val ;
616
+
617
+ switch ( coltype ) {
618
+ case ENTupleColumnType . kReal64 :
619
+ val = reader . readF64 ( ) ;
620
+ break ;
621
+ case ENTupleColumnType . kReal32 :
622
+ val = reader . readF32 ( ) ;
623
+ break ;
624
+ case ENTupleColumnType . kInt64 :
625
+ val = reader . readI64 ( ) ;
626
+ break ;
627
+ case ENTupleColumnType . kUInt64 :
628
+ val = reader . readU64 ( ) ;
629
+ break ;
630
+ case ENTupleColumnType . kInt32 :
631
+ case ENTupleColumnType . kIndex32 :
632
+ val = reader . readU32 ( ) ;
633
+ break ;
634
+ case ENTupleColumnType . kUInt32 :
635
+ val = reader . readU32 ( ) ;
636
+ break ;
637
+ case ENTupleColumnType . kInt16 :
638
+ val = reader . readI16 ( ) ;
639
+ break ;
640
+ case ENTupleColumnType . kUInt16 :
641
+ val = reader . readU16 ( ) ;
642
+ break ;
643
+ case ENTupleColumnType . kInt8 :
644
+ val = reader . readS8 ( ) ;
645
+ break ;
646
+ case ENTupleColumnType . kUInt8 :
647
+ case ENTupleColumnType . kByte :
648
+ case ENTupleColumnType . kByteArray :
649
+ case ENTupleColumnType . kIndexArrayU8 :
650
+ val = reader . readU8 ( ) ;
651
+ break ;
652
+ case ENTupleColumnType . kChar :
653
+ val = String . fromCharCode ( reader . readS8 ( ) ) ;
654
+ break ;
655
+ case ENTupleColumnType . kIndex64 :
656
+ val = reader . readU64 ( ) ;
657
+ break ;
658
+ case ENTupleColumnType . kSplitIndex64 :
659
+ val = reader . readU32 ( ) ;
660
+ break ;
661
+ default :
642
662
throw new Error ( `Unsupported column type: ${ columnDescriptor . coltype } ` ) ;
643
- }
644
- values . push ( val ) ;
645
663
}
646
-
647
- return values ;
664
+ values . push ( val ) ;
648
665
}
649
666
667
+ return values ;
668
+ }
669
+
650
670
651
671
}
652
672
@@ -662,22 +682,24 @@ async function readHeaderFooter(tuple) {
662
682
if ( blobs ?. length !== 2 )
663
683
return false ;
664
684
665
- // unzip both buffers
685
+ // Handle both compressed and uncompressed cases
686
+ const processBlob = ( blob , uncompressedSize ) => {
687
+ // If uncompressedSize matches blob size, it's uncompressed
688
+ if ( blob . byteLength === uncompressedSize )
689
+ return Promise . resolve ( blob ) ;
690
+ return R__unzip ( blob , uncompressedSize ) ;
691
+ } ;
692
+
666
693
return Promise . all ( [
667
- R__unzip ( blobs [ 0 ] , tuple . fLenHeader ) ,
668
- R__unzip ( blobs [ 1 ] , tuple . fLenFooter )
694
+ processBlob ( blobs [ 0 ] , tuple . fLenHeader ) ,
695
+ processBlob ( blobs [ 1 ] , tuple . fLenFooter )
669
696
] ) . then ( unzip_blobs => {
670
- const header_blob = unzip_blobs [ 0 ] ,
671
- footer_blob = unzip_blobs [ 1 ] ;
672
- if ( ! header_blob || ! footer_blob )
697
+ const [ header_blob , footer_blob ] = unzip_blobs ;
698
+ if ( ! header_blob || ! footer_blob )
673
699
return false ;
674
700
675
- // create builder description and decode it - dummy for the moment
676
-
677
701
tuple . builder = new RNTupleDescriptorBuilder ;
678
-
679
702
tuple . builder . deserializeHeader ( header_blob ) ;
680
-
681
703
tuple . builder . deserializeFooter ( footer_blob ) ;
682
704
683
705
// Build fieldToColumns mapping
@@ -703,13 +725,20 @@ async function readHeaderFooter(tuple) {
703
725
if ( ! ( page_list_blob instanceof DataView ) )
704
726
throw new Error ( `Expected DataView from readBuffer, got ${ Object . prototype . toString . call ( page_list_blob ) } ` ) ;
705
727
706
- return R__unzip ( page_list_blob , uncompressedSize ) . then ( unzipped_blob => {
707
- if ( ! ( unzipped_blob instanceof DataView ) )
708
- throw new Error ( `Unzipped page list is not a DataView, got ${ Object . prototype . toString . call ( unzipped_blob ) } ` ) ;
709
-
710
- tuple . builder . deserializePageList ( unzipped_blob ) ;
728
+ // Check if page list data is uncompressed
729
+ if ( page_list_blob . byteLength === uncompressedSize ) {
730
+ // Data is uncompressed, use directly
731
+ tuple . builder . deserializePageList ( page_list_blob ) ;
711
732
return true ;
712
- } ) ;
733
+ }
734
+ // Attempt to decompress the page list
735
+ return R__unzip ( page_list_blob , uncompressedSize ) . then ( unzipped_blob => {
736
+ if ( ! ( unzipped_blob instanceof DataView ) )
737
+ throw new Error ( `Unzipped page list is not a DataView, got ${ Object . prototype . toString . call ( unzipped_blob ) } ` ) ;
738
+
739
+ tuple . builder . deserializePageList ( unzipped_blob ) ;
740
+ return true ;
741
+ } ) ;
713
742
} ) ;
714
743
} ) ;
715
744
} ) . catch ( err => {
@@ -718,11 +747,40 @@ async function readHeaderFooter(tuple) {
718
747
} ) ;
719
748
}
720
749
750
+ function readEntry ( rntuple , fieldName , entryIndex ) {
751
+ const builder = rntuple . builder ,
752
+ field = builder . fieldDescriptors . find ( f => f . fieldName === fieldName ) ,
753
+ fieldData = rntuple . _clusterData [ fieldName ] ;
754
+
755
+ if ( ! field )
756
+ throw new Error ( `No descriptor for field ${ fieldName } ` ) ;
757
+ if ( ! fieldData )
758
+ throw new Error ( `No data for field ${ fieldName } ` ) ;
759
+
760
+ // Detect and decode string fields
761
+ if ( Array . isArray ( fieldData ) && fieldData . length === 2 ) {
762
+ const [ offsets , payload ] = fieldData ,
763
+ start = entryIndex === 0 ? 0 : Number ( offsets [ entryIndex - 1 ] ) ,
764
+ end = Number ( offsets [ entryIndex ] ) ,
765
+ decoded = payload . slice ( start , end ) . join ( '' ) ; // Convert to string
766
+ return decoded ;
767
+ }
768
+
769
+ // Fallback: primitive type (e.g. int, float)
770
+ return fieldData [ 0 ] [ entryIndex ] ;
771
+ }
772
+
773
+
721
774
// Read and process the next data cluster from the RNTuple
722
775
function readNextCluster ( rntuple , selector ) {
723
- const builder = rntuple . builder ,
724
- clusterIndex = selector . currentCluster ,
725
- clusterSummary = builder . clusterSummaries [ clusterIndex ] ,
776
+ const builder = rntuple . builder ;
777
+
778
+ // Add validation
779
+ if ( ! builder . clusterSummaries || builder . clusterSummaries . length === 0 )
780
+ throw new Error ( 'No cluster summaries available - possibly incomplete file reading' ) ;
781
+
782
+ const clusterIndex = selector . currentCluster ,
783
+ clusterSummary = builder . clusterSummaries [ clusterIndex ] ,
726
784
727
785
// Gather all pages for this cluster from selected fields only
728
786
pages = [ ] ,
@@ -742,11 +800,11 @@ function readNextCluster(rntuple, selector) {
742
800
const colEntry = builder . pageLocations [ clusterIndex ] ?. [ colDesc . index ] ;
743
801
744
802
// When the data is missing or broken
745
- if ( ! colEntry || ! colEntry . pages )
803
+ if ( ! colEntry || ! colEntry . pages )
746
804
throw new Error ( `No pages for column ${ colDesc . index } in cluster ${ clusterIndex } ` ) ;
747
805
748
806
for ( const page of colEntry . pages )
749
- pages . push ( { page, colDesc } ) ;
807
+ pages . push ( { page, colDesc, fieldName } ) ;
750
808
}
751
809
}
752
810
@@ -766,36 +824,67 @@ function readNextCluster(rntuple, selector) {
766
824
return rntuple . $file . readBuffer ( dataToRead ) . then ( blobsRaw => {
767
825
const blobs = Array . isArray ( blobsRaw ) ? blobsRaw : [ blobsRaw ] ,
768
826
unzipPromises = blobs . map ( ( blob , idx ) => {
769
- const { page, colDesc } = pages [ idx ] ,
827
+ const { page, colDesc } = pages [ idx ] ,
828
+ colEntry = builder . pageLocations [ clusterIndex ] [ colDesc . index ] , // Access column entry
770
829
numElements = Number ( page . numElements ) ,
771
830
elementSize = colDesc . bitsOnStorage / 8 ;
772
- return R__unzip ( blob , numElements * elementSize ) ;
831
+
832
+ // Check if data is compressed
833
+ if ( colEntry . compression === 0 )
834
+ return Promise . resolve ( blob ) ; // Uncompressed: use blob directly
835
+ return R__unzip ( blob , numElements * elementSize ) ;
773
836
} ) ;
774
837
775
838
return Promise . all ( unzipPromises ) . then ( unzipBlobs => {
776
839
rntuple . _clusterData = { } ; // store deserialized data per field
777
840
778
841
for ( let i = 0 ; i < unzipBlobs . length ; ++ i ) {
842
+ const blob = unzipBlobs [ i ] ;
843
+ // Ensure blob is a DataView
844
+ if ( ! ( blob instanceof DataView ) )
845
+ throw new Error ( `Invalid blob type for page ${ i } : ${ Object . prototype . toString . call ( blob ) } ` ) ;
779
846
const {
780
847
colDesc
781
848
} = pages [ i ] ,
782
849
field = builder . fieldDescriptors [ colDesc . fieldId ] ,
783
- values = builder . deserializePage ( unzipBlobs [ i ] , colDesc , field ) ;
850
+ values = builder . deserializePage ( blob , colDesc ) ;
784
851
785
- // TODO: Handle fields with multiple columns (e.g., data + metadata).
786
- // For now, we only store the first column's data to avoid overwriting.
852
+ // Support multiple representations (e.g., string fields with offsets + payload)
787
853
if ( ! rntuple . _clusterData [ field . fieldName ] )
788
- rntuple . _clusterData [ field . fieldName ] = values ;
854
+ rntuple . _clusterData [ field . fieldName ] = [ ] ;
855
+
856
+ // splitting string fields into offset and payload components
857
+ if ( field . typeName === 'std::string' ) {
858
+ if ( colDesc . coltype === ENTupleColumnType . kIndex64 ) // Index64/Index32
859
+ rntuple . _clusterData [ field . fieldName ] [ 0 ] = values ; // Offsets
860
+ else if ( colDesc . coltype === ENTupleColumnType . kChar )
861
+ rntuple . _clusterData [ field . fieldName ] [ 1 ] = values ; // Payload
862
+ else
863
+ throw new Error ( `Unsupported column type for string field: ${ colDesc . coltype } ` ) ;
864
+ } else
865
+ rntuple . _clusterData [ field . fieldName ] [ 0 ] = values ;
866
+ }
867
+
868
+ // Ensure string fields have ending offset for proper reconstruction of the last entry
869
+ for ( const fieldName of selectedFields ) {
870
+ const field = builder . fieldDescriptors . find ( f => f . fieldName === fieldName ) ,
871
+ colData = rntuple . _clusterData [ fieldName ] ;
872
+ if ( field . typeName === 'std::string' ) {
873
+ if ( ! Array . isArray ( colData ) || colData . length !== 2 )
874
+ throw new Error ( `String field '${ fieldName } ' must have 2 columns` ) ;
875
+ if ( colData [ 0 ] . length !== builder . clusterSummaries [ clusterIndex ] . numEntries )
876
+ throw new Error ( `Malformed string field '${ fieldName } ': missing final offset` ) ;
877
+ }
789
878
}
790
879
791
880
const numEntries = clusterSummary . numEntries ;
792
881
for ( let i = 0 ; i < numEntries ; ++ i ) {
793
882
for ( let b = 0 ; b < selector . numBranches ( ) ; ++ b ) {
794
883
const fieldName = selector . nameOfBranch ( b ) ,
795
884
values = rntuple . _clusterData [ fieldName ] ;
796
- if ( ! values )
885
+ if ( ! values )
797
886
throw new Error ( `Missing values for selected field: ${ fieldName } ` ) ;
798
- selector . tgtobj [ fieldName ] = values [ i ] ;
887
+ selector . tgtobj [ fieldName ] = readEntry ( rntuple , fieldName , i ) ;
799
888
}
800
889
selector . Process ( ) ;
801
890
}
0 commit comments