@@ -16,7 +16,7 @@ import {
16
16
import { TGetResultSetMetadataResp , TColumnDesc } from '../../thrift/TCLIService_types' ;
17
17
import IClientContext from '../contracts/IClientContext' ;
18
18
import IResultsProvider , { ResultsProviderFetchNextOptions } from './IResultsProvider' ;
19
- import { getSchemaColumns , convertThriftValue } from './utils' ;
19
+ import { ArrowBatch , getSchemaColumns , convertThriftValue } from './utils' ;
20
20
21
21
const { isArrowBigNumSymbol, bigNumToBigInt } = arrowUtils ;
22
22
@@ -26,15 +26,23 @@ type ArrowSchemaField = Field<DataType<Type, TypeMap>>;
26
26
export default class ArrowResultConverter implements IResultsProvider < Array < any > > {
27
27
protected readonly context : IClientContext ;
28
28
29
- private readonly source : IResultsProvider < Array < Buffer > > ;
29
+ private readonly source : IResultsProvider < ArrowBatch > ;
30
30
31
31
private readonly schema : Array < TColumnDesc > ;
32
32
33
- private reader ?: IterableIterator < RecordBatch < TypeMap > > ;
33
+ private recordBatchReader ?: IterableIterator < RecordBatch < TypeMap > > ;
34
34
35
- private pendingRecordBatch ?: RecordBatch < TypeMap > ;
35
+ // Remaining rows in current Arrow batch (not the record batch!)
36
+ private remainingRows : number = 0 ;
36
37
37
- constructor ( context : IClientContext , source : IResultsProvider < Array < Buffer > > , { schema } : TGetResultSetMetadataResp ) {
38
+ // This is the next (!!) record batch to be read. It is unset only in two cases:
39
+ // - prior to the first call to `fetchNext`
40
+ // - when no more data available
41
+ // This field is primarily used by a `hasMore`, so it can tell if next `fetchNext` will
42
+ // actually return a non-empty result
43
+ private prefetchedRecordBatch ?: RecordBatch < TypeMap > ;
44
+
45
+ constructor ( context : IClientContext , source : IResultsProvider < ArrowBatch > , { schema } : TGetResultSetMetadataResp ) {
38
46
this . context = context ;
39
47
this . source = source ;
40
48
this . schema = getSchemaColumns ( schema ) ;
@@ -44,7 +52,7 @@ export default class ArrowResultConverter implements IResultsProvider<Array<any>
44
52
if ( this . schema . length === 0 ) {
45
53
return false ;
46
54
}
47
- if ( this . pendingRecordBatch ) {
55
+ if ( this . prefetchedRecordBatch ) {
48
56
return true ;
49
57
}
50
58
return this . source . hasMore ( ) ;
@@ -55,47 +63,80 @@ export default class ArrowResultConverter implements IResultsProvider<Array<any>
55
63
return [ ] ;
56
64
}
57
65
58
- // eslint-disable-next-line no-constant-condition
59
- while ( true ) {
60
- // It's not possible to know if iterator has more items until trying
61
- // to get the next item. But we need to know if iterator is empty right
62
- // after getting the next item. Therefore, after creating the iterator,
63
- // we get one item more and store it in `pendingRecordBatch`. Next time,
64
- // we use that stored item, and prefetch the next one. Prefetched item
65
- // is therefore the next item we are going to return, so it can be used
66
- // to know if we actually can return anything next time
67
- const recordBatch = this . pendingRecordBatch ;
68
- this . pendingRecordBatch = this . prefetch ( ) ;
69
-
70
- if ( recordBatch ) {
71
- const table = new Table ( recordBatch ) ;
72
- return this . getRows ( table . schema , table . toArray ( ) ) ;
66
+ // It's not possible to know if iterator has more items until trying to get the next item.
67
+ // So each time we read one batch ahead and store it, but process the batch prefetched on
68
+ // a previous `fetchNext` call. Because we actually already have the next item - it's easy
69
+ // to tell if the subsequent `fetchNext` will be able to read anything, and `hasMore` logic
70
+ // becomes trivial
71
+
72
+ // This prefetch handles a first call to `fetchNext`, when all the internal fields are not initialized yet.
73
+ // On subsequent calls to `fetchNext` it will do nothing
74
+ await this . prefetch ( options ) ;
75
+
76
+ if ( this . prefetchedRecordBatch ) {
77
+ // Consume a record batch fetched during previous call to `fetchNext`
78
+ const table = new Table ( this . prefetchedRecordBatch ) ;
79
+ this . prefetchedRecordBatch = undefined ;
80
+ // Get table rows, but not more than remaining count
81
+ const arrowRows = table . toArray ( ) . slice ( 0 , this . remainingRows ) ;
82
+ const result = this . getRows ( table . schema , arrowRows ) ;
83
+
84
+ // Reduce remaining rows count by a count of rows we just processed.
85
+ // If the remaining count reached zero - we're done with current arrow
86
+ // batch, so discard the batch reader
87
+ this . remainingRows -= result . length ;
88
+ if ( this . remainingRows === 0 ) {
89
+ this . recordBatchReader = undefined ;
73
90
}
74
91
75
- // eslint-disable-next-line no-await-in-loop
76
- const batches = await this . source . fetchNext ( options ) ;
77
- if ( batches . length === 0 ) {
78
- this . reader = undefined ;
79
- break ;
80
- }
92
+ // Prefetch the next record batch
93
+ await this . prefetch ( options ) ;
81
94
82
- const reader = RecordBatchReader . from < TypeMap > ( batches ) ;
83
- this . reader = reader [ Symbol . iterator ] ( ) ;
84
- this . pendingRecordBatch = this . prefetch ( ) ;
95
+ return result ;
85
96
}
86
97
87
98
return [ ] ;
88
99
}
89
100
90
- private prefetch ( ) : RecordBatch < TypeMap > | undefined {
91
- const item = this . reader ?. next ( ) ?? { done : true , value : undefined } ;
101
+ // This method tries to read one more record batch and store it in `prefetchedRecordBatch` field.
102
+ // If `prefetchedRecordBatch` is already non-empty - the method does nothing.
103
+ // This method pulls the next item from source if needed, initializes a record batch reader and
104
+ // gets the next item from it - until either reaches end of data or finds a non-empty record batch
105
+ private async prefetch ( options : ResultsProviderFetchNextOptions ) {
106
+ // This loop will be executed until a next non-empty record batch is retrieved
107
+ // Another implicit loop condition (end of data) is checked in the loop body
108
+ while ( ! this . prefetchedRecordBatch ) {
109
+ // First, try to fetch next item from source and initialize record batch reader.
110
+ // If source has no more data - exit prematurely
111
+ if ( ! this . recordBatchReader ) {
112
+ const sourceHasMore = await this . source . hasMore ( ) ; // eslint-disable-line no-await-in-loop
113
+ if ( ! sourceHasMore ) {
114
+ return ;
115
+ }
116
+
117
+ const arrowBatch = await this . source . fetchNext ( options ) ; // eslint-disable-line no-await-in-loop
118
+ if ( arrowBatch . batches . length > 0 && arrowBatch . rowCount > 0 ) {
119
+ const reader = RecordBatchReader . from < TypeMap > ( arrowBatch . batches ) ;
120
+ this . recordBatchReader = reader [ Symbol . iterator ] ( ) ;
121
+ this . remainingRows = arrowBatch . rowCount ;
122
+ }
123
+ }
92
124
93
- if ( item . done || item . value === undefined ) {
94
- this . reader = undefined ;
95
- return undefined ;
125
+ // Try to get a next item from current record batch reader. The reader may be unavailable at this point -
126
+ // in this case we fall back to a "done" state, and the `while` loop will do one more iteration attempting
127
+ // to create a new reader. Eventually it will either succeed or reach end of source. This scenario also
128
+ // handles readers which are already empty
129
+ const item = this . recordBatchReader ?. next ( ) ?? { done : true , value : undefined } ;
130
+ if ( item . done || item . value === undefined ) {
131
+ this . recordBatchReader = undefined ;
132
+ } else {
133
+ // Skip empty batches
134
+ // eslint-disable-next-line no-lonely-if
135
+ if ( item . value . numRows > 0 ) {
136
+ this . prefetchedRecordBatch = item . value ;
137
+ }
138
+ }
96
139
}
97
-
98
- return item . value ;
99
140
}
100
141
101
142
private getRows ( schema : ArrowSchema , rows : Array < StructRow | MapRow > ) : Array < any > {
0 commit comments