4
4
package com .marklogic .spark .reader .file ;
5
5
6
6
import com .marklogic .client .io .DocumentMetadataHandle ;
7
+ import com .marklogic .client .io .InputStreamHandle ;
7
8
import com .marklogic .spark .ConnectorException ;
8
9
import com .marklogic .spark .Options ;
9
10
import com .marklogic .spark .Util ;
12
13
import org .apache .spark .sql .catalyst .InternalRow ;
13
14
import org .apache .spark .sql .connector .read .PartitionReader ;
14
15
16
+ import java .io .ByteArrayOutputStream ;
15
17
import java .io .IOException ;
18
+ import java .io .ObjectOutputStream ;
16
19
import java .util .ArrayList ;
17
20
import java .util .List ;
18
21
import java .util .zip .ZipEntry ;
19
22
import java .util .zip .ZipInputStream ;
20
23
21
- class ArchiveFileReader implements PartitionReader <InternalRow > {
24
+ public class ArchiveFileReader implements PartitionReader <InternalRow > {
22
25
23
26
private final FilePartition filePartition ;
24
27
private final FileContext fileContext ;
25
28
private final List <String > metadataCategories ;
29
+ private final StreamingMode streamingMode ;
26
30
27
31
private String currentFilePath ;
28
32
private ZipInputStream currentZipInputStream ;
@@ -32,9 +36,25 @@ class ArchiveFileReader implements PartitionReader<InternalRow> {
32
36
// Legacy = content first, then metadata.
33
37
private Boolean isLegacyFormat ;
34
38
39
+ public enum StreamingMode {
40
+ STREAM_DURING_READER_PHASE ,
41
+ STREAM_DURING_WRITER_PHASE
42
+ }
43
+
35
44
ArchiveFileReader (FilePartition filePartition , FileContext fileContext ) {
45
+ this (
46
+ filePartition , fileContext ,
47
+ // Will refactor this later to avoid duplication of this comparison.
48
+ // Should be a nice little method in FileContext.
49
+ "true" .equalsIgnoreCase (fileContext .getStringOption (Options .STREAM_FILES )) ? StreamingMode .STREAM_DURING_READER_PHASE : null
50
+ );
51
+ }
52
+
53
+ public ArchiveFileReader (FilePartition filePartition , FileContext fileContext , StreamingMode streamingMode ) {
36
54
this .filePartition = filePartition ;
37
55
this .fileContext = fileContext ;
56
+ this .streamingMode = streamingMode ;
57
+
38
58
this .metadataCategories = new ArrayList <>();
39
59
if (fileContext .hasOption (Options .READ_ARCHIVES_CATEGORIES )) {
40
60
for (String category : fileContext .getStringOption (Options .READ_ARCHIVES_CATEGORIES ).split ("," )) {
@@ -47,6 +67,10 @@ class ArchiveFileReader implements PartitionReader<InternalRow> {
47
67
48
68
@ Override
49
69
public boolean next () {
70
+ if (StreamingMode .STREAM_DURING_READER_PHASE .equals (this .streamingMode )) {
71
+ return nextWhileStreamingDuringReaderPhase ();
72
+ }
73
+
50
74
try {
51
75
ZipEntry nextZipEntry = FileUtil .findNextFileEntry (currentZipInputStream );
52
76
if (nextZipEntry == null ) {
@@ -55,6 +79,7 @@ public boolean next() {
55
79
56
80
if (isLegacyFormat == null ) {
57
81
isLegacyFormat = !nextZipEntry .getName ().endsWith (".metadata" );
82
+ logArchiveFormat ();
58
83
}
59
84
60
85
return isLegacyFormat ? readContentFollowedByMetadata (nextZipEntry ) : readMetadataFollowedByContent ();
@@ -70,14 +95,52 @@ public boolean next() {
70
95
71
96
@ Override
72
97
public InternalRow get () {
73
- return nextRowToReturn ;
98
+ return StreamingMode .STREAM_DURING_READER_PHASE .equals (this .streamingMode ) ?
99
+ buildSingleRowForArchiveFile () :
100
+ nextRowToReturn ;
74
101
}
75
102
76
103
@ Override
77
104
public void close () {
78
105
IOUtils .closeQuietly (this .currentZipInputStream );
79
106
}
80
107
108
+ /**
109
+ * Exposed for {@code ArchiveFileIterator} to be able to read from the zip stream when it produces a set of
110
+ * document inputs.
111
+ *
112
+ * @return a {@code InputStreamHandle} to avoid reading a content zip entry into memory.
113
+ */
114
+ public InputStreamHandle getContentHandleForCurrentZipEntry () {
115
+ return new InputStreamHandle (currentZipInputStream );
116
+ }
117
+
118
+ private void logArchiveFormat () {
119
+ if (Util .MAIN_LOGGER .isInfoEnabled () && isLegacyFormat ) {
120
+ Util .MAIN_LOGGER .info ("Archive {} uses Flux 1.0 format, will read content and then metadata." , this .currentFilePath );
121
+ }
122
+ if (Util .MAIN_LOGGER .isDebugEnabled () && !isLegacyFormat .booleanValue ()) {
123
+ Util .MAIN_LOGGER .debug ("Archive {} uses Flux 1.1+ format, will read metadata and then content." , this .currentFilePath );
124
+ }
125
+ }
126
+
127
+ /**
128
+ * Implementation of {@code next()} while streaming during the reader phase, where we don't want to actually read
129
+ * anything from a zip file. We just want to build a row per zip file.
130
+ *
131
+ * @return
132
+ */
133
+ private boolean nextWhileStreamingDuringReaderPhase () {
134
+ if (currentFilePath != null ) {
135
+ return true ;
136
+ }
137
+ if (nextFilePathIndex >= filePartition .getPaths ().size ()) {
138
+ return false ;
139
+ }
140
+ openNextFile ();
141
+ return true ;
142
+ }
143
+
81
144
/**
82
145
* This is the Flux 1.0 "legacy" approach, where content was written first, followed by metadata. This does not
83
146
* support streaming.
@@ -87,16 +150,15 @@ private boolean readContentFollowedByMetadata(ZipEntry contentZipEntry) throws I
87
150
if (content == null || content .length == 0 ) {
88
151
return openNextFileAndReadNextEntry ();
89
152
}
90
- final String zipEntryName = contentZipEntry .getName ();
91
153
154
+ final String zipEntryName = contentZipEntry .getName ();
92
155
byte [] metadataBytes = readMetadataEntry (zipEntryName );
93
156
if (metadataBytes == null || metadataBytes .length == 0 ) {
94
157
return openNextFileAndReadNextEntry ();
95
158
}
96
159
97
160
DocumentMetadataHandle metadata = new DocumentMetadataHandle ();
98
161
metadata .fromBuffer (metadataBytes );
99
-
100
162
this .nextRowToReturn = new DocumentRowBuilder (this .metadataCategories )
101
163
.withUri (zipEntryName ).withContent (content ).withMetadata (metadata )
102
164
.buildRow ();
@@ -105,29 +167,45 @@ private boolean readContentFollowedByMetadata(ZipEntry contentZipEntry) throws I
105
167
106
168
/**
107
169
* This is the Flux 1.1+ approach, where the metadata entry is written first. This supports streaming.
170
+ * <p>
171
+ * This is where we implement streaming-during-write-to-MarkLogic. We read the metadata entry as normal - good.
172
+ * Then we build everything in our row except the content.
108
173
*/
109
174
private boolean readMetadataFollowedByContent () throws IOException {
110
175
byte [] metadataBytes = fileContext .readBytes (currentZipInputStream );
111
176
if (metadataBytes == null || metadataBytes .length == 0 ) {
112
177
return openNextFileAndReadNextEntry ();
113
178
}
114
179
115
- ZipEntry contentZipEntry = FileUtil .findNextFileEntry (currentZipInputStream );
116
- byte [] content = fileContext .readBytes (currentZipInputStream );
117
-
118
180
DocumentMetadataHandle metadata = new DocumentMetadataHandle ();
119
181
metadata .fromBuffer (metadataBytes );
120
- this .nextRowToReturn = new DocumentRowBuilder (this .metadataCategories )
182
+
183
+ // We still do this to get the stream ready to read the next entry.
184
+ ZipEntry contentZipEntry = FileUtil .findNextFileEntry (currentZipInputStream );
185
+
186
+ DocumentRowBuilder rowBuilder = new DocumentRowBuilder (this .metadataCategories )
121
187
.withUri (contentZipEntry .getName ())
122
- .withContent (content ).withMetadata (metadata )
123
- .buildRow ();
188
+ .withMetadata (metadata );
189
+
190
+ if (!StreamingMode .STREAM_DURING_WRITER_PHASE .equals (this .streamingMode )) {
191
+ byte [] content = fileContext .readBytes (currentZipInputStream );
192
+ rowBuilder = rowBuilder .withContent (content );
193
+ }
194
+
195
+ this .nextRowToReturn = rowBuilder .buildRow ();
124
196
return true ;
125
197
}
126
198
127
199
private void openNextFile () {
128
- this .currentFilePath = fileContext .decodeFilePath (filePartition .getPaths ().get (nextFilePathIndex ));
200
+ final boolean isStreamingDuringRead = StreamingMode .STREAM_DURING_READER_PHASE .equals (this .streamingMode );
201
+ final String nextFilePath = filePartition .getPaths ().get (nextFilePathIndex );
202
+
203
+ this .currentFilePath = isStreamingDuringRead ? nextFilePath : fileContext .decodeFilePath (nextFilePath );
129
204
nextFilePathIndex ++;
130
- this .currentZipInputStream = new ZipInputStream (fileContext .openFile (this .currentFilePath ));
205
+
206
+ if (!isStreamingDuringRead ) {
207
+ this .currentZipInputStream = new ZipInputStream (fileContext .openFile (this .currentFilePath ));
208
+ }
131
209
}
132
210
133
211
private boolean openNextFileAndReadNextEntry () {
@@ -151,4 +229,26 @@ private byte[] readMetadataEntry(String zipEntryName) throws IOException {
151
229
}
152
230
return fileContext .readBytes (currentZipInputStream );
153
231
}
232
+
233
+ /**
234
+ * Builds a row containing the file path, the serialized FileContext, and the metadata.
235
+ */
236
+ private InternalRow buildSingleRowForArchiveFile () {
237
+ ByteArrayOutputStream baos = new ByteArrayOutputStream ();
238
+ try (ObjectOutputStream oos = new ObjectOutputStream (baos )) {
239
+ oos .writeObject (this .fileContext );
240
+ oos .flush ();
241
+ } catch (Exception ex ) {
242
+ String message = String .format ("Unable to build row for archive file at %s; cause: %s" ,
243
+ this .currentFilePath , ex .getMessage ());
244
+ throw new ConnectorException (message , ex );
245
+ }
246
+
247
+ InternalRow row = new DocumentRowBuilder (this .metadataCategories )
248
+ .withUri (this .currentFilePath )
249
+ .withContent (baos .toByteArray ())
250
+ .buildRow ();
251
+ this .currentFilePath = null ;
252
+ return row ;
253
+ }
154
254
}
0 commit comments