13
13
import javax .xml .stream .XMLInputFactory ;
14
14
import javax .xml .stream .XMLStreamException ;
15
15
import javax .xml .stream .XMLStreamReader ;
16
- import java .io .ByteArrayInputStream ;
17
16
import java .io .IOException ;
18
17
import java .io .InputStream ;
19
18
import java .util .Iterator ;
@@ -61,9 +60,13 @@ class AggregateXmlSplitter {
61
60
final String element = fileContext .getStringOption (Options .READ_AGGREGATES_XML_ELEMENT );
62
61
final String encoding = fileContext .getStringOption (Options .READ_FILES_ENCODING );
63
62
63
+ final XMLSplitter <StringHandle > splitter = this .uriElement != null ?
64
+ new XMLSplitter <>(new UriElementExtractingVisitor (namespace , element , uriNamespace , uriElement )) :
65
+ XMLSplitter .makeSplitter (namespace , element );
66
+
64
67
try {
65
68
XMLStreamReader reader = xmlInputFactory .createXMLStreamReader (inputStream , encoding );
66
- this .contentStream = XMLSplitter . makeSplitter ( namespace , element ) .split (reader ).iterator ();
69
+ this .contentStream = splitter .split (reader ).iterator ();
67
70
} catch (IOException | XMLStreamException e ) {
68
71
throw new ConnectorException (
69
72
String .format ("Unable to read XML at %s; cause: %s" , this .identifierForErrors , e .getMessage ()), e
@@ -81,88 +84,39 @@ boolean hasNext() {
81
84
}
82
85
83
86
/**
84
- * @param pathPrefix used to construct a path if no uriElement was specified
87
+ * @param uriPrefix used to construct a URI if no uriElement was specified
85
88
* @return a row corresponding to the {@code DocumentRowSchema}
86
89
*/
87
- InternalRow nextRow (String pathPrefix ) {
88
- String xml ;
90
+ InternalRow nextRow (String uriPrefix ) {
91
+ StringHandle stringHandle ;
89
92
try {
90
- xml = this .contentStream .next (). get ();
93
+ stringHandle = this .contentStream .next ();
91
94
} catch (RuntimeException ex ) {
92
95
String message = String .format ("Unable to read XML from %s; cause: %s" ,
93
96
this .identifierForErrors , ex .getMessage ());
94
97
throw new ConnectorException (message , ex );
95
98
}
96
99
97
- final String path = this .uriElement != null && !this .uriElement .trim ().isEmpty () ?
98
- extractUriElementValue (xml ) :
99
- pathPrefix + "-" + rowCounter + ".xml" ;
100
-
100
+ final String initialUri = determineInitialUri (stringHandle , uriPrefix );
101
101
rowCounter ++;
102
-
103
- byte [] content = xml .getBytes ();
104
102
return new GenericInternalRow (new Object []{
105
- UTF8String .fromString (path ),
106
- ByteArray .concat (content ),
103
+ UTF8String .fromString (initialUri ),
104
+ ByteArray .concat (stringHandle . get (). getBytes () ),
107
105
UTF8String .fromString ("xml" ),
108
106
null , null , null , null , null
109
107
});
110
108
}
111
109
112
- /**
113
- * MLCP has undocumented support for attribute references via "@(attribute-name)". We are not supporting this yet
114
- * as we are using XMLSplitter to find the user-defined element, and XMLSplitter does not support finding
115
- * attributes. Additionally, this feature is still fairly limited in comparison to the "URI template" that the
116
- * connector supports. Ultimately, we'd want to support N path expressions against both Spark columns and against
117
- * a JSON or XML tree in a single Spark column.
118
- *
119
- * @param xml
120
- * @return
121
- */
122
- private String extractUriElementValue (String xml ) {
123
- Iterator <StringHandle > iterator ;
124
- XMLSplitter <StringHandle > splitter = XMLSplitter .makeSplitter (this .uriNamespace , this .uriElement );
125
- splitter .setVisitor (new UriElementVisitor (this .uriNamespace , this .uriElement ));
126
- try {
127
- iterator = splitter .split (new ByteArrayInputStream (xml .getBytes ())).iterator ();
128
- } catch (Exception e ) {
129
- // We don't expect this to ever occur, as if the XML couldn't be parsed, an error would have been thrown
130
- // when the child element was originally extracted. But still have to catch an exception.
131
- String message = String .format ("Unable to parse XML in aggregate element %d in %s; cause: %s" ,
132
- rowCounter , this .identifierForErrors , e .getMessage ());
133
- throw new ConnectorException (message , e );
134
- }
135
-
136
- if (!iterator .hasNext ()) {
137
- String message = String .format ("No occurrence of URI element '%s' found in aggregate element %d in %s" ,
138
- this .uriElement , rowCounter , this .identifierForErrors );
139
- throw new ConnectorException (message );
140
- }
141
- return iterator .next ().get ();
142
- }
143
-
144
- /**
145
- * Extends the Java Client visitor class so that it can return a handle containing the text of the
146
- * user-defined URI element.
147
- */
148
- private class UriElementVisitor extends XMLSplitter .BasicElementVisitor {
149
- public UriElementVisitor (String nsUri , String localName ) {
150
- super (nsUri , localName );
151
- }
152
-
153
- @ Override
154
- public StringHandle makeBufferedHandle (XMLStreamReader xmlStreamReader ) {
155
- String text ;
156
- try {
157
- text = xmlStreamReader .getElementText ();
158
- } catch (XMLStreamException e ) {
159
- String message = String .format (
160
- "Unable to get text from URI element '%s' found in aggregate element %d in %s; cause: %s" ,
161
- uriElement , rowCounter , identifierForErrors , e .getMessage ()
162
- );
163
- throw new ConnectorException (message , e );
110
+ private String determineInitialUri (StringHandle stringHandle , String uriPrefix ) {
111
+ if (stringHandle instanceof StringHandleWithUriValue ) {
112
+ String uriValue = ((StringHandleWithUriValue ) stringHandle ).getUriValue ();
113
+ if (uriValue == null ) {
114
+ String message = String .format ("No occurrence of URI element '%s' found in aggregate element %d in %s" ,
115
+ this .uriElement , rowCounter , this .identifierForErrors );
116
+ throw new ConnectorException (message );
164
117
}
165
- return new StringHandle ( text ) ;
118
+ return uriValue ;
166
119
}
120
+ return String .format ("%s-%d.xml" , uriPrefix , rowCounter );
167
121
}
168
122
}
0 commit comments