|
24 | 24 | import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
25 | 25 | import java.io.IOException;
|
26 | 26 | import java.nio.ByteBuffer;
|
| 27 | +import java.util.Arrays; |
27 | 28 | import java.util.Iterator;
|
28 | 29 | import java.util.List;
|
29 | 30 | import java.util.Map;
|
30 | 31 | import java.util.concurrent.ExecutorService;
|
31 | 32 | import java.util.concurrent.Executors;
|
32 | 33 | import java.util.concurrent.TimeUnit;
|
| 34 | +import java.util.stream.Collectors; |
33 | 35 | import lombok.extern.slf4j.Slf4j;
|
34 | 36 | import org.apache.pulsar.client.api.Schema;
|
35 | 37 | import org.apache.pulsar.client.api.schema.GenericRecord;
|
@@ -119,57 +121,65 @@ private void flush(Map<String, List<Record<GenericRecord>>> recordsToInsertByTop
|
119 | 121 | final long timeStampForPartitioning = System.currentTimeMillis();
|
120 | 122 | for (Map.Entry<String, List<Record<GenericRecord>>> entry : recordsToInsertByTopic.entrySet()) {
|
121 | 123 | String topicName = entry.getKey();
|
122 |
| - List<Record<GenericRecord>> singleTopicRecordsToInsert = entry.getValue(); |
123 |
| - Record<GenericRecord> firstRecord = singleTopicRecordsToInsert.get(0); |
124 |
| - Schema<GenericRecord> schema; |
125 |
| - try { |
126 |
| - schema = getPulsarSchema(firstRecord); |
127 |
| - } catch (Exception e) { |
128 |
| - log.error("Failed to retrieve message schema", e); |
129 |
| - bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
130 |
| - return; |
131 |
| - } |
132 |
| - |
133 |
| - if (!format.doSupportPulsarSchemaType(schema.getSchemaInfo().getType())) { |
134 |
| - String errorMsg = "Sink does not support schema type of pulsar: " + schema.getSchemaInfo().getType(); |
135 |
| - log.error(errorMsg); |
136 |
| - bulkHandleFailedRecords(new UnsupportedOperationException(errorMsg), singleTopicRecordsToInsert); |
137 |
| - return; |
138 |
| - } |
| 124 | + Map<String, List<Record<GenericRecord>>> schemaVersionMap = entry.getValue().stream() |
| 125 | + .collect(Collectors.groupingBy(r -> { |
| 126 | + byte[] schemaVersionBytes = r.getValue().getSchemaVersion(); |
| 127 | + return schemaVersionBytes != null ? Arrays.toString(schemaVersionBytes) : "null"; |
| 128 | + } |
| 129 | + )); |
| 130 | + for (List<Record<GenericRecord>> singleTopicRecordsToInsert : schemaVersionMap.values()) { |
| 131 | + Record<GenericRecord> firstRecord = singleTopicRecordsToInsert.get(0); |
| 132 | + Schema<GenericRecord> schema; |
| 133 | + try { |
| 134 | + schema = getPulsarSchema(firstRecord); |
| 135 | + } catch (Exception e) { |
| 136 | + log.error("Failed to retrieve message schema", e); |
| 137 | + bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
| 138 | + return; |
| 139 | + } |
139 | 140 |
|
140 |
| - String filepath = ""; |
141 |
| - try { |
142 |
| - format.initSchema(schema); |
143 |
| - final Iterator<Record<GenericRecord>> iter = singleTopicRecordsToInsert.iterator(); |
144 |
| - filepath = buildPartitionPath(firstRecord, partitioner, format, timeStampForPartitioning); |
145 |
| - ByteBuffer payload = bindValue(iter, format); |
146 |
| - int uploadSize = singleTopicRecordsToInsert.size(); |
147 |
| - long uploadBytes = getBytesSum(singleTopicRecordsToInsert); |
148 |
| - log.info("Uploading blob {} from topic {} uploadSize:{} uploadBytes:{} currentBatchStatus:{}", |
149 |
| - filepath, topicName, uploadSize, uploadBytes, batchManager.getCurrentStatsStr()); |
150 |
| - long elapsedMs = System.currentTimeMillis(); |
151 |
| - blobWriter.uploadBlob(filepath, payload); |
152 |
| - elapsedMs = System.currentTimeMillis() - elapsedMs; |
153 |
| - log.debug("Uploading blob {} elapsed time in ms: {}", filepath, elapsedMs); |
154 |
| - singleTopicRecordsToInsert.forEach(Record::ack); |
155 |
| - if (sinkContext != null) { |
156 |
| - sinkContext.recordMetric(METRICS_TOTAL_SUCCESS, singleTopicRecordsToInsert.size()); |
157 |
| - sinkContext.recordMetric(METRICS_LATEST_UPLOAD_ELAPSED_TIME, elapsedMs); |
| 141 | + if (!format.doSupportPulsarSchemaType(schema.getSchemaInfo().getType())) { |
| 142 | + String errorMsg = "Sink does not support schema type of pulsar: " |
| 143 | + + schema.getSchemaInfo().getType(); |
| 144 | + log.error(errorMsg); |
| 145 | + bulkHandleFailedRecords(new UnsupportedOperationException(errorMsg), singleTopicRecordsToInsert); |
| 146 | + return; |
158 | 147 | }
|
159 |
| - log.info("Successfully uploaded blob {} from topic {} uploadSize {} uploadBytes {}", |
160 |
| - filepath, entry.getKey(), |
161 |
| - uploadSize, uploadBytes); |
162 |
| - } catch (Exception e) { |
163 |
| - if (e instanceof ContainerNotFoundException) { |
164 |
| - log.error("Blob {} is not found", filepath, e); |
165 |
| - } else if (e instanceof IOException) { |
166 |
| - log.error("Failed to write to blob {}", filepath, e); |
167 |
| - } else if (e instanceof UnsupportedOperationException || e instanceof IllegalArgumentException) { |
168 |
| - log.error("Failed to handle message schema {}", schema, e); |
169 |
| - } else { |
170 |
| - log.error("Encountered unknown error writing to blob {}", filepath, e); |
| 148 | + |
| 149 | + String filepath = ""; |
| 150 | + try { |
| 151 | + format.initSchema(schema); |
| 152 | + final Iterator<Record<GenericRecord>> iter = singleTopicRecordsToInsert.iterator(); |
| 153 | + filepath = buildPartitionPath(firstRecord, partitioner, format, timeStampForPartitioning); |
| 154 | + ByteBuffer payload = bindValue(iter, format); |
| 155 | + int uploadSize = singleTopicRecordsToInsert.size(); |
| 156 | + long uploadBytes = getBytesSum(singleTopicRecordsToInsert); |
| 157 | + log.info("Uploading blob {} from topic {} uploadSize:{} uploadBytes:{} currentBatchStatus:{}", |
| 158 | + filepath, topicName, uploadSize, uploadBytes, batchManager.getCurrentStatsStr()); |
| 159 | + long elapsedMs = System.currentTimeMillis(); |
| 160 | + blobWriter.uploadBlob(filepath, payload); |
| 161 | + elapsedMs = System.currentTimeMillis() - elapsedMs; |
| 162 | + log.debug("Uploading blob {} elapsed time in ms: {}", filepath, elapsedMs); |
| 163 | + singleTopicRecordsToInsert.forEach(Record::ack); |
| 164 | + if (sinkContext != null) { |
| 165 | + sinkContext.recordMetric(METRICS_TOTAL_SUCCESS, singleTopicRecordsToInsert.size()); |
| 166 | + sinkContext.recordMetric(METRICS_LATEST_UPLOAD_ELAPSED_TIME, elapsedMs); |
| 167 | + } |
| 168 | + log.info("Successfully uploaded blob {} from topic {} uploadSize {} uploadBytes {}", |
| 169 | + filepath, topicName, |
| 170 | + uploadSize, uploadBytes); |
| 171 | + } catch (Exception e) { |
| 172 | + if (e instanceof ContainerNotFoundException) { |
| 173 | + log.error("Blob {} is not found", filepath, e); |
| 174 | + } else if (e instanceof IOException) { |
| 175 | + log.error("Failed to write to blob {}", filepath, e); |
| 176 | + } else if (e instanceof UnsupportedOperationException || e instanceof IllegalArgumentException) { |
| 177 | + log.error("Failed to handle message schema {}", schema, e); |
| 178 | + } else { |
| 179 | + log.error("Encountered unknown error writing to blob {}", filepath, e); |
| 180 | + } |
| 181 | + bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
171 | 182 | }
|
172 |
| - bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
173 | 183 | }
|
174 | 184 | }
|
175 | 185 | }
|
|
0 commit comments