Skip to content

Commit 0da3b77

Browse files
authored
Merge pull request #1618 from synthetichealth/limit-csv-lines
Limit resources per CSV file
2 parents 0091254 + bce933b commit 0da3b77

File tree

4 files changed

+381
-78
lines changed

4 files changed

+381
-78
lines changed

src/main/java/org/mitre/synthea/export/CSVConstants.java

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -27,26 +27,6 @@ public class CSVConstants {
2727
public static final String CLAIM_TRANSACTION_KEY = "claims_transactions";
2828
public static final String PATIENT_EXPENSE_KEY = "patient_expenses";
2929

30-
public static final String BASE_PATIENT_FILENAME = "patients.csv";
31-
public static final String BASE_ALLERGY_FILENAME = "allergies.csv";
32-
public static final String BASE_MEDICATION_FILENAME = "medications.csv";
33-
public static final String BASE_CONDITION_FILENAME = "conditions.csv";
34-
public static final String BASE_CAREPLAN_FILENAME = "careplans.csv";
35-
public static final String BASE_OBSERVATION_FILENAME = "observations.csv";
36-
public static final String BASE_PROCEDURE_FILENAME = "procedures.csv";
37-
public static final String BASE_IMMUNIZATION_FILENAME = "immunizations.csv";
38-
public static final String BASE_ENCOUNTER_FILENAME = "encounters.csv";
39-
public static final String BASE_IMAGING_STUDY_FILENAME = "imaging_studies.csv";
40-
public static final String BASE_DEVICE_FILENAME = "devices.csv";
41-
public static final String BASE_SUPPLY_FILENAME = "supplies.csv";
42-
public static final String BASE_ORGANIZATION_FILENAME = "organizations.csv";
43-
public static final String BASE_PROVIDER_FILENAME = "providers.csv";
44-
public static final String BASE_PAYER_FILENAME = "payers.csv";
45-
public static final String BASE_PAYER_TRANSITION_FILENAME = "payer_transitions.csv";
46-
public static final String BASE_CLAIM_FILENAME = "claims.csv";
47-
public static final String BASE_CLAIM_TRANSACTION_FILENAME = "claims_transactions.csv";
48-
public static final String BASE_PATIENT_EXPENSE_FILENAME = "patient_expenses.csv";
49-
5030
public static final String PATIENT_HEADER_LINE =
5131
"Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,"
5232
+ "PREFIX,FIRST,MIDDLE,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,"
@@ -130,35 +110,8 @@ public class CSVConstants {
130110
+ "HEALTHCARE_EXPENSES,INSURANCE_COSTS,COVERED_COSTS"
131111
+ NEWLINE;
132112

133-
public static final Map<String, String> BASE_FILENAME_MAP = initializeFilenameMap();
134113
public static final Map<String, String> HEADER_LINE_MAP = initializeHeaderMap();
135114

136-
private static Map<String, String> initializeFilenameMap() {
137-
Map<String, String> map = new HashMap<>();
138-
139-
map.put(PATIENT_KEY, BASE_PATIENT_FILENAME);
140-
map.put(ALLERGY_KEY, BASE_ALLERGY_FILENAME);
141-
map.put(MEDICATION_KEY, BASE_MEDICATION_FILENAME);
142-
map.put(CONDITION_KEY, BASE_CONDITION_FILENAME);
143-
map.put(CAREPLAN_KEY, BASE_CAREPLAN_FILENAME);
144-
map.put(OBSERVATION_KEY, BASE_OBSERVATION_FILENAME);
145-
map.put(PROCEDURE_KEY, BASE_PROCEDURE_FILENAME);
146-
map.put(IMMUNIZATION_KEY, BASE_IMMUNIZATION_FILENAME);
147-
map.put(ENCOUNTER_KEY, BASE_ENCOUNTER_FILENAME);
148-
map.put(IMAGING_STUDY_KEY, BASE_IMAGING_STUDY_FILENAME);
149-
map.put(DEVICE_KEY, BASE_DEVICE_FILENAME);
150-
map.put(SUPPLY_KEY, BASE_SUPPLY_FILENAME);
151-
map.put(ORGANIZATION_KEY, BASE_ORGANIZATION_FILENAME);
152-
map.put(PROVIDER_KEY, BASE_PROVIDER_FILENAME);
153-
map.put(PAYER_KEY, BASE_PAYER_FILENAME);
154-
map.put(PAYER_TRANSITION_KEY, BASE_PAYER_TRANSITION_FILENAME);
155-
map.put(CLAIM_KEY, BASE_CLAIM_FILENAME);
156-
map.put(CLAIM_TRANSACTION_KEY, BASE_CLAIM_TRANSACTION_FILENAME);
157-
map.put(PATIENT_EXPENSE_KEY, BASE_PATIENT_EXPENSE_FILENAME);
158-
159-
return Collections.unmodifiableMap(map);
160-
}
161-
162115
private static Map<String, String> initializeHeaderMap() {
163116
Map<String, String> map = new HashMap<>();
164117

src/main/java/org/mitre/synthea/export/CSVFileManager.java

Lines changed: 164 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import java.io.FileOutputStream;
55
import java.io.IOException;
66
import java.io.OutputStreamWriter;
7+
import java.lang.NumberFormatException;
78
import java.nio.charset.Charset;
9+
import java.nio.file.Files;
810
import java.nio.file.Path;
911
import java.util.Arrays;
1012
import java.util.Collections;
@@ -16,6 +18,7 @@
1618
import org.apache.commons.io.output.NullOutputStream;
1719
import org.mitre.synthea.export.CSVConstants;
1820
import org.mitre.synthea.helpers.Config;
21+
import org.mitre.synthea.helpers.SimpleCSV;
1922

2023
public class CSVFileManager {
2124
/**
@@ -26,16 +29,11 @@ public class CSVFileManager {
2629
private Path outputDirectory;
2730
private List<String> includedFiles;
2831
private List<String> excludedFiles;
29-
private Map<String, String> filenameMap = initializeFilenameMap();
32+
private Map<String, String> filenameMap = new HashMap<>();
3033
private Map<String, OutputStreamWriter> writerMap = new HashMap<>();
31-
32-
private Map<String, String> initializeFilenameMap() {
33-
HashMap<String, String> map = new HashMap<>();
34-
35-
map.putAll(CSVConstants.BASE_FILENAME_MAP);
36-
37-
return map;
38-
}
34+
private Map<String, Integer> resourceCountMap = new HashMap<>();
35+
private int maxLinesPerFile;
36+
private int fileNumberDigits;
3937

4038
/**
4139
* "No-op" writer to use to prevent writing to excluded files.
@@ -50,6 +48,8 @@ private Map<String, String> initializeFilenameMap() {
5048
*/
5149
public CSVFileManager() {
5250
initializeAppend();
51+
initializeMaxLinesPerFile();
52+
initializeFileNumberDigits();
5353
initializeOutputDirectory();
5454
initializeIncludedAndExcludedFiles();
5555
}
@@ -58,6 +58,24 @@ private void initializeAppend() {
5858
append = Config.getAsBoolean("exporter.csv.append_mode");
5959
}
6060

61+
private void initializeMaxLinesPerFile() {
62+
try {
63+
maxLinesPerFile = Config.getAsInteger("exporter.csv.max_lines_per_file", 0);
64+
} catch (NumberFormatException ex) {
65+
// if the property is present but not a numeric string
66+
maxLinesPerFile = 0;
67+
}
68+
}
69+
70+
private void initializeFileNumberDigits() {
71+
try {
72+
fileNumberDigits = Config.getAsInteger("exporter.csv.file_number_digits", 1);
73+
} catch (NumberFormatException ex) {
74+
// if the property is present but not a numeric string
75+
fileNumberDigits = 1;
76+
}
77+
}
78+
6179
private void initializeOutputDirectory() {
6280
File output = Exporter.getOutputFolder("csv", null);
6381
output.mkdirs();
@@ -72,6 +90,24 @@ private void initializeOutputDirectory() {
7290
}
7391
}
7492

93+
private boolean multipleFilesPerResource() {
94+
return maxLinesPerFile > 0;
95+
}
96+
97+
private String filename(String resourceKey) {
98+
return resourceKey + ".csv";
99+
}
100+
101+
private String filename(String resourceKey, int fileNumber) {
102+
String formattedNumber = String.valueOf(fileNumber);
103+
104+
if (fileNumberDigits > 1) {
105+
formattedNumber = String.format("%0" + fileNumberDigits + "d", fileNumber);
106+
}
107+
108+
return resourceKey + "-" + formattedNumber + ".csv";
109+
}
110+
75111
private void initializeIncludedAndExcludedFiles() {
76112
String includedFilesStr = Config.get("exporter.csv.included_files", "").trim();
77113
String excludedFilesStr = Config.get("exporter.csv.excluded_files", "").trim();
@@ -126,6 +162,25 @@ private static List<String> propStringToList(String fileListString) {
126162
return files;
127163
}
128164

165+
private int incrementResourceCount(String resourceKey) {
166+
Integer resourceCount = resourceCountMap.get(resourceKey);
167+
168+
if (resourceCount == null) {
169+
resourceCount = 0;
170+
}
171+
172+
resourceCount++;
173+
resourceCountMap.put(resourceKey, resourceCount);
174+
175+
return resourceCount;
176+
}
177+
178+
private boolean resourceIsExcluded(String resourceKey) {
179+
String baseFilename = filename(resourceKey);
180+
return (!includedFiles.isEmpty() && !includedFiles.contains(baseFilename))
181+
|| excludedFiles.contains(baseFilename);
182+
}
183+
129184
/**
130185
* Helper method to instantiate, if necessary, and return the writer for the
131186
* resource type's CSV file. Returns a "no-op" writer for any excluded files.
@@ -134,20 +189,87 @@ private static List<String> propStringToList(String fileListString) {
134189
*
135190
* @return OutputStreamWriter for the given resource type's CSV file
136191
*/
137-
private OutputStreamWriter getResourceWriter(String resourceKey) throws IOException {
138-
String baseFilename = CSVConstants.BASE_FILENAME_MAP.get(resourceKey);
139-
boolean excluded = (!includedFiles.isEmpty() && !includedFiles.contains(baseFilename))
140-
|| excludedFiles.contains(baseFilename);
141-
if (excluded) {
192+
private OutputStreamWriter initializeResourceWriter(String resourceKey) throws IOException {
193+
if (resourceIsExcluded(resourceKey)) {
142194
return NO_OP;
143195
}
144196

145-
String filename = filenameMap.get(resourceKey);
197+
String filename = filename(resourceKey);
146198
File file = outputDirectory.resolve(filename).toFile();
147199
// file writing may fail if we tell it to append to a file that doesn't already exist
148200
boolean appendToThisFile = append && file.exists();
149201

150-
return new OutputStreamWriter(new FileOutputStream(file, appendToThisFile), charset);
202+
OutputStreamWriter writer =
203+
new OutputStreamWriter(new FileOutputStream(file, appendToThisFile), charset);
204+
if (!append) {
205+
writer.write(CSVConstants.HEADER_LINE_MAP.get(resourceKey));
206+
}
207+
208+
return writer;
209+
}
210+
211+
/**
212+
* Helper method to instantiate, if necessary, and return the writer for the
213+
* resource type's CSV file. Returns a "no-op" writer for any excluded files.
214+
*
215+
* @param resourceKey Key from CSVConstants for the resource type being written
216+
*
217+
* @return OutputStreamWriter for the given resource type's CSV file
218+
*/
219+
private OutputStreamWriter initializeResourceWriter(String resourceKey, int resourceCount)
220+
throws IOException {
221+
if (resourceIsExcluded(resourceKey)) {
222+
return NO_OP;
223+
}
224+
225+
if (append && resourceCount == 1) {
226+
resourceCount = getResourceCount(resourceKey) + 1;
227+
resourceCountMap.put(resourceKey, resourceCount);
228+
}
229+
230+
int fileNumber = (resourceCount - 1) / maxLinesPerFile + 1;
231+
String filename = filename(resourceKey, fileNumber);
232+
233+
File file = outputDirectory.resolve(filename).toFile();
234+
// file writing may fail if we tell it to append to a file that doesn't already exist
235+
boolean appendToThisFile = append && file.exists();
236+
237+
OutputStreamWriter writer =
238+
new OutputStreamWriter(new FileOutputStream(file, appendToThisFile), charset);
239+
if (!append || resourceCount % maxLinesPerFile == 1) {
240+
writer.write(CSVConstants.HEADER_LINE_MAP.get(resourceKey));
241+
}
242+
243+
return writer;
244+
}
245+
246+
private int getResourceCount(String resourceKey) throws IOException {
247+
int fileNumber = 1;
248+
249+
String currentFilename = filename(resourceKey, fileNumber);
250+
File file = outputDirectory.resolve(currentFilename).toFile();
251+
252+
if (file.exists()) {
253+
do {
254+
fileNumber++;
255+
currentFilename = filename(resourceKey, fileNumber);
256+
file = outputDirectory.resolve(currentFilename).toFile();
257+
} while ((file.exists()));
258+
259+
fileNumber--;
260+
}
261+
262+
currentFilename = filename(resourceKey, fileNumber);
263+
file = outputDirectory.resolve(currentFilename).toFile();
264+
265+
int resourceCount = (fileNumber - 1) * maxLinesPerFile;
266+
267+
if (file.exists()) {
268+
String csvData = new String(Files.readAllBytes(file.toPath()));
269+
resourceCount += SimpleCSV.parse(csvData).size();
270+
}
271+
272+
return resourceCount;
151273
}
152274

153275
/**
@@ -158,13 +280,35 @@ private OutputStreamWriter getResourceWriter(String resourceKey) throws IOExcept
158280
* @return OutputStreamWriter for the given resource type's CSV file
159281
*/
160282
public OutputStreamWriter getWriter(String resourceKey) throws IOException {
283+
if (multipleFilesPerResource()) {
284+
return getWriterForMultipleFiles(resourceKey);
285+
}
286+
161287
OutputStreamWriter writer = writerMap.get(resourceKey);
162288
if (writer == null) {
163-
writer = getResourceWriter(resourceKey);
289+
writer = initializeResourceWriter(resourceKey);
164290
writerMap.put(resourceKey, writer);
165-
if (!append) {
166-
writer.write(CSVConstants.HEADER_LINE_MAP.get(resourceKey));
291+
}
292+
293+
return writer;
294+
}
295+
296+
private OutputStreamWriter getWriterForMultipleFiles(String resourceKey) throws IOException {
297+
if (resourceIsExcluded(resourceKey)) {
298+
return NO_OP;
299+
}
300+
301+
int resourceCount = incrementResourceCount(resourceKey);
302+
303+
OutputStreamWriter writer = writerMap.get(resourceKey);
304+
305+
if (resourceCount % maxLinesPerFile == 1) {
306+
if (writer != null) {
307+
writer.flush();
167308
}
309+
310+
writer = initializeResourceWriter(resourceKey, resourceCount);
311+
writerMap.put(resourceKey, writer);
168312
}
169313

170314
return writer;
@@ -177,7 +321,7 @@ public OutputStreamWriter getWriter(String resourceKey) throws IOException {
177321
*/
178322
public void flushWriter(String resourceKey) throws IOException {
179323
synchronized (resourceKey) {
180-
OutputStreamWriter writer = getWriter(resourceKey);
324+
OutputStreamWriter writer = writerMap.get(resourceKey);
181325
if (writer != null) {
182326
writer.flush();
183327
}

src/main/resources/synthea.properties

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ exporter.csv.folder_per_run = false
5353
# NOTE: the csv exporter does not actively delete files, so if Run 1 you included a file, then Run 2 you exclude that file, the version from Run 1 will still be present
5454
exporter.csv.included_files =
5555
exporter.csv.excluded_files = patient_expenses.csv
56+
# if set to a number, the output for resource types with more than this number
57+
# of resources will be split among multiple files
58+
exporter.csv.max_lines_per_file =
59+
# When using multiple files, the numbers in the filenames will be zero padded to
60+
# this many digits.
61+
exporter.csv.file_number_digits =
5662

5763
exporter.cpcds.export = false
5864
exporter.cpcds.append_mode = false

0 commit comments

Comments
 (0)