Skip to content

Commit 9b7286f

Browse files
committed
process the file using the native csv module to avoid high memory usage
1 parent 38a6847 commit 9b7286f

File tree

1 file changed

+18
-11
lines changed

1 file changed

+18
-11
lines changed

utilities/reduce_output_size.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
1-
import pandas as pd
21
import os
2+
import gzip
3+
import csv
34

45
file_path = 'test_output.csv'
56

67
# Check if the file exists
78
if os.path.exists(file_path):
8-
df = pd.read_csv(file_path)
9+
# Open the input and output files
10+
with open(file_path, 'r') as infile, gzip.open('test_output.csv.gz', 'wt', newline='') as outfile:
11+
reader = csv.DictReader(infile)
912

10-
# Columns to be rounded to four decimal places
11-
columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted']
12-
for column in columns_to_round:
13-
df[column] = df[column].round(4)
13+
# Drop b_values columns
14+
fieldnames = [field for field in reader.fieldnames if not field.startswith('bval_')]
15+
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
16+
writer.writeheader()
1417

15-
#drop b_values columns.
16-
df = df.loc[:, ~df.columns.str.startswith('bval')]
17-
18-
#compress and save the file.
19-
df.to_csv('test_output.csv.gz', compression='gzip', index=False)
18+
columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted']
19+
20+
# Process each row
21+
for row in reader:
22+
filtered_row = {column: row[column] for column in fieldnames}
23+
for column in columns_to_round:
24+
if column in filtered_row:
25+
filtered_row[column] = round(float(filtered_row[column]), 4)
26+
writer.writerow(filtered_row)

0 commit comments

Comments
 (0)