process the file using the native csv module to avoid high memory usage

AhmedBasem20 · AhmedBasem20 · commit 9b7286fcc660 · 2024-08-07T16:24:24.000+03:00
diff --git a/utilities/reduce_output_size.py b/utilities/reduce_output_size.py
@@ -1,19 +1,26 @@
-import pandas as pd
 import os
+import gzip
+import csv
 
 file_path = 'test_output.csv'
 
 # Check if the file exists
 if os.path.exists(file_path):
-    df = pd.read_csv(file_path)
+    # Open the input and output files
+    with open(file_path, 'r') as infile, gzip.open('test_output.csv.gz', 'wt', newline='') as outfile:
+        reader = csv.DictReader(infile)
 
-    # Columns to be rounded to four decimal places
-    columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted']
-    for column in columns_to_round:
-        df[column] = df[column].round(4)
+        # Drop b_values columns
+        fieldnames = [field for field in reader.fieldnames if not field.startswith('bval_')]
+        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
+        writer.writeheader()
 
-    #drop b_values columns.
-    df = df.loc[:, ~df.columns.str.startswith('bval')] 
-
-    #compress and save the file.
-    df.to_csv('test_output.csv.gz', compression='gzip', index=False)
+        columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted']
+        
+        # Process each row
+        for row in reader:
+            filtered_row = {column: row[column] for column in fieldnames}
+            for column in columns_to_round:
+                if column in filtered_row:
+                    filtered_row[column] = round(float(filtered_row[column]), 4)
+            writer.writerow(filtered_row)