@@ -659,7 +659,55 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None):
659
659
if resource_format .upper () == "CSV" :
660
660
logger .info ("Normalizing/UTF-8 transcoding {}..." .format (resource_format ))
661
661
else :
662
- logger .info ("Normalizing/UTF-8 transcoding {} to CSV..." .format (format ))
662
+ # if not CSV (e.g. TSV, TAB, etc.) we need to normalize to CSV
663
+ logger .info (
664
+ "Normalizing/UTF-8 transcoding {} to CSV..." .format (resource_format )
665
+ )
666
+
667
+ qsv_input_utf_8_encoded_csv = os .path .join (temp_dir , 'qsv_input_utf_8_encoded.csv' )
668
+
669
+ # using uchardet to determine encoding
670
+ file_encoding = subprocess .run (
671
+ [
672
+ "uchardet" ,
673
+ tmp
674
+ ],
675
+ check = True ,
676
+ capture_output = True ,
677
+ text = True ,
678
+ )
679
+ logger .info ("Identified encoding of the file: {}" .format (file_encoding .stdout ))
680
+
681
+ # trim the encoding string
682
+ file_encoding .stdout = file_encoding .stdout .strip ()
683
+
684
+ # using iconv to re-encode in UTF-8
685
+ if file_encoding .stdout != "UTF-8" :
686
+ logger .info ("File is not UTF-8 encoded. Re-encoding from {} to UTF-8" .format (
687
+ file_encoding .stdout )
688
+ )
689
+ try :
690
+ subprocess .run (
691
+ [
692
+ "iconv" ,
693
+ "-f" ,
694
+ file_encoding .stdout ,
695
+ "-t" ,
696
+ "UTF-8" ,
697
+ tmp ,
698
+ "--output" ,
699
+ qsv_input_utf_8_encoded_csv ,
700
+ ],
701
+ check = True ,
702
+ )
703
+ except subprocess .CalledProcessError as e :
704
+ # return as we can't push a non UTF-8 CSV
705
+ logger .error (
706
+ "Job aborted as the file cannot be re-encoded to UTF-8: {}." .format (e )
707
+ )
708
+ return
709
+ else :
710
+ qsv_input_utf_8_encoded_csv = tmp
663
711
try :
664
712
qsv_input = subprocess .run (
665
713
[
@@ -693,7 +741,7 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None):
693
741
)
694
742
except subprocess .CalledProcessError as e :
695
743
# return as we can't push an invalid CSV file
696
- validate_error_msg = qsv_validate .stderr
744
+ validate_error_msg = e .stderr
697
745
logger .error ("Invalid CSV! Job aborted: {}." .format (validate_error_msg ))
698
746
return
699
747
logger .info ("Well-formed, valid CSV file confirmed..." )
@@ -1359,6 +1407,7 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None):
1359
1407
except psycopg2 .Error as e :
1360
1408
raise utils .JobError ("Could not connect to the Datastore: {}" .format (e ))
1361
1409
else :
1410
+ copy_readbuffer_size = config .get ("COPY_READBUFFER_SIZE" )
1362
1411
cur = raw_connection .cursor ()
1363
1412
"""
1364
1413
truncate table to use copy freeze option and further increase
@@ -1383,9 +1432,10 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None):
1383
1432
sql .Identifier (resource_id ),
1384
1433
column_names ,
1385
1434
)
1386
- with open (tmp , "rb" ) as f :
1435
+ # specify a 1MB buffer size for COPY read from disk
1436
+ with open (tmp , "rb" , copy_readbuffer_size ) as f :
1387
1437
try :
1388
- cur .copy_expert (copy_sql , f )
1438
+ cur .copy_expert (copy_sql , f , size = copy_readbuffer_size )
1389
1439
except psycopg2 .Error as e :
1390
1440
raise utils .JobError ("Postgres COPY failed: {}" .format (e ))
1391
1441
else :
0 commit comments