@@ -71,6 +71,18 @@ private static string GetColumnWidths(CsvDefinition csvdef, bool abspos)
71
71
return res ;
72
72
}
73
73
74
+ /// <summary>
75
+ /// Standard disclaimer for generated scripts
76
+ /// </summary>
77
+ private static void ScriptDisclaimer ( StringBuilder sb )
78
+ {
79
+ sb . Append ( "#\r \n # NOTE:\r \n " ) ;
80
+ sb . Append ( "# This is a generated script and it doesn't handle all potential data errors.\r \n " ) ;
81
+ sb . Append ( "# The script is meant as a starting point for processing your data files.\r \n " ) ;
82
+ sb . Append ( "# Adjust and expand the script for your specific data processing needs.\r \n " ) ;
83
+ sb . Append ( "# Always back-up your data files to prevent data loss.\r \n \r \n " ) ;
84
+ }
85
+
74
86
/// <summary>
75
87
/// generate JSON metadata
76
88
/// </summary>
@@ -189,6 +201,15 @@ public static void GenerateSchemaJSON(CsvDefinition csvdef)
189
201
}
190
202
}
191
203
204
+ /// <summary>
205
+ /// generate CSV datadictionary metadata
206
+ /// </summary>
207
+ /// <param name="data"> csv data </param>
208
+ public static void GenerateDatadictionaryCSV ( CsvDefinition csvdef )
209
+ {
210
+ //TODO
211
+ }
212
+
192
213
/// <summary>
193
214
/// generate Python Panda code based on columns (most asked on stackoverflow)
194
215
/// </summary>
@@ -215,13 +236,10 @@ public static void GeneratePythonPanda(CsvDefinition csvdef)
215
236
List < String > comment = CsvEdit . ScriptInfo ( notepad ) ;
216
237
foreach ( var str in comment ) python . Append ( string . Format ( "# {0}\r \n " , str ) ) ;
217
238
218
- // warning
219
- python . Append ( "#\r \n # NOTE:\r \n " ) ;
220
- python . Append ( "# This is a generated script and it doesn't handle all potential data errors.\r \n " ) ;
221
- python . Append ( "# The script is meant as a starting point for processing your data files.\r \n " ) ;
222
- python . Append ( "# Adjust and expand the script for your specific data processing needs.\r \n " ) ;
223
- python . Append ( "# Always back-up your data files to prevent data loss.\r \n \r \n " ) ;
239
+ // add standard disclaimer for generated scripts
240
+ ScriptDisclaimer ( python ) ;
224
241
242
+ // start Python script
225
243
python . Append ( "# Library\r \n " ) ;
226
244
python . Append ( "import os\r \n " ) ;
227
245
python . Append ( "import numpy as np\r \n " ) ;
@@ -476,13 +494,10 @@ public static void GenerateRScript(CsvDefinition csvdef)
476
494
List < String > comment = CsvEdit . ScriptInfo ( notepad ) ;
477
495
foreach ( var str in comment ) rscript . Append ( string . Format ( "# {0}\r \n " , str ) ) ;
478
496
479
- // warning
480
- rscript . Append ( "#\r \n # NOTE:\r \n " ) ;
481
- rscript . Append ( "# This is a generated script and it doesn't handle all potential data errors.\r \n " ) ;
482
- rscript . Append ( "# The script is meant as a starting point for processing your data files.\r \n " ) ;
483
- rscript . Append ( "# Adjust and expand the script for your specific data processing needs.\r \n " ) ;
484
- rscript . Append ( "# Always back-up your data files to prevent data loss.\r \n \r \n " ) ;
497
+ // add standard disclaimer for generated scripts
498
+ ScriptDisclaimer ( rscript ) ;
485
499
500
+ // start R-script
486
501
rscript . Append ( "# Library\r \n " ) ;
487
502
rscript . Append ( "library(dplyr)\r \n \r \n " ) ;
488
503
@@ -674,5 +689,231 @@ public static void GenerateRScript(CsvDefinition csvdef)
674
689
notepad . SetCurrentLanguage ( LangType . L_R ) ;
675
690
}
676
691
}
692
+
693
+ /// <summary>
694
+ /// generate PowerShell code based on columns (sometimes asked on stackoverflow)
695
+ /// </summary>
696
+ /// <param name="data"> csv data </param>
697
+ public static void GeneratePowerShell ( CsvDefinition csvdef )
698
+ {
699
+ // get access to Notepad++
700
+ INotepadPPGateway notepad = new NotepadPPGateway ( ) ;
701
+ IScintillaGateway editor = new ScintillaGateway ( PluginBase . GetCurrentScintilla ( ) ) ;
702
+
703
+ // Python requires forward slash for filepaths
704
+ string FILE_PATH = Path . GetDirectoryName ( notepad . GetCurrentFilePath ( ) ) . TrimEnd ( Path . DirectorySeparatorChar ) + Path . DirectorySeparatorChar ;
705
+ string FILE_NAME = Path . GetFileName ( notepad . GetCurrentFilePath ( ) ) ;
706
+
707
+ StringBuilder ps1 = new StringBuilder ( ) ;
708
+
709
+ // build Python script
710
+ ps1 . Append ( "# PowerShell - read csv with datatypes\r \n " ) ;
711
+
712
+ // default comment
713
+ List < String > comment = CsvEdit . ScriptInfo ( notepad ) ;
714
+ foreach ( var str in comment ) ps1 . Append ( string . Format ( "# {0}\r \n " , str ) ) ;
715
+
716
+ // add standard disclaimer for generated scripts
717
+ ScriptDisclaimer ( ps1 ) ;
718
+
719
+ // start PowerShell script
720
+ ps1 . Append ( "# working directory and filename\r \n " ) ;
721
+ ps1 . Append ( string . Format ( "$pathname = \" {0}\" )\r \n " , FILE_PATH ) ) ;
722
+ ps1 . Append ( string . Format ( "$filename = $pathname + \" {0}\" \r \n \r \n " , FILE_NAME ) ) ;
723
+
724
+ var col_names = "" ;
725
+ var col_order = "" ;
726
+ var col_types = "" ;
727
+ var col_enums = "" ;
728
+ var check_enums = "" ;
729
+
730
+ var exampleDate = "" ;
731
+
732
+ var r_dec = "" ;
733
+
734
+ for ( int c = 0 ; c < csvdef . Fields . Count ; c ++ )
735
+ {
736
+ // next field
737
+ var coldef = csvdef . Fields [ c ] ;
738
+
739
+ // any characters are allowed in Python column names
740
+ var colname = coldef . Name ;
741
+ //colname = Regex.Replace(colname, "[^a-zA-Z0-9]", "_"); // not letter or digit
742
+ var colnamepad = colname . PadRight ( 15 , ' ' ) ;
743
+
744
+ var comma = ( c < csvdef . Fields . Count - 1 ? ", " : "" ) ;
745
+
746
+ // list all column names
747
+ col_names += string . Format ( "\" {0}\" {1}" , colname , comma ) ;
748
+ col_order += string . Format ( "\t {0} = $_.{1}\r \n " , colnamepad , colname ) ;
749
+
750
+ // enumeration
751
+ if ( coldef . isCodedValue )
752
+ {
753
+ var enumvals = string . Join ( "\" , \" " , coldef . CodedList ) ;
754
+ // Constrains for string or integer values
755
+ if ( coldef . DataType == ColumnType . String )
756
+ {
757
+ enumvals = string . Format ( "\" {0}\" " , enumvals ) ; // use quotes
758
+ }
759
+ else
760
+ {
761
+ enumvals = enumvals . Replace ( "\" " , "" ) ; // no quotes
762
+ } ;
763
+ col_enums += string . Format ( "${0}_array = @({1})\r \n " , coldef . Name , enumvals ) ;
764
+ check_enums += string . Format ( "\t if (!(${0}_array -contains $row.{0})) {{$errmsg += \" $($row.{0}) is invalid {0}\" }}\r \n " , coldef . Name ) ;
765
+ }
766
+
767
+ // indent for next lines
768
+ //if (c > 0) col_types += " ";
769
+
770
+ // Python datetypes
771
+ switch ( coldef . DataType )
772
+ {
773
+ case ColumnType . DateTime :
774
+ // build Python fomat example "M/d/yyyy HH:m:s" -> "%m/%d/%Y %H:%M:%S"
775
+ var msk = coldef . Mask ;
776
+ msk = DateMaskStandardToCstr ( msk ) ;
777
+ col_types += string . Format ( "\t $row.{0} = [datetime]::parseexact($row.{1}, '{2}', $null)\r \n " , colnamepad , colname , msk ) ;
778
+ if ( exampleDate == "" ) exampleDate = colname ;
779
+ break ;
780
+ case ColumnType . Integer :
781
+ col_types += string . Format ( "\t $row.{0} = [int]($row.{1} -replace 'NaN', '')\r \n " , colnamepad , colname ) ;
782
+
783
+ break ;
784
+ case ColumnType . Decimal :
785
+ col_types += string . Format ( "\t $row.{0} = [decimal]($row.{1} -replace ',', '.')\r \n " , colnamepad , colname ) ;
786
+
787
+ // just use the first decimal symbol
788
+ if ( r_dec == "" ) r_dec = coldef . DecimalSymbol . ToString ( ) ;
789
+ break ;
790
+ //default:
791
+ // col_types += string.Format("#\t$row.{0} = $row.{1}\r\n", colnamepad, colname);
792
+ // break;
793
+ } ;
794
+ }
795
+
796
+ // no decimals, then not technically needed but nice to have as example code
797
+ if ( r_dec == "" ) r_dec = "." ;
798
+
799
+ // csv-parameters
800
+ var nameparam = "" ;
801
+ var separator = csvdef . Separator . ToString ( ) ;
802
+ if ( separator != "\0 " )
803
+ {
804
+ if ( separator == "\t " ) separator = "`t" ;
805
+ nameparam = string . Format ( " -Delimiter \" {0}\" " , separator ) ;
806
+ }
807
+
808
+ if ( ! csvdef . ColNameHeader )
809
+ {
810
+ nameparam = string . Format ( " -Header @({0})" , col_names ) ;
811
+ }
812
+
813
+ // PowerShell skip comment lines
814
+ if ( csvdef . SkipLines > 0 ) nameparam += string . Format ( " | Select-Object -Skip {0}" , csvdef . SkipLines ) ;
815
+
816
+ // PowerShell comment character not supported(?)
817
+ //if (csvdef.CommentChar != '\0') nameparam += string.Format(" -Comment '{0}'", csvdef.CommentChar);
818
+
819
+ // read csv file
820
+ if ( csvdef . Separator == '\0 ' )
821
+ {
822
+ // fixed width
823
+ ps1 . Append ( string . Format ( "# read fixed width data file, positions {0}\r \n " , GetColumnWidths ( csvdef , true ) ) ) ;
824
+
825
+ ps1 . Append ( "$stream_in = [System.IO.StreamReader]::new($pathname + $filename)\r \n \r \n " ) ;
826
+ ps1 . Append ( "$csvdata = while ($line = $stream_in.ReadLine()) {\r \n " ) ;
827
+ ps1 . Append ( "\t [PSCustomObject]@{\r \n " ) ;
828
+
829
+ // fixed width columns
830
+ var startpos = 0 ;
831
+ for ( int c = 0 ; c < csvdef . Fields . Count ; c ++ )
832
+ {
833
+ // next field
834
+ var coldef = csvdef . Fields [ c ] ;
835
+
836
+ // space characters are not allowed in PowerShell customobject field names
837
+ var colname = coldef . Name . PadRight ( 15 , ' ' ) ;
838
+ var strpos = startpos . ToString ( ) . PadLeft ( 3 , ' ' ) ;
839
+ var strwid = coldef . MaxWidth . ToString ( ) . PadLeft ( 2 , ' ' ) ;
840
+ ps1 . Append ( string . Format ( "\t \t {0} = $line.Substring({1}, {2}).Trim(' \" ')\r \n " , colname , strpos , strwid ) ) ;
841
+ startpos += coldef . MaxWidth ;
842
+ } ;
843
+ ps1 . Append ( "\t }\r \n }\r \n \r \n " ) ;
844
+ }
845
+ else
846
+ {
847
+ // character separated
848
+ ps1 . Append ( "# read csv data file\r \n " ) ;
849
+ ps1 . Append ( string . Format ( "$csvdata = Import-Csv -Path $filename{0}\r \n \r \n " , nameparam ) ) ;
850
+ }
851
+
852
+ // column types
853
+ if ( col_types != "" )
854
+ {
855
+ ps1 . Append ( "# Explicit datatypes\r \n " ) ;
856
+ ps1 . Append ( "# WARNING: The script below doesn't have any eror handling for null/empty values,\r \n " ) ;
857
+ ps1 . Append ( "# so if your data file contains int, decimal or datetime columns with empty or incorrect values,\r \n " ) ;
858
+ ps1 . Append ( "# this script can throw errors or silently change values to '0', so beware.\r \n " ) ;
859
+ ps1 . Append ( "foreach ($row in $csvdata)\r \n {\r \n " ) ;
860
+ ps1 . Append ( col_types ) ;
861
+ ps1 . Append ( "}\r \n \r \n " ) ;
862
+ }
863
+
864
+ // PowerShell enumeration check
865
+ if ( col_enums != "" )
866
+ {
867
+ ps1 . Append ( "# Enumeration allowed values\r \n " ) ;
868
+ ps1 . Append ( string . Format ( "{0}\r \n " , col_enums ) ) ;
869
+ ps1 . Append ( "# enumeration check invalid values\r \n " ) ;
870
+ ps1 . Append ( "$line = 0\r \n " ) ;
871
+ ps1 . Append ( "foreach ($row in $csvdata)\r \n \r \n " ) ;
872
+ ps1 . Append ( "\t # check invalid values\r \n " ) ;
873
+ ps1 . Append ( "\t $errmsg = \" \" \r \n " ) ;
874
+ ps1 . Append ( string . Format ( "{0}\r \n " , check_enums ) ) ;
875
+ ps1 . Append ( "\t # report invalid values\r \n " ) ;
876
+ ps1 . Append ( "\t $line = $line + 1\r \n " ) ;
877
+ ps1 . Append ( "\t if ($errmsg) {Write-Output \" line $($line):$errmsg\" }\r \n }\r \n \r \n " ) ;
878
+ }
879
+
880
+ if ( exampleDate == "" ) exampleDate = "myDateField" ;
881
+
882
+ // Python examples of typical data transformations
883
+ ps1 . Append ( "# --------------------------------------\r \n " ) ;
884
+ ps1 . Append ( "# Data transformation suggestions\r \n " ) ;
885
+ ps1 . Append ( "# --------------------------------------\r \n \r \n " ) ;
886
+
887
+ ps1 . Append ( "# Reorder or remove columns (edit code below)\r \n " ) ;
888
+ ps1 . Append ( "$csvnew = $csvdata | ForEach-Object {\r \n " ) ;
889
+ ps1 . Append ( "\t # Reorder columns\r \n " ) ;
890
+ ps1 . Append ( col_order ) ;
891
+ ps1 . Append ( "#\t # Add columns\r \n " ) ;
892
+ ps1 . Append ( string . Format ( "#\t {0} = $_.{0}.ToString(\" yyyy-MM-dd\" )\r \n " , exampleDate ) ) ;
893
+ ps1 . Append ( "#\t YesNo_code = switch ($_.YesNoValue) {\r \n " ) ;
894
+ ps1 . Append ( "#\t \t \t \" No\" {\" 0\" }\r \n " ) ;
895
+ ps1 . Append ( "#\t \t \t \" Yes\" {\" 1\" }\r \n " ) ;
896
+ ps1 . Append ( "#\t \t \t default {$_}\r \n " ) ;
897
+ ps1 . Append ( "#\t \t }\r \n " ) ;
898
+ ps1 . Append ( "#\t bmi = [math]::Round($_.Weight / ($_.Height * $_.Height), 2)\r \n " ) ;
899
+ ps1 . Append ( "#\t center_patient = $_.centercode.SubString(0, 2) + \" -\" + patientcode # '01-123' etc\r \n " ) ;
900
+ ps1 . Append ( "}\r \n \r \n " ) ;
901
+
902
+ ps1 . Append ( "# Merge datasets example, to join on multiple columns use a list, for example: on=['patient_id', 'center_id']\r \n " ) ;
903
+ ps1 . Append ( "# $merged_df = Join-Object -Left $PSCustomObject -Right $DataTable -LeftJoinProperty 'ID' -RightJoinProperty 'IDD' -ExcludeRightProperties 'Junk' -Prefix 'R_' | Format-Table # same key column name\r \n " ) ;
904
+ ps1 . Append ( "# $merged_df = Join-Object -Left $PSCustomObject -Right $DataTable -LeftJoinProperty 'ID' -RightJoinProperty 'IDD' -ExcludeRightProperties 'Junk' -Prefix 'R_' | Format-Table # different key column names\r \n \r \n " ) ;
905
+
906
+ ps1 . Append ( "# csv write new output\r \n " ) ;
907
+ ps1 . Append ( "$filenew = $pathname + \" output.txt\" \r \n " ) ;
908
+ ps1 . Append ( string . Format ( "$csvnew | Export-Csv -Path $filenew -Delimiter \" `t\" -NoTypeInformation\r \n " , separator ) ) ;
909
+
910
+ // create new file
911
+ notepad . FileNew ( ) ;
912
+ editor . SetText ( ps1 . ToString ( ) ) ;
913
+ if ( ps1 . Length < Main . Settings . AutoSyntaxLimit )
914
+ {
915
+ notepad . SetCurrentLanguage ( LangType . L_POWERSHELL ) ;
916
+ }
917
+ }
677
918
}
678
919
}
0 commit comments