Skip to content

Commit a6f9b06

Browse files
committed
Generate script PowerShell
Generate script, new option for PowerShell scripts
1 parent fab8c95 commit a6f9b06

File tree

4 files changed

+325
-31
lines changed

4 files changed

+325
-31
lines changed

CSVLintNppPlugin/CsvLint/CsvGenerateCode.cs

Lines changed: 253 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,18 @@ private static string GetColumnWidths(CsvDefinition csvdef, bool abspos)
7171
return res;
7272
}
7373

74+
/// <summary>
75+
/// Standard disclaimer for generated scripts
76+
/// </summary>
77+
private static void ScriptDisclaimer(StringBuilder sb)
78+
{
79+
sb.Append("#\r\n# NOTE:\r\n");
80+
sb.Append("# This is a generated script and it doesn't handle all potential data errors.\r\n");
81+
sb.Append("# The script is meant as a starting point for processing your data files.\r\n");
82+
sb.Append("# Adjust and expand the script for your specific data processing needs.\r\n");
83+
sb.Append("# Always back-up your data files to prevent data loss.\r\n\r\n");
84+
}
85+
7486
/// <summary>
7587
/// generate JSON metadata
7688
/// </summary>
@@ -189,6 +201,15 @@ public static void GenerateSchemaJSON(CsvDefinition csvdef)
189201
}
190202
}
191203

204+
/// <summary>
205+
/// generate CSV datadictionary metadata
206+
/// </summary>
207+
/// <param name="data"> csv data </param>
208+
public static void GenerateDatadictionaryCSV(CsvDefinition csvdef)
209+
{
210+
//TODO
211+
}
212+
192213
/// <summary>
193214
/// generate Python Panda code based on columns (most asked on stackoverflow)
194215
/// </summary>
@@ -215,13 +236,10 @@ public static void GeneratePythonPanda(CsvDefinition csvdef)
215236
List<String> comment = CsvEdit.ScriptInfo(notepad);
216237
foreach (var str in comment) python.Append(string.Format("# {0}\r\n", str));
217238

218-
// warning
219-
python.Append("#\r\n# NOTE:\r\n");
220-
python.Append("# This is a generated script and it doesn't handle all potential data errors.\r\n");
221-
python.Append("# The script is meant as a starting point for processing your data files.\r\n");
222-
python.Append("# Adjust and expand the script for your specific data processing needs.\r\n");
223-
python.Append("# Always back-up your data files to prevent data loss.\r\n\r\n");
239+
// add standard disclaimer for generated scripts
240+
ScriptDisclaimer(python);
224241

242+
// start Python script
225243
python.Append("# Library\r\n");
226244
python.Append("import os\r\n");
227245
python.Append("import numpy as np\r\n");
@@ -476,13 +494,10 @@ public static void GenerateRScript(CsvDefinition csvdef)
476494
List<String> comment = CsvEdit.ScriptInfo(notepad);
477495
foreach (var str in comment) rscript.Append(string.Format("# {0}\r\n", str));
478496

479-
// warning
480-
rscript.Append("#\r\n# NOTE:\r\n");
481-
rscript.Append("# This is a generated script and it doesn't handle all potential data errors.\r\n");
482-
rscript.Append("# The script is meant as a starting point for processing your data files.\r\n");
483-
rscript.Append("# Adjust and expand the script for your specific data processing needs.\r\n");
484-
rscript.Append("# Always back-up your data files to prevent data loss.\r\n\r\n");
497+
// add standard disclaimer for generated scripts
498+
ScriptDisclaimer(rscript);
485499

500+
// start R-script
486501
rscript.Append("# Library\r\n");
487502
rscript.Append("library(dplyr)\r\n\r\n");
488503

@@ -674,5 +689,231 @@ public static void GenerateRScript(CsvDefinition csvdef)
674689
notepad.SetCurrentLanguage(LangType.L_R);
675690
}
676691
}
692+
693+
/// <summary>
694+
/// generate PowerShell code based on columns (sometimes asked on stackoverflow)
695+
/// </summary>
696+
/// <param name="data"> csv data </param>
697+
public static void GeneratePowerShell(CsvDefinition csvdef)
698+
{
699+
// get access to Notepad++
700+
INotepadPPGateway notepad = new NotepadPPGateway();
701+
IScintillaGateway editor = new ScintillaGateway(PluginBase.GetCurrentScintilla());
702+
703+
// Python requires forward slash for filepaths
704+
string FILE_PATH = Path.GetDirectoryName(notepad.GetCurrentFilePath()).TrimEnd(Path.DirectorySeparatorChar) + Path.DirectorySeparatorChar;
705+
string FILE_NAME = Path.GetFileName(notepad.GetCurrentFilePath());
706+
707+
StringBuilder ps1 = new StringBuilder();
708+
709+
// build Python script
710+
ps1.Append("# PowerShell - read csv with datatypes\r\n");
711+
712+
// default comment
713+
List<String> comment = CsvEdit.ScriptInfo(notepad);
714+
foreach (var str in comment) ps1.Append(string.Format("# {0}\r\n", str));
715+
716+
// add standard disclaimer for generated scripts
717+
ScriptDisclaimer(ps1);
718+
719+
// start PowerShell script
720+
ps1.Append("# working directory and filename\r\n");
721+
ps1.Append(string.Format("$pathname = \"{0}\")\r\n", FILE_PATH));
722+
ps1.Append(string.Format("$filename = $pathname + \"{0}\"\r\n\r\n", FILE_NAME));
723+
724+
var col_names = "";
725+
var col_order = "";
726+
var col_types = "";
727+
var col_enums = "";
728+
var check_enums = "";
729+
730+
var exampleDate = "";
731+
732+
var r_dec = "";
733+
734+
for (int c = 0; c < csvdef.Fields.Count; c++)
735+
{
736+
// next field
737+
var coldef = csvdef.Fields[c];
738+
739+
// any characters are allowed in Python column names
740+
var colname = coldef.Name;
741+
//colname = Regex.Replace(colname, "[^a-zA-Z0-9]", "_"); // not letter or digit
742+
var colnamepad = colname.PadRight(15, ' ');
743+
744+
var comma = (c < csvdef.Fields.Count - 1 ? ", " : "");
745+
746+
// list all column names
747+
col_names += string.Format("\"{0}\"{1}", colname, comma);
748+
col_order += string.Format("\t{0} = $_.{1}\r\n", colnamepad, colname);
749+
750+
// enumeration
751+
if (coldef.isCodedValue)
752+
{
753+
var enumvals = string.Join("\", \"", coldef.CodedList);
754+
// Constrains for string or integer values
755+
if (coldef.DataType == ColumnType.String)
756+
{
757+
enumvals = string.Format("\"{0}\"", enumvals); // use quotes
758+
}
759+
else
760+
{
761+
enumvals = enumvals.Replace("\"", ""); // no quotes
762+
};
763+
col_enums += string.Format("${0}_array = @({1})\r\n", coldef.Name, enumvals);
764+
check_enums += string.Format("\tif (!(${0}_array -contains $row.{0})) {{$errmsg += \" $($row.{0}) is invalid {0}\"}}\r\n", coldef.Name);
765+
}
766+
767+
// indent for next lines
768+
//if (c > 0) col_types += " ";
769+
770+
// Python datetypes
771+
switch (coldef.DataType)
772+
{
773+
case ColumnType.DateTime:
774+
// build Python fomat example "M/d/yyyy HH:m:s" -> "%m/%d/%Y %H:%M:%S"
775+
var msk = coldef.Mask;
776+
msk = DateMaskStandardToCstr(msk);
777+
col_types += string.Format("\t$row.{0} = [datetime]::parseexact($row.{1}, '{2}', $null)\r\n", colnamepad, colname, msk);
778+
if (exampleDate == "") exampleDate = colname;
779+
break;
780+
case ColumnType.Integer:
781+
col_types += string.Format("\t$row.{0} = [int]($row.{1} -replace 'NaN', '')\r\n", colnamepad, colname);
782+
783+
break;
784+
case ColumnType.Decimal:
785+
col_types += string.Format("\t$row.{0} = [decimal]($row.{1} -replace ',', '.')\r\n", colnamepad, colname);
786+
787+
// just use the first decimal symbol
788+
if (r_dec == "") r_dec = coldef.DecimalSymbol.ToString();
789+
break;
790+
//default:
791+
// col_types += string.Format("#\t$row.{0} = $row.{1}\r\n", colnamepad, colname);
792+
// break;
793+
};
794+
}
795+
796+
// no decimals, then not technically needed but nice to have as example code
797+
if (r_dec == "") r_dec = ".";
798+
799+
// csv-parameters
800+
var nameparam = "";
801+
var separator = csvdef.Separator.ToString();
802+
if (separator != "\0")
803+
{
804+
if (separator == "\t") separator = "`t";
805+
nameparam = string.Format(" -Delimiter \"{0}\"", separator);
806+
}
807+
808+
if (!csvdef.ColNameHeader)
809+
{
810+
nameparam = string.Format(" -Header @({0})", col_names);
811+
}
812+
813+
// PowerShell skip comment lines
814+
if (csvdef.SkipLines > 0) nameparam += string.Format(" | Select-Object -Skip {0}", csvdef.SkipLines);
815+
816+
// PowerShell comment character not supported(?)
817+
//if (csvdef.CommentChar != '\0') nameparam += string.Format(" -Comment '{0}'", csvdef.CommentChar);
818+
819+
// read csv file
820+
if (csvdef.Separator == '\0')
821+
{
822+
// fixed width
823+
ps1.Append(string.Format("# read fixed width data file, positions {0}\r\n", GetColumnWidths(csvdef, true)));
824+
825+
ps1.Append("$stream_in = [System.IO.StreamReader]::new($pathname + $filename)\r\n\r\n");
826+
ps1.Append("$csvdata = while ($line = $stream_in.ReadLine()) {\r\n");
827+
ps1.Append("\t[PSCustomObject]@{\r\n");
828+
829+
// fixed width columns
830+
var startpos = 0;
831+
for (int c = 0; c < csvdef.Fields.Count; c++)
832+
{
833+
// next field
834+
var coldef = csvdef.Fields[c];
835+
836+
// space characters are not allowed in PowerShell customobject field names
837+
var colname = coldef.Name.PadRight(15, ' ');
838+
var strpos = startpos.ToString().PadLeft(3, ' ');
839+
var strwid = coldef.MaxWidth.ToString().PadLeft(2, ' ');
840+
ps1.Append(string.Format("\t\t{0} = $line.Substring({1}, {2}).Trim(' \"')\r\n", colname, strpos, strwid));
841+
startpos += coldef.MaxWidth;
842+
};
843+
ps1.Append("\t}\r\n}\r\n\r\n");
844+
}
845+
else
846+
{
847+
// character separated
848+
ps1.Append("# read csv data file\r\n");
849+
ps1.Append(string.Format("$csvdata = Import-Csv -Path $filename{0}\r\n\r\n", nameparam));
850+
}
851+
852+
// column types
853+
if (col_types != "")
854+
{
855+
ps1.Append("# Explicit datatypes\r\n");
856+
ps1.Append("# WARNING: The script below doesn't have any eror handling for null/empty values,\r\n");
857+
ps1.Append("# so if your data file contains int, decimal or datetime columns with empty or incorrect values,\r\n");
858+
ps1.Append("# this script can throw errors or silently change values to '0', so beware.\r\n");
859+
ps1.Append("foreach ($row in $csvdata)\r\n{\r\n");
860+
ps1.Append(col_types);
861+
ps1.Append("}\r\n\r\n");
862+
}
863+
864+
// PowerShell enumeration check
865+
if (col_enums != "")
866+
{
867+
ps1.Append("# Enumeration allowed values\r\n");
868+
ps1.Append(string.Format("{0}\r\n", col_enums));
869+
ps1.Append("# enumeration check invalid values\r\n");
870+
ps1.Append("$line = 0\r\n");
871+
ps1.Append("foreach ($row in $csvdata)\r\n\r\n");
872+
ps1.Append("\t# check invalid values\r\n");
873+
ps1.Append("\t$errmsg = \"\"\r\n");
874+
ps1.Append(string.Format("{0}\r\n", check_enums));
875+
ps1.Append("\t# report invalid values\r\n");
876+
ps1.Append("\t$line = $line + 1\r\n");
877+
ps1.Append("\tif ($errmsg) {Write-Output \"line $($line):$errmsg\" }\r\n}\r\n\r\n");
878+
}
879+
880+
if (exampleDate == "") exampleDate = "myDateField";
881+
882+
// Python examples of typical data transformations
883+
ps1.Append("# --------------------------------------\r\n");
884+
ps1.Append("# Data transformation suggestions\r\n");
885+
ps1.Append("# --------------------------------------\r\n\r\n");
886+
887+
ps1.Append("# Reorder or remove columns (edit code below)\r\n");
888+
ps1.Append("$csvnew = $csvdata | ForEach-Object {\r\n");
889+
ps1.Append("\t# Reorder columns\r\n");
890+
ps1.Append(col_order);
891+
ps1.Append("#\t# Add columns\r\n");
892+
ps1.Append(string.Format("#\t{0} = $_.{0}.ToString(\"yyyy-MM-dd\")\r\n", exampleDate));
893+
ps1.Append("#\tYesNo_code = switch ($_.YesNoValue) {\r\n");
894+
ps1.Append("#\t\t\t\"No\" {\"0\"}\r\n");
895+
ps1.Append("#\t\t\t\"Yes\" {\"1\"}\r\n");
896+
ps1.Append("#\t\t\tdefault {$_}\r\n");
897+
ps1.Append("#\t\t}\r\n");
898+
ps1.Append("#\tbmi = [math]::Round($_.Weight / ($_.Height * $_.Height), 2)\r\n");
899+
ps1.Append("#\tcenter_patient = $_.centercode.SubString(0, 2) + \"-\" + patientcode # '01-123' etc\r\n");
900+
ps1.Append("}\r\n\r\n");
901+
902+
ps1.Append("# Merge datasets example, to join on multiple columns use a list, for example: on=['patient_id', 'center_id']\r\n");
903+
ps1.Append("# $merged_df = Join-Object -Left $PSCustomObject -Right $DataTable -LeftJoinProperty 'ID' -RightJoinProperty 'IDD' -ExcludeRightProperties 'Junk' -Prefix 'R_' | Format-Table # same key column name\r\n");
904+
ps1.Append("# $merged_df = Join-Object -Left $PSCustomObject -Right $DataTable -LeftJoinProperty 'ID' -RightJoinProperty 'IDD' -ExcludeRightProperties 'Junk' -Prefix 'R_' | Format-Table # different key column names\r\n\r\n");
905+
906+
ps1.Append("# csv write new output\r\n");
907+
ps1.Append("$filenew = $pathname + \"output.txt\"\r\n");
908+
ps1.Append(string.Format("$csvnew | Export-Csv -Path $filenew -Delimiter \"`t\" -NoTypeInformation\r\n", separator));
909+
910+
// create new file
911+
notepad.FileNew();
912+
editor.SetText(ps1.ToString());
913+
if (ps1.Length < Main.Settings.AutoSyntaxLimit)
914+
{
915+
notepad.SetCurrentLanguage(LangType.L_POWERSHELL);
916+
}
917+
}
677918
}
678919
}

0 commit comments

Comments
 (0)