Skip to content

Commit 97f57c2

Browse files
author
u0028003
committed
Incorporation of spot request cancellation and instance termination
functionality into JobRunner. Menu txt changes to VersionManager
1 parent 479a8bb commit 97f57c2

File tree

4 files changed

+142
-68
lines changed

4 files changed

+142
-68
lines changed

pom.xml

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,19 @@
3838
<artifactId>junit</artifactId>
3939
<version>4.13.1</version>
4040
</dependency>
41-
<dependency>
42-
<groupId>com.sun.mail</groupId>
43-
<artifactId>javax.mail</artifactId>
44-
<version>1.6.0</version>
45-
</dependency>
46-
<dependency>
47-
<groupId>software.amazon.awssdk</groupId>
48-
<artifactId>dynamodb</artifactId>
41+
<dependency>
42+
<groupId>com.sun.mail</groupId>
43+
<artifactId>javax.mail</artifactId>
44+
<version>1.6.0</version>
45+
</dependency>
46+
<dependency>
47+
<groupId>com.googlecode.json-simple</groupId>
48+
<artifactId>json-simple</artifactId>
49+
<version>1.1.1</version>
50+
</dependency>
51+
<dependency>
52+
<groupId>software.amazon.awssdk</groupId>
53+
<artifactId>dynamodb</artifactId>
4954
</dependency>
5055
<dependency>
5156
<groupId>software.amazon.awssdk</groupId>

src/main/java/edu/utah/hci/aws/apps/jobrunner/JobRunner.java

Lines changed: 113 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@
99
import java.net.UnknownHostException;
1010
import java.util.regex.Matcher;
1111
import java.util.regex.Pattern;
12+
import org.json.simple.JSONArray;
13+
import org.json.simple.JSONObject;
14+
import org.json.simple.parser.JSONParser;
1215
import edu.utah.hci.aws.util.Util;
1316
import java.util.ArrayList;
1417
import java.util.Arrays;
1518
import java.util.HashMap;
19+
import java.util.Iterator;
1620

1721

1822
/**Looks for bash scripts in a particular s3 bucket, reanames each, downloads, and runs, and transfers back jobs results
@@ -75,7 +79,7 @@ public class JobRunner {
7579
private boolean verbose = false;
7680
private boolean syncDirs = true;
7781
private boolean terminateInstance = false;
78-
private int minToWait = 60;
82+
private int minToWait = 10;
7983
private boolean testing = false;
8084

8185
//internal fields
@@ -86,8 +90,10 @@ public class JobRunner {
8690
private String ram = "NA";
8791
private String availableDisk = "NA";
8892
private String availabilityZone = null;
93+
private String region = null;
8994
private String instanceId = null;
9095
private String instanceType = null;
96+
private String spotId = null;
9197
private double spotPrice = 0;
9298
private String awsPath = "aws";
9399
private StringBuilder hostLog = new StringBuilder();
@@ -120,8 +126,12 @@ public JobRunner (String[] args){
120126
processArgs(args);
121127

122128
loadCredentials();
123-
129+
124130
checkAwsCli();
131+
132+
loadHostInfo();
133+
loadSpotInfo();
134+
printHostInfo();
125135

126136
checkResourceBundle();
127137

@@ -262,7 +272,7 @@ private double fetchSpotPrice(long currentTime) throws IOException {
262272
availabilityZone="us-west-2d";
263273
}
264274

265-
String[] cmd = {awsPath, "ec2", "describe-spot-price-history",
275+
String[] cmd = {awsPath, "--region", region, "ec2", "describe-spot-price-history",
266276
"--instance-types", instanceType,
267277
"--availability-zone", availabilityZone,
268278
"--start-time", seconds.toString(),
@@ -359,9 +369,17 @@ private void deleteAndCopyLocalJobDirWithS3JobDir() throws Exception {
359369
private void shutDown(int ec) throws Exception {
360370
exitCode = ec;
361371

372+
//kill the spot request
373+
if (spotId != null) {
374+
if (verbose) pl ("Canceling spot request...");
375+
String[] cmd = new String[]{awsPath, "--region", region, "ec2", "cancel-spot-instance-requests", "--spot-instance-request-ids", spotId};
376+
executeReturnExitCode(cmd, false, true, null);
377+
}
378+
362379
// this will system exit
363380
if (availabilityZone != null && terminateInstance) {
364-
String[] cmd = new String[]{awsPath, "ec2", "terminate-instances", "--instance-ids", instanceId};
381+
if (verbose) pl ("Terminating instance...");
382+
String[] cmd = new String[]{awsPath, "--region", region, "ec2", "terminate-instances", "--instance-ids", instanceId};
365383
executeReturnExitCode(cmd, false, true, null);
366384
}
367385
Util.pl("\tComplete");
@@ -833,12 +851,19 @@ private void loadCredentials() throws IOException {
833851
//check it, the downloaded file might be a error message from AWS about expired
834852
String[] lines = Util.loadTxtFile(credentialsFile);
835853
int keyCount = 0;
836-
for (String l: lines) if (l.contains("aws_access_key_id")) keyCount++;
854+
//String regionLine = null; //region = us-west-2
855+
for (String l: lines) {
856+
if (l.contains("aws_access_key_id")) keyCount++;
857+
//if (l.contains("region")) regionLine = l;
858+
}
837859
String merged = Util.stringArrayToString(lines, "\n\t");
838860
if (keyCount !=1 || merged.contains("region") == false || merged.contains("aws_access_key_id") == false || merged.contains("aws_secret_access_key") == false) {
839-
throw new IOException("\tError: the credential file is malformed -> "+credentialsUrl+ "\n\t"+merged+"\n\tSee the JobRunner help menu.");
861+
throw new IOException("\tError: the credential file is malformed, does it have just one set of credentials? with region, aws_access_key_id, and aws_secret_access_key? -> "+credentialsUrl+"\n\tSee the JobRunner help menu.");
840862
}
841863

864+
//String[] splitRegionLine = Util.EQUALS.split(regionLine);
865+
//region = splitRegionLine[1].trim();
866+
842867
//since it was downloaded, mark it for deletion upon exit
843868
credentialsFile.deleteOnExit();
844869
}
@@ -848,14 +873,11 @@ private void loadCredentials() throws IOException {
848873

849874

850875
//TODO: Only needed in Eclipse
851-
if (hostName.endsWith("local") || hostName.contains("utah")) {
876+
if (testing) {
852877
envPropToAdd.put("PATH", "/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/opt/X11/bin");
853878
awsPath="/usr/local/bin/aws";
854879
}
855880

856-
//fetch the spot price
857-
spotPrice = fetchSpotPrice(System.currentTimeMillis());
858-
859881
}
860882

861883

@@ -892,26 +914,18 @@ public void processArgs(String[] args) throws Exception {
892914
}
893915
}
894916
}
895-
917+
896918
//work directory, need this to find RAM on machine
897919
if (workDirectory == null) {
898920
throw new IOException("Error: failed to find your -t temporary local work directory.");
899921
}
900-
922+
901923
workDirectory.mkdirs();
902924
if (workDirectory.exists() == false || workDirectory.canWrite() == false) {
903925
throw new IOException("Error: failed to find a writable work directory -> "+ workDirectory);
904926
}
905927
tmpDirectory = new File (workDirectory, "TmpDir");
906928
tmpDirectory.mkdir();
907-
908-
//set node info
909-
loadHostInfo();
910-
911-
//add workingDir to env
912-
envPropToAdd.put("JR_WORKING_DIR", workDirectory.getCanonicalPath());
913-
914-
printParams();
915929

916930
// required args?
917931
if (resourceS3Uri == null || credentialsUrl == null || jobsS3Uri == null || logsS3Uri == null) {
@@ -928,8 +942,49 @@ public void processArgs(String[] args) throws Exception {
928942
}
929943
if (resourceS3Uri.endsWith(".zip") == false) throw new IOException("Error: the zip resource S3Uri must end with xxx.zip, see "+resourceS3Uri);
930944

945+
//add workingDir to env
946+
envPropToAdd.put("JR_WORKING_DIR", workDirectory.getCanonicalPath());
947+
948+
printParams();
949+
931950
}
951+
952+
private void loadSpotInfo() {
953+
String jsonString = null;
954+
try {
955+
if (instanceId == null) return;
956+
957+
958+
//fetch the spot price
959+
spotPrice = fetchSpotPrice(System.currentTimeMillis());
960+
961+
String[] cmd = {awsPath, "--region", region, "ec2", "describe-spot-instance-requests"};
932962

963+
String[] out = executeViaProcessBuilder(cmd, false, null);
964+
965+
jsonString = Util.stringArrayToString(out, " ");
966+
Object obj = new JSONParser().parse(jsonString);
967+
JSONObject jo = (JSONObject) obj;
968+
JSONArray ja = (JSONArray) jo.get("SpotInstanceRequests");
969+
970+
Iterator<JSONObject> it = ja.iterator();
971+
while (it.hasNext()) {
972+
JSONObject sr = it.next();
973+
String iid = sr.get("InstanceId").toString();
974+
if (iid.equals(instanceId)) {
975+
spotId = sr.get("SpotInstanceRequestId").toString();
976+
return;
977+
}
978+
}
979+
throw new Exception();
980+
} catch (Exception e) {
981+
el("\nFailed to fetch spot information for "+instanceId+" from ->\n"+jsonString+"\n"+Util.getStackTrace(e));
982+
e.printStackTrace();
983+
try {
984+
shutDown(1);
985+
} catch (Exception e1) {}
986+
}
987+
}
933988
/*Attempt to get machine info, doesn't work on a mac*/
934989
private void loadHostInfo() throws UnknownHostException {
935990

@@ -969,6 +1024,7 @@ private void loadHostInfo() throws UnknownHostException {
9691024
} catch (Exception e) {}
9701025
if (out!= null && out.length == 3) {
9711026
availabilityZone = Util.WHITESPACE.split(out[0])[1];
1027+
region = availabilityZone.substring(0, availabilityZone.length()-1);
9721028
instanceId = Util.WHITESPACE.split(out[1])[1];
9731029
hostName = instanceId;
9741030
instanceType = Util.WHITESPACE.split(out[2])[1];
@@ -991,9 +1047,12 @@ private void printParams() {
9911047
pl(" -l Node Logs S3 URI : "+ logsS3Uri);
9921048
pl(" -d Local work dir : "+ workDirectory);
9931049
pl(" -t Terminate node on exit : "+ terminateInstance);
994-
pl(" -w Min 2 wait before exit : "+ minToWait);
1050+
pl(" -w Min to wait before exit : "+ minToWait);
9951051
pl(" -x Replace S3 job with local : "+ (syncDirs==false));
9961052

1053+
}
1054+
1055+
private void printHostInfo() {
9971056
pl("\nJob Runner Info:");
9981057
pl(" Host name : "+ hostName);
9991058
pl(" Number processors : "+ numberProcessors);
@@ -1003,40 +1062,49 @@ private void printParams() {
10031062

10041063
if (availabilityZone != null) {
10051064
pl(" Availability Zone : "+ availabilityZone);
1065+
pl(" Region : "+ region);
10061066
pl(" Instance ID : "+ instanceId);
1067+
pl(" Spot Request ID : "+ spotId);
10071068
pl(" Instance Type : "+ instanceType);
10081069
pl(" Terminate upon exit : "+ terminateInstance);
10091070
}
10101071
}
10111072

10121073
public static void printDocs(){
10131074
System.out.println("\n" +
1014-
"****************************************************************************************************************************\n" +
1015-
"** AWS Job Runner : December 2021 **\n" +
1016-
"****************************************************************************************************************************\n" +
1017-
"JR is an app for running bash scripts on AWS EC2 nodes. It downloads and uncompressed your resource bundle and looks for\n"+
1018-
"xxx.sh_JR_START files in your S3 Jobs directories. For each, it copies over the directory contents, executes the\n"+
1019-
"associated xxx.sh script, and transfers back the results. This is repeated until no unrun jobs are found. Launch many\n"+
1020-
"EC2 JR nodes, each running an instance of the JR, to process hundreds of jobs in parallel. Use spot requests and\n"+
1021-
"hibernation to reduce costs.\n"+
1075+
"**************************************************************************************\n" +
1076+
"** AWS Job Runner : January 2021 **\n" +
1077+
"**************************************************************************************\n" +
1078+
"JR is an app for running bash scripts on AWS EC2 nodes. It downloads and uncompressed\n"+
1079+
"your resource bundle and looks for xxx.sh_JR_START files in your S3 Jobs directories.\n"+
1080+
"For each, it copies over the directory contents, executes the associated xxx.sh\n"+
1081+
"script, and transfers back the results. This is repeated until no unrun jobs are\n"+
1082+
"found. Launch many EC2 JR nodes, each running an instance of the JR, to process\n"+
1083+
"hundreds of jobs in parallel. Use spot requests and hibernation to reduce costs.\n"+
1084+
"Upon termination, JR will cancel the spot request and kill the instance.\n"+
10221085

10231086
"\nTo use:\n"+
1024-
"1) Install and configure the aws cli on your local workstation, see https://aws.amazon.com/cli/\n"+
1025-
"2) Upload your aws credentials file into a private bucket on aws, e.g.\n"+
1026-
" aws s3 cp ~/.aws/credentials s3://my-jr/aws.cred.txt\n"+
1087+
"1) Install and configure the aws cli on your local workstation, see\n"+
1088+
" https://aws.amazon.com/cli/\n"+
1089+
"2) Upload a [default] aws credential file containing a single set of region,\n"+
1090+
" aws_access_key_id, and aws_secret_access_key info into a private bucket, e.g.\n"+
1091+
" aws s3 cp ~/.aws/credentials s3://my-jr/aws.cred.txt \n"+
10271092
"3) Generate a secure 24hr timed URL for the credentials file, e.g.\n"+
10281093
" aws --region us-west-2 s3 presign s3://my-jr/aws.cred.txt --expires-in 259200\n"+
10291094
"4) Upload a zip archive containing resources needed to run your jobs into S3, e.g.\n"+
10301095
" aws s3 cp ~/TNRunnerResourceBundle.zip s3://my-jr/TNRunnerResourceBundle.zip\n"+
10311096
" This will be copied into the /JRDir/ directory and then unzipped.\n"+
10321097
"5) Upload script and job files into a 'Jobs' directory on S3, e.g.\n"+
10331098
" aws s3 cp ~/JRJobs/A/ s3://my-jr/Jobs/A/ --recursive\n"+
1034-
"6) Optional, upload bash script files ending with JR_INIT.sh and or JR_TERM.sh. These are executed by JR before and after\n"+
1035-
" running the main bash script. Use these to copy in sample specific resources, e.g. fastq/ cram/ bam files, and to run\n"+
1099+
"6) Optional, upload bash script files ending with JR_INIT.sh and or JR_TERM.sh. These\n"+
1100+
" are executed by JR before and after running the main bash script. Use these to\n"+
1101+
" copy in sample specific resources, e.g. fastq/ cram/ bam files, and to run\n"+
10361102
" post job clean up.\n"+
1037-
"7) Upload a file named XXX_JR_START to let the JobRunner know the bash script named XXX is ready to run, e.g.\n"+
1103+
"7) Upload a file named XXX_JR_START to let the JobRunner know the bash script named\n"+
1104+
" XXX is ready to run, e.g.\n"+
10381105
" aws s3 cp s3://my-jr/emptyFile s3://my-jr/Jobs/A/dnaAlignQC.sh_JR_START\n"+
1039-
"8) Launch the JobRunner.jar on one or more JR configured EC2 nodes. See https://ri-confluence.hci.utah.edu/x/gYCgBw\n"+
1106+
"8) Launch the JobRunner.jar on one or more JR configured EC2 nodes. See\n"+
1107+
" https://ri-confluence.hci.utah.edu/x/gYCgBw\n"+
10401108

10411109
"\nJob Runner Required Options:\n"+
10421110
"-c URL to your secure timed config credentials file.\n"+
@@ -1045,23 +1113,25 @@ public static void printDocs(){
10451113
"-l S3URI to your Log folder for node logs.\n"+
10461114

10471115
"\nDefault Options:\n"+
1048-
"-d Directory on the local worker node, full path, in which resources and job files will be processed, defaults to /JRDir/\n"+
1116+
"-d Directory on the local worker node, full path, in which resources and job files\n"+
1117+
" will be processed, defaults to /JRDir/\n"+
10491118
"-a Aws credentials directory, defaults to ~/.aws/\n"+
1050-
"-t Terminate the EC2 node upon job completion. Defaults to looking for jobs for the min2Wait.\n"+
1051-
"-w Minutes to wait when jobs are not found before termination, defaults to 10.\n"+
1052-
"-x Replace S3 job directories with processed analysis, defaults to syncing local with S3. WARNING, if selected, don't place\n"+
1053-
" any files in these S3 jobs directories that cannot be replaced. JR will delete them.\n"+
1119+
"-t Terminate the EC2 node upon program exit, defaults to leaving it running. \n"+
1120+
"-w Minutes to wait looking for jobs before exiting, defaults to 10.\n"+
1121+
"-x Replace S3 job directories with processed analysis, defaults to syncing local with\n"+
1122+
" S3. WARNING, if selected, don't place any files in these S3 jobs directories that\n"+
1123+
" cannot be replaced. JR will delete them.\n"+
10541124
"-v Verbose debugging output.\n"+
10551125

10561126
"\nExample: java -jar -Xmx1G JobRunner.jar -x -t \n"+
10571127
" -r s3://my-jr/TNRunnerResourceBundle.zip\n"+
10581128
" -j s3://my-jr/Jobs/\n"+
10591129
" -l s3://my-jr/NodeLogs/\n"+
1060-
" -c 'https://my-jr.s3.us-west-2.amazonaws.com/aws.cred.txt?X-Amz-Algorithm=AWS4-HMXXX...'\n\n"+
1130+
" -c 'https://my-jr.s3.us-west-2.amazonaws.com/aws.cred.txt?X-AmRun...'\n\n"+
10611131

10621132

10631133

1064-
"****************************************************************************************************************************\n");
1134+
"**************************************************************************************\n");
10651135

10661136
}
10671137

0 commit comments

Comments
 (0)