Testing by running with plumbing data (#5)

kachulis · web-flow · commit d620e8664d17 · 2019-12-11T11:28:59.000-05:00
Add infrastructure for testing by running workflows with plumbing data, and some tests for BenchmarkVCFs
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,20 +1,66 @@
-# CircleCI 2.0 configuration file
+# CircleCI 2.1 configuration file
 #
 #
-version: 2
-jobs:
-  build:
+version: 2.1
+executors:
+  womtool-executor:
     docker:
       # specify the version you desire here
       - image: broadinstitute/womtool:47-b36d920
         entrypoint: /bin/bash
     working_directory: ~/palantir-workflows
+    environment: WOMTOOL_JAR=/app/womtool.jar
 
-    environment:
-      # Customize the JVM maximum heap limit
+commands:
+  get-cromwell-jar:
+    description: "Download cromwell jar"
+    steps:
+      - run: wget https://github.com/broadinstitute/cromwell/releases/download/47/cromwell-47.jar
+
+  install-make:
+    description: "Install make"
+    steps:
+      - run: |
+            apt-get update
+            apt-get install -y make
+jobs:
+  validate:
+    executor: womtool-executor
+    steps:
+      - checkout
+      - install-make
+      - run: make validate
 
+  validate-with-json:
+    executor: womtool-executor
     steps:
       - checkout
-      - run: apt-get update
-      - run: apt-get install -y make
-      - run: make all
+      - install-make
+      - run: make validate-with-json
+
+  test:
+    machine: true
+    environment:
+      CROMWELL_JAR=/home/circleci/project/cromwell-47.jar
+    steps:
+      - checkout
+      - get-cromwell-jar
+      - run: make test
+      - store_artifacts:
+          path: /home/circleci/project/logs
+      - store_artifacts:
+          path: /home/circleci/project/workflow_logs
+      - store_artifacts:
+          path: /home/circleci/project/call_logs
+
+
+workflows:
+  version: 2
+  validate-and-test:
+    jobs:
+      - validate
+      - validate-with-json
+      - test:
+          requires:
+            - validate
+            - validate-with-json
diff --git a/.gitignore b/.gitignore
@@ -37,10 +37,16 @@ old_*
 *.recal
 *.recal.idx
 
+#cromwell executions folders
+*cromwell-executions*
+
 #hidden and backup files
 ._*
 .vimbup
 .*.done
 .queue
 *~
-.*.swp
+.*.swp
+
+#Intellij stuff
+.idea
diff --git a/BenchmarkVCFs/BenchmarkVCFs.wdl b/BenchmarkVCFs/BenchmarkVCFs.wdl
@@ -94,6 +94,8 @@ workflow Benchmark {
         input:
             evalVcf=evalVcf,
             truthVcf=truthVcf,
+            evalVcfIndex=evalVcfIndex,
+            truthVcfIndex=truthVcfIndex,
             hapMap=hapMap,
             gatkTag=gatkTag,
             preemptible=preemptible
@@ -1385,8 +1387,10 @@ task CombineSummaries {
 #Use CrosscheckFingerprints to match evaluation vcfs to appropriate truth vcfs
 task MatchEvalTruth {
     input{
-        String evalVcf
-        String truthVcf
+        File evalVcf
+        File truthVcf
+        File evalVcfIndex
+        File truthVcfIndex
         File hapMap
         Int? preemptible
         Int? memoryMaybe
@@ -1397,6 +1401,21 @@ task MatchEvalTruth {
     Int memoryRam=memoryJava+2
     Int disk_size = 10 + ceil(size(hapMap, "GB"))
 
+    parameter_meta {
+        evalVcf: {
+            localization_optional: true
+        }
+        truthVcf: {
+            localization_optional: true
+        }
+        evalVcfIndex: {
+            localization_optional: true
+        }
+        truthVcfIndex: {
+            localization_optional: true
+        }
+    }
+
     command <<<
         gatk --java-options "-Xmx~{memoryJava}G" CrosscheckFingerprints -I ~{evalVcf} -SI ~{truthVcf} -H ~{hapMap} --CROSSCHECK_MODE CHECK_ALL_OTHERS --CROSSCHECK_BY FILE --EXPECT_ALL_GROUPS_TO_MATCH
     >>>
diff --git a/BenchmarkVCFs/BenchmarkVCFs.wdl.json b/BenchmarkVCFs/BenchmarkVCFs.wdl.json
diff --git a/Makefile b/Makefile
@@ -1,22 +1,38 @@
+TEST_JSON= $(shell find test -name '*.json')
 
-all: validate validate_with_json
+VALIDATE_WDL= $(shell find . -name '*.wdl' ! -path './test/*')
 
-WDL= $(shell find . -name '*.wdl')
+TEST=java -jar $(CROMWELL_JAR) run
 
-VALIDATE=java -jar /app/womtool.jar validate
+VALIDATE=java -jar $(WOMTOOL_JAR) validate
 
-all-tests := $(addsuffix .test, $(basename $(WDL)))
-all-tests-json := $(addsuffix .json, $(all-tests))
+all-tests := $(addsuffix .test, $(TEST_JSON))
 
-validate: $(all-tests)
-validate_with_json : $(all-tests) $(all-tests-json)
+all-validations := $(addsuffix .validate, $(VALIDATE_WDL))
 
-test : $(all-tests)
+all-validations-with-json := $(addsuffix .validate, $(TEST_JSON))
 
-test_with_json: test $(all-tests-json)
+.PHONY: all
+all: test validate
 
-%.test : %.wdl
-	$(VALIDATE)  $?
+.PHONY: test
+test: $(all-tests)
 
-%.test.json : %.wdl.json
-	$(VALIDATE) $(basename $? .json) -i $?
+.PHONY: validate
+validate: $(all-validations)
+
+.PHONY: validate-with-json
+validate-with-json: $(all-validations-with-json)
+
+.PHONY: %.wdl.validate
+%.wdl.validate: 
+	$(VALIDATE) $(basename $@)
+
+.PHONY: %.json.test
+%.json.test:
+	mkdir -p logs
+	$(TEST) $(subst _json/,.wdl, $(dir $(basename $@))) -i $(basename $@) -o test_options.json 2>&1 | tee logs/$(notdir $(subst _json/,.wdl, $(dir $(basename $@))))_with_$(notdir $(basename $@)).log 
+
+.PHONY: %.json.validate
+%.json.validate:
+	$(VALIDATE) $(subst _json/,.wdl, $(dir $(basename $@))) -i $(basename $@)
diff --git a/README.md b/README.md
@@ -7,4 +7,21 @@ Utility workflows used by the DSP's Palantir team.  This repository should be us
 **Remember, this is a public repository, so anything you put in this repo is publicly viewable.**
 
 
-**To enable testable workflows, please make sure that a FILE.wdl.json is present with every FILE.wdl you add.
+## Testing Workflows
+
+All workflows should have associated tests.
+In order to add tests, you should add a test workflow to the `test` directory. 
+The test workflow should call the workflow you are testing, and (preferably) compare the outputs to those expected. 
+Input JSONs for the test workflow must be placed in a directory whose name is the same as the test workflow, with `.wdl` replaced by `_json`.
+So, the test directory structure will be built like this:
+
+```bash
++-- palantir-workflows
+|   +-- test
+|   |   +-- MyWorkflow
+|   |   |   +-- my_test_workflow.wdl
+|   |   |   +-- my_test_workflow_json
+|   |   |   |   +-- test_input_1.json
+|   |   |   |   +-- test_input_2.json
++++++++++++++++++
+```
diff --git a/test/BenchmarkVCFs/plumbing_data/test.confidence.interval_list b/test/BenchmarkVCFs/plumbing_data/test.confidence.interval_list
@@ -0,0 +1,5 @@
+@HD	VN:1.6	SO:coordinate
+@SQ	SN:chr1	LN:100
+chr1	1	17	+	.
+chr1	20	65	+	.
+chr1	70	97	+	.
diff --git a/test/BenchmarkVCFs/plumbing_data/test.eval.vcf b/test/BenchmarkVCFs/plumbing_data/test.eval.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=chr1,length=100>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SM-1
+chr1	10	.	G	T	30	.	.	GT	0/1
+chr1	22	.	CCACG	C	30	.	.	GT	1/1
+chr1	73	.	A	AGT	30	.	.	GT	1/1
+chr1	86	.	G	T	30	SUPER_FILTER	.	GT	0/1
diff --git a/test/BenchmarkVCFs/plumbing_data/test.fasta b/test/BenchmarkVCFs/plumbing_data/test.fasta
@@ -0,0 +1,3 @@
+>chr1	LN:100
+GGTGGAGCGCGCCGCCACGGACCACGGGCGGGCTGGCGGGCGAGCGGCGAGCGCGCGGCG
+ATCCGAGCCCCTAGGGCGGATCCCGGCTCCAGGCCCGCGC
diff --git a/test/BenchmarkVCFs/plumbing_data/test.fasta.dict b/test/BenchmarkVCFs/plumbing_data/test.fasta.dict
@@ -0,0 +1,2 @@
+@HD	VN:1.0	SO:unsorted
+@SQ	SN:chr1	LN:100	M5:cc081e3e70932dda461569ee09e668ba
diff --git a/test/BenchmarkVCFs/plumbing_data/test.fasta.fai b/test/BenchmarkVCFs/plumbing_data/test.fasta.fai
@@ -0,0 +1 @@
+chr1	100	13	60	61
diff --git a/test/BenchmarkVCFs/plumbing_data/test.haplotype_database.txt b/test/BenchmarkVCFs/plumbing_data/test.haplotype_database.txt
@@ -0,0 +1,6 @@
+@HD	VN:1.6	SO:coordinate
+@SQ	SN:chr1	LN:100
+#CHROMOSOME	POSITION	NAME	MAJOR_ALLELE	MINOR_ALLELE	MAF	ANCHOR_SNP	PANELS
+chr1	10	rs1	G	T	0.5
+chr1	22	rs2	CCACG	C	0.9
+chr1	73	rs3	A	AGT	0.5
diff --git a/test/BenchmarkVCFs/plumbing_data/test.truth.vcf b/test/BenchmarkVCFs/plumbing_data/test.truth.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=chr1,length=100>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SM-1
+chr1	10	.	G	T	30	.	.	GT	0/1
+chr1	22	.	CCACG	C	30	.	.	GT	1/1
+chr1	73	.	A	AGT	30	.	.	GT	0/1
+chr1	86	.	G	T	30	.	.	GT	0/1
diff --git a/test/BenchmarkVCFs/test_BenchmarkVCFs.wdl b/test/BenchmarkVCFs/test_BenchmarkVCFs.wdl
@@ -0,0 +1,107 @@
+version 1.0
+
+import "../../BenchmarkVCFs/BenchmarkVCFs.wdl"
+
+workflow testBenchmarkVCFs {
+	input {
+		Float expected_snpPrecision
+		Float expected_indelPrecision
+		Float expected_snpRecall
+		Float expected_indelRecall
+
+		File evalVcf
+		File truthVcf
+	}
+
+	call BgzipAndIndex as BgzipAndIndexEval {
+		input:
+			vcf = evalVcf
+	}
+
+	call BgzipAndIndex as BgzipAndIndexTruth {
+		input:
+			vcf = truthVcf
+	}
+
+	call BenchmarkVCFs.Benchmark {
+		input:
+			evalVcf = BgzipAndIndexEval.bgzipped_vcf,
+			evalVcfIndex = BgzipAndIndexEval.vcf_index,
+			truthVcf = BgzipAndIndexTruth.bgzipped_vcf,
+			truthVcfIndex = BgzipAndIndexTruth.vcf_index
+	}
+
+	call AssertPassed {
+		input:
+			expected_snpPrecision = expected_snpPrecision,
+			expected_indelPrecision = expected_indelPrecision,
+			expected_snpRecall = expected_snpRecall,
+			expected_indelRecall = expected_indelRecall,
+			observed_snpPrecision = Benchmark.snpPrecision,
+			observed_indelPrecision = Benchmark.indelPrecision,
+			observed_snpRecall = Benchmark.snpRecall,
+			observed_indelRecall = Benchmark.indelRecall,
+
+	}
+}
+
+task BgzipAndIndex {
+	input {
+		File vcf
+	}
+
+	command <<<
+		set -xeuo pipefail
+
+		ln -s ~{vcf} .
+		bgzip ~{basename(vcf)}
+		tabix ~{basename(vcf) + ".gz"}
+	>>>
+
+	runtime {
+		docker: "biocontainers/tabix@sha256:7e093436d00c01cf6ad7b285680bf1657f9fcb692cc083c972e5df5a7e951f49"
+	}
+
+	output {
+		File bgzipped_vcf = "~{basename(vcf) + '.gz'}"
+		File vcf_index = "~{basename(vcf) + '.gz.tbi'}"
+	}
+}
+
+task AssertPassed {
+	input {
+		Float expected_snpPrecision
+		Float expected_indelPrecision
+		Float expected_snpRecall
+		Float expected_indelRecall
+		
+		Float observed_snpPrecision
+		Float observed_indelPrecision
+		Float observed_snpRecall
+		Float observed_indelRecall
+	}
+
+	command <<<
+		set -euo pipefail
+		
+		assert_eq() {
+			local variable="$1"
+			local expected="$2"
+			local observed="$3"
+			
+			if [[ $expected != $observed ]]; then
+				>&2 echo $variable expected to be $expected, observed as $observed
+				exit 1;
+			fi
+		}
+		
+		assert_eq snpPrecision ~{expected_snpPrecision} ~{observed_snpPrecision}
+		assert_eq indelPrecision ~{expected_indelPrecision} ~{observed_indelPrecision}
+		assert_eq snpRecall ~{expected_snpRecall} ~{observed_snpRecall}
+		assert_eq indelRecall ~{expected_indelRecall} ~{observed_indelRecall}
+	>>>
+
+    runtime {
+        docker: "ubuntu@sha256:134c7fe821b9d359490cd009ce7ca322453f4f2d018623f849e580a89a685e5d"
+    }
+}
diff --git a/test/BenchmarkVCFs/test_BenchmarkVCFs_json/BenchmarkVCFs.test.ignoreFilters.wdl.json b/test/BenchmarkVCFs/test_BenchmarkVCFs_json/BenchmarkVCFs.test.ignoreFilters.wdl.json
diff --git a/test/BenchmarkVCFs/test_BenchmarkVCFs_json/BenchmarkVCFs.wdl.test.json b/test/BenchmarkVCFs/test_BenchmarkVCFs_json/BenchmarkVCFs.wdl.test.json
diff --git a/test/BenchmarkVCFs/test_BenchmarkVCFs_json/BenchmarkVCFs.wdl.test.unmatchingGenotypes.json b/test/BenchmarkVCFs/test_BenchmarkVCFs_json/BenchmarkVCFs.wdl.test.unmatchingGenotypes.json
diff --git a/test_options.json b/test_options.json

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+>chr1 LN:100`
	`2`	`+GGTGGAGCGCGCCGCCACGGACCACGGGCGGGCTGGCGGGCGAGCGGCGAGCGCGCGGCG`
	`3`	`+ATCCGAGCCCCTAGGGCGGATCCCGGCTCCAGGCCCGCGC`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+@HD VN:1.0 SO:unsorted`
	`2`	`+@SQ SN:chr1 LN:100 M5:cc081e3e70932dda461569ee09e668ba`