Skip to content

Commit 3bb2b8b

Browse files
authored
Add SVStratify and GroupedSVCluster tools (#8990)
1 parent 18707c6 commit 3bb2b8b

File tree

42 files changed

+4368
-543
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+4368
-543
lines changed

src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ public final class GATKSVVCFConstants {
1313
// VCF standard keys reserved for sv
1414
public static final String SVTYPE = "SVTYPE";
1515
public static final String SVLEN = "SVLEN";
16+
public static final String EVIDENCE = "EVIDENCE";
1617
public static final String IMPRECISE = "IMPRECISE";
1718
public static final String CIPOS = "CIPOS";
1819
public static final String CIEND = "CIEND";
@@ -31,6 +32,14 @@ public final class GATKSVVCFConstants {
3132
public static final Allele DEL_ALLELE = Allele.create("<DEL>", false);
3233
public static final Allele DUP_ALLELE = Allele.create("<DUP>", false);
3334

35+
// Evidence types
36+
public enum EvidenceTypes {
37+
BAF,
38+
PE,
39+
RD,
40+
SR
41+
}
42+
3443
// GATK-SV specific header lines
3544
// TODO: 10/3/17 the following comment is a goal we are trying to achieve
3645
// applicable to all records all the time
@@ -136,8 +145,13 @@ public enum ComplexVariantSubtype {
136145
public static final String BND_DELETION_STRANDS = "+-";
137146
public static final String BND_DUPLICATION_STRANDS = "-+";
138147

148+
// SR support
149+
public static final String BOTHSIDES_SUPPORT_ATTRIBUTE = "BOTHSIDES_SUPPORT";
150+
public static final String HIGH_SR_BACKGROUND_ATTRIBUTE = "HIGH_SR_BACKGROUND";
151+
139152
// format block
140153
public static final String COPY_NUMBER_FORMAT = "CN";
154+
public static final String DEPTH_GENOTYPE_COPY_NUMBER_FORMAT = "RD_CN";
141155
public static final String EXPECTED_COPY_NUMBER_FORMAT = "ECN";
142156
public static final String COPY_NUMBER_QUALITY_FORMAT = "CNQ";
143157

@@ -175,6 +189,9 @@ public enum ComplexVariantSubtype {
175189
public static final String TRUTH_ALLELE_NUMBER_INFO = "TRUTH_AN";
176190
public static final String TRUTH_ALLELE_FREQUENCY_INFO = "TRUTH_AF";
177191

192+
// stratification
193+
public static final String STRATUM_INFO_KEY = "STRAT";
194+
178195
// functional annotations
179196
public static final String LOF = "PREDICTED_LOF";
180197
public static final String INT_EXON_DUP = "PREDICTED_INTRAGENIC_EXON_DUP";

src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.stream.Stream;
2222

2323
import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT;
24+
import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT;
2425

2526
public class SVCallRecord implements SVLocatable {
2627

@@ -31,6 +32,7 @@ public class SVCallRecord implements SVLocatable {
3132
VCFConstants.END_KEY,
3233
GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
3334
GATKSVVCFConstants.SVLEN,
35+
GATKSVVCFConstants.EVIDENCE,
3436
GATKSVVCFConstants.CONTIG2_ATTRIBUTE,
3537
GATKSVVCFConstants.END2_ATTRIBUTE,
3638
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
@@ -48,6 +50,7 @@ public class SVCallRecord implements SVLocatable {
4850
private final Boolean strandB;
4951
private final GATKSVVCFConstants.StructuralVariantAnnotationType type;
5052
private final Integer length;
53+
private final List<GATKSVVCFConstants.EvidenceTypes> evidence;
5154
private final List<String> algorithms;
5255
private final List<Allele> alleles;
5356
private final Allele refAllele;
@@ -72,14 +75,15 @@ public SVCallRecord(final String id,
7275
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
7376
final List<ComplexEventInterval> cpxIntervals,
7477
final Integer length,
78+
final List<GATKSVVCFConstants.EvidenceTypes> evidence,
7579
final List<String> algorithms,
7680
final List<Allele> alleles,
7781
final List<Genotype> genotypes,
7882
final Map<String,Object> attributes,
7983
final Set<String> filters,
8084
final Double log10PError,
8185
final SAMSequenceDictionary dictionary) {
82-
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
86+
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, evidence, algorithms, alleles, genotypes, attributes, filters, log10PError);
8387
validateCoordinates(dictionary);
8488
}
8589

@@ -94,6 +98,7 @@ protected SVCallRecord(final String id,
9498
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
9599
final List<ComplexEventInterval> cpxIntervals,
96100
final Integer length,
101+
final List<GATKSVVCFConstants.EvidenceTypes> evidence,
97102
final List<String> algorithms,
98103
final List<Allele> alleles,
99104
final List<Genotype> genotypes,
@@ -106,6 +111,7 @@ protected SVCallRecord(final String id,
106111
Utils.nonNull(attributes);
107112
Utils.nonNull(filters);
108113
Utils.nonNull(cpxIntervals);
114+
Utils.nonNull(evidence);
109115
this.id = Utils.nonNull(id);
110116
this.contigA = contigA;
111117
this.positionA = positionA;
@@ -123,6 +129,7 @@ protected SVCallRecord(final String id,
123129
this.genotypes = GenotypesContext.copy(genotypes).immutable();
124130
this.attributes = validateAttributes(attributes);
125131
this.length = inferLength(type, positionA, positionB, length);
132+
this.evidence = evidence;
126133
final Pair<Boolean, Boolean> strands = inferStrands(type, strandA, strandB);
127134
this.strandA = strands.getLeft();
128135
this.strandB = strands.getRight();
@@ -272,7 +279,8 @@ private boolean isCarrier(final Genotype genotype) {
272279
}
273280

274281
// Otherwise, try to infer status if it's a biallelic CNV with a copy number call
275-
final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, expectedCopyNumber);
282+
final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT,
283+
VariantContextGetters.getAttributeAsInt(genotype, DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, expectedCopyNumber));
276284
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) {
277285
return copyNumber < expectedCopyNumber;
278286
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) {
@@ -370,6 +378,10 @@ public Integer getLength() {
370378
return length;
371379
}
372380

381+
public List<GATKSVVCFConstants.EvidenceTypes> getEvidence() {
382+
return evidence;
383+
}
384+
373385
public List<String> getAlgorithms() {
374386
return algorithms;
375387
}

src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import java.util.stream.Collectors;
1919
import java.util.stream.Stream;
2020

21+
import static htsjdk.variant.vcf.VCFConstants.MISSING_VALUE_v4;
2122
import static org.broadinstitute.hellbender.tools.sv.SVCallRecord.UNDEFINED_LENGTH;
2223

2324
public final class SVCallRecordUtils {
@@ -91,6 +92,9 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
9192
&& record.getStrandA() != null && record.getStrandB() != null) {
9293
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
9394
}
95+
if (!record.getEvidence().isEmpty()) {
96+
builder.attribute(GATKSVVCFConstants.EVIDENCE, record.getEvidence());
97+
}
9498
if (!record.getFilters().isEmpty()) {
9599
builder.filters(record.getFilters());
96100
}
@@ -173,12 +177,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
173177
*/
174178
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
175179
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
176-
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
180+
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
177181
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
178182
}
179183
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map<String, Object> attr) {
180184
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
181-
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
185+
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
182186
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
183187
}
184188

@@ -291,10 +295,10 @@ public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallReco
291295
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
292296
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
293297
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
294-
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
298+
record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
295299
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
296300
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
297-
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
301+
record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
298302
return Stream.of(positiveBreakend, negativeBreakend);
299303
}
300304

@@ -319,8 +323,9 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
319323

320324
final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant);
321325
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant);
322-
final List<SVCallRecord.ComplexEventInterval> cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary);
326+
final List<SVCallRecord.ComplexEventInterval> cpxIntervals = parseComplexIntervals(variant, dictionary);
323327
final List<String> algorithms = getAlgorithms(variant);
328+
final List<GATKSVVCFConstants.EvidenceTypes> evidence = getEvidence(variant);
324329

325330
final String strands;
326331
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL
@@ -375,12 +380,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
375380

376381
final Map<String, Object> sanitizedAttributes = sanitizeAttributes(attributes);
377382
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype,
378-
cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
383+
cpxIntervals, length, evidence, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
379384
variant.getFilters(), log10PError);
380385
}
381386

382-
private static List<SVCallRecord.ComplexEventInterval> parseComplexIntervals(final List<String> intervals, final SAMSequenceDictionary dictionary) {
383-
return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
387+
private static List<SVCallRecord.ComplexEventInterval> parseComplexIntervals(final VariantContext variant, final SAMSequenceDictionary dictionary) {
388+
return variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null).stream()
389+
.map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
384390
}
385391

386392
private static Map<String, Object> sanitizeAttributes(final Map<String, Object> attributes) {
@@ -402,6 +408,19 @@ private static Integer getLength(final VariantContext variant, final GATKSVVCFCo
402408
return length;
403409
}
404410

411+
public static List<GATKSVVCFConstants.EvidenceTypes> getEvidence(final VariantContext variant) {
412+
Utils.nonNull(variant);
413+
final List<String> value = variant.getAttributeAsStringList(GATKSVVCFConstants.EVIDENCE, null);
414+
if (value == null) {
415+
return Collections.emptyList();
416+
} else {
417+
return value.stream()
418+
.filter(v -> v != null && !v.equals(MISSING_VALUE_v4))
419+
.map(GATKSVVCFConstants.EvidenceTypes::valueOf)
420+
.collect(Collectors.toList());
421+
}
422+
}
423+
405424
public static List<String> getAlgorithms(final VariantContext variant) {
406425
Utils.nonNull(variant);
407426
Utils.validateArg(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE), "Expected " + GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE + " field for variant " + variant.getID());

0 commit comments

Comments
 (0)