Skip to content

Commit 18707c6

Browse files
bbimberlbergelson
andauthored
Retain all source IDs on VariantContext (#9032)
* Retain all source IDs on VariantContext merge In GATK3, when merging variants, the IDs of all the source VCFs were retained. This code path seems like it intended that, since the variantSources set is generated, but it doesnt get used for anything. This PR will use that set to set the source of the resulting merged VC. * Add overloaded method to GATKVariantContextUtils.simpleMerge to preserve existing behavior when storeAllVcfSources=false Co-authored-by: Louis Bergelson <louisb@broadinstitute.org>
1 parent 3fe011a commit 18707c6

File tree

2 files changed

+70
-19
lines changed

2 files changed

+70
-19
lines changed

src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,14 @@
3434
import java.nio.file.Path;
3535
import java.util.*;
3636
import java.util.function.BiFunction;
37+
import java.util.stream.Collector;
3738
import java.util.stream.Collectors;
3839
import java.util.stream.IntStream;
3940

4041
public final class GATKVariantContextUtils {
4142

43+
/** maximum number of sources to include when merging sources */
44+
private static final int MAX_SOURCES_TO_INCLUDE = 10;
4245
private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class);
4346

4447
public static final String MERGE_FILTER_PREFIX = "filterIn";
@@ -1096,31 +1099,46 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
10961099
final GenotypeMergeType genotypeMergeOptions,
10971100
final boolean filteredAreUncalled) {
10981101
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
1099-
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled);
1102+
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, false, -1);
1103+
}
1104+
1105+
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
1106+
final List<String> priorityListOfVCs,
1107+
final FilteredRecordMergeType filteredRecordMergeType,
1108+
final GenotypeMergeType genotypeMergeOptions,
1109+
final boolean filteredAreUncalled,
1110+
final boolean storeAllVcfSources,
1111+
final int maxSourceFieldLength) {
1112+
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
1113+
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, storeAllVcfSources, maxSourceFieldLength);
11001114
}
11011115

11021116
/**
1103-
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
1104-
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
1105-
* the sample name.
1106-
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
1107-
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
1108-
*
1109-
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
1110-
*
1111-
* @param unsortedVCs collection of unsorted VCs
1112-
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
1113-
* @param filteredRecordMergeType merge type for filtered records
1114-
* @param genotypeMergeOptions merge option for genotypes
1115-
* @param filteredAreUncalled are filtered records uncalled?
1116-
* @return new VariantContext representing the merge of unsortedVCs
1117-
*/
1117+
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
1118+
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
1119+
* the sample name.
1120+
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
1121+
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
1122+
*
1123+
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
1124+
*
1125+
* @param unsortedVCs collection of unsorted VCs
1126+
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
1127+
* @param filteredRecordMergeType merge type for filtered records
1128+
* @param genotypeMergeOptions merge option for genotypes
1129+
* @param filteredAreUncalled are filtered records uncalled?
1130+
* @param storeAllVcfSources if true, the sources of all VCs where isVariable()=true will be concatenated in the output VC's source field. If false, the source of the first VC will be used. This mirror's GATK3's behavior
1131+
* @param maxSourceFieldLength This can be used to enforce a maximum length for the value of the source field (primarily useful if storeAllVcfSources=true). Set to -1 for unlimited
1132+
* @return new VariantContext representing the merge of unsortedVCs
1133+
*/
11181134
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
11191135
final List<String> priorityListOfVCs,
11201136
final int originalNumOfVCs,
11211137
final FilteredRecordMergeType filteredRecordMergeType,
11221138
final GenotypeMergeType genotypeMergeOptions,
1123-
final boolean filteredAreUncalled) {
1139+
final boolean filteredAreUncalled,
1140+
final boolean storeAllVcfSources,
1141+
final int maxSourceFieldLength) {
11241142
if ( unsortedVCs == null || unsortedVCs.isEmpty() )
11251143
return null;
11261144

@@ -1165,7 +1183,7 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
11651183
longestVC = vc; // get the longest location
11661184

11671185
nFiltered += vc.isFiltered() ? 1 : 0;
1168-
if ( vc.isVariant() ) variantSources.add(vc.getSource());
1186+
if ( storeAllVcfSources && vc.isVariant() ) variantSources.add(vc.getSource());
11691187

11701188
AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc);
11711189

@@ -1236,7 +1254,19 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
12361254

12371255
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
12381256

1239-
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
1257+
// This preserves the GATK3-like behavior of reporting multiple sources, delimited with hyphen:
1258+
// NOTE: if storeAllVcfSources is false, variantSources will be empty and therefore no sorting is performed
1259+
String allSources = variantSources.isEmpty() ? name : variantSources.stream()
1260+
.sorted()
1261+
.distinct()
1262+
.limit(MAX_SOURCES_TO_INCLUDE)
1263+
.collect(Collectors.joining("-"));
1264+
1265+
if (maxSourceFieldLength != -1 && allSources.length() > maxSourceFieldLength) {
1266+
allSources = allSources.substring(0, maxSourceFieldLength);
1267+
}
1268+
1269+
final VariantContextBuilder builder = new VariantContextBuilder().source(allSources).id(ID);
12401270
builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd());
12411271
builder.alleles(alleles);
12421272
builder.genotypes(genotypes);

src/test/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtilsUnitTest.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,27 @@ public void testMergeGenotypesUniquify() {
659659
Assert.assertEquals(merged.getSampleNames(), new LinkedHashSet<>(Arrays.asList("s1.1", "s1.2")));
660660
}
661661

662+
@Test
663+
public void testMergeSourceIDs() {
664+
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
665+
final VariantContext vc1WithSource= new VariantContextBuilder(vc1).source("source1").make();
666+
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
667+
final VariantContext vc2WithSource = new VariantContextBuilder(vc2).source("source2").make();
668+
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
669+
Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
670+
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, -1);
671+
672+
// test sources are merged
673+
Assert.assertEquals(merged.getSource(), "source1-source2");
674+
675+
final VariantContext mergedTruncated = GATKVariantContextUtils.simpleMerge(
676+
Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
677+
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, 2);
678+
679+
// test sources are merged and respects maxSourceFieldLength
680+
Assert.assertEquals(mergedTruncated.getSource(), "so");
681+
}
682+
662683
// TODO: remove after testing
663684
// @Test(expectedExceptions = IllegalStateException.class)
664685
// public void testMergeGenotypesRequireUnique() {

0 commit comments

Comments
 (0)