|
34 | 34 | import java.nio.file.Path;
|
35 | 35 | import java.util.*;
|
36 | 36 | import java.util.function.BiFunction;
|
| 37 | +import java.util.stream.Collector; |
37 | 38 | import java.util.stream.Collectors;
|
38 | 39 | import java.util.stream.IntStream;
|
39 | 40 |
|
40 | 41 | public final class GATKVariantContextUtils {
|
41 | 42 |
|
| 43 | + /** maximum number of sources to include when merging sources */ |
| 44 | + private static final int MAX_SOURCES_TO_INCLUDE = 10; |
42 | 45 | private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class);
|
43 | 46 |
|
44 | 47 | public static final String MERGE_FILTER_PREFIX = "filterIn";
|
@@ -1096,31 +1099,46 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
|
1096 | 1099 | final GenotypeMergeType genotypeMergeOptions,
|
1097 | 1100 | final boolean filteredAreUncalled) {
|
1098 | 1101 | int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
|
1099 |
| - return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled); |
| 1102 | + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, false, -1); |
| 1103 | + } |
| 1104 | + |
| 1105 | + public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs, |
| 1106 | + final List<String> priorityListOfVCs, |
| 1107 | + final FilteredRecordMergeType filteredRecordMergeType, |
| 1108 | + final GenotypeMergeType genotypeMergeOptions, |
| 1109 | + final boolean filteredAreUncalled, |
| 1110 | + final boolean storeAllVcfSources, |
| 1111 | + final int maxSourceFieldLength) { |
| 1112 | + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); |
| 1113 | + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, storeAllVcfSources, maxSourceFieldLength); |
1100 | 1114 | }
|
1101 | 1115 |
|
1102 | 1116 | /**
|
1103 |
| - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. |
1104 |
| - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with |
1105 |
| - * the sample name. |
1106 |
| - * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use |
1107 |
| - * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. |
1108 |
| - * |
1109 |
| - * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ |
1110 |
| - * |
1111 |
| - * @param unsortedVCs collection of unsorted VCs |
1112 |
| - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs |
1113 |
| - * @param filteredRecordMergeType merge type for filtered records |
1114 |
| - * @param genotypeMergeOptions merge option for genotypes |
1115 |
| - * @param filteredAreUncalled are filtered records uncalled? |
1116 |
| - * @return new VariantContext representing the merge of unsortedVCs |
1117 |
| - */ |
| 1117 | + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. |
| 1118 | + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with |
| 1119 | + * the sample name. |
| 1120 | + * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use |
| 1121 | + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. |
| 1122 | + * |
| 1123 | + * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ |
| 1124 | + * |
| 1125 | + * @param unsortedVCs collection of unsorted VCs |
| 1126 | + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs |
| 1127 | + * @param filteredRecordMergeType merge type for filtered records |
| 1128 | + * @param genotypeMergeOptions merge option for genotypes |
| 1129 | + * @param filteredAreUncalled are filtered records uncalled? |
| 1130 | + * @param storeAllVcfSources if true, the sources of all VCs where isVariable()=true will be concatenated in the output VC's source field. If false, the source of the first VC will be used. This mirror's GATK3's behavior |
| 1131 | + * @param maxSourceFieldLength This can be used to enforce a maximum length for the value of the source field (primarily useful if storeAllVcfSources=true). Set to -1 for unlimited |
| 1132 | + * @return new VariantContext representing the merge of unsortedVCs |
| 1133 | + */ |
1118 | 1134 | public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
|
1119 | 1135 | final List<String> priorityListOfVCs,
|
1120 | 1136 | final int originalNumOfVCs,
|
1121 | 1137 | final FilteredRecordMergeType filteredRecordMergeType,
|
1122 | 1138 | final GenotypeMergeType genotypeMergeOptions,
|
1123 |
| - final boolean filteredAreUncalled) { |
| 1139 | + final boolean filteredAreUncalled, |
| 1140 | + final boolean storeAllVcfSources, |
| 1141 | + final int maxSourceFieldLength) { |
1124 | 1142 | if ( unsortedVCs == null || unsortedVCs.isEmpty() )
|
1125 | 1143 | return null;
|
1126 | 1144 |
|
@@ -1165,7 +1183,7 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
|
1165 | 1183 | longestVC = vc; // get the longest location
|
1166 | 1184 |
|
1167 | 1185 | nFiltered += vc.isFiltered() ? 1 : 0;
|
1168 |
| - if ( vc.isVariant() ) variantSources.add(vc.getSource()); |
| 1186 | + if ( storeAllVcfSources && vc.isVariant() ) variantSources.add(vc.getSource()); |
1169 | 1187 |
|
1170 | 1188 | AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc);
|
1171 | 1189 |
|
@@ -1236,7 +1254,19 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
|
1236 | 1254 |
|
1237 | 1255 | final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
|
1238 | 1256 |
|
1239 |
| - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); |
| 1257 | + // This preserves the GATK3-like behavior of reporting multiple sources, delimited with hyphen: |
| 1258 | + // NOTE: if storeAllVcfSources is false, variantSources will be empty and therefore no sorting is performed |
| 1259 | + String allSources = variantSources.isEmpty() ? name : variantSources.stream() |
| 1260 | + .sorted() |
| 1261 | + .distinct() |
| 1262 | + .limit(MAX_SOURCES_TO_INCLUDE) |
| 1263 | + .collect(Collectors.joining("-")); |
| 1264 | + |
| 1265 | + if (maxSourceFieldLength != -1 && allSources.length() > maxSourceFieldLength) { |
| 1266 | + allSources = allSources.substring(0, maxSourceFieldLength); |
| 1267 | + } |
| 1268 | + |
| 1269 | + final VariantContextBuilder builder = new VariantContextBuilder().source(allSources).id(ID); |
1240 | 1270 | builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd());
|
1241 | 1271 | builder.alleles(alleles);
|
1242 | 1272 | builder.genotypes(genotypes);
|
|
0 commit comments