Skip to content

Commit 59e8dae

Browse files
committed
Add bundle support for FeatureInputs.
1 parent 59c9c1b commit 59e8dae

File tree

13 files changed

+1098
-61
lines changed

13 files changed

+1098
-61
lines changed

src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ private StandardArgumentDefinitions(){}
5656
public static final String INTERVALS_SHORT_NAME = "L";
5757
public static final String COMPARISON_SHORT_NAME = "comp";
5858
public static final String READ_INDEX_SHORT_NAME = READ_INDEX_LONG_NAME;
59+
public static final String PRIMARY_INPUT_LONG_NAME = "primary";
60+
public static final String PRIMARY_INPUT_SHORT_NAME = "PI";
61+
public static final String SECONDARY_INPUT_LONG_NAME = "secondaryI";
62+
public static final String SECONDARY_INPUT_SHORT_NAME = "SI";
5963
public static final String LENIENT_SHORT_NAME = "LE";
6064
public static final String READ_VALIDATION_STRINGENCY_SHORT_NAME = "VS";
6165
public static final String SAMPLE_ALIAS_SHORT_NAME = "ALIAS";

src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
package org.broadinstitute.hellbender.engine;
22

3+
import htsjdk.beta.io.bundle.Bundle;
4+
import htsjdk.beta.io.bundle.BundleJSON;
5+
import htsjdk.beta.io.bundle.BundleResource;
6+
import htsjdk.beta.io.bundle.BundleResourceType;
7+
import htsjdk.io.IOPath;
38
import htsjdk.samtools.SAMSequenceDictionary;
49
import htsjdk.samtools.util.IOUtil;
510
import htsjdk.samtools.util.Locatable;
@@ -148,7 +153,7 @@ public FeatureDataSource(final File featureFile) {
148153
* generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES})
149154
* during queries that produce cache misses.
150155
*
151-
* @param featurePath path or URI to source of Features
156+
* @param featurePath path or URI to source of Features (may be a Bundle)
152157
*/
153158
public FeatureDataSource(final String featurePath) {
154159
this(featurePath, null, DEFAULT_QUERY_LOOKAHEAD_BASES, null);
@@ -159,7 +164,7 @@ public FeatureDataSource(final String featurePath) {
159164
* name. We will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) during queries
160165
* that produce cache misses.
161166
*
162-
* @param featureFile file containing Features
167+
* @param featureFile file or Bundle containing Features
163168
* @param name logical name for this data source (may be null)
164169
*/
165170
public FeatureDataSource(final File featureFile, final String name) {
@@ -170,7 +175,7 @@ public FeatureDataSource(final File featureFile, final String name) {
170175
* Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical
171176
* name. We will look ahead the specified number of bases during queries that produce cache misses.
172177
*
173-
* @param featureFile file containing Features
178+
* @param featureFile file or Bundle containing Features
174179
* @param name logical name for this data source (may be null)
175180
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
176181
*/
@@ -181,7 +186,7 @@ public FeatureDataSource(final File featureFile, final String name, final int qu
181186
/**
182187
* Creates a FeatureDataSource backed by the resource at the provided path.
183188
*
184-
* @param featurePath path to file or GenomicsDB url containing features
189+
* @param featurePath path to file or GenomicsDB url or Bundle containing features
185190
* @param name logical name for this data source (may be null)
186191
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
187192
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
@@ -195,7 +200,7 @@ public FeatureDataSource(final String featurePath, final String name, final int
195200
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
196201
* during queries that produce cache misses.
197202
*
198-
* @param featureInput a FeatureInput specifying a source of Features
203+
* @param featureInput a FeatureInput specifying a source of Features (or a Bundle)
199204
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
200205
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
201206
* that produce this type of Feature. May be null, which results in an unrestricted search.
@@ -207,7 +212,7 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
207212
/**
208213
* Creates a FeatureDataSource backed by the resource at the provided path.
209214
*
210-
* @param featurePath path to file or GenomicsDB url containing features
215+
* @param featurePath path to file or GenomicsDB url or Bundle containing features
211216
* @param name logical name for this data source (may be null)
212217
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
213218
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
@@ -224,7 +229,7 @@ public FeatureDataSource(final String featurePath, final String name, final int
224229
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
225230
* during queries that produce cache misses.
226231
*
227-
* @param featureInput a FeatureInput specifying a source of Features
232+
* @param featureInput a FeatureInput specifying a source of Features (may be a Bundle)
228233
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
229234
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
230235
* that produce this type of Feature. May be null, which results in an unrestricted search.
@@ -241,7 +246,7 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
241246
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
242247
* during queries that produce cache misses.
243248
*
244-
* @param featureInput a FeatureInput specifying a source of Features
249+
* @param featureInput a FeatureInput specifying a source of Features (may be a Bundle)
245250
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
246251
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
247252
* that produce this type of Feature. May be null, which results in an unrestricted search.
@@ -259,7 +264,7 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
259264
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
260265
* during queries that produce cache misses.
261266
*
262-
* @param featureInput a FeatureInput specifying a source of Features
267+
* @param featureInput a FeatureInput specifying a source of Features (may be a Bundle)
263268
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
264269
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
265270
* that produce this type of Feature. May be null, which results in an unrestricted search.
@@ -278,7 +283,7 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
278283
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
279284
* during queries that produce cache misses.
280285
*
281-
* @param featureInput a FeatureInput specifying a source of Features
286+
* @param featureInput a FeatureInput specifying a source of Features (may be a Bundle)
282287
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
283288
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
284289
* that produce this type of Feature. May be null, which results in an unrestricted search.
@@ -296,7 +301,7 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
296301
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
297302
* during queries that produce cache misses.
298303
*
299-
* @param featureInput a FeatureInput specifying a source of Features
304+
* @param featureInput a FeatureInput specifying a source of Features (may be a Bundle)
300305
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
301306
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
302307
* that produce this type of Feature. May be null, which results in an unrestricted search.
@@ -369,9 +374,26 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
369374
} catch (final ClassCastException e) {
370375
throw new UserException("GenomicsDB inputs can only be used to provide VariantContexts.", e);
371376
}
377+
} else if (featureInput.hasExtension(BundleJSON.BUNDLE_EXTENSION)) {
378+
// the feature input specifies a serialized json bundle file
379+
final Bundle vcfBundle = BundleJSON.toBundle(htsjdk.beta.plugin.IOUtils.getStringFromPath(featureInput), GATKPath::new);
380+
final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get();
381+
// to get the codec we have to use the path of the underlying vcf resource, not the bundle path
382+
final FeatureInput<T> fi = new FeatureInput<T>(vcfPath.getRawInputString(), featureInput.getName());
383+
final FeatureCodec<T, ?> codec = getCodecForFeatureInput(fi, targetFeatureType, setNameOnCodec);
384+
// propagate the bundle path, not the vcf path, to the reader, so that downstream code can retrieve
385+
// the index path from the bundle
386+
return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper);
387+
} else if (featureInput.getParentBundle() != null) {
388+
// the featureInput was created from a bundle list expansion (i.e, MultiVariantWalkers). it has the
389+
// primary resource as the underlying resource path, and the containing bundle attached as the
390+
// "parent bundle". Use the original FI to get the codec, but to get the feature reader, we use
391+
// the FI that contains the bundle path, since the feature reader may require acccess to the index
392+
final FeatureCodec<T, ?> codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec);
393+
return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper);
372394
} else {
373395
final FeatureCodec<T, ?> codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec);
374-
if ( featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION) ) {
396+
if (featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION)) {
375397
return new Reader(featureInput, codec);
376398
}
377399
return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper);
@@ -419,18 +441,48 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
419441
private static <T extends Feature> AbstractFeatureReader<T, ?> getTribbleFeatureReader(final FeatureInput<T> featureInput, final FeatureCodec<T, ?> codec, final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper, final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper) {
420442
Utils.nonNull(codec);
421443
try {
422-
// Must get the path to the data file from the codec here:
423-
final String absoluteRawPath = featureInput.getRawInputString();
424-
425444
// Instruct the reader factory to not require an index. We will require one ourselves as soon as
426445
// a query by interval is attempted.
427446
final boolean requireIndex = false;
428447

429-
// Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching.
430-
if (BucketUtils.isEligibleForPrefetching(featureInput)) {
431-
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
448+
if (featureInput.hasExtension(BundleJSON.BUNDLE_EXTENSION)) {
449+
final Bundle vcfBundle = BundleJSON.toBundle(htsjdk.beta.plugin.IOUtils.getStringFromPath(featureInput), GATKPath::new);
450+
final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get();
451+
final Optional<BundleResource> vcfIndexPath = vcfBundle.get(BundleResourceType.CT_VARIANTS_INDEX);
452+
final String rawIndexResourcePath =
453+
vcfIndexPath.isPresent() ? vcfIndexPath.get().getIOPath().get().getRawInputString() : null;
454+
455+
// Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching.
456+
if (BucketUtils.isEligibleForPrefetching(vcfPath)) {
457+
final String absoluteRawPath = vcfPath.getRawInputString();
458+
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
459+
} else {
460+
return AbstractFeatureReader.getFeatureReader(vcfPath.getRawInputString(), rawIndexResourcePath, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction());
461+
}
462+
} else if (featureInput.getParentBundle() != null) {
463+
final Bundle vcfBundle = featureInput.getParentBundle();
464+
// code path for when a user has specified multiple bundles on the command line, so there is no single
465+
// serialized bundle file to access
466+
final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get();
467+
// Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching.
468+
final Optional<BundleResource> vcfIndexPath = vcfBundle.get(BundleResourceType.CT_VARIANTS_INDEX);
469+
final String rawIndexResourcePath =
470+
vcfIndexPath.isPresent() ? vcfIndexPath.get().getIOPath().get().getRawInputString() : null;
471+
final String absoluteRawPath = vcfPath.getRawInputString();
472+
if (BucketUtils.isEligibleForPrefetching(vcfPath)) {
473+
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
474+
} else {
475+
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction());
476+
}
432477
} else {
433-
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction());
478+
final String absoluteRawPath = featureInput.getRawInputString();
479+
480+
// Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching.
481+
if (BucketUtils.isEligibleForPrefetching(featureInput)) {
482+
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
483+
} else {
484+
return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction());
485+
}
434486
}
435487
} catch (final TribbleException e) {
436488
throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e);

src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.broadinstitute.hellbender.engine;
22

33
import com.google.common.annotations.VisibleForTesting;
4+
import htsjdk.beta.io.bundle.Bundle;
45
import htsjdk.tribble.Feature;
56
import htsjdk.tribble.FeatureCodec;
67
import org.apache.logging.log4j.LogManager;
@@ -52,6 +53,11 @@ public final class FeatureInput<T extends Feature> extends GATKPath implements S
5253
*/
5354
private transient Class<FeatureCodec<T, ?>> featureCodecClass;
5455

56+
/**
57+
* retain any containing bundle in case we need to extract other resources from it
58+
*/
59+
private Bundle parentBundle;
60+
5561
/**
5662
* Delimiter between the logical name and the file name in the --argument_name logical_name:feature_file syntax
5763
*/
@@ -129,6 +135,34 @@ public FeatureInput(final String rawInputSpecifier, final String name, final Map
129135
setTagAttributes(keyValueMap);
130136
}
131137

138+
/**
139+
* Construct a FeatureInput from a Bundle.
140+
*
141+
* @param primaryResourcePath the path for the primary feature resource for this bundle
142+
* @param featureBundle an existing Bundle object; resources in this bundle MUST be IOPathBundleResources (that is,
143+
* they must be backed by an IOPath, not an in-memory object)
144+
* @param name the tag name for this feature input - may be null
145+
*/
146+
public FeatureInput(
147+
final GATKPath primaryResourcePath,
148+
final Bundle featureBundle,
149+
final String name) {
150+
super(primaryResourcePath);
151+
// retain the containing bundle for later so we can interrogate it for other resources, like the index
152+
this.parentBundle = featureBundle;
153+
if (name != null) {
154+
if (primaryResourcePath.getTag() != null) {
155+
logger.warn(String.format(
156+
"FeatureInput: user-provided tag name %s will be replaced with %s",
157+
primaryResourcePath.getTag(),
158+
name));
159+
}
160+
setTag(name);
161+
}
162+
163+
}
164+
165+
132166
/**
133167
* Remember the FeatureCodec class for this input the first time it is discovered so we can bypass dynamic codec
134168
* discovery when multiple FeatureDataSources are created for the same input.
@@ -144,6 +178,14 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
144178
return this.featureCodecClass;
145179
}
146180

181+
/**
182+
* @return the parent bundle for this FeatureInput, if this input was derived from a Bundle. May
183+
* return {@code null}. The returned bundle can be interrogated for companion resources.
184+
*/
185+
public Bundle getParentBundle() {
186+
return parentBundle;
187+
}
188+
147189
/**
148190
* creates a name from the given filePath by finding the absolute path of the given input
149191
*/

0 commit comments

Comments
 (0)