Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 77 additions & 7 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,77 @@ def kaiju(inBam, db, taxDb, outReport, outReads=None, threads=None):
__commands__.append(('kaiju', parser_kaiju))


def extract_kraken_unclassified(in_kraken_reads, in_bam, out_bam, filter_taxids=None, filter_children=None, tax_dir=None, p_threshold=0.05):
if filter_children:
assert tax_dir
db = ncbitax.TaxonomyDb(tax_dir, load_nodes=True, load_merged=True)
filter_taxids = collect_children(db.children, set(filter_taxids))

qnames = set()
with gzip.open(in_kraken_reads, 'rt') as f:
for line in f:
parts = line.split('\t')
classified = parts[0] == 'C'
qname = parts[1]
taxid = parts[2]
length = parts[3]
# Check if already filtered and P= added
if '=' in parts[4]:
p = float(parts[4].split('=')[1])
kmer_str = parts[5]
else:
kmer_str = parts[4]
kmers = [kmer for kmer in kmer_str.rstrip().split(' ')]
kmer_counts = []
for kmer in kmers:
t = kmer.split(':')
# Kraken2 marker for paired read joiner
if t[0] == '|':
continue
kmer_counts.append((t[0], int(t[1])))
n_kmers = 0
n_classified_kmers = 0
for kmer_taxid, count in kmer_counts:
if kmer_taxid != 'A':
n_kmers += count
if kmer_taxid not in ('0', 'A'):
if filter_taxids is None:
n_classified_kmers += count
elif int(kmer_taxid) in filter_taxids:
n_classified_kmers += count
if n_kmers > 0:
p = n_classified_kmers / n_kmers
else:
p = 0

if p <= float(p_threshold):
if qname.endswith('/1') or qname.endswith('/2'):
qname = qname[:-2]
qnames.add(qname)

in_sam_f = pysam.AlignmentFile(in_bam, 'rb', check_sq=False)
out_sam_f = pysam.AlignmentFile(out_bam, 'wb', template=in_sam_f)

for sam1 in in_sam_f:
if sam1.query_name in qnames:
out_sam_f.write(sam1)


def parser_kraken_unclassified(parser=argparse.ArgumentParser()):
parser.add_argument('in_bam', metavar='inBam', help='Input original bam.')
parser.add_argument('in_kraken_reads', metavar='inKrakenReads', help='Input kraken reads.')
parser.add_argument('out_bam', metavar='outBam', help='Output extracted bam')
parser.add_argument('-p', '--pThreshold', dest='p_threshold', default=0.05, help='Kraken p threshold')
parser.add_argument('--filterTaxids', dest='filter_taxids', nargs='+', type=int, help='Taxids to filter/deplete out')
parser.add_argument('--filterChildren', dest='filter_children', action='store_true', help='Including filter taxid descendants when filtering - requires taxDb')
parser.add_argument('--taxDb', dest='tax_dir', help='Directory to NCBI taxonomy db')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, extract_kraken_unclassified, split_args=True)
return parser

__commands__.append(('kraken_unclassified', parser_kraken_unclassified))


def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=None):

if outReads:
Expand Down Expand Up @@ -950,7 +1021,6 @@ def metagenomic_report_merge(metagenomic_reports, out_kraken_summary, kraken_db,
__commands__.append(('report_merge', parser_metagenomic_report_merge))



def fasta_library_accessions(library):
'''Parse accession from ids of fasta files in library directory. '''
library_accessions = set()
Expand All @@ -962,7 +1032,7 @@ def fasta_library_accessions(library):
for seqr in SeqIO.parse(filepath, 'fasta'):
name = seqr.name
# Search for accession
mo = re.search('([A-Z]+_?\d+\.\d+)', name)
mo = re.search(r'([A-Z]+_?\d+\.\d+)', name)
if mo:
accession = mo.group(1)
library_accessions.add(accession)
Expand All @@ -973,9 +1043,9 @@ class KrakenUniqBuildError(Exception):
'''Error while building KrakenUniq database.'''

def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
parser.add_argument('in_bam', help='Input bam file.')
parser.add_argument('read_IDs_to_tax_IDs', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified')
parser.add_argument('in_bam', metavar='inBam', help='Input bam file.')
parser.add_argument('reads_tsv', metavar='readsTsv', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
parser.add_argument('out_bam', metavar='outBam', help='Output bam file, filtered to the taxa specified')
parser.add_argument('nodes_dmp', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
parser.add_argument('names_dmp', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
parser.add_argument('--taxNames', nargs="+", dest="tax_names", help='The taxonomic names to include. More than one can be specified. Mapped to Tax IDs by lowercase exact match only. Ex. "Viruses" This is in addition to any taxonomic IDs provided.')
Expand All @@ -992,7 +1062,7 @@ def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True)
return parser

def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
def filter_bam_to_taxa(in_bam, reads_tsv, out_bam,
nodes_dmp, names_dmp,
tax_names=None, tax_ids=None,
omit_children=False,
Expand Down Expand Up @@ -1049,7 +1119,7 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
with util.file.tempfname(".txt.gz") as temp_read_list:
with open_or_gzopen(temp_read_list, "wt") as read_IDs_file:
read_ids_written = 0
for row in util.file.read_tabfile(read_IDs_to_tax_IDs):
for row in util.file.read_tabfile(reads_tsv):
assert tax_id_col<len(row), "tax_id_col does not appear to be in range for number of columns present in mapping file"
assert read_id_col<len(row), "read_id_col does not appear to be in range for number of columns present in mapping file"
read_id = row[read_id_col]
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/dx-launcher/dx-yml-build
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import os, sys
import yaml, json

with open('dxapp.yml') as infile:
data = yaml.load(infile)
data = yaml.load(infile, loader=yaml.FullLoader)
data["00COMMENT"] = "This dxapp.json has been generated from dxapp.yml automatically using dx-yml-build"
with open('dxapp.json', 'w') as outfile:
json.dump(data, outfile, sort_keys=True, indent=2)
Expand Down
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/classify_kraken.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "tasks_metagenomics.wdl" as metagenomics

workflow classify_kraken {
call metagenomics.krakenuniq
}
2 changes: 1 addition & 1 deletion pipes/rules/common.rules
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def download_file(uriToGet, dest, destFileName=None):
req = urlopen(uriToGet)

if not destFileName:
m = re.search('filename="(?P<filename>.+)"', req.info()['Content-Disposition'])
m = re.search(r'filename="(?P<filename>.+)"', req.info()['Content-Disposition'])

if m:
destFileName = m.group("filename")
Expand Down
65 changes: 65 additions & 0 deletions taxon_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,71 @@ def parser_filter_lastal_bam(parser=argparse.ArgumentParser()):
__commands__.append(('filter_lastal_bam', parser_filter_lastal_bam))


# ==============================
# *** deplete kraken2 ***
# ==============================
def parser_deplete_bam_kraken2(parser=argparse.ArgumentParser()):
parser.add_argument('in_bam', metavar='inBam', help='Input BAM file.')
parser.add_argument('db', help='Kraken2 database directory.')
parser.add_argument('out_bam', metavar='outBam', help='Output BAM file.')
parser.add_argument('--taxDb', dest='tax_dir', help='Directory to NCBI taxonomy db')
parser.add_argument(
'--JVMmemory',
default=tools.picard.FilterSamReadsTool.jvmMemDefault,
help='JVM virtual memory size (default: %(default)s)'
)
parser.add_argument('-p', '--pThreshold', dest='p_threshold', default=0.5, help='Kraken p threshold')
parser.add_argument('--filterTaxid', dest='filter_taxid', type=int, help='Taxid to filter/deplete out')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, main_deplete_bam_kraken2)
return parser

def main_deplete_bam_kraken2(args):
'''Use kraken2 to deplete input reads against several databases.'''

kraken2 = tools.kraken.Kraken2()
with util.file.tempfname() as k_reads_fn, util.file.tempfname() as k_report_fn:
kraken2.classify(args.db, args.tax_db, args.in_bam,
output_report=k_report_fn, output_reads=k_reads_fn,
num_threads=threads)
metagenomics.extract_kraken_unclassified(
k_reads_fn, args.in_bam, out_bam,
filter_taxid=args.filter_taxid, tax_dir=args.tax_dir,
p_threshold=args.p_threshold)

__commands__.append(('deplete_bam_kraken2', parser_deplete_bam_kraken2))


def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs):

tmpDb = None
if len(refDbs)>1 and not any(
not os.path.exists(db) # indexed db prefix
or os.path.isdir(db) # indexed db in directory
or (os.path.isfile(db) and ('.tar' in db or '.tgz' in db or '.zip' in db)) # packaged indexed db
for db in refDbs):
# this is a scenario where all refDbs are unbuilt fasta
# files. we can simplify and speed up execution by
# concatenating them all and running deplete_method
# just once
tmpDb = mkstempfname('.fasta')
merge_compressed_files(refDbs, tmpDb, sep='\n')
refDbs = [tmpDb]

samtools = tools.samtools.SamtoolsTool()
tmpBamIn = inBam
for db in refDbs:
if not samtools.isEmpty(tmpBamIn):
tmpBamOut = mkstempfname('.bam')
deplete_method(tmpBamIn, db, tmpBamOut, **kwargs)
if tmpBamIn != inBam:
os.unlink(tmpBamIn)
tmpBamIn = tmpBamOut
shutil.copyfile(tmpBamIn, outBam)

if tmpDb:
os.unlink(tmpDb)

# ==============================
# *** deplete_bmtagger_bam ***
# ==============================
Expand Down
5 changes: 4 additions & 1 deletion test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def assert_equal_bam_reads(testCase, bam_filename1, bam_filename2):
samtools.view(args=[], inFile=bam_filename2, outFile=sam_two)

try:
testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
if testCase:
testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
else:
assert filecmp.cmp(sam_one, sam_two, shallow=False)
finally:
for fname in [sam_one, sam_two]:
if os.path.exists(fname):
Expand Down
2 changes: 1 addition & 1 deletion test/pipelines/snakemake/snake.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def setup(self):
os.symlink(self.root, self.bindir)
os.symlink(join(self.root, 'pipes', 'Snakefile'), join(self.workdir, 'Snakefile'))
with open(join(self.root, 'pipes', 'config.yaml')) as f:
config = yaml.load(f)
config = yaml.load(f, loader=yaml.FullLoader)
self.config = merge_yaml_dicts(config, {'number_of_threads': _CPUS})
if self.override_config:
self.config = merge_yaml_dicts(self.config, self.override_config)
Expand Down
Loading