Skip to content

Commit 979583f

Browse files
authored
Merge pull request #111 from broadinstitute/dp-fasta-handling
make samtools happy with spaces in fasta headers
2 parents b15de41 + 390b07d commit 979583f

File tree

3 files changed

+27
-10
lines changed

3 files changed

+27
-10
lines changed

test/unit/test_tools_samtools.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,22 @@ def test_fasta_index(self):
3232
samtools.faidx(inRef)
3333
self.assertEqualContents(outFai, expected_fai)
3434

35+
def test_messy_fasta_index(self):
36+
orig_ref = os.path.join(util.file.get_test_input_path(), 'TestToolPicard', 'messy-headers.fasta')
37+
samtools = tools.samtools.SamtoolsTool()
38+
with util.file.tempfname('.fasta') as inRef:
39+
shutil.copyfile(orig_ref, inRef)
40+
samtools.faidx(inRef, overwrite=True)
41+
with open(inRef + '.fai', 'rt') as inf:
42+
seqnames = set()
43+
for line in inf:
44+
seq_name = line.strip().split('\t')[0]
45+
# old versions of code cut this off at "Influenza"
46+
self.assertGreater(len(seq_name), 50)
47+
seqnames.add(seq_name)
48+
# require that all sequence names are unique
49+
self.assertEqual(len(seqnames), 8)
50+
3551
def test_isEmpty(self):
3652
samtools = tools.samtools.SamtoolsTool()
3753
self.assertTrue(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'empty.bam')))

tools/samtools.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,10 @@ def faidx(self, inFasta, overwrite=False):
154154
else:
155155
return
156156
#pysam.faidx(inFasta)
157-
self.execute('faidx', [inFasta])
157+
with util.file.fastas_with_sanitized_ids(inFasta, use_tmp=True) as sanitized_fastas:
158+
sanitized_fasta = sanitized_fastas[0]
159+
self.execute('faidx', [sanitized_fasta])
160+
shutil.copyfile(sanitized_fasta + '.fai', outfname)
158161

159162
def depth(self, inBam, outFile, options=None):
160163
""" Write a TSV file with coverage depth by position """

util/file.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -647,17 +647,15 @@ def sanitize_id_for_sam_rname(string_in):
647647
return string_value
648648

649649
def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
650-
with open(out_filepath, "w") as handle:
651-
fasta_out = FastaIO.FastaWriter(handle, wrap=None)
652-
fasta_out.write_header()
653-
for record in SeqIO.parse(fasta_in, "fasta"):
654-
record.id=sanitize_id_for_sam_rname(record.description)
655-
fasta_out.write_record(record)
650+
with open(out_filepath, "wt") as outf:
651+
with open(fasta_in, "rt") as inf:
652+
for line in inf:
653+
line = line.strip()
654+
if line.startswith(">"):
655+
line = ">"+sanitize_id_for_sam_rname(line[1:])
656+
outf.write(line + '\n')
656657
print("out_filepath",out_filepath)
657658
print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath))
658-
print("ls -lah")
659-
for line in subprocess.check_output(["ls","-lah",os.path.dirname(out_filepath)]).decode("utf-8").split("\n"):
660-
print(line)
661659
return out_filepath
662660

663661
@contextlib.contextmanager

0 commit comments

Comments
 (0)