Skip to content

Commit b15de41

Browse files
authored
Merge pull request #110 from broadinstitute/dp-fasta-handling
handle spaces in fasta headers
2 parents dd0d3c8 + d5de7d5 commit b15de41

File tree

3 files changed

+61
-1
lines changed

3 files changed

+61
-1
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-1
2+
CTGGAGCAAGACAAATGATGCCGGATCAGATCGAGTGATGGTATCACCCCTGGCTGTGAC
3+
ATGGTGGAATAGGAATGGACCAACAACAAGTACAATTCACTATCCAAAGGTATACAAAAC
4+
AAAGGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTGATGAAACG
5+
GAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGC
6+
CATCAATTAGTGTGGAATTGTTTAAAAACGACCTTGTTTCTACT
7+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-2
8+
CGAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTCTTCTTGAAAGTTCC
9+
AGCGCAAAATGCCATAAGCACCACATTCCCGTATACTGGAGATCCTCCATACAGCCATGG
10+
ACGGATTAAGAAGGAGGAGTTTGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCT
11+
CAGACGGCAGAAATAGTGAATTTAGCTTGTCCTTCATGAAAAAATGCCTTGTTTCTACT
12+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-3
13+
AAGCAGGTACTGATTCAAAATGGAAGACTTTGTGCGACAATGCTTCAATCCAATGATTGT
14+
CGAGCTTGCGGAAAAAGCAATGAAAGAACATGGGGAAGATCCGAAAATCGAGACAAACAA
15+
GATTAACGATCCCTGGGTTTTGCTTAATGCATCTTGGTTCAACTCCTTCCTCACACATGC
16+
ACTGAAATAGTTGTGGCAATGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTG
17+
TTTCTACT
18+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-4
19+
GCAGTGTAGCTGGATGGCTCCTCGGAAACCCAATGTGCGACGAATTCATCAGAGTGCCGG
20+
AATGGTCTTACATAGTGGAGCGGGCTAACCCAGCTAATGACCTCTGTTACCCAGGGAGCC
21+
TATCTTTATGGATGTGCTCCAATGGGTCGTTACAATGCAGAATTTGCATTTAGATTTATG
22+
AGCTCAGATTGTAGTTAAAAACACCCTTGTTTCT
23+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-5
24+
AAGCAGGGTAGATAATCACTCACTGAGTGACATCCACATCATGGCGTCTCAAGGCACCAA
25+
ATCTGTTGGAAGAATGGTTGGCGGAATCGGGAGATTCTACATACAGATGTGCACTGAGCT
26+
GGACGAAAAGGCAACGAACCCGATCGTGCCTTCCTTTGACATGAACAATGAAGGATCTTA
27+
TTTCTTCGGAGACAATGCAGAGGAGTATGACAATTAAAGAAAAATACCCTTGTTTCTACT
28+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-6
29+
CGGGCAATTCATCTCTTTGCCCTATTAGTGGGTGGGCAATATACAGTAAGGACAACGGTA
30+
TAAGAATTGGATCTAAGGGGGATGTGTTTGTTATAAGAGAACCATTCATCTCATGCTCCC
31+
GTGGTGTAAATAGTGACACTGTGGGTTGGTCTTGGCCAGACGGTGCTGAGTTGCCATTCA
32+
CCATTGACAAGTAGTTTGTTCAAAAAACTCCTTGTTTCTACT
33+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-7
34+
AAGCAGGTAGATATTGAAAGATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTA
35+
TCGTCCCGTCGGGCCCCCTCAAAGCCGAGATCGCGCAGAGACTTGAAGATGTCTTTGCAG
36+
GGAAGAACACCGATCTTGAGGCTCTCATGGAATGGCTAAAGACAAGACCAATCCTGTCAC
37+
CTCTGACCAAGGGGATTTTGGGATTTGTGTTCACGCTCACCGTGCCCAGTGAGCGAGGAC
38+
CT
39+
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-8
40+
AGCDGGGTGACAAAAACATAATGGATTCCAACACTGTGTTAAGCTTTCAGGTAGACTGCT
41+
TTCTTTGGCATGTCCGCAAACGATTTGCAGACCAAGAACTGGGTGATGCCCCATTCCTTG
42+
TCGAGACGGCCACTCGTGCTGGGAAGCAGATAGTGGAGAGGATTCTGGAGGAAGAATCCG
43+
TCGTTTCAGCTTATTTAATAATAAAAAACACCCTTGTTTCTACT

test/unit/test_tools_picard.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,23 @@ def test_fasta_index(self):
3636
actual_first3 = [re.sub(r'VN:1\.[4-6]','VN:1.4',x.strip()).split('\t')[:3] for x in inf.readlines()]
3737
self.assertEqual(actual_first3, expected_first3)
3838

39+
def test_messy_fasta_index(self):
40+
orig_ref = os.path.join(util.file.get_test_input_path(self), 'messy-headers.fasta')
41+
picard_index = tools.picard.CreateSequenceDictionaryTool()
42+
with util.file.tempfname('.fasta') as inRef:
43+
shutil.copyfile(orig_ref, inRef)
44+
picard_index.execute(inRef, overwrite=True)
45+
with open(inRef[:-6] + '.dict', 'rt') as inf:
46+
seqnames = set()
47+
for line in inf:
48+
if line.startswith('@SQ'):
49+
seq_name = dict(x.split(':', maxsplit=1) for x in line.strip().split('\t')[1:])['SN']
50+
# old versions of code cut this off at "Influenza"
51+
self.assertGreater(len(seq_name), 50)
52+
seqnames.add(seq_name)
53+
# require that all sequence names are unique
54+
self.assertEqual(len(seqnames), 8)
55+
3956
def test_sam_downsample(self):
4057
desired_count = 100
4158
tolerance = 0.02

util/file.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
651651
fasta_out = FastaIO.FastaWriter(handle, wrap=None)
652652
fasta_out.write_header()
653653
for record in SeqIO.parse(fasta_in, "fasta"):
654-
record.id=sanitize_id_for_sam_rname(record.id)
654+
record.id=sanitize_id_for_sam_rname(record.description)
655655
fasta_out.write_record(record)
656656
print("out_filepath",out_filepath)
657657
print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath))

0 commit comments

Comments
 (0)