Skip to content

Commit 385d2a4

Browse files
authored
Model update (#5)
* Change from minimap2 to winnowmap - winnowmap performs better in repeat regions as shown in the paper https://doi.org/10.1093/bioinformatics/btaa435 * Add new error catch * Add script to check if magnipore positions are within annotated regions * add (0-based) information to readmes * - changed default parameters for winnowmap - fixed bug that doubled the read counts - minor bug fixes * fix tests * remove settings force_rebuild to true if segmentation is not done * Update test according to new default mapping parameters * add file paths to output * add file paths to output
1 parent 95b4a82 commit 385d2a4

File tree

16 files changed

+8168
-11224
lines changed

16 files changed

+8168
-11224
lines changed

magnipore/magnipore.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,8 @@ def magnipore(mapping : dict, unaligned : dict, seq_dict : dict, aln_dict: dict,
397397
with no_data.get_lock(): no_data_val = no_data.value
398398
with low_cov_count.get_lock(): low_cov_count_val = low_cov_count.value
399399

400-
LOGGER.printLog('Writing indels file')
401400
indelFile = os.path.join(working_dir, f'{first_sample_label}_{sec_sample_label}.indels')
401+
LOGGER.printLog(f'Writing indel file to {os.path.dirname(indelFile)}')
402402
with open(indelFile, 'w') as f:
403403
f.write('type\tstrand\tref\tpos\tbase\n')
404404
for seq in unaligned:

magnipore/nanosherlock.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def readSegSum(segSum : str):
9191
print(f'Reading line {lidx + 1}', end='\r')
9292
r_index, r_ID = line.strip().split()[:2]
9393
read_index2ID[int(r_index)] = r_ID
94-
LOGGER.printLog(f'Mapped {len(read_index2ID)} readids')
94+
LOGGER.printLog(f'Segmented {len(read_index2ID)} readids')
9595
return read_index2ID
9696

9797
def parse() -> Namespace:
@@ -208,8 +208,6 @@ def signalSegmentation(raw_data : str, file_format : str, basecalls : str, refer
208208
if os.path.exists(summary_csv) and os.path.exists(result_csv) and not force_rebuild:
209209
LOGGER.printLog(f'segmentation files already exist:\n-\t{summary_csv}\n-\t{result_csv}')
210210
return summary_csv, result_csv, force_rebuild
211-
else:
212-
force_rebuild = True
213211

214212
if not os.path.exists(segmentation_path):
215213
os.makedirs(segmentation_path)
@@ -227,7 +225,7 @@ def signalSegmentation(raw_data : str, file_format : str, basecalls : str, refer
227225
end = perf_counter_ns()
228226
LOGGER.printLog(f'{ANSI.YELLOW}TIMED: samtools indexing took {pd.to_timedelta(end-start)}, {end - start} nanoseconds{ANSI.END}')
229227
else:
230-
LOGGER.printLog(f"Using existing {os.path.exists(alignment_bam + '.bai')}")
228+
LOGGER.printLog(f"Alignment index already exists\n-\t{alignment_bam + '.bai'}")
231229

232230
# segmentation indexing
233231
if not os.path.exists(basecalls + '.index') or force_rebuild:
@@ -242,7 +240,7 @@ def signalSegmentation(raw_data : str, file_format : str, basecalls : str, refer
242240
end = perf_counter_ns()
243241
LOGGER.printLog(f'{ANSI.YELLOW}TIMED: segmentation indexing took {pd.to_timedelta(end-start)}, {end - start} nanoseconds{ANSI.END}')
244242
else:
245-
LOGGER.printLog(f"Using existing {os.path.exists(basecalls + '.index')}")
243+
LOGGER.printLog(f"f5c idnex already exists\n-\t{basecalls + '.index'}")
246244

247245
# segmentation
248246
log_file = os.path.join(segmentation_path, "log.txt")
@@ -323,7 +321,7 @@ def aggregate_events(seg_result : str, seg_sum: str, raw_data : str, file_format
323321
end = perf_counter_ns()
324322
LOGGER.printLog(f'{ANSI.YELLOW}TIMED: Building distribution models took {pd.to_timedelta(end-start)}, {end - start} nanoseconds{ANSI.END}')
325323

326-
LOGGER.printLog('Writing output files')
324+
LOGGER.printLog(f'Writing .red file to {os.path.dirname(red_file)}')
327325
writeOutput(red_file, red_dict)
328326
return red_file
329327

@@ -436,6 +434,10 @@ def buildModels(red_dict : dict, omvs : dict, nano2readid : dict, readID2File :
436434
# maybe haplotypes end up here as NNNNN? -> actually mutations in the reads, segmentation has no clue what to do?
437435
continue
438436

437+
# case: mapping file got replaced by a custom one with additional contigs
438+
if event['contig'] not in red_dict:
439+
continue
440+
439441
# prepare signal data for new read
440442
readid = nano2readid[event['read_index']]
441443
# found new read, store last information
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
strand td_score kl_divergence bayesian_p signal_type ref_1 pos_1 base_1 motif_1 signal_mean_1 signal_std_1 n_datapoints_1 contained_datapoints_1 n_segments_1 contained_segments_1 n_reads_1 ref_2 pos_2 base_2 motif_2 signal_mean_2 signal_std_2 n_datapoints_2 contained_datapoints_2 n_segments_2 contained_segments_2 n_reads_2
2-
- 0.00000000 0.00000000 1.00000000 mut sample1 2 G TTGAT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 2 G TTGGT 100.00000000 999.50000000 450000 400000 350000 300000 250000
3-
+ 1.08016350 0.58337659 0.58914066 mut sample1 2 C ATCAA -0.69412115 0.12474506 462977 458042 29191 27871 16452 sample2 2 C ACCAA -0.82886621 0.12474506 462977 458042 29191 27871 16452
2+
- 0.00000000 0.00000000 1.00000000 mut sample1 3 T ATTGA 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 3 T CTTGG 100.00000000 999.50000000 450000 400000 350000 300000 250000
43
+ 0.00000000 0.00000000 1.00000000 mut sample1 3 A TCAAT -0.42774528 0.15967356 3705397 3659799 200313 184824 58289 sample2 3 A CCAAG -0.42774528 0.15967356 3705397 3659799 200313 184824 58289
5-
- 0.00000000 0.00000000 1.00000000 mut sample1 4 T AATTG 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 4 T CCTTG 100.00000000 999.50000000 450000 400000 350000 300000 250000
4+
+ 1.08016350 0.58337659 0.58914066 mut sample1 2 C ATCAA -0.69412115 0.12474506 462977 458042 29191 27871 16452 sample2 2 C ACCAA -0.82886621 0.12474506 462977 458042 29191 27871 16452
65
+ 1.99863643 1.99727379 0.31764056 mut sample1 4 A CAATT -0.12040068 0.20013645 4285190 4244593 213433 193041 61695 sample2 4 A CAAGG -0.52040068 0.20013645 4285190 4244593 213433 193041 61695
7-
- 0.00000000 0.00000000 1.00000000 mut sample1 3 T ATTGA 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 3 T CTTGG 100.00000000 999.50000000 450000 400000 350000 300000 250000
6+
- 0.00000000 0.00000000 1.00000000 mut sample1 2 G TTGAT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 2 G TTGGT 100.00000000 999.50000000 450000 400000 350000 300000 250000
7+
- 0.00000000 0.00000000 1.00000000 mut sample1 4 T AATTG 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 4 T CCTTG 100.00000000 999.50000000 450000 400000 350000 300000 250000
88
+ 2.20157177 2.42345913 0.27098986 mut sample1 10 G TTGGA -0.99948980 0.22711047 1765086 1753265 105062 99905 65868 sample2 5 G AAGGA -0.49948980 0.22711047 1765086 1753265 105062 99905 65868
9+
+ 0.00000000 0.00000000 1.00000000 mut sample1 11 G TGGAC 0.55857725 0.20900872 5351321 4656614 252294 243878 54642 sample2 6 G AGGAC 0.55857725 0.20900872 5351321 4656614 252294 243878 54642
910
- 0.00000000 0.00000000 1.00000000 mut sample1 10 C TCCAA 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 5 C TCCTT 100.00000000 999.50000000 450000 400000 350000 300000 250000
1011
- 0.00000000 0.00000000 1.00000000 mut sample1 11 C GTCCA 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 6 C GTCCT 100.00000000 999.50000000 450000 400000 350000 300000 250000
11-
+ 0.00000000 0.00000000 1.00000000 mut sample1 11 G TGGAC 0.55857725 0.20900872 5351321 4656614 252294 243878 54642 sample2 6 G AGGAC 0.55857725 0.20900872 5351321 4656614 252294 243878 54642
12+
+ 0.00000000 0.00000000 1.00000000 mod sample1 12 A GGACC 0.62131312 0.30007160 9519518 8496545 321121 305045 54644 sample2 7 A GGACC 0.62131312 0.30007160 9519518 8496545 321121 305045 54644
1213
- 0.00000000 0.00000000 1.00000000 mod sample1 12 T GGTCC 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 7 T GGTCC 100.00000000 999.50000000 450000 400000 350000 300000 250000
1314
+ 0.00000000 0.00000000 1.00000000 mod sample1 13 C GACCA -0.80830759 0.12240194 2342558 2317373 127716 114675 69140 sample2 8 C GACCA -0.80830759 0.12240194 2342558 2317373 127716 114675 69140
14-
+ 0.00000000 0.00000000 1.00000000 mod sample1 12 A GGACC 0.62131312 0.30007160 9519518 8496545 321121 305045 54644 sample2 7 A GGACC 0.62131312 0.30007160 9519518 8496545 321121 305045 54644
1515
- 0.00000000 0.00000000 1.00000000 mod sample1 13 G TGGTC 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 8 G TGGTC 100.00000000 999.50000000 450000 400000 350000 300000 250000
1616
+ 0.00000000 0.00000000 1.00000000 mod sample1 14 C ACCAA -0.15731964 0.28819697 1850817 1848556 104976 103831 71375 sample2 9 C ACCAA -0.15731964 0.28819697 1850817 1848556 104976 103831 71375
1717
- 0.00000000 0.00000000 1.00000000 mod sample1 14 G TTGGT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 9 G TTGGT 100.00000000 999.50000000 450000 400000 350000 300000 250000
@@ -20,12 +20,12 @@ strand td_score kl_divergence bayesian_p signal_type ref_1 pos_1 base_1 motif_1
2020
+ 1.91018369 1.82440086 0.33953125 mod sample1 16 A CAACA 0.02282725 0.18095354 6265978 6222182 366204 343396 73982 sample2 11 A CAACA -0.32282725 0.18095354 6265978 6222182 366204 343396 73982
2121
- 9.50000000 20.70972340 0.00000178 mod sample1 16 T TGTTG -0.90000000 0.10000000 6265978 6222182 366204 343396 73982 sample2 11 T TGTTG 1.00000000 0.30000000 6265978 6222182 366204 343396 73982
2222
+ 0.00000000 0.00000000 1.00000000 mod sample1 17 C AACAC 0.54594578 0.22102544 4846856 5988485 254324 244985 70918 sample2 12 C AACAC 0.54594578 0.22102544 4846856 5988485 254324 244985 70918
23-
- 0.00000000 0.00000000 1.00000000 mod sample1 17 G GTGTT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 12 G GTGTT 100.00000000 999.50000000 450000 400000 350000 300000 250000
2423
+ 0.00000000 0.00000000 1.00000000 mod sample1 18 A ACACT 0.30000000 0.20000000 1234567 1234567 123456 123456 12345 sample2 13 A ACACT 0.30000000 0.20000000 1234567 1234567 123456 123456 12345
24+
- 0.00000000 0.00000000 1.00000000 mod sample1 17 G GTGTT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 12 G GTGTT 100.00000000 999.50000000 450000 400000 350000 300000 250000
2525
- 0.00000000 0.00000000 1.00000000 mod sample1 18 T AGTGT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 13 T AGTGT 100.00000000 999.50000000 450000 400000 350000 300000 250000
2626
+ 0.00000000 0.00000000 1.00000000 mod sample1 19 C CACTT 0.25252343 0.12157210 5461264 3356565 232988 206498 80510 sample2 14 C CACTT 0.25252343 0.12157210 5461264 3356565 232988 206498 8051012345
2727
- 0.00000000 0.00000000 1.00000000 mod sample1 19 G AAGTG 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 14 G AAGTG 100.00000000 999.50000000 450000 400000 350000 300000 250000
28-
- nan nan nan mut sample1 20 A AAAGT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 15 A AAGT 0.00000000 0.00000000 0 0 0 0 0
2928
+ nan nan nan mut sample1 20 T ACTTT 0.45252343 0.22157210 5461264 3356565 232988 206498 80510 sample2 15 T ACTT 0.00000000 0.00000000 0 0 0 0 0
29+
- nan nan nan mut sample1 20 A AAAGT 100.00000000 999.50000000 450000 400000 350000 300000 250000 sample2 15 A AAGT 0.00000000 0.00000000 0 0 0 0 0
3030
+ nan nan nan mut sample1 21 T CTTT 0.00000000 0.00000000 0 0 0 0 0 sample2 16 T CTT 0.00000000 0.00000000 0 0 0 0 0
3131
- nan nan nan mut sample1 21 A AAAG 0.00000000 0.00000000 0 0 0 0 0 sample2 16 A AAG 0.00000000 0.00000000 0 0 0 0 0
Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
Number of indels: 6
2-
Number of significant positions: 5 - Classified as mutations: 3
3-
Positions with no data 4, at least one aligned position without information (no signals)
4-
Number of positions with low coverage in at least one sample: 4 - I recommend filtering out these positions in the .magnipore file.
5-
Positions with now data or low coverage can be high if one strand has no aligned reads!
1+
Total alignment positions: 34
2+
Indels: 6
3+
Significant positions: 5
4+
Classified as mutations: 3
5+
Positions with no data 4, at least one sample at aligned position has no data
6+
Positions with coverage < 10 in at least one sample: 4 - filtering recommended.
7+
Positions with no data or low coverage can be high if one strand has no aligned reads!

0 commit comments

Comments
 (0)