rnajena
diff --git a/‎magnipore/magnipore.py‎
Lines changed: 1 addition & 1 deletion b/‎magnipore/magnipore.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎magnipore/nanosherlock.py‎
Lines changed: 8 additions & 6 deletions b/‎magnipore/nanosherlock.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎tests/magnipore/sample1_sample2/sample1_sample2.all‎
Lines changed: 8 additions & 8 deletions b/‎tests/magnipore/sample1_sample2/sample1_sample2.all‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎tests/magnipore/sample1_sample2/sample1_sample2.txt‎
Lines changed: 7 additions & 5 deletions b/‎tests/magnipore/sample1_sample2/sample1_sample2.txt‎
Lines changed: 7 additions & 5 deletions
@@ -397,8 +397,8 @@ def magnipore(mapping : dict, unaligned : dict, seq_dict : dict, aln_dict: dict,
     with no_data.get_lock(): no_data_val = no_data.value
     with low_cov_count.get_lock(): low_cov_count_val = low_cov_count.value
 
-    LOGGER.printLog('Writing indels file')
     indelFile = os.path.join(working_dir, f'{first_sample_label}_{sec_sample_label}.indels')
+    LOGGER.printLog(f'Writing indel file to {os.path.dirname(indelFile)}')
     with open(indelFile, 'w') as f:
         f.write('type\tstrand\tref\tpos\tbase\n')
         for seq in unaligned:
 
@@ -91,7 +91,7 @@ def readSegSum(segSum : str):
                 print(f'Reading line {lidx + 1}', end='\r')
             r_index, r_ID = line.strip().split()[:2]
             read_index2ID[int(r_index)] = r_ID
-    LOGGER.printLog(f'Mapped {len(read_index2ID)} readids')
+    LOGGER.printLog(f'Segmented {len(read_index2ID)} readids')
     return read_index2ID
 
 def parse() -> Namespace:
@@ -208,8 +208,6 @@ def signalSegmentation(raw_data : str, file_format : str, basecalls : str, refer
     if os.path.exists(summary_csv) and os.path.exists(result_csv) and not force_rebuild:
         LOGGER.printLog(f'segmentation files already exist:\n-\t{summary_csv}\n-\t{result_csv}')
         return summary_csv, result_csv, force_rebuild
-    else:
-        force_rebuild = True
 
     if not os.path.exists(segmentation_path):
         os.makedirs(segmentation_path)
@@ -227,7 +225,7 @@ def signalSegmentation(raw_data : str, file_format : str, basecalls : str, refer
             end = perf_counter_ns()
             LOGGER.printLog(f'{ANSI.YELLOW}TIMED: samtools indexing took {pd.to_timedelta(end-start)}, {end - start} nanoseconds{ANSI.END}')
     else:
-        LOGGER.printLog(f"Using existing {os.path.exists(alignment_bam + '.bai')}")
+        LOGGER.printLog(f"Alignment index already exists\n-\t{alignment_bam + '.bai'}")
 
     # segmentation indexing
     if not os.path.exists(basecalls + '.index') or force_rebuild:
@@ -242,7 +240,7 @@ def signalSegmentation(raw_data : str, file_format : str, basecalls : str, refer
             end = perf_counter_ns()
             LOGGER.printLog(f'{ANSI.YELLOW}TIMED: segmentation indexing took {pd.to_timedelta(end-start)}, {end - start} nanoseconds{ANSI.END}')
     else:
-        LOGGER.printLog(f"Using existing {os.path.exists(basecalls + '.index')}")
+        LOGGER.printLog(f"f5c idnex already exists\n-\t{basecalls + '.index'}")
 
     # segmentation
     log_file = os.path.join(segmentation_path, "log.txt")
@@ -323,7 +321,7 @@ def aggregate_events(seg_result : str, seg_sum: str, raw_data : str, file_format
         end = perf_counter_ns()
         LOGGER.printLog(f'{ANSI.YELLOW}TIMED: Building distribution models took {pd.to_timedelta(end-start)}, {end - start} nanoseconds{ANSI.END}')
 
-    LOGGER.printLog('Writing output files')
+    LOGGER.printLog(f'Writing .red file to {os.path.dirname(red_file)}')
     writeOutput(red_file, red_dict)
     return red_file
 
@@ -436,6 +434,10 @@ def buildModels(red_dict : dict, omvs : dict, nano2readid : dict, readID2File :
                     # maybe haplotypes end up here as NNNNN? -> actually mutations in the reads, segmentation has no clue what to do?
                     continue
 
+                # case: mapping file got replaced by a custom one with additional contigs
+                if event['contig'] not in red_dict:
+                    continue
+
                 # prepare signal data for new read
                 readid = nano2readid[event['read_index']]
                 # found new read, store last information
 
@@ -1,17 +1,17 @@
 strand	td_score	kl_divergence	bayesian_p	signal_type	ref_1	pos_1	base_1	motif_1	signal_mean_1	signal_std_1	n_datapoints_1	contained_datapoints_1	n_segments_1	contained_segments_1	n_reads_1	ref_2	pos_2	base_2	motif_2	signal_mean_2	signal_std_2	n_datapoints_2	contained_datapoints_2	n_segments_2	contained_segments_2	n_reads_2
--	0.00000000	0.00000000	1.00000000	mut	sample1	2	G	TTGAT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	2	G	TTGGT	100.00000000	999.50000000	450000	400000	350000	300000	250000
-+	1.08016350	0.58337659	0.58914066	mut	sample1	2	C	ATCAA	-0.69412115	0.12474506	462977	458042	29191	27871	16452	sample2	2	C	ACCAA	-0.82886621	0.12474506	462977	458042	29191	27871	16452
+-	0.00000000	0.00000000	1.00000000	mut	sample1	3	T	ATTGA	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	3	T	CTTGG	100.00000000	999.50000000	450000	400000	350000	300000	250000
 +	0.00000000	0.00000000	1.00000000	mut	sample1	3	A	TCAAT	-0.42774528	0.15967356	3705397	3659799	200313	184824	58289	sample2	3	A	CCAAG	-0.42774528	0.15967356	3705397	3659799	200313	184824	58289
--	0.00000000	0.00000000	1.00000000	mut	sample1	4	T	AATTG	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	4	T	CCTTG	100.00000000	999.50000000	450000	400000	350000	300000	250000
++	1.08016350	0.58337659	0.58914066	mut	sample1	2	C	ATCAA	-0.69412115	0.12474506	462977	458042	29191	27871	16452	sample2	2	C	ACCAA	-0.82886621	0.12474506	462977	458042	29191	27871	16452
 +	1.99863643	1.99727379	0.31764056	mut	sample1	4	A	CAATT	-0.12040068	0.20013645	4285190	4244593	213433	193041	61695	sample2	4	A	CAAGG	-0.52040068	0.20013645	4285190	4244593	213433	193041	61695
--	0.00000000	0.00000000	1.00000000	mut	sample1	3	T	ATTGA	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	3	T	CTTGG	100.00000000	999.50000000	450000	400000	350000	300000	250000
+-	0.00000000	0.00000000	1.00000000	mut	sample1	2	G	TTGAT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	2	G	TTGGT	100.00000000	999.50000000	450000	400000	350000	300000	250000
+-	0.00000000	0.00000000	1.00000000	mut	sample1	4	T	AATTG	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	4	T	CCTTG	100.00000000	999.50000000	450000	400000	350000	300000	250000
 +	2.20157177	2.42345913	0.27098986	mut	sample1	10	G	TTGGA	-0.99948980	0.22711047	1765086	1753265	105062	99905	65868	sample2	5	G	AAGGA	-0.49948980	0.22711047	1765086	1753265	105062	99905	65868
++	0.00000000	0.00000000	1.00000000	mut	sample1	11	G	TGGAC	0.55857725	0.20900872	5351321	4656614	252294	243878	54642	sample2	6	G	AGGAC	0.55857725	0.20900872	5351321	4656614	252294	243878	54642
 -	0.00000000	0.00000000	1.00000000	mut	sample1	10	C	TCCAA	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	5	C	TCCTT	100.00000000	999.50000000	450000	400000	350000	300000	250000
 -	0.00000000	0.00000000	1.00000000	mut	sample1	11	C	GTCCA	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	6	C	GTCCT	100.00000000	999.50000000	450000	400000	350000	300000	250000
-+	0.00000000	0.00000000	1.00000000	mut	sample1	11	G	TGGAC	0.55857725	0.20900872	5351321	4656614	252294	243878	54642	sample2	6	G	AGGAC	0.55857725	0.20900872	5351321	4656614	252294	243878	54642
++	0.00000000	0.00000000	1.00000000	mod	sample1	12	A	GGACC	0.62131312	0.30007160	9519518	8496545	321121	305045	54644	sample2	7	A	GGACC	0.62131312	0.30007160	9519518	8496545	321121	305045	54644
 -	0.00000000	0.00000000	1.00000000	mod	sample1	12	T	GGTCC	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	7	T	GGTCC	100.00000000	999.50000000	450000	400000	350000	300000	250000
 +	0.00000000	0.00000000	1.00000000	mod	sample1	13	C	GACCA	-0.80830759	0.12240194	2342558	2317373	127716	114675	69140	sample2	8	C	GACCA	-0.80830759	0.12240194	2342558	2317373	127716	114675	69140
-+	0.00000000	0.00000000	1.00000000	mod	sample1	12	A	GGACC	0.62131312	0.30007160	9519518	8496545	321121	305045	54644	sample2	7	A	GGACC	0.62131312	0.30007160	9519518	8496545	321121	305045	54644
 -	0.00000000	0.00000000	1.00000000	mod	sample1	13	G	TGGTC	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	8	G	TGGTC	100.00000000	999.50000000	450000	400000	350000	300000	250000
 +	0.00000000	0.00000000	1.00000000	mod	sample1	14	C	ACCAA	-0.15731964	0.28819697	1850817	1848556	104976	103831	71375	sample2	9	C	ACCAA	-0.15731964	0.28819697	1850817	1848556	104976	103831	71375
 -	0.00000000	0.00000000	1.00000000	mod	sample1	14	G	TTGGT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	9	G	TTGGT	100.00000000	999.50000000	450000	400000	350000	300000	250000
@@ -20,12 +20,12 @@ strand	td_score	kl_divergence	bayesian_p	signal_type	ref_1	pos_1	base_1	motif_1
 +	1.91018369	1.82440086	0.33953125	mod	sample1	16	A	CAACA	0.02282725	0.18095354	6265978	6222182	366204	343396	73982	sample2	11	A	CAACA	-0.32282725	0.18095354	6265978	6222182	366204	343396	73982
 -	9.50000000	20.70972340	0.00000178	mod	sample1	16	T	TGTTG	-0.90000000	0.10000000	6265978	6222182	366204	343396	73982	sample2	11	T	TGTTG	1.00000000	0.30000000	6265978	6222182	366204	343396	73982
 +	0.00000000	0.00000000	1.00000000	mod	sample1	17	C	AACAC	0.54594578	0.22102544	4846856	5988485	254324	244985	70918	sample2	12	C	AACAC	0.54594578	0.22102544	4846856	5988485	254324	244985	70918
--	0.00000000	0.00000000	1.00000000	mod	sample1	17	G	GTGTT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	12	G	GTGTT	100.00000000	999.50000000	450000	400000	350000	300000	250000
 +	0.00000000	0.00000000	1.00000000	mod	sample1	18	A	ACACT	0.30000000	0.20000000	1234567	1234567	123456	123456	12345	sample2	13	A	ACACT	0.30000000	0.20000000	1234567	1234567	123456	123456	12345
+-	0.00000000	0.00000000	1.00000000	mod	sample1	17	G	GTGTT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	12	G	GTGTT	100.00000000	999.50000000	450000	400000	350000	300000	250000
 -	0.00000000	0.00000000	1.00000000	mod	sample1	18	T	AGTGT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	13	T	AGTGT	100.00000000	999.50000000	450000	400000	350000	300000	250000
 +	0.00000000	0.00000000	1.00000000	mod	sample1	19	C	CACTT	0.25252343	0.12157210	5461264	3356565	232988	206498	80510	sample2	14	C	CACTT	0.25252343	0.12157210	5461264	3356565	232988	206498	8051012345
 -	0.00000000	0.00000000	1.00000000	mod	sample1	19	G	AAGTG	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	14	G	AAGTG	100.00000000	999.50000000	450000	400000	350000	300000	250000
--	nan	nan	nan	mut	sample1	20	A	AAAGT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	15	A	AAGT	0.00000000	0.00000000	0	0	0	0	0
 +	nan	nan	nan	mut	sample1	20	T	ACTTT	0.45252343	0.22157210	5461264	3356565	232988	206498	80510	sample2	15	T	ACTT	0.00000000	0.00000000	0	0	0	0	0
+-	nan	nan	nan	mut	sample1	20	A	AAAGT	100.00000000	999.50000000	450000	400000	350000	300000	250000	sample2	15	A	AAGT	0.00000000	0.00000000	0	0	0	0	0
 +	nan	nan	nan	mut	sample1	21	T	CTTT	0.00000000	0.00000000	0	0	0	0	0	sample2	16	T	CTT	0.00000000	0.00000000	0	0	0	0	0
 -	nan	nan	nan	mut	sample1	21	A	AAAG	0.00000000	0.00000000	0	0	0	0	0	sample2	16	A	AAG	0.00000000	0.00000000	0	0	0	0	0
@@ -1,5 +1,7 @@
-Number of indels: 6
-Number of significant positions: 5 - Classified as mutations: 3
-Positions with no data 4, at least one aligned position without information (no signals)
-Number of positions with low coverage in at least one sample: 4 - I recommend filtering out these positions in the .magnipore file.
-Positions with now data or low coverage can be high if one strand has no aligned reads!
+Total alignment positions: 34
+Indels: [93m6[0m
+Significant positions: [93m5[0m
+Classified as mutations: [93m3[0m
+Positions with no data [93m4[0m, at least one sample at aligned position has no data
+Positions with coverage < 10 in at least one sample: [93m4[0m - filtering recommended.
+Positions with no data or low coverage can be high if one strand has no aligned reads!