20
20
blacklist = ["1_43815093" ,"1_115256626" ,"2_25463686" ,"2_25467178" ,"2_25469567" ,"2_25458738" ,"2_209113336" ,"2_25536827" ,"2_198267672" ,\
21
21
"2_209113332" ,"4_106194088" ,"4_106157187" ,"4_106196675" ,"4_106196287" ,"7_148504716" ,"7_148506185" ,"7_148506191" ,"7_148504854" ,\
22
22
"7_148506194" ,"7_148526908" ,"7_148504717" ,"7_148543582" ,"7_148543583" ,"8_128750698" ,"8_117864842" ,"12_25378673" ,"12_25378676" ,\
23
- "13_28592669" ,"13_28602256" ,"17_29559928" ,"17_29562734" ,"17_7578587" ,"17_7578115" ,"17_7579472" ,"17_7579801" ,"17_29559932" ,\
23
+ "10_77210191" ,"10_106721610" ,\
24
+ "13_28592669" ,"13_28602256" ,"16_55770629" ,"17_29559928" ,"17_29562734" ,"17_7578587" ,"17_7579472" ,"17_7579801" ,"17_29559932" ,\
24
25
"17_7579414" ,"17_29483195" ,"17_29559926" ,"17_29562734" ,"17_7579440" ,"20_31024389" ,"21_44524505" ,"21_36259324" ,"X_133527541" ,\
25
26
"X_44911052" ,"X_39932806" ,"X_39932807" ,"X_44929002" ,"X_15809170" ,"X_39922359" ,"X_15821932" ,"X_15841334" ,"X_15838366" ,"X_15841334" ,\
26
27
"X_15841336" ,"X_39932907" ," X_53426504" ,"X_133549184" ,"X_53426504" ,"X_39914742" ,"X_53426570" ,"X_44949032" ,"X_39921505" ,"X_15827406" ]
@@ -93,10 +94,12 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
93
94
with loompy .connect (infile ) as ds :
94
95
n_loci_full ,n_cells_full = ds .shape
95
96
chromosomes_full = ds .ra ["CHROM" ]
97
+ chromosomes_full = [str (chr ) for chr in chromosomes_full ]
96
98
positions_full = ds .ra ["POS" ]
97
99
# The loom files appear to not always be sorted. (or partially sorted...)
98
100
# reorder loci by position and chromosome
99
101
chr_order = ["1" ,"2" ,"3" ,"4" ,"5" ,"6" ,"7" ,"8" ,"9" ,"10" ,"11" ,"12" ,"13" ,"14" ,"15" ,"16" ,"17" ,"18" ,"19" ,"20" ,"21" ,"22" ,"X" ,"Y" ]
102
+
100
103
for chrom in chromosomes_full :
101
104
chr = str (chrom )
102
105
if not chr in chr_order :
@@ -143,10 +146,11 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
143
146
elif len (genes_at_pos )== 1 and (panel_file is None ):
144
147
amplicon_to_gene [amplicons_full [i ]] = genes_at_pos [0 ]
145
148
else :
146
- amplicon_to_gene [amplicons_full [i ]] = amplicon_name .lstrip ("MYE_" ).split ("_" )[0 ] # Assume amplicon name is gene_xx, potentially prefixed by MYE_
149
+ if amplicon_name .startswith ("MYE_" ):
150
+ amplicon_name = amplicon_name [4 :]
151
+ amplicon_to_gene [amplicons_full [i ]] = amplicon_name .split ("_" )[0 ] # Assume amplicon name is gene_xx, potentially prefixed by MYE_
147
152
n_amplicons = len (amplicon_names )
148
-
149
-
153
+
150
154
151
155
genotypes = ds [:,:]
152
156
# First select loci where the alt allele is present in some cells, to avoid loading the 5 full matrices in memory
@@ -157,7 +161,7 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
157
161
if chr_pos in whitelist_positions : candidate_loci .append (i )
158
162
else :
159
163
if chr_pos in blacklist : continue
160
- n_alt = np .sum ( (genotypes [i ,:]== 1 ) | (genotypes [i ]== 2 ) )
164
+ n_alt = np .sum ( (genotypes [i ,:]== 1 ) | (genotypes [i ,: ]== 2 ) )
161
165
if n_alt / n_cells_full >= min_frac_cells_alt :
162
166
candidate_loci .append (i )
163
167
del genotypes
@@ -180,7 +184,6 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
180
184
RO = (ds .layers ["RO" ][sorted_candidate ,:])[reverse_sort ,:]
181
185
GQ = (ds .layers ["GQ" ][sorted_candidate ,:])[reverse_sort ,:]
182
186
genotypes = (ds [sorted_candidate ,:])[reverse_sort ,:]
183
-
184
187
# Set low quality genotypes to "missing"
185
188
prefiltered_loci = []
186
189
lowqual_genotypes = (GQ < min_GQ ) | (DP < min_DP ) | ( (genotypes != 0 )& (AD / (DP + 0.1 )< min_AF ) )
@@ -194,7 +197,6 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
194
197
count_cells_genotyped = np .sum (genotypes [i ,:]!= 3 )
195
198
if count_cells_genotyped / n_cells_full > min_frac_cells_genotyped :
196
199
prefiltered_loci .append (i )
197
-
198
200
# Keep cells for which at least X% of the variants are genotyped
199
201
filtered_cells = []
200
202
for j in range (n_cells_full ):
@@ -251,6 +253,8 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
251
253
else :
252
254
variant_frequency = get_1K_freq (SNP_f ,str (chromosomes [i ]),int (positions [i ]),str (ref [i ]),str (alt [i ]))
253
255
256
+ if variant_frequency > 0.01 :
257
+ variant_name = "SNP " + variant_name
254
258
255
259
# Stricter filters for SNPs and silent mutations
256
260
if variant_frequency > 0.0001 :
@@ -455,6 +459,79 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
455
459
df_genes = pd .DataFrame (gene_matrix ,index = index_genes ,dtype = int )
456
460
if region == "gene" :
457
461
df_genes .to_csv (os .path .join (outdir ,basename + "_regions.csv" ),sep = "," ,header = False ,index = True )
462
+
463
+ #############################################
464
+ # SCITE input: genotype matrix and gene names
465
+ genotype_matrix = np .zeros ((n_loci ,n_cells ))
466
+ gene_names = []
467
+
468
+ for i ,locus in enumerate (filtered_loci ):
469
+ gene_names .append (variant_names [i ])
470
+ for j in range (n_cells ):
471
+ if (GQ [filtered_loci [i ],filtered_cells [j ]]> 0.1 ):
472
+ genotype_matrix [i ,j ] = int (genotypes [filtered_loci [i ],filtered_cells [j ]])
473
+ else : # if genotype quality is 0, genotype is unknown
474
+ genotype_matrix [i ,j ]= 3
475
+ np .savetxt (os .path .join (outdir ,basename + "_genotypes.csv" ),genotype_matrix .astype (int ),delimiter = " " ,fmt = '%i' )
476
+ with open (os .path .join (outdir ,basename + ".geneNames" ), 'w' ) as outfile :
477
+ outfile .write ('\n ' .join (gene_names ))
478
+
479
+
480
+ ################################
481
+ # BiTSC2 input: DP, AD, segments
482
+
483
+
484
+ DP_matrix = np .zeros ((n_loci ,n_cells ))
485
+ AD_matrix = np .zeros ((n_loci ,n_cells ))
486
+
487
+ for i in range (len (filtered_loci )):
488
+ for j in range (n_cells ):
489
+ AD_matrix [i ,j ] = int (AD [filtered_loci [i ],filtered_cells [j ]])
490
+ DP_matrix [i ,j ] = int (AD [filtered_loci [i ],filtered_cells [j ]]) + int (RO [filtered_loci [i ],filtered_cells [j ]])
491
+
492
+ # Add regions which do not have any loci ??
493
+ region2loci = {}
494
+ df_region = df_genes if region == "gene" else df_amplicons
495
+ for x in df_region .index :
496
+ region2loci [x [x .find ("_" )+ 1 :]] = []
497
+ for i in df_variants .index :
498
+ region2loci [df_variants .loc [i ,"REGION" ]].append (i )
499
+ regions_depth = []
500
+ for x in df_region .index :
501
+ region = x [1 + x .find ("_" ):]
502
+ if len (region2loci [region ])== 0 :
503
+ regions_depth .append (df_region .loc [x ,:])
504
+ if len (regions_depth )> 0 :
505
+ regions_depth = np .array (regions_depth )
506
+ DP_matrix = np .concatenate ([DP_matrix ,regions_depth ],axis = 0 )
507
+ AD_matrix = np .concatenate ([AD_matrix ,np .zeros (regions_depth .shape )],axis = 0 )
508
+
509
+ np .savetxt (os .path .join (outdir ,basename + "_full_DP.csv" ),DP_matrix .astype (int ),delimiter = "," ,fmt = '%i' )
510
+ np .savetxt (os .path .join (outdir ,basename + "_full_AD.csv" ),AD_matrix .astype (int ),delimiter = "," ,fmt = '%i' )
511
+
512
+ # Subsample cells
513
+ n_cells_BITSC2 = min (200 ,n_cells )
514
+ cells_subset = np .random .choice (n_cells ,n_cells_BITSC2 ,replace = False )
515
+ DP_matrix = DP_matrix [:,cells_subset ]
516
+ AD_matrix = AD_matrix [:,cells_subset ]
517
+ np .savetxt (os .path .join (outdir ,basename + "_DP.csv" ),DP_matrix .astype (int ),delimiter = "," ,fmt = '%i' )
518
+ np .savetxt (os .path .join (outdir ,basename + "_AD.csv" ),AD_matrix .astype (int ),delimiter = "," ,fmt = '%i' )
519
+
520
+ # Create genomic segments for BiTSC2
521
+ segments = []
522
+ start_region = 0
523
+ end_region = 0
524
+ while end_region < n_loci :
525
+ if end_region == n_loci - 1 or amplicon_to_gene [amplicons [filtered_loci [start_region ]]] != amplicon_to_gene [amplicons [filtered_loci [end_region + 1 ]]]:
526
+ segments .append ((start_region + 1 ,end_region + 1 ))
527
+ start_region = end_region + 1
528
+ end_region = start_region
529
+ else :
530
+ end_region += 1
531
+ for i in range (n_loci + 1 ,AD_matrix .shape [0 ]+ 1 ):
532
+ segments .append ((i ,i ))
533
+ np .savetxt (os .path .join (outdir ,basename + "_segments.csv" ),np .array (segments ).astype (int ),delimiter = "," ,fmt = '%i' )
534
+ np .savetxt (os .path .join (outdir ,basename + "_full_segments.csv" ),np .array (segments ).astype (int ),delimiter = "," ,fmt = '%i' )
458
535
459
536
460
537
0 commit comments