cbg-ethz
diff --git a/‎.github/workflows/build.yml
Lines changed: 0 additions & 19 deletions b/‎.github/workflows/build.yml
Lines changed: 0 additions & 19 deletions
diff --git a/‎.github/workflows/docker.yml
Lines changed: 0 additions & 38 deletions b/‎.github/workflows/docker.yml
Lines changed: 0 additions & 38 deletions
diff --git a/‎COMPASS.cpp
Lines changed: 27 additions & 11 deletions b/‎COMPASS.cpp
Lines changed: 27 additions & 11 deletions
diff --git a/‎Experiments/preprocessing/README.md
Lines changed: 1 addition & 1 deletion b/‎Experiments/preprocessing/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎Experiments/preprocessing/download_1000G.sh
Lines changed: 1 addition & 1 deletion b/‎Experiments/preprocessing/download_1000G.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎Experiments/preprocessing/estimate_region_weights.py
Lines changed: 29 additions & 0 deletions b/‎Experiments/preprocessing/estimate_region_weights.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎Experiments/preprocessing/preprocess.py renamed to ‎Experiments/preprocessing/preprocess_loom.py
Lines changed: 84 additions & 7 deletions b/‎Experiments/preprocessing/preprocess.py renamed to ‎Experiments/preprocessing/preprocess_loom.py
Lines changed: 84 additions & 7 deletions
@@ -24,13 +24,13 @@ int main(int argc, char* argv[]){
     parameters.verbose=false;
     // Read command line arguments
     std::string input_file{};
+    std::string regionweights_file{};
     int n_chains=4;
     int chain_length=5000;
-    int burn_in = 1000;
+    int burn_in = -1;
     double temperature=10;
     double betabin_overdisp = parameters.omega_het;
-    bool use_CNV=true;
-    bool apply_filter_regions = true;
+    bool use_CNA=true;
     bool output_simplified = true;
     std::string output{};
     data.sex = "female";
@@ -40,6 +40,9 @@ int main(int argc, char* argv[]){
         if (strcmp(argv[i],"-i")==0){
             input_file = argv[i+1];
         }
+        else if (strcmp(argv[i],"--regionweights")==0){
+            regionweights_file = argv[i+1];
+        }
         else if (strcmp(argv[i],"--nchains")==0){
             n_chains=atoi(argv[i+1]);
         }
@@ -61,11 +64,22 @@ int main(int argc, char* argv[]){
         else if (strcmp(argv[i],"-d")==0){
             if (strcmp(argv[i+1],"0")==0) parameters.use_doublets=false;
         }
-        else if (strcmp(argv[i],"--CNV")==0){
-            if (strcmp(argv[i+1],"0")==0) use_CNV=false;
+        else if (strcmp(argv[i],"--CNA")==0){
+            if (strcmp(argv[i+1],"0")==0) use_CNA=false;
         }
         else if (strcmp(argv[i],"--filterregions")==0){
-            if (strcmp(argv[i+1],"0")==0) apply_filter_regions=false;
+            if (strcmp(argv[i+1],"0")==0){
+                parameters.filter_regions=false;
+                parameters.filter_regions_CNLOH=false;
+            }
+        }
+        else if (strcmp(argv[i],"--filterregionsCNLOH")==0){
+            if (strcmp(argv[i+1],"0")==0){
+                parameters.filter_regions_CNLOH=false;
+            }
+        }
+        else if (strcmp(argv[i],"--verbose")==0){
+            if (strcmp(argv[i+1],"1")==0) parameters.verbose=true;
         }
         else if (strcmp(argv[i],"--sex")==0){
            data.sex= std::string(argv[i+1]);
@@ -91,8 +105,11 @@ int main(int argc, char* argv[]){
     if (output.size()==0){
         std::cout << "No output name was provided. COMPASS will use the same basename as the input for the output." <<std::endl;
     }
+    if (burn_in==-1){
+        burn_in=chain_length/2;
+    }
 
-    load_CSV(input_file,use_CNV,apply_filter_regions); 
+    load_CSV(input_file,regionweights_file,use_CNA); 
 
     parameters.omega_het = std::min(parameters.omega_het,betabin_overdisp);
     parameters.omega_het_indel = std::min(parameters.omega_het_indel,betabin_overdisp);
@@ -118,7 +135,7 @@ int main(int argc, char* argv[]){
 	for (int i=0;i<n_chains;i++){
 		std::srand(i);
 		Inference infer{"",temperature,i};
-        best_trees[i] = infer.find_best_tree(use_CNV,chain_length,burn_in);
+        best_trees[i] = infer.find_best_tree(use_CNA,chain_length,burn_in);
 		results[i]=best_trees[i].log_score;
 	}
     double best_score=-DBL_MAX;
@@ -129,11 +146,10 @@ int main(int argc, char* argv[]){
             best_score_index = i;
         }
 	}
-    if (output_simplified) best_trees[best_score_index].to_dot_pretty(output);
-    else best_trees[best_score_index].to_dot(output);
+    if (output_simplified) best_trees[best_score_index].to_dot(output,true);
+    else best_trees[best_score_index].to_dot(output,false);
 
     std::string gv_filename(output);
-    std::cout<<output.size() << std::endl;
     if ( output.size()<= 3 || (output.size()>3 && output.substr(output.size()-3)!=".gv")){
         gv_filename = output + + "_tree.gv";
     }
 
@@ -28,4 +28,4 @@ In case, you already know which mutations to include in the analysis, the list o
 ## Population Frequency
 
 Optionally, the preprocessing script can take as input a tsv file containing the population frequency of variants. This is used in the preprocessing to remove germline variants (unless they appear to be affected by LOH in some cells) and, in COMPASS, to penalize variants with a high population frequency which are not placed at the root (since they are likely to be germline variants).
-The file that we used can be downloaded [here](https://polybox.ethz.ch/index.php/s/V5Wr1wCrAAZw1S5). It was generated using the script `download_1000G.sh`, which was adapted from [this script](https://github.com/single-cell-genetics/cellSNP/blob/master/SNPlist_1Kgenome.sh).
+The file that we used was generated using the script `download_1000G.sh`, which was adapted from [this script](https://github.com/single-cell-genetics/cellSNP/blob/master/SNPlist_1Kgenome.sh).
@@ -16,4 +16,4 @@ for chr in `seq 1 22` X; do
 	files_list="$files_list chr${chr}.vcf.gz"
 done
 
-bcftools concat $files_list | bcftools view -H -Oz -o 1000G.vcf
+bcftools concat $files_list | bcftools view -H -Ov -o 1000G.vcf
@@ -0,0 +1,29 @@
+# Estimate region weights using samples that do not have copy number alterations (if a CNA is present in less than 50% of the samples it is still OK).
+# Usage: python estimate_region_weights.py output.csv sample1_regions.csv sample2_regions.csv sample3_regions.csv
+# where the sampleX_regions.csv files contain the read counts for each region and each cell (same as COMPASS input)
+
+
+import sys
+import numpy as np
+import pandas as pd
+
+regions_proportions={}
+output_file = sys.argv[1]
+for infile in sys.argv[2:]:
+    df = pd.read_csv(infile,sep=",",index_col=0,header=None)
+    df = df / np.sum(df)
+    for i in df.index:
+        region = i.split("_")[-1]
+        if not region in regions_proportions:
+            regions_proportions[region] = []
+        for j in df.columns:
+            regions_proportions[region].append(df.loc[i,j])
+
+regions_weights = {}
+for region in regions_proportions:
+    regions_weights[region] = np.median(regions_proportions[region])
+
+
+with open(output_file,"w") as out:
+    for region in regions_weights:
+        tmp = out.write(region+","+str(regions_weights[region])+"\n")
@@ -20,7 +20,8 @@
 blacklist = ["1_43815093","1_115256626","2_25463686","2_25467178","2_25469567","2_25458738","2_209113336","2_25536827","2_198267672",\
     "2_209113332","4_106194088","4_106157187","4_106196675","4_106196287","7_148504716","7_148506185","7_148506191","7_148504854",\
     "7_148506194","7_148526908","7_148504717","7_148543582","7_148543583","8_128750698","8_117864842","12_25378673","12_25378676",\
-    "13_28592669","13_28602256","17_29559928","17_29562734","17_7578587","17_7578115","17_7579472","17_7579801","17_29559932",\
+    "10_77210191","10_106721610",\
+    "13_28592669","13_28602256","16_55770629","17_29559928","17_29562734","17_7578587","17_7579472","17_7579801","17_29559932",\
     "17_7579414","17_29483195","17_29559926","17_29562734","17_7579440","20_31024389","21_44524505","21_36259324","X_133527541",\
     "X_44911052","X_39932806","X_39932807","X_44929002","X_15809170","X_39922359","X_15821932","X_15841334","X_15838366","X_15841334",\
     "X_15841336","X_39932907"," X_53426504","X_133549184","X_53426504","X_39914742","X_53426570","X_44949032","X_39921505","X_15827406"]
@@ -93,10 +94,12 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
     with loompy.connect(infile) as ds:
         n_loci_full,n_cells_full = ds.shape
         chromosomes_full = ds.ra["CHROM"]
+        chromosomes_full = [str(chr) for chr in chromosomes_full]
         positions_full = ds.ra["POS"]
         # The loom files appear to not always be sorted. (or partially sorted...)
         # reorder loci by position and chromosome
         chr_order = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"]
+
         for chrom in chromosomes_full:
             chr = str(chrom)
             if not chr in chr_order:
@@ -143,10 +146,11 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
                     elif len(genes_at_pos)==1 and (panel_file is None):
                         amplicon_to_gene[amplicons_full[i]] = genes_at_pos[0]
                     else:
-                        amplicon_to_gene[amplicons_full[i]] = amplicon_name.lstrip("MYE_").split("_")[0] # Assume amplicon name is gene_xx, potentially prefixed by MYE_
+                        if amplicon_name.startswith("MYE_"):
+                            amplicon_name = amplicon_name[4:]
+                        amplicon_to_gene[amplicons_full[i]] = amplicon_name.split("_")[0] # Assume amplicon name is gene_xx, potentially prefixed by MYE_
         n_amplicons = len(amplicon_names)
-    
-        
+
 
         genotypes = ds[:,:]
         # First select loci where the alt allele is present in some cells, to avoid loading the 5 full matrices in memory
@@ -157,7 +161,7 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
                 if chr_pos in whitelist_positions: candidate_loci.append(i)
             else:
                 if chr_pos in blacklist: continue
-                n_alt = np.sum( (genotypes[i,:]==1) | (genotypes[i]==2) )
+                n_alt = np.sum( (genotypes[i,:]==1) | (genotypes[i,:]==2) )
                 if n_alt / n_cells_full >= min_frac_cells_alt:
                     candidate_loci.append(i)
         del genotypes
@@ -180,7 +184,6 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
         RO = (ds.layers["RO"][sorted_candidate,:])[reverse_sort,:]
         GQ = (ds.layers["GQ"][sorted_candidate,:])[reverse_sort,:]
         genotypes = (ds[sorted_candidate,:])[reverse_sort,:]
-        
         # Set low quality genotypes to "missing"
         prefiltered_loci = []
         lowqual_genotypes = (GQ<min_GQ) | (DP<min_DP) | ( (genotypes!=0)& (AD/(DP+0.1)<min_AF) )
@@ -194,7 +197,6 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
                 count_cells_genotyped = np.sum(genotypes[i,:]!=3)
                 if count_cells_genotyped/n_cells_full>min_frac_cells_genotyped: 
                     prefiltered_loci.append(i)
-
         # Keep cells for which at least X% of the variants are genotyped
         filtered_cells = []
         for j in range(n_cells_full):
@@ -251,6 +253,8 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
             else:
                 variant_frequency = get_1K_freq(SNP_f,str(chromosomes[i]),int(positions[i]),str(ref[i]),str(alt[i]))
 
+            if variant_frequency>0.01:
+                variant_name = "SNP " + variant_name
 
             # Stricter filters for SNPs and silent mutations
             if variant_frequency>0.0001:
@@ -455,6 +459,79 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
         df_genes = pd.DataFrame(gene_matrix,index=index_genes,dtype=int)
         if region=="gene":
             df_genes.to_csv(os.path.join(outdir,basename+"_regions.csv"),sep=",",header=False,index=True)
+
+        #############################################
+        # SCITE input: genotype matrix and gene names
+        genotype_matrix = np.zeros((n_loci,n_cells))
+        gene_names=[]
+
+        for i,locus in enumerate(filtered_loci):
+            gene_names.append(variant_names[i])
+            for j in range(n_cells):
+                if (GQ[filtered_loci[i],filtered_cells[j]]>0.1):
+                    genotype_matrix[i,j] = int(genotypes[filtered_loci[i],filtered_cells[j]])
+                else: # if genotype quality is 0, genotype is unknown
+                    genotype_matrix[i,j]=3
+        np.savetxt(os.path.join(outdir,basename+"_genotypes.csv"),genotype_matrix.astype(int),delimiter=" ",fmt='%i')
+        with open(os.path.join(outdir,basename+".geneNames"), 'w') as outfile:
+            outfile.write('\n'.join(gene_names))
+
+
+        ################################
+        # BiTSC2 input: DP, AD, segments
+
+
+        DP_matrix=np.zeros((n_loci,n_cells))
+        AD_matrix=np.zeros((n_loci,n_cells))
+
+        for i in range(len(filtered_loci)):
+            for j in range(n_cells):
+                AD_matrix[i,j] = int(AD[filtered_loci[i],filtered_cells[j]])
+                DP_matrix[i,j] = int(AD[filtered_loci[i],filtered_cells[j]]) + int(RO[filtered_loci[i],filtered_cells[j]])
+
+        # Add regions which do not have any loci ??
+        region2loci={}
+        df_region = df_genes if region=="gene" else df_amplicons
+        for x in df_region.index:
+            region2loci[x[x.find("_")+1:]] = []
+        for i in df_variants.index:
+            region2loci[df_variants.loc[i,"REGION"]].append(i)
+        regions_depth=[]
+        for x in df_region.index:
+            region = x[1+x.find("_"):]
+            if len(region2loci[region])==0:
+                regions_depth.append(df_region.loc[x,:])
+        if len(regions_depth)>0:
+            regions_depth = np.array(regions_depth)
+            DP_matrix = np.concatenate([DP_matrix,regions_depth],axis=0)
+            AD_matrix = np.concatenate([AD_matrix,np.zeros(regions_depth.shape)],axis=0)
+
+        np.savetxt(os.path.join(outdir,basename+"_full_DP.csv"),DP_matrix.astype(int),delimiter=",",fmt='%i')
+        np.savetxt(os.path.join(outdir,basename+"_full_AD.csv"),AD_matrix.astype(int),delimiter=",",fmt='%i')
+        
+        # Subsample cells
+        n_cells_BITSC2 = min(200,n_cells)
+        cells_subset = np.random.choice(n_cells,n_cells_BITSC2,replace=False)
+        DP_matrix = DP_matrix[:,cells_subset]
+        AD_matrix = AD_matrix[:,cells_subset]
+        np.savetxt(os.path.join(outdir,basename+"_DP.csv"),DP_matrix.astype(int),delimiter=",",fmt='%i')
+        np.savetxt(os.path.join(outdir,basename+"_AD.csv"),AD_matrix.astype(int),delimiter=",",fmt='%i')
+
+        # Create genomic segments for BiTSC2
+        segments = []
+        start_region = 0
+        end_region = 0
+        while end_region < n_loci:
+            if end_region==n_loci-1 or amplicon_to_gene[amplicons[filtered_loci[start_region]]]  != amplicon_to_gene[amplicons[filtered_loci[end_region+1]]]:
+                segments.append((start_region+1,end_region+1))
+                start_region = end_region+1
+                end_region = start_region
+            else:
+                end_region+=1
+        for i in range(n_loci+1,AD_matrix.shape[0]+1):
+            segments.append((i,i))
+        np.savetxt(os.path.join(outdir,basename+"_segments.csv"),np.array(segments).astype(int),delimiter=",",fmt='%i')
+        np.savetxt(os.path.join(outdir,basename+"_full_segments.csv"),np.array(segments).astype(int),delimiter=",",fmt='%i')