Skip to content

Commit 6e5db0e

Browse files
authored
Merge pull request #8 from cbg-ethz/revision
Revision
2 parents 017d694 + 5eb597e commit 6e5db0e

File tree

472 files changed

+11139
-10545
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

472 files changed

+11139
-10545
lines changed

.github/workflows/build.yml

Lines changed: 0 additions & 19 deletions
This file was deleted.

.github/workflows/docker.yml

Lines changed: 0 additions & 38 deletions
This file was deleted.

COMPASS.cpp

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ int main(int argc, char* argv[]){
2424
parameters.verbose=false;
2525
// Read command line arguments
2626
std::string input_file{};
27+
std::string regionweights_file{};
2728
int n_chains=4;
2829
int chain_length=5000;
29-
int burn_in = 1000;
30+
int burn_in = -1;
3031
double temperature=10;
3132
double betabin_overdisp = parameters.omega_het;
32-
bool use_CNV=true;
33-
bool apply_filter_regions = true;
33+
bool use_CNA=true;
3434
bool output_simplified = true;
3535
std::string output{};
3636
data.sex = "female";
@@ -40,6 +40,9 @@ int main(int argc, char* argv[]){
4040
if (strcmp(argv[i],"-i")==0){
4141
input_file = argv[i+1];
4242
}
43+
else if (strcmp(argv[i],"--regionweights")==0){
44+
regionweights_file = argv[i+1];
45+
}
4346
else if (strcmp(argv[i],"--nchains")==0){
4447
n_chains=atoi(argv[i+1]);
4548
}
@@ -61,11 +64,22 @@ int main(int argc, char* argv[]){
6164
else if (strcmp(argv[i],"-d")==0){
6265
if (strcmp(argv[i+1],"0")==0) parameters.use_doublets=false;
6366
}
64-
else if (strcmp(argv[i],"--CNV")==0){
65-
if (strcmp(argv[i+1],"0")==0) use_CNV=false;
67+
else if (strcmp(argv[i],"--CNA")==0){
68+
if (strcmp(argv[i+1],"0")==0) use_CNA=false;
6669
}
6770
else if (strcmp(argv[i],"--filterregions")==0){
68-
if (strcmp(argv[i+1],"0")==0) apply_filter_regions=false;
71+
if (strcmp(argv[i+1],"0")==0){
72+
parameters.filter_regions=false;
73+
parameters.filter_regions_CNLOH=false;
74+
}
75+
}
76+
else if (strcmp(argv[i],"--filterregionsCNLOH")==0){
77+
if (strcmp(argv[i+1],"0")==0){
78+
parameters.filter_regions_CNLOH=false;
79+
}
80+
}
81+
else if (strcmp(argv[i],"--verbose")==0){
82+
if (strcmp(argv[i+1],"1")==0) parameters.verbose=true;
6983
}
7084
else if (strcmp(argv[i],"--sex")==0){
7185
data.sex= std::string(argv[i+1]);
@@ -91,8 +105,11 @@ int main(int argc, char* argv[]){
91105
if (output.size()==0){
92106
std::cout << "No output name was provided. COMPASS will use the same basename as the input for the output." <<std::endl;
93107
}
108+
if (burn_in==-1){
109+
burn_in=chain_length/2;
110+
}
94111

95-
load_CSV(input_file,use_CNV,apply_filter_regions);
112+
load_CSV(input_file,regionweights_file,use_CNA);
96113

97114
parameters.omega_het = std::min(parameters.omega_het,betabin_overdisp);
98115
parameters.omega_het_indel = std::min(parameters.omega_het_indel,betabin_overdisp);
@@ -118,7 +135,7 @@ int main(int argc, char* argv[]){
118135
for (int i=0;i<n_chains;i++){
119136
std::srand(i);
120137
Inference infer{"",temperature,i};
121-
best_trees[i] = infer.find_best_tree(use_CNV,chain_length,burn_in);
138+
best_trees[i] = infer.find_best_tree(use_CNA,chain_length,burn_in);
122139
results[i]=best_trees[i].log_score;
123140
}
124141
double best_score=-DBL_MAX;
@@ -129,11 +146,10 @@ int main(int argc, char* argv[]){
129146
best_score_index = i;
130147
}
131148
}
132-
if (output_simplified) best_trees[best_score_index].to_dot_pretty(output);
133-
else best_trees[best_score_index].to_dot(output);
149+
if (output_simplified) best_trees[best_score_index].to_dot(output,true);
150+
else best_trees[best_score_index].to_dot(output,false);
134151

135152
std::string gv_filename(output);
136-
std::cout<<output.size() << std::endl;
137153
if ( output.size()<= 3 || (output.size()>3 && output.substr(output.size()-3)!=".gv")){
138154
gv_filename = output + + "_tree.gv";
139155
}

Experiments/preprocessing/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ In case, you already know which mutations to include in the analysis, the list o
2828
## Population Frequency
2929

3030
Optionally, the preprocessing script can take as input a tsv file containing the population frequency of variants. This is used in the preprocessing to remove germline variants (unless they appear to be affected by LOH in some cells) and, in COMPASS, to penalize variants with a high population frequency which are not placed at the root (since they are likely to be germline variants).
31-
The file that we used can be downloaded [here](https://polybox.ethz.ch/index.php/s/V5Wr1wCrAAZw1S5). It was generated using the script `download_1000G.sh`, which was adapted from [this script](https://github.com/single-cell-genetics/cellSNP/blob/master/SNPlist_1Kgenome.sh).
31+
The file that we used was generated using the script `download_1000G.sh`, which was adapted from [this script](https://github.com/single-cell-genetics/cellSNP/blob/master/SNPlist_1Kgenome.sh).

Experiments/preprocessing/download_1000G.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ for chr in `seq 1 22` X; do
1616
files_list="$files_list chr${chr}.vcf.gz"
1717
done
1818

19-
bcftools concat $files_list | bcftools view -H -Oz -o 1000G.vcf
19+
bcftools concat $files_list | bcftools view -H -Ov -o 1000G.vcf
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Estimate region weights using samples that do not have copy number alterations (if a CNA is present in less than 50% of the samples it is still OK).
2+
# Usage: python estimate_region_weights.py output.csv sample1_regions.csv sample2_regions.csv sample3_regions.csv
3+
# where the sampleX_regions.csv files contain the read counts for each region and each cell (same as COMPASS input)
4+
5+
6+
import sys
7+
import numpy as np
8+
import pandas as pd
9+
10+
regions_proportions={}
11+
output_file = sys.argv[1]
12+
for infile in sys.argv[2:]:
13+
df = pd.read_csv(infile,sep=",",index_col=0,header=None)
14+
df = df / np.sum(df)
15+
for i in df.index:
16+
region = i.split("_")[-1]
17+
if not region in regions_proportions:
18+
regions_proportions[region] = []
19+
for j in df.columns:
20+
regions_proportions[region].append(df.loc[i,j])
21+
22+
regions_weights = {}
23+
for region in regions_proportions:
24+
regions_weights[region] = np.median(regions_proportions[region])
25+
26+
27+
with open(output_file,"w") as out:
28+
for region in regions_weights:
29+
tmp = out.write(region+","+str(regions_weights[region])+"\n")

Experiments/preprocessing/preprocess.py renamed to Experiments/preprocessing/preprocess_loom.py

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
blacklist = ["1_43815093","1_115256626","2_25463686","2_25467178","2_25469567","2_25458738","2_209113336","2_25536827","2_198267672",\
2121
"2_209113332","4_106194088","4_106157187","4_106196675","4_106196287","7_148504716","7_148506185","7_148506191","7_148504854",\
2222
"7_148506194","7_148526908","7_148504717","7_148543582","7_148543583","8_128750698","8_117864842","12_25378673","12_25378676",\
23-
"13_28592669","13_28602256","17_29559928","17_29562734","17_7578587","17_7578115","17_7579472","17_7579801","17_29559932",\
23+
"10_77210191","10_106721610",\
24+
"13_28592669","13_28602256","16_55770629","17_29559928","17_29562734","17_7578587","17_7579472","17_7579801","17_29559932",\
2425
"17_7579414","17_29483195","17_29559926","17_29562734","17_7579440","20_31024389","21_44524505","21_36259324","X_133527541",\
2526
"X_44911052","X_39932806","X_39932807","X_44929002","X_15809170","X_39922359","X_15821932","X_15841334","X_15838366","X_15841334",\
2627
"X_15841336","X_39932907"," X_53426504","X_133549184","X_53426504","X_39914742","X_53426570","X_44949032","X_39921505","X_15827406"]
@@ -93,10 +94,12 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
9394
with loompy.connect(infile) as ds:
9495
n_loci_full,n_cells_full = ds.shape
9596
chromosomes_full = ds.ra["CHROM"]
97+
chromosomes_full = [str(chr) for chr in chromosomes_full]
9698
positions_full = ds.ra["POS"]
9799
# The loom files appear to not always be sorted. (or partially sorted...)
98100
# reorder loci by position and chromosome
99101
chr_order = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"]
102+
100103
for chrom in chromosomes_full:
101104
chr = str(chrom)
102105
if not chr in chr_order:
@@ -143,10 +146,11 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
143146
elif len(genes_at_pos)==1 and (panel_file is None):
144147
amplicon_to_gene[amplicons_full[i]] = genes_at_pos[0]
145148
else:
146-
amplicon_to_gene[amplicons_full[i]] = amplicon_name.lstrip("MYE_").split("_")[0] # Assume amplicon name is gene_xx, potentially prefixed by MYE_
149+
if amplicon_name.startswith("MYE_"):
150+
amplicon_name = amplicon_name[4:]
151+
amplicon_to_gene[amplicons_full[i]] = amplicon_name.split("_")[0] # Assume amplicon name is gene_xx, potentially prefixed by MYE_
147152
n_amplicons = len(amplicon_names)
148-
149-
153+
150154

151155
genotypes = ds[:,:]
152156
# First select loci where the alt allele is present in some cells, to avoid loading the 5 full matrices in memory
@@ -157,7 +161,7 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
157161
if chr_pos in whitelist_positions: candidate_loci.append(i)
158162
else:
159163
if chr_pos in blacklist: continue
160-
n_alt = np.sum( (genotypes[i,:]==1) | (genotypes[i]==2) )
164+
n_alt = np.sum( (genotypes[i,:]==1) | (genotypes[i,:]==2) )
161165
if n_alt / n_cells_full >= min_frac_cells_alt:
162166
candidate_loci.append(i)
163167
del genotypes
@@ -180,7 +184,6 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
180184
RO = (ds.layers["RO"][sorted_candidate,:])[reverse_sort,:]
181185
GQ = (ds.layers["GQ"][sorted_candidate,:])[reverse_sort,:]
182186
genotypes = (ds[sorted_candidate,:])[reverse_sort,:]
183-
184187
# Set low quality genotypes to "missing"
185188
prefiltered_loci = []
186189
lowqual_genotypes = (GQ<min_GQ) | (DP<min_DP) | ( (genotypes!=0)& (AD/(DP+0.1)<min_AF) )
@@ -194,7 +197,6 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
194197
count_cells_genotyped = np.sum(genotypes[i,:]!=3)
195198
if count_cells_genotyped/n_cells_full>min_frac_cells_genotyped:
196199
prefiltered_loci.append(i)
197-
198200
# Keep cells for which at least X% of the variants are genotyped
199201
filtered_cells = []
200202
for j in range(n_cells_full):
@@ -251,6 +253,8 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
251253
else:
252254
variant_frequency = get_1K_freq(SNP_f,str(chromosomes[i]),int(positions[i]),str(ref[i]),str(alt[i]))
253255

256+
if variant_frequency>0.01:
257+
variant_name = "SNP " + variant_name
254258

255259
# Stricter filters for SNPs and silent mutations
256260
if variant_frequency>0.0001:
@@ -455,6 +459,79 @@ def convert_loom(infile,outdir, min_GQ, min_DP, min_AF, min_frac_cells_genotyped
455459
df_genes = pd.DataFrame(gene_matrix,index=index_genes,dtype=int)
456460
if region=="gene":
457461
df_genes.to_csv(os.path.join(outdir,basename+"_regions.csv"),sep=",",header=False,index=True)
462+
463+
#############################################
464+
# SCITE input: genotype matrix and gene names
465+
genotype_matrix = np.zeros((n_loci,n_cells))
466+
gene_names=[]
467+
468+
for i,locus in enumerate(filtered_loci):
469+
gene_names.append(variant_names[i])
470+
for j in range(n_cells):
471+
if (GQ[filtered_loci[i],filtered_cells[j]]>0.1):
472+
genotype_matrix[i,j] = int(genotypes[filtered_loci[i],filtered_cells[j]])
473+
else: # if genotype quality is 0, genotype is unknown
474+
genotype_matrix[i,j]=3
475+
np.savetxt(os.path.join(outdir,basename+"_genotypes.csv"),genotype_matrix.astype(int),delimiter=" ",fmt='%i')
476+
with open(os.path.join(outdir,basename+".geneNames"), 'w') as outfile:
477+
outfile.write('\n'.join(gene_names))
478+
479+
480+
################################
481+
# BiTSC2 input: DP, AD, segments
482+
483+
484+
DP_matrix=np.zeros((n_loci,n_cells))
485+
AD_matrix=np.zeros((n_loci,n_cells))
486+
487+
for i in range(len(filtered_loci)):
488+
for j in range(n_cells):
489+
AD_matrix[i,j] = int(AD[filtered_loci[i],filtered_cells[j]])
490+
DP_matrix[i,j] = int(AD[filtered_loci[i],filtered_cells[j]]) + int(RO[filtered_loci[i],filtered_cells[j]])
491+
492+
# Add regions which do not have any loci ??
493+
region2loci={}
494+
df_region = df_genes if region=="gene" else df_amplicons
495+
for x in df_region.index:
496+
region2loci[x[x.find("_")+1:]] = []
497+
for i in df_variants.index:
498+
region2loci[df_variants.loc[i,"REGION"]].append(i)
499+
regions_depth=[]
500+
for x in df_region.index:
501+
region = x[1+x.find("_"):]
502+
if len(region2loci[region])==0:
503+
regions_depth.append(df_region.loc[x,:])
504+
if len(regions_depth)>0:
505+
regions_depth = np.array(regions_depth)
506+
DP_matrix = np.concatenate([DP_matrix,regions_depth],axis=0)
507+
AD_matrix = np.concatenate([AD_matrix,np.zeros(regions_depth.shape)],axis=0)
508+
509+
np.savetxt(os.path.join(outdir,basename+"_full_DP.csv"),DP_matrix.astype(int),delimiter=",",fmt='%i')
510+
np.savetxt(os.path.join(outdir,basename+"_full_AD.csv"),AD_matrix.astype(int),delimiter=",",fmt='%i')
511+
512+
# Subsample cells
513+
n_cells_BITSC2 = min(200,n_cells)
514+
cells_subset = np.random.choice(n_cells,n_cells_BITSC2,replace=False)
515+
DP_matrix = DP_matrix[:,cells_subset]
516+
AD_matrix = AD_matrix[:,cells_subset]
517+
np.savetxt(os.path.join(outdir,basename+"_DP.csv"),DP_matrix.astype(int),delimiter=",",fmt='%i')
518+
np.savetxt(os.path.join(outdir,basename+"_AD.csv"),AD_matrix.astype(int),delimiter=",",fmt='%i')
519+
520+
# Create genomic segments for BiTSC2
521+
segments = []
522+
start_region = 0
523+
end_region = 0
524+
while end_region < n_loci:
525+
if end_region==n_loci-1 or amplicon_to_gene[amplicons[filtered_loci[start_region]]] != amplicon_to_gene[amplicons[filtered_loci[end_region+1]]]:
526+
segments.append((start_region+1,end_region+1))
527+
start_region = end_region+1
528+
end_region = start_region
529+
else:
530+
end_region+=1
531+
for i in range(n_loci+1,AD_matrix.shape[0]+1):
532+
segments.append((i,i))
533+
np.savetxt(os.path.join(outdir,basename+"_segments.csv"),np.array(segments).astype(int),delimiter=",",fmt='%i')
534+
np.savetxt(os.path.join(outdir,basename+"_full_segments.csv"),np.array(segments).astype(int),delimiter=",",fmt='%i')
458535

459536

460537

0 commit comments

Comments
 (0)