From 732932d16a53af8574a3ac10e49e17f5240b08e2 Mon Sep 17 00:00:00 2001 From: Patrick Sorn Date: Thu, 24 Aug 2023 11:02:56 +0200 Subject: [PATCH 1/2] Added test sample output data; Simplified code; Updated bowtie version to allow native gzip input --- README.md | 16 +- confidence.py | 7 +- environment.yml | 2 +- fourdigits.py | 4 +- run_test.sh | 2 +- seq2HLA.py | 139 ++++++++++-------- test.py | 31 ++++ test_data/{ => input}/sample_R1.fastq.gz | Bin test_data/{ => input}/sample_R2.fastq.gz | Bin .../output/hla_1_classical.HLAgenotype2digits | 4 + .../output/hla_1_classical.HLAgenotype4digits | 4 + test_data/output/hla_1_classical.expression | 3 + .../hla_1_nonclassical.HLAgenotype2digits | 10 ++ .../hla_1_nonclassical.HLAgenotype4digits | 10 ++ .../output/hla_1_nonclassical.expression | 9 ++ .../output/hla_2_classical.HLAgenotype2digits | 7 + .../output/hla_2_classical.HLAgenotype4digits | 7 + test_data/output/hla_2_classical.expression | 6 + 18 files changed, 183 insertions(+), 78 deletions(-) create mode 100644 test.py rename test_data/{ => input}/sample_R1.fastq.gz (100%) rename test_data/{ => input}/sample_R2.fastq.gz (100%) create mode 100644 test_data/output/hla_1_classical.HLAgenotype2digits create mode 100644 test_data/output/hla_1_classical.HLAgenotype4digits create mode 100644 test_data/output/hla_1_classical.expression create mode 100644 test_data/output/hla_1_nonclassical.HLAgenotype2digits create mode 100644 test_data/output/hla_1_nonclassical.HLAgenotype4digits create mode 100644 test_data/output/hla_1_nonclassical.expression create mode 100644 test_data/output/hla_2_classical.HLAgenotype2digits create mode 100644 test_data/output/hla_2_classical.HLAgenotype4digits create mode 100644 test_data/output/hla_2_classical.expression diff --git a/README.md b/README.md index e9fd170..0387007 100755 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ as input, uses a bowtie index comprising all HLA alleles and outputs the most li - biopython (>= 1.79) - numpy (>= 1.21.5) - scipy (>= 1.7.3) -- bowtie (>= 1.0.0) +- bowtie (>= 1.3.1) ## Installation @@ -50,13 +50,13 @@ optional arguments: ## Output The results are output to stdout and to textfiles. Most important are: -- -ClassI.HLAgenotype2digits => 2 digit result of Class I -- -ClassII.HLAgenotype2digits => 2 digit result of Class II -- -ClassI.HLAgenotype4digits => 4 digit result of Class I -- -ClassII.HLAgenotype4digits => 4 digit result of Class II -- .ambiguity => reports typing ambuigities (more than one solution for an allele possible) -- -ClassI.expression => expression of Class I alleles -- -ClassII.expression => expression of Class II alleles +- `hla_1_*.HLAgenotype2digits` - 2 digit result of Class I +- `hla_2_classical.HLAgenotype2digits` - 2 digit result of Class II +- `hla_1_*.HLAgenotype4digits` - 4 digit result of Class I +- `hla_2_classical.HLAgenotype4digits` - 4 digit result of Class II +- `hla.ambiguity` - reports typing ambuigities (more than one solution for an allele possible) +- `hla_1_*.expression` - expression of Class I alleles +- `hla_2_classical.expression` - expression of Class II alleles ## Version history diff --git a/confidence.py b/confidence.py index 3d5472b..8c52038 100644 --- a/confidence.py +++ b/confidence.py @@ -3,9 +3,14 @@ import numpy as np import scipy.stats as st +np.seterr("raise") def calculate_confidence(allele_count, allele_vector): - paril = 1.0 - st.norm.cdf(allele_count, np.mean(allele_vector), np.std(allele_vector, ddof = 1)) + if not allele_vector or len(allele_vector) < 2: + return 1.0 + mean = np.mean(allele_vector) + std = np.std(allele_vector, ddof = 1) + paril = 1.0 - st.norm.cdf(allele_count, mean, std) poutlier = st.binom.pmf(0, len(allele_vector), paril) return (1.0 - poutlier) diff --git a/environment.yml b/environment.yml index 35f9f44..eb45195 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ channels: - defaults dependencies: - biopython=1.79 - - bowtie=1.0.0 + - bowtie=1.3.1 - numpy=1.21.5 - scipy=1.7.3 - python=3.7.16 diff --git a/fourdigits.py b/fourdigits.py index 3c5c415..6c47f64 100755 --- a/fourdigits.py +++ b/fourdigits.py @@ -176,13 +176,13 @@ def calculate_outlier(top, bgVec): """Call R-script "commmand_fourdigit.R" to calculate the confidence of the top-scoring allele (four-digits).""" # Convert bgVec into md5 - md5sum = hashlib.md5(bgVec.encode("utf-8")).hexdigest() + md5sum = hashlib.md5(','.join(str(x) for x in bgVec).encode("utf-8")).hexdigest() if not md5sum in top_alleles: top_alleles[md5sum] = {} if top in top_alleles[md5sum]: return top_alleles[md5sum][top] else: - allele_list = [float(x) for x in bgVec.rstrip(",").split(",")] + allele_list = [float(x) for x in bgVec] confidence = calculate_confidence(float(top), allele_list) top_alleles[md5sum][top] = confidence return confidence diff --git a/run_test.sh b/run_test.sh index b1daa0d..4707619 100644 --- a/run_test.sh +++ b/run_test.sh @@ -2,4 +2,4 @@ rm -rf test_out/ -python seq2HLA.py -1 test_data/sample_R1.fastq.gz -2 test_data/sample_R2.fastq.gz -o test_out/ -p 12 +python seq2HLA.py -1 test_data/input/sample_R1.fastq.gz -2 test_data/input/sample_R2.fastq.gz -o test_out/ -p 12 diff --git a/seq2HLA.py b/seq2HLA.py index 90df8f2..01bab62 100755 --- a/seq2HLA.py +++ b/seq2HLA.py @@ -12,7 +12,6 @@ from argparse import ArgumentParser -from glob import glob import gzip import linecache import logging @@ -72,8 +71,12 @@ def get_P(locus, finaloutput): return linecache.getline(finaloutput, locus_dict[locus]).split("\t")[4][0:-1] -#In case of ambiguous typings, the allele(s) with the best p-value (which is actually the smallest one, so the name of the function is misleading - sorry) is reported and thus the min(p) is returned -def get_max_P(infile): + +def get_best_P(infile): + """ + In case of ambiguous typings, the allele(s) with the best p-value + (which is actually the smallest one) is reported and thus the min(p) is returned. + """ p = [] with open(infile) as inf: for line in inf: @@ -87,8 +90,30 @@ def get_max_P(infile): return "NA" -#Open sam-file and count mappings for each allele + +def get_best_allele_per_group(readspergroup, readcount): + """ + For each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save + #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele + #ii) the number of reads mapping to the top-scoring allele => readspergroup + """ + + top_counts = {} + top_alleles = {} + for group in readspergroup: + top_counts[group] = readspergroup[group] + for key in readcount: + if readcount[key] > 0: + group = key.split(":")[0] + if top_counts[group] <= readcount[key]: + top_counts[group] = readcount[key] + top_alleles[group] = key + return (top_counts, top_alleles) + + + def parse_mapping(hla_dict, sam, readcount): + """Open sam-file and count mappings for each allele.""" with open(sam) as samhandle: for line in samhandle: if line[0] != '@': @@ -229,6 +254,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl output3 = os.path.join(self.working_dir, "{}.digitalhaplotype3".format(prefix)) medianfile = os.path.join(self.working_dir, "{}.digitalhaplotype1".format(prefix)) ambiguityfile = os.path.join(self.working_dir, "hla.ambiguity") + expressionfile = os.path.join(self.working_dir, "{}.expression".format(prefix)) finaloutput = os.path.join(self.working_dir, "{}.HLAgenotype2digits".format(prefix)) finaloutput4digit = os.path.join(self.working_dir, "{}.HLAgenotype4digits".format(prefix)) @@ -274,7 +300,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl self.report_HLA_genotype(output1, output2, finaloutput, locus_list) self.logger.info("Calculating locus-specific expression...") try: - self.expression(aligned_1, sam1, bowtielog, output1, locus_list, length_dict, hla_dict) + self.expression(aligned_1, sam1, expressionfile, bowtielog, output1, locus_list, length_dict, hla_dict) except IOError: #tmp = "" for locus in locus_list: @@ -324,7 +350,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl if fourDigit_solutions1[locus]: if int(fourDigit_solutions1[locus]) > 1: allele1 += "'" - finalAlleles[locus] = "{}\t{}\t".format(allele1, get_max_P("{}.4digits{}1.solutions".format(sam1, locus))) + finalAlleles[locus] = "{}\t{}\t".format(allele1, get_best_P("{}.4digits{}1.solutions".format(sam1, locus))) else: finalAlleles[locus] = "no\tNA\t" @@ -343,13 +369,13 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl allele3 = f3[alleleFourDigit_index] if int(f3[alleleFourDigit_index+1]) > 1: allele3 += "'" - finalAlleles[locus] += "{}\t{}".format(allele3, get_max_P("{}.4digits{}2.solutions".format(sam3, locus))) + finalAlleles[locus] += "{}\t{}".format(allele3, get_best_P("{}.4digits{}2.solutions".format(sam3, locus))) alleleFourDigit_index += 3 else: allele2 = fourdigits2[locus] if int(fourDigit_solutions2[locus]) > 1: allele2 += "'" - finalAlleles[locus] += "{}\t{}".format(allele2, get_max_P("{}.4digits{}2.solutions".format(sam2, locus))) + finalAlleles[locus] += "{}\t{}".format(allele2, get_best_P("{}.4digits{}2.solutions".format(sam2, locus))) alleleFourDigit_index += 3 #output the 4-digit type (stdout and to file) @@ -361,7 +387,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl finalAlleles[locus] = "no\tNA\tno\tNA" self.report_HLA_four_digit_genotype(finalAlleles, finaloutput4digit, locus_list) - self.cleanup() + #self.cleanup() def cleanup(self): @@ -379,14 +405,16 @@ def cleanup(self): "hla_2_classical.HLAgenotype2digits", "hla_2_classical.HLAgenotype4digits", "hla.ambiguity", - "run.expression", + "hla_1_classical.expression", + "hla_1_nonclassical.expression", + "hla_2_classical.expression", "run.log" ] for filename in os.listdir(self.working_dir): if not filename in files_to_keep: file_path = os.path.join(self.working_dir, filename) - self.logger.info("Removing {}.".format(file_path)) + self.logger.debug("Removing {}.".format(file_path)) os.remove(file_path) @@ -397,23 +425,13 @@ def mapping(self, sam, aligned_file, bowtielog, fq1, fq2, bowtiebuild, iteration self.logger.info("Skipping mapping step as already done...") return if iteration == 1: - if not self.gzipped: - mapping_cmd = "(bowtie -3 {} -S {} --al {} {} -1 {} -2 {} | awk -F \'\\t\' '$3 != \"*\"{{ print $0 }}' > {}) 2> {}".format(self.trim3, mapopt, aligned_file, bowtiebuild, fq1, fq2, sam, bowtielog) - else: - mapping_cmd = "(bowtie -3 {} -S {} --al {} {} -1 <(zcat {}) -2 <(zcat {}) | awk -F \'\\t\' '$3 != \"*\"{{ print $0 }}' > {}) 2> {}".format(self.trim3, mapopt, aligned_file, bowtiebuild, fq1, fq2, sam, bowtielog) + mapping_cmd = "(bowtie -3 {} -S {} --al {} -x {} -1 {} -2 {} | awk -F \'\\t\' '$3 != \"*\"{{ print $0 }}' > {}) 2> {}".format(self.trim3, mapopt, aligned_file, bowtiebuild, fq1, fq2, sam, bowtielog) elif iteration == 2: - mapping_cmd = "bowtie -3 {} -S {} {} -1 {} -2 {} {}".format(self.trim3, mapopt, bowtiebuild, fq1, fq2, sam) + mapping_cmd = "bowtie -3 {} -S {} -x {} -1 {} -2 {} {}".format(self.trim3, mapopt, bowtiebuild, fq1, fq2, sam) #execute bowtie self.logger.info("CMD: {}".format(mapping_cmd)) process_mapping = subprocess.Popen(['bash', '-c', mapping_cmd], stdout=subprocess.PIPE) out, err = process_mapping.communicate() - - if iteration == 1: - #print alignment stats - printcommand = "cat {}".format(bowtielog) - printcommand_proc = subprocess.Popen(['bash', '-c', printcommand], stdout=subprocess.PIPE) - out, err = printcommand_proc.communicate() - self.logger.info(out) def create_ref_dict(self, hlafasta, class1_list): @@ -465,31 +483,22 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r readspergroup_dict[locus] = {} fourdigits_dict[locus] = {} fourdigits_sorted_dict[locus] = {} - alleleVector[locus] = "" + alleleVector[locus] = [] - maxallelepergroup = {} - - #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save - #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele - #ii) the number of reads mapping to the top-scoring allele => readspergroup - for key in readcount: - if readcount[key] > 0: - group = key.split(":")[0] - if readspergroup[group] <= readcount[key]: - readspergroup[group] = readcount[key] - maxallelepergroup[group] = key - - readspergrouphandle = open("{}.readspergrouphandle".format(sam), "a") - for group in readspergroup: - readspergrouphandle.write("{}\t{}\n".format(group, readspergroup[group])) - readspergrouphandle.close() + + (top_counts, top_alleles) = get_best_allele_per_group(readspergroup, readcount) + + + with open("{}.readspergrouphandle".format(sam), "a") as outf: + for group in top_counts: + outf.write("{}\t{}\n".format(group, top_counts[group])) #consider all A-,B-, and C-groups seperately #readspergrouplist = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration) #readspergroup = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27) - for key in readspergroup: + for key in top_counts: locus = key.split("*")[0] - readspergroup_dict_list[locus].append(readspergroup[key] - float(medians[locus])) - readspergroup_dict[locus][key] = readspergroup[key] - float(medians[locus]) + readspergroup_dict_list[locus].append(top_counts[key] - float(medians[locus])) + readspergroup_dict[locus][key] = top_counts[key] - float(medians[locus]) #Determine top-scoring group of the whole locus (A,B,C) and store it #maxkey = group (e.g. A*02) with the most reads @@ -499,7 +508,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r if len(readspergroup_dict_list[locus]) > 0: maxkey[locus] = max(readspergroup_dict[locus], key=lambda a:readspergroup_dict[locus].get(a)) if readspergroup_dict[locus][maxkey[locus]] > 0: - maxAlleles[locus] = maxallelepergroup[maxkey[locus]] + maxAlleles[locus] = top_alleles[maxkey[locus]] allele_dict[locus] = maxkey[locus] else: allele_dict[locus] = "no" @@ -519,10 +528,9 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r else: iteration = 1 - fourdigit_writehandle = open("{}.4digits{}{}".format(sam, locus, iteration), "w") - for key in fourdigits_sorted_dict[locus]: - fourdigit_writehandle.write("{}: {}\n".format(key[0], key[1])) - fourdigit_writehandle.close() + with open("{}.4digits{}{}".format(sam, locus, iteration), "w") as outf: + for key in fourdigits_sorted_dict[locus]: + outf.write("{}: {}\n".format(key[0], key[1])) readspergrouplocus = sorted(readspergroup_dict[locus].items(), key=itemgetter(1), reverse=True) @@ -530,36 +538,37 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r #in the 2nd iteration: #1.) DO NOT remove the top-scoring group from the set vec as this is more strict when calculating the probability of the top scoring group being an outlier #The strings vec are used by the R-script to calculate the probability of the top scoring group being an outlier - for key in maxallelepergroup: + + for key in top_alleles: if key.split("*")[0] == locus: - alleleVector[locus] += "{},".format(readcount[maxallelepergroup[key]]) + alleleVector[locus].append(readcount[top_alleles[key]]) #2.) Add the decision thresholds to the sets vec, as this enables measuring the distance of the top-scoring group to this distance if allele_dict[locus] == "no": - alleleVector[locus] += str(medians[locus]) + alleleVector[locus].append(int(float(medians[locus]))) #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), vec only contain median[<0|1|2>]. #To avoid errors in the R-Script, set to 0 - if alleleVector[locus] == "" or alleleVector[locus] == medians[locus]: - alleleVector[locus] = "0" - alleleCount[locus] = "0" + if not alleleVector[locus] or alleleVector[locus][0] == int(float(medians[locus])): + alleleVector[locus] = [] + alleleCount[locus] = 0 else: - alleleCount[locus] = str(readcount[maxallelepergroup[maxkey[locus]]]) + alleleCount[locus] = readcount[top_alleles[maxkey[locus]]] else: #in the 1st iteration: remove the top-scoring group from the set vec as this increases the certainty when calculating the probability of the top scoring group being an outlier - for key in maxallelepergroup: + for key in top_alleles: if key.split("*")[0] == locus: if key != maxkey[locus]: - alleleVector[locus] += "{},".format(readcount[maxallelepergroup[key]]) + alleleVector[locus].append(readcount[top_alleles[key]]) #2.) DO NOT add the decision thresholds to the sets vec - if alleleVector[locus] == "": - alleleVector[locus] = "0" - alleleCount[locus] = "0" + if not alleleVector[locus]: + alleleCount[locus] = 0 else: - alleleCount[locus] = str(readcount[maxallelepergroup[maxkey[locus]]]) + alleleCount[locus] = readcount[top_alleles[maxkey[locus]]] + confidence_vals = [] for locus in locus_list: - allele_list = [float(x) for x in alleleVector[locus].rstrip(",").split(",")] + allele_list = [float(x) for x in alleleVector[locus]] confidence_val = calculate_confidence(float(alleleCount[locus]), allele_list) confidence_vals.append((maxkey[locus], confidence_val, 1 - confidence_val)) @@ -572,7 +581,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r self.logger.info("Determining 4 digits HLA type...") for locus in locus_list: if not allele_dict[locus] == "no": - fourDigitString += "{},{},".format(allele_dict[locus], fourdigits.determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus].rstrip(","), hla_class, hla_classification, ambiguityfile)) + fourDigitString += "{},{},".format(allele_dict[locus], fourdigits.determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus], hla_class, hla_classification, ambiguityfile)) else: fourDigitString += "{},,,".format(allele_dict[locus]) @@ -717,7 +726,7 @@ def report_HLA_four_digit_genotype(self, finalAlleles, finaloutput4digit, locus_ self.logger.info("{}\t{}".format(locus, finalAlleles[locus])) - def expression(self, aligned, sam, logfile, alleles_in, locus_list, length_dict, hla_dict): + def expression(self, aligned, sam, output, logfile, alleles_in, locus_list, length_dict, hla_dict): """Calculate locus-specific expression.""" totalreads = float(linecache.getline(logfile, 1).split(':')[1]) @@ -756,7 +765,7 @@ def expression(self, aligned, sam, logfile, alleles_in, locus_list, length_dict, count[locus] += float(1.0 / float(n)) #Calculate RPKM and print expression values for each locus to stdout - with open(os.path.join(self.working_dir, "hla.expression"), 'w') as outf: + with open(output, 'w') as outf: for locus in count: rpkm = float((1000.0 / length_dict[locus])) * float((1000000.0 / totalreads)) * count[locus] self.logger.info("{}: {} RPKM".format(locus, rpkm)) diff --git a/test.py b/test.py new file mode 100644 index 0000000..a7307ac --- /dev/null +++ b/test.py @@ -0,0 +1,31 @@ +import unittest + +from seq2HLA import get_best_allele_per_group + + +class TestSeq2HLAMethods(unittest.TestCase): + def test_get_best_allele_per_group(self): + """ + Test that it can sum a list of integers + """ + readspergroup = { + 'A*01': 0, + 'A*02': 0, + 'B*45': 0, + 'C*04': 0, + 'G*29': 0 + } + readcount = { + 'A*01:05': 90, + 'A*01:01:02': 122, + 'B*45:07': 116, + 'C*04:02': 74 + } + + result = get_best_allele_per_group(readspergroup, readcount) + print(result) + self.assertEqual(result, ({'A*01': 122, 'A*02': 0, 'B*45': 116, 'C*04': 74, 'G*29': 0}, {'A*01': 'A*01:01:02', 'B*45': 'B*45:07', 'C*04': 'C*04:02'})) + + +if __name__ == '__main__': + unittest.main() diff --git a/test_data/sample_R1.fastq.gz b/test_data/input/sample_R1.fastq.gz similarity index 100% rename from test_data/sample_R1.fastq.gz rename to test_data/input/sample_R1.fastq.gz diff --git a/test_data/sample_R2.fastq.gz b/test_data/input/sample_R2.fastq.gz similarity index 100% rename from test_data/sample_R2.fastq.gz rename to test_data/input/sample_R2.fastq.gz diff --git a/test_data/output/hla_1_classical.HLAgenotype2digits b/test_data/output/hla_1_classical.HLAgenotype2digits new file mode 100644 index 0000000..d903c0f --- /dev/null +++ b/test_data/output/hla_1_classical.HLAgenotype2digits @@ -0,0 +1,4 @@ +#Locus Allele 1 Confidence Allele 2 Confidence +A A*24 0.0009938365 A*02 0.0009960293 +B B*35 0.02892281 B*15 0.0008318822 +C C*04 3.766758e-07 hoz("C*03") 0.0651378 diff --git a/test_data/output/hla_1_classical.HLAgenotype4digits b/test_data/output/hla_1_classical.HLAgenotype4digits new file mode 100644 index 0000000..e351b6b --- /dev/null +++ b/test_data/output/hla_1_classical.HLAgenotype4digits @@ -0,0 +1,4 @@ +#Locus Allele 1 Confidence Allele 2 Confidence +A A*24:02 0.0009938365 A*02:01 0.001056695 +B B*35:14' 0.02892281 B*15:01' 0.0008609877 +C C*04:01 3.766758e-07 C*04:01 0.0651378 diff --git a/test_data/output/hla_1_classical.expression b/test_data/output/hla_1_classical.expression new file mode 100644 index 0000000..cbff77f --- /dev/null +++ b/test_data/output/hla_1_classical.expression @@ -0,0 +1,3 @@ +A: 153474.83 RPKM +C: 299860.82 RPKM +B: 223649.0 RPKM diff --git a/test_data/output/hla_1_nonclassical.HLAgenotype2digits b/test_data/output/hla_1_nonclassical.HLAgenotype2digits new file mode 100644 index 0000000..c209090 --- /dev/null +++ b/test_data/output/hla_1_nonclassical.HLAgenotype2digits @@ -0,0 +1,10 @@ +#Locus Allele 1 Confidence Allele 2 Confidence +E E*01 NA hoz("E*01") NA +F F*01 NA hoz("F*01") NA +G no NA hoz("G*01") NA +H H*02 2.220446e-16 hoz("H*01") 0.0259602 +J J*01 NA hoz("J*01") NA +K K*01 NA hoz("K*01") NA +L L*01 NA hoz("L*01") NA +P no NA hoz("P*02") NA +V no NA hoz("V*01") NA diff --git a/test_data/output/hla_1_nonclassical.HLAgenotype4digits b/test_data/output/hla_1_nonclassical.HLAgenotype4digits new file mode 100644 index 0000000..faef3a4 --- /dev/null +++ b/test_data/output/hla_1_nonclassical.HLAgenotype4digits @@ -0,0 +1,10 @@ +#Locus Allele 1 Confidence Allele 2 Confidence +E E*01:03 NA E*01:03 NA +F F*01:01 NA F*01:01 NA +G no NA no NA +H H*02:06 2.220446e-16 H*02:06 0.0259602 +J J*01:01 NA J*01:01 NA +K K*01:01 NA K*01:01 NA +L L*01:02 NA L*01:02 NA +P no NA no NA +V no NA no NA diff --git a/test_data/output/hla_1_nonclassical.expression b/test_data/output/hla_1_nonclassical.expression new file mode 100644 index 0000000..d2b299c --- /dev/null +++ b/test_data/output/hla_1_nonclassical.expression @@ -0,0 +1,9 @@ +E: 248284.76 RPKM +G: 0.0 RPKM +F: 43661.47 RPKM +H: 55744.05 RPKM +K: 2619.91 RPKM +J: 854.15 RPKM +L: 853.37 RPKM +P: 0.0 RPKM +V: 0.0 RPKM diff --git a/test_data/output/hla_2_classical.HLAgenotype2digits b/test_data/output/hla_2_classical.HLAgenotype2digits new file mode 100644 index 0000000..9976684 --- /dev/null +++ b/test_data/output/hla_2_classical.HLAgenotype2digits @@ -0,0 +1,7 @@ +#Locus Allele 1 Confidence Allele 2 Confidence +DQA1 DQA1*01 0 hoz("DQA1*04") NA +DQB1 DQB1*05 0.3246214 DQB1*06 0.3701177 +DRB1 DRB1*01 0.005774051 DRB1*15 0.02749135 +DRA DRA*01 NA hoz("DRA*01") NA +DPA1 DPA1*01 0.1600902 hoz("DPA1*02") 0.0574801 +DPB1 DPB1*02 8.582919e-10 DPB1*04 0.0109329 diff --git a/test_data/output/hla_2_classical.HLAgenotype4digits b/test_data/output/hla_2_classical.HLAgenotype4digits new file mode 100644 index 0000000..e4740ab --- /dev/null +++ b/test_data/output/hla_2_classical.HLAgenotype4digits @@ -0,0 +1,7 @@ +#Locus Allele 1 Confidence Allele 2 Confidence +DQA1 DQA1*01:02 0.0 DQA1*01:02 NA +DQB1 DQB1*05:01 0.3246214 DQB1*06:02' 0.3701177 +DRB1 DRB1*01:01 0.005774051 DRB1*15:01 0.02749135 +DRA DRA*01:01 NA DRA*01:01 NA +DPA1 DPA1*01:03 0.1600902 DPA1*01:03 0.0574801 +DPB1 DPB1*02:01' 8.582919e-10 DPB1*04:01 0.0109329 diff --git a/test_data/output/hla_2_classical.expression b/test_data/output/hla_2_classical.expression new file mode 100644 index 0000000..8775696 --- /dev/null +++ b/test_data/output/hla_2_classical.expression @@ -0,0 +1,6 @@ +DQB1: 9352.96 RPKM +DQA1: 8557.96 RPKM +DRB1: 45209.69 RPKM +DPB1: 17124.51 RPKM +DRA: 75562.7 RPKM +DPA1: 37577.18 RPKM From a7dd26db6334211f71809f2670ab7fa26129400c Mon Sep 17 00:00:00 2001 From: Patrick Sorn Date: Thu, 24 Aug 2023 12:35:39 +0200 Subject: [PATCH 2/2] Restructured code and added test --- confidence.py | 105 ------------------------------ fourdigits.py => determine_hla.py | 40 ++++++++---- seq2HLA.py | 24 ++++--- test.py | 35 +++++++++- 4 files changed, 70 insertions(+), 134 deletions(-) delete mode 100644 confidence.py rename fourdigits.py => determine_hla.py (90%) diff --git a/confidence.py b/confidence.py deleted file mode 100644 index 8c52038..0000000 --- a/confidence.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python - -import numpy as np -import scipy.stats as st - -np.seterr("raise") - -def calculate_confidence(allele_count, allele_vector): - if not allele_vector or len(allele_vector) < 2: - return 1.0 - mean = np.mean(allele_vector) - std = np.std(allele_vector, ddof = 1) - paril = 1.0 - st.norm.cdf(allele_count, mean, std) - poutlier = st.binom.pmf(0, len(allele_vector), paril) - return (1.0 - poutlier) - - -def main(): - val = [8498, 6256, 25164] - types = ["A*24", "B*35", "C*04"] - val_list = [ - [ - 1244, - 5940, - 1722, - 660, - 5770, - 222, - 688, - 374, - 544, - 1890, - 636, - 4272, - 192, - 132, - 212, - 212, - 1150, - 2426, - 596, - 160 - ], - [ - 2718, - 782, - 1754, - 1284, - 5832, - 2436, - 1088, - 654, - 630, - 924, - 2086, - 420, - 344, - 1316, - 1314, - 4534, - 310, - 3386, - 1664, - 896, - 2002, - 1372, - 4500, - 3398, - 1374, - 3508, - 2194, - 3488, - 1192, - 300, - 436, - 2846, - 78, - 276, - 322 - ], - [ - 6002, - 2068, - 2332, - 6386, - 3518, - 3078, - 5418, - 3028, - 4836, - 2232, - 2034, - 2096, - 16068 - ] - ] - - print("['[1] A*24', '[1] 8498', '[1] 1452.1', '[1] 1810.365', '[1] 0.0009938365', '[1] B*35', '[1] 6256', '[1] 1761.657', '[1] 1430.283', '[1] 0.02892281', '[1] C*04', '[1] 25164', '[1] 4545.846', '[1] 3800.572', '[1] 3.766758e-07', '']") - - - print(calculate_confidence(val[0], val_list[0])) - - -if __name__ == "__main__": - main() diff --git a/fourdigits.py b/determine_hla.py similarity index 90% rename from fourdigits.py rename to determine_hla.py index 6c47f64..5de807e 100755 --- a/fourdigits.py +++ b/determine_hla.py @@ -5,17 +5,29 @@ import numpy as np +import scipy.stats as st -from confidence import calculate_confidence import config as cfg -""" -Determination of the 4-digit HLA type based on the 2-digit determination by seq2HLA.py -""" top_alleles = {} +np.seterr("raise") + +def calculate_confidence(allele_max, allele_count_list): + if not allele_count_list or len(allele_count_list) < 2: + return 1.0 + mean = np.mean(allele_count_list) + std = np.std(allele_count_list, ddof = 1) + paril = 1.0 - st.norm.cdf(allele_max, mean, std) + poutlier = st.binom.pmf(0, len(allele_count_list), paril) + return (1.0 - poutlier) + + def determine_four_digits_main(infile, bgVec, hla_class, hla_classification, ambiguityfile): + """ + Determination of the 4-digit HLA type based on the 2-digit determination by seq2HLA.py + """ if hla_classification == "nonclassical": result = determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile) else: @@ -90,7 +102,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile): if 0.0 < confidence < 0.001: for item in result_dict: if result_dict[item] == top: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec))) repeat = False mostProbableAllele = str(item.split(":")[0]) + ":" + str(item.split(":")[1]) numbersolutions = 1 @@ -113,7 +125,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile): count = 0 for item in result_dict: if result_dict[item] >= perc: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec))) repeat = False item_split = item.split(":") mostProbableAllele = "{}:{}".format(item_split[0], item_split[1]) @@ -133,7 +145,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile): #for item in result_dict: for item in result_dict2: item_split = item.split(":") - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec))) repeat = False mostProbableAllele = "{}:{}".format(item_split[0], item_split[1]) numbersolutions = 1 @@ -157,7 +169,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile): ambiguity_handle.write("#Ambiguity:\n") ambiguity_handle.write("#Based on the RNA-Seq reads and the dbMHC table, the following 4-digits alleles are possible:\n") for ambi4DigitAllele in ambiguity_dict: - ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, 1 - calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec))) + ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec))) ambiguity_handle.write("#However, by taking into account the read data, the most probable 4-digit allele is:\n") ambiguity_handle.write("{}\n\n".format(mostProbableAllele)) @@ -167,7 +179,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile): item_split = item.split(":") item4digit = "{}:{}".format(item_split[0], item_split[1]) if result_dict2[item] == max: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec))) repeat = False writehandle.close() return "{},{}".format(mostProbableAllele, numbersolutions) @@ -221,7 +233,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile): if 0.0 < confidence < 0.001: for item in result_dict: if result_dict[item] == top: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec))) repeat = False item_split = item.split(":") mostProbableAllele = "{}:{}".format(item_split[0], item_split[1]) @@ -245,7 +257,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile): count = 0 for item in result_dict: if result_dict[item] >= perc: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec))) repeat = False item_split = item.split(":") mostProbableAllele = "{}:{}".format(item_split[0], item_split[1]) @@ -264,7 +276,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile): if len(set(result)) == 1: #for item in result_dict: for item in result_dict2: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec))) repeat = False item_split = item.split(":") mostProbableAllele = "{}:{}".format(item_split[0], item_split[1]) @@ -294,7 +306,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile): ambiguity_handle.write("#Ambiguity:\n") ambiguity_handle.write("#Based on the RNA-Seq reads and the dbMHC table, the following 4-digits alleles are possible:\n") for ambi4DigitAllele in ambiguity_dict: - ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, 1 - calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec))) + ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec))) ambiguity_handle.write("#However, by taking into account the read data, the most probable 4-digit allele is:\n") ambiguity_handle.write("{}\n\n".format(mostProbableAllele)) @@ -306,7 +318,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile): #if np.mean(dbmhc_prob[item4digit]) == max: #if meanPanPopFreq(dbmhc_prob[item4digit]) == max: if result_dict2[item] == max: - writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec))) + writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec))) repeat = False writehandle.close() return "{},{}".format(mostProbableAllele, numbersolutions) diff --git a/seq2HLA.py b/seq2HLA.py index 01bab62..c0b58d6 100755 --- a/seq2HLA.py +++ b/seq2HLA.py @@ -26,8 +26,9 @@ # Internal modules import config as cfg -from confidence import calculate_confidence -import fourdigits + +from determine_hla import calculate_confidence +from determine_hla import determine_four_digits_main from version import version @@ -55,8 +56,11 @@ def get_read_len(fq_file, gzipped): return read_len -#Return the p value of a prediction, which is stored in the intermediate textfile + def get_P(locus, finaloutput): + """ + Return the p value of a prediction, which is stored in the intermediate textfile. + """ locus_dict = { "A": 2, "DQA1": 2, "E": 2, "B": 3, "DQB1": 3, "F": 3, @@ -214,7 +218,6 @@ def run(self): ) - #---------------Class I------------------------------- def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_classification, length_dict): """This method calls HLA types for class I.""" twodigits1 = {} @@ -227,7 +230,6 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl twodigits3 = {} finalAlleles = {} - #-------1st iteration----------------------------------- if hla_class == 1: self.logger.info("----------HLA class I------------") if hla_classification == "classical": @@ -239,7 +241,6 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl prefix = "hla_{}_{}".format(hla_class, hla_classification) - #iteration = 1 fq1_2 = os.path.join(self.working_dir, "{}_iteration_2_1.fq".format(prefix)) fq2_2 = os.path.join(self.working_dir, "{}_iteration_2_2.fq".format(prefix)) sam1 = os.path.join(self.working_dir, "{}_iteration_1.sam".format(prefix)) @@ -281,7 +282,6 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl self.logger.info("Starting 2nd iteration:") self.logger.info("Mapping reads...") medians = {} - #iteration = 2 self.mapping(sam2, aligned, bowtielog, fq1_2, fq2_2, bowtiebuild, 2, mapopt) @@ -302,9 +302,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl try: self.expression(aligned_1, sam1, expressionfile, bowtielog, output1, locus_list, length_dict, hla_dict) except IOError: - #tmp = "" for locus in locus_list: - #tmp += "{}: 0 RPKM\n".format(locus) self.logger.debug("{}: 0 RPKM".format(locus)) #-----3rd iteration in case of at least one homozygous call----------- @@ -387,7 +385,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl finalAlleles[locus] = "no\tNA\tno\tNA" self.report_HLA_four_digit_genotype(finalAlleles, finaloutput4digit, locus_list) - #self.cleanup() + self.cleanup() def cleanup(self): @@ -570,7 +568,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r for locus in locus_list: allele_list = [float(x) for x in alleleVector[locus]] confidence_val = calculate_confidence(float(alleleCount[locus]), allele_list) - confidence_vals.append((maxkey[locus], confidence_val, 1 - confidence_val)) + confidence_vals.append((maxkey[locus], confidence_val)) fourDigitString = "" if medianflag: @@ -581,7 +579,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r self.logger.info("Determining 4 digits HLA type...") for locus in locus_list: if not allele_dict[locus] == "no": - fourDigitString += "{},{},".format(allele_dict[locus], fourdigits.determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus], hla_class, hla_classification, ambiguityfile)) + fourDigitString += "{},{},".format(allele_dict[locus], determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus], hla_class, hla_classification, ambiguityfile)) else: fourDigitString += "{},,,".format(allele_dict[locus]) @@ -599,7 +597,7 @@ def pred2file(self, entries, readspergroup_dict, output, allele_dict, HLAclass, outf.write("{}\t{}\t".format(locus, allele_dict[locus])) #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration outf.write("{}\t".format(np.median(list(readspergroup_dict[locus].values())))) - outf.write("{}\t{}\n".format(entries[index][0], entries[index][2])) + outf.write("{}\t{}\n".format(entries[index][0], entries[index][1])) index += 1 self.logger.info("The digital haplotype is written into {}".format(output)) diff --git a/test.py b/test.py index a7307ac..f2e7d5a 100644 --- a/test.py +++ b/test.py @@ -1,12 +1,13 @@ import unittest +from determine_hla import calculate_confidence from seq2HLA import get_best_allele_per_group class TestSeq2HLAMethods(unittest.TestCase): def test_get_best_allele_per_group(self): """ - Test that it can sum a list of integers + Test that best alleles are selected per locus. """ readspergroup = { 'A*01': 0, @@ -23,9 +24,39 @@ def test_get_best_allele_per_group(self): } result = get_best_allele_per_group(readspergroup, readcount) - print(result) + #print(result) self.assertEqual(result, ({'A*01': 122, 'A*02': 0, 'B*45': 116, 'C*04': 74, 'G*29': 0}, {'A*01': 'A*01:01:02', 'B*45': 'B*45:07', 'C*04': 'C*04:02'})) + def test_confidence_calculation_low_conf(self): + """ + Test to make sure that confidence calculation is correct. + """ + allele_max = 500 + allele_count_list = [500, 400, 300] + result = calculate_confidence(allele_max, allele_count_list) + print(result) + self.assertEqual(result, 0.40444488206853557) + + + def test_confidence_calculation_high_conf(self): + """ + Test to make sure that confidence calculation is correct. + """ + allele_max = 5000 + allele_count_list = [500, 400, 300] + result = calculate_confidence(allele_max, allele_count_list) + print(result) + self.assertEqual(result, 0.0) + + def test_confidence_calculation_too_few_elements(self): + """ + Test to make sure that too few elements in allele_count_list are not supported. + """ + allele_max = 500 + allele_count_list = [500] + result = calculate_confidence(allele_max, allele_count_list) + self.assertEqual(result, 1.0) + if __name__ == '__main__': unittest.main()