From 732932d16a53af8574a3ac10e49e17f5240b08e2 Mon Sep 17 00:00:00 2001
From: Patrick Sorn <patrick.sorn@tron-mainz.de>
Date: Thu, 24 Aug 2023 11:02:56 +0200
Subject: [PATCH 1/2] Added test sample output data; Simplified code; Updated
 bowtie version to allow native gzip input

---
 README.md                                     |  16 +-
 confidence.py                                 |   7 +-
 environment.yml                               |   2 +-
 fourdigits.py                                 |   4 +-
 run_test.sh                                   |   2 +-
 seq2HLA.py                                    | 139 ++++++++++--------
 test.py                                       |  31 ++++
 test_data/{ => input}/sample_R1.fastq.gz      | Bin
 test_data/{ => input}/sample_R2.fastq.gz      | Bin
 .../output/hla_1_classical.HLAgenotype2digits |   4 +
 .../output/hla_1_classical.HLAgenotype4digits |   4 +
 test_data/output/hla_1_classical.expression   |   3 +
 .../hla_1_nonclassical.HLAgenotype2digits     |  10 ++
 .../hla_1_nonclassical.HLAgenotype4digits     |  10 ++
 .../output/hla_1_nonclassical.expression      |   9 ++
 .../output/hla_2_classical.HLAgenotype2digits |   7 +
 .../output/hla_2_classical.HLAgenotype4digits |   7 +
 test_data/output/hla_2_classical.expression   |   6 +
 18 files changed, 183 insertions(+), 78 deletions(-)
 create mode 100644 test.py
 rename test_data/{ => input}/sample_R1.fastq.gz (100%)
 rename test_data/{ => input}/sample_R2.fastq.gz (100%)
 create mode 100644 test_data/output/hla_1_classical.HLAgenotype2digits
 create mode 100644 test_data/output/hla_1_classical.HLAgenotype4digits
 create mode 100644 test_data/output/hla_1_classical.expression
 create mode 100644 test_data/output/hla_1_nonclassical.HLAgenotype2digits
 create mode 100644 test_data/output/hla_1_nonclassical.HLAgenotype4digits
 create mode 100644 test_data/output/hla_1_nonclassical.expression
 create mode 100644 test_data/output/hla_2_classical.HLAgenotype2digits
 create mode 100644 test_data/output/hla_2_classical.HLAgenotype4digits
 create mode 100644 test_data/output/hla_2_classical.expression
diff --git a/README.md b/README.md
index e9fd170..0387007 100755
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ as input, uses a bowtie index comprising all HLA alleles and outputs the most li
   - biopython (>= 1.79)
   - numpy (>= 1.21.5)
   - scipy (>= 1.7.3)
-- bowtie (>= 1.0.0)
+- bowtie (>= 1.3.1)
 
 
 ## Installation
@@ -50,13 +50,13 @@ optional arguments:
 ## Output
 
 The results are output to stdout and to textfiles. Most important are:
-- <prefix>-ClassI.HLAgenotype2digits => 2 digit result of Class I
-- <prefix>-ClassII.HLAgenotype2digits => 2 digit result of Class II
-- <prefix>-ClassI.HLAgenotype4digits => 4 digit result of Class I
-- <prefix>-ClassII.HLAgenotype4digits => 4 digit result of Class II
-- <prefix>.ambiguity => reports typing ambuigities (more than one solution for an allele possible)
-- <prefix>-ClassI.expression => expression of Class I alleles
-- <prefix>-ClassII.expression => expression of Class II alleles
+- `hla_1_*.HLAgenotype2digits` - 2 digit result of Class I
+- `hla_2_classical.HLAgenotype2digits` - 2 digit result of Class II
+- `hla_1_*.HLAgenotype4digits` - 4 digit result of Class I
+- `hla_2_classical.HLAgenotype4digits` - 4 digit result of Class II
+- `hla.ambiguity` - reports typing ambuigities (more than one solution for an allele possible)
+- `hla_1_*.expression` - expression of Class I alleles
+- `hla_2_classical.expression` - expression of Class II alleles
 
 
 ## Version history
diff --git a/confidence.py b/confidence.py
index 3d5472b..8c52038 100644
--- a/confidence.py
+++ b/confidence.py
@@ -3,9 +3,14 @@
 import numpy as np
 import scipy.stats as st
 
+np.seterr("raise")
 
 def calculate_confidence(allele_count, allele_vector):
-    paril = 1.0 - st.norm.cdf(allele_count, np.mean(allele_vector), np.std(allele_vector, ddof = 1))
+    if not allele_vector or len(allele_vector) < 2:
+        return 1.0
+    mean = np.mean(allele_vector)
+    std = np.std(allele_vector, ddof = 1)
+    paril = 1.0 - st.norm.cdf(allele_count, mean, std)
     poutlier = st.binom.pmf(0, len(allele_vector), paril)
     return (1.0 - poutlier)
 
diff --git a/environment.yml b/environment.yml
index 35f9f44..eb45195 100644
--- a/environment.yml
+++ b/environment.yml
@@ -6,7 +6,7 @@ channels:
   - defaults
 dependencies:
   - biopython=1.79
-  - bowtie=1.0.0
+  - bowtie=1.3.1
   - numpy=1.21.5
   - scipy=1.7.3
   - python=3.7.16
diff --git a/fourdigits.py b/fourdigits.py
index 3c5c415..6c47f64 100755
--- a/fourdigits.py
+++ b/fourdigits.py
@@ -176,13 +176,13 @@ def calculate_outlier(top, bgVec):
     """Call R-script "commmand_fourdigit.R" to calculate the confidence of the top-scoring allele (four-digits)."""
 
     # Convert bgVec into md5
-    md5sum = hashlib.md5(bgVec.encode("utf-8")).hexdigest()
+    md5sum = hashlib.md5(','.join(str(x) for x in bgVec).encode("utf-8")).hexdigest()
     if not md5sum in top_alleles:
         top_alleles[md5sum] = {}
     if top in top_alleles[md5sum]:
         return top_alleles[md5sum][top]
     else:
-        allele_list = [float(x) for x in bgVec.rstrip(",").split(",")]
+        allele_list = [float(x) for x in bgVec]
         confidence = calculate_confidence(float(top), allele_list)
         top_alleles[md5sum][top] = confidence
         return confidence
diff --git a/run_test.sh b/run_test.sh
index b1daa0d..4707619 100644
--- a/run_test.sh
+++ b/run_test.sh
@@ -2,4 +2,4 @@
 
 rm -rf test_out/
 
-python seq2HLA.py -1 test_data/sample_R1.fastq.gz -2 test_data/sample_R2.fastq.gz -o test_out/ -p 12
+python seq2HLA.py -1 test_data/input/sample_R1.fastq.gz -2 test_data/input/sample_R2.fastq.gz -o test_out/ -p 12
diff --git a/seq2HLA.py b/seq2HLA.py
index 90df8f2..01bab62 100755
--- a/seq2HLA.py
+++ b/seq2HLA.py
@@ -12,7 +12,6 @@
 
 
 from argparse import ArgumentParser
-from glob import glob
 import gzip
 import linecache
 import logging
@@ -72,8 +71,12 @@ def get_P(locus, finaloutput):
     return linecache.getline(finaloutput, locus_dict[locus]).split("\t")[4][0:-1]
 
 
-#In case of ambiguous typings, the allele(s) with the best p-value (which is actually the smallest one, so the name of the function is misleading - sorry) is reported and thus the min(p) is returned
-def get_max_P(infile):
+
+def get_best_P(infile):
+    """
+    In case of ambiguous typings, the allele(s) with the best p-value
+    (which is actually the smallest one) is reported and thus the min(p) is returned.
+    """
     p = []
     with open(infile) as inf:
         for line in inf:
@@ -87,8 +90,30 @@ def get_max_P(infile):
         return "NA"
 
 
-#Open sam-file and count mappings for each allele 
+
+def get_best_allele_per_group(readspergroup, readcount):
+    """
+    For each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save
+    #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
+    #ii) the number of reads mapping to the top-scoring allele => readspergroup
+    """
+
+    top_counts = {}
+    top_alleles = {}
+    for group in readspergroup:
+        top_counts[group] = readspergroup[group]
+    for key in readcount:
+        if readcount[key] > 0:
+            group = key.split(":")[0]
+            if top_counts[group] <= readcount[key]:
+                top_counts[group] = readcount[key]
+                top_alleles[group] = key
+    return (top_counts, top_alleles)
+
+
+
 def parse_mapping(hla_dict, sam, readcount):
+    """Open sam-file and count mappings for each allele."""
     with open(sam) as samhandle:
         for line in samhandle:
             if line[0] != '@':
@@ -229,6 +254,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
         output3 = os.path.join(self.working_dir, "{}.digitalhaplotype3".format(prefix))
         medianfile = os.path.join(self.working_dir, "{}.digitalhaplotype1".format(prefix))
         ambiguityfile = os.path.join(self.working_dir, "hla.ambiguity")
+        expressionfile = os.path.join(self.working_dir, "{}.expression".format(prefix))
         finaloutput = os.path.join(self.working_dir, "{}.HLAgenotype2digits".format(prefix))
         finaloutput4digit = os.path.join(self.working_dir, "{}.HLAgenotype4digits".format(prefix))
         
@@ -274,7 +300,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
         self.report_HLA_genotype(output1, output2, finaloutput, locus_list)
         self.logger.info("Calculating locus-specific expression...")
         try:
-            self.expression(aligned_1, sam1, bowtielog, output1, locus_list, length_dict, hla_dict)
+            self.expression(aligned_1, sam1, expressionfile, bowtielog, output1, locus_list, length_dict, hla_dict)
         except IOError:
             #tmp = ""
             for locus in locus_list:
@@ -324,7 +350,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
                 if fourDigit_solutions1[locus]:
                     if int(fourDigit_solutions1[locus]) > 1:
                         allele1 += "'"
-                    finalAlleles[locus] = "{}\t{}\t".format(allele1, get_max_P("{}.4digits{}1.solutions".format(sam1, locus)))
+                    finalAlleles[locus] = "{}\t{}\t".format(allele1, get_best_P("{}.4digits{}1.solutions".format(sam1, locus)))
                 else:
                     finalAlleles[locus] = "no\tNA\t"
 	
@@ -343,13 +369,13 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
                         allele3 = f3[alleleFourDigit_index]
                         if int(f3[alleleFourDigit_index+1]) > 1:
                             allele3 += "'"
-                        finalAlleles[locus] += "{}\t{}".format(allele3, get_max_P("{}.4digits{}2.solutions".format(sam3, locus)))
+                        finalAlleles[locus] += "{}\t{}".format(allele3, get_best_P("{}.4digits{}2.solutions".format(sam3, locus)))
                         alleleFourDigit_index += 3
                 else:
                     allele2 = fourdigits2[locus]
                     if int(fourDigit_solutions2[locus]) > 1:
                         allele2 += "'"
-                    finalAlleles[locus] += "{}\t{}".format(allele2, get_max_P("{}.4digits{}2.solutions".format(sam2, locus)))
+                    finalAlleles[locus] += "{}\t{}".format(allele2, get_best_P("{}.4digits{}2.solutions".format(sam2, locus)))
                     alleleFourDigit_index += 3
 	
             #output the 4-digit type (stdout and to file)
@@ -361,7 +387,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
                 finalAlleles[locus] = "no\tNA\tno\tNA"
             self.report_HLA_four_digit_genotype(finalAlleles, finaloutput4digit, locus_list)
 
-        self.cleanup()
+        #self.cleanup()
 
 
     def cleanup(self):
@@ -379,14 +405,16 @@ def cleanup(self):
             "hla_2_classical.HLAgenotype2digits",
             "hla_2_classical.HLAgenotype4digits",
             "hla.ambiguity",
-            "run.expression",
+            "hla_1_classical.expression",
+            "hla_1_nonclassical.expression",
+            "hla_2_classical.expression",
             "run.log"
         ]
 
         for filename in os.listdir(self.working_dir):
             if not filename in files_to_keep:
                 file_path = os.path.join(self.working_dir, filename)
-                self.logger.info("Removing {}.".format(file_path))
+                self.logger.debug("Removing {}.".format(file_path))
                 os.remove(file_path)
 
 
@@ -397,23 +425,13 @@ def mapping(self, sam, aligned_file, bowtielog, fq1, fq2, bowtiebuild, iteration
             self.logger.info("Skipping mapping step as already done...")
             return
         if iteration == 1:
-            if not self.gzipped:
-                mapping_cmd = "(bowtie -3 {} -S {} --al {} {} -1 {} -2 {} |  awk -F \'\\t\' '$3 != \"*\"{{ print $0 }}' > {}) 2> {}".format(self.trim3, mapopt, aligned_file, bowtiebuild, fq1, fq2, sam, bowtielog)
-            else:
-                mapping_cmd = "(bowtie -3 {} -S {} --al {} {} -1 <(zcat {}) -2 <(zcat {}) |  awk -F \'\\t\' '$3 != \"*\"{{ print $0 }}' > {}) 2> {}".format(self.trim3, mapopt, aligned_file, bowtiebuild, fq1, fq2, sam, bowtielog)
+            mapping_cmd = "(bowtie -3 {} -S {} --al {} -x {} -1 {} -2 {} |  awk -F \'\\t\' '$3 != \"*\"{{ print $0 }}' > {}) 2> {}".format(self.trim3, mapopt, aligned_file, bowtiebuild, fq1, fq2, sam, bowtielog)
         elif iteration == 2:
-            mapping_cmd = "bowtie -3 {} -S {} {} -1 {} -2 {} {}".format(self.trim3, mapopt, bowtiebuild, fq1, fq2, sam)
+            mapping_cmd = "bowtie -3 {} -S {} -x {} -1 {} -2 {} {}".format(self.trim3, mapopt, bowtiebuild, fq1, fq2, sam)
         #execute bowtie
         self.logger.info("CMD: {}".format(mapping_cmd))
         process_mapping = subprocess.Popen(['bash', '-c', mapping_cmd], stdout=subprocess.PIPE)
         out, err = process_mapping.communicate()
-	
-        if iteration == 1:
-            #print alignment stats
-            printcommand = "cat {}".format(bowtielog)
-            printcommand_proc = subprocess.Popen(['bash', '-c', printcommand], stdout=subprocess.PIPE)
-            out, err = printcommand_proc.communicate()
-            self.logger.info(out)
 
     
     def create_ref_dict(self, hlafasta, class1_list):
@@ -465,31 +483,22 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
             readspergroup_dict[locus] = {}
             fourdigits_dict[locus] = {}
             fourdigits_sorted_dict[locus] = {}
-            alleleVector[locus] = ""
+            alleleVector[locus] = []
 
-        maxallelepergroup = {}
-	
-        #for each allele, to which at least 1 read map, find the allele which has the most reads within a group (2-digit-level, e.g. A*02") and save
-        #i) the key of this allele as ambassador for the group => maxallelepergroup holds for each group the top-scoring allele
-        #ii) the number of reads mapping to the top-scoring allele => readspergroup
-        for key in readcount:
-            if readcount[key] > 0:
-                group = key.split(":")[0]
-                if readspergroup[group] <= readcount[key]:
-                    readspergroup[group] = readcount[key]
-                    maxallelepergroup[group] = key
-
-        readspergrouphandle = open("{}.readspergrouphandle".format(sam), "a")
-        for group in readspergroup:
-            readspergrouphandle.write("{}\t{}\n".format(group, readspergroup[group]))
-        readspergrouphandle.close()
+
+        (top_counts, top_alleles) = get_best_allele_per_group(readspergroup, readcount)
+
+
+        with open("{}.readspergrouphandle".format(sam), "a") as outf:
+            for group in top_counts:
+                outf.write("{}\t{}\n".format(group, top_counts[group]))
         #consider all A-,B-, and C-groups seperately
         #readspergroup<A|B|C>list = list of all reads mapping to the top-scoring groups minus the decision threshold (which is 0 in the first iteration)
         #readspergroup<A|B|C> = contains the same entries as the list, but the entries are uniquely accessible via the group-key (e.g. B*27)
-        for key in readspergroup:
+        for key in top_counts:
             locus = key.split("*")[0]
-            readspergroup_dict_list[locus].append(readspergroup[key] - float(medians[locus]))
-            readspergroup_dict[locus][key] = readspergroup[key] - float(medians[locus])
+            readspergroup_dict_list[locus].append(top_counts[key] - float(medians[locus]))
+            readspergroup_dict[locus][key] = top_counts[key] - float(medians[locus])
 	
         #Determine top-scoring group of the whole locus (A,B,C) and store it
         #maxkey<A,B,C> = group (e.g. A*02) with the most reads
@@ -499,7 +508,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
             if len(readspergroup_dict_list[locus]) > 0:
                 maxkey[locus] = max(readspergroup_dict[locus], key=lambda a:readspergroup_dict[locus].get(a))
                 if readspergroup_dict[locus][maxkey[locus]] > 0:
-                    maxAlleles[locus] = maxallelepergroup[maxkey[locus]]
+                    maxAlleles[locus] = top_alleles[maxkey[locus]]
                     allele_dict[locus] = maxkey[locus]
                 else:
                     allele_dict[locus] = "no"
@@ -519,10 +528,9 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
             else:
                 iteration = 1
 	
-            fourdigit_writehandle = open("{}.4digits{}{}".format(sam, locus, iteration), "w")
-            for key in fourdigits_sorted_dict[locus]:
-                fourdigit_writehandle.write("{}: {}\n".format(key[0], key[1]))
-            fourdigit_writehandle.close()
+            with open("{}.4digits{}{}".format(sam, locus, iteration), "w") as outf:
+                for key in fourdigits_sorted_dict[locus]:
+                    outf.write("{}: {}\n".format(key[0], key[1]))
 
             readspergrouplocus = sorted(readspergroup_dict[locus].items(), key=itemgetter(1), reverse=True)
 
@@ -530,36 +538,37 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
                 #in the 2nd iteration: 
                 #1.) DO NOT remove the top-scoring group from the set <a,b,c>vec as this is more strict when calculating the probability of the top scoring group being an outlier
                 #The strings <a,b,c>vec are used by the R-script to calculate the probability of the top scoring group being an outlier
-                for key in maxallelepergroup:
+
+                for key in top_alleles:
                     if key.split("*")[0] == locus:
-                        alleleVector[locus] += "{},".format(readcount[maxallelepergroup[key]])
+                        alleleVector[locus].append(readcount[top_alleles[key]])
                 #2.) Add the decision thresholds to the sets <a,b,c>vec, as this enables measuring the distance of the top-scoring group to this distance
                 if allele_dict[locus] == "no":
-                    alleleVector[locus] += str(medians[locus])
+                    alleleVector[locus].append(int(float(medians[locus])))
                 #3.) In case of, e.g. a loss of whole HLA-locus (A,B,C), <a,b,c>vec only contain median[<0|1|2>].
                 #To avoid errors in the R-Script, set to 0
-                if alleleVector[locus] == "" or alleleVector[locus] == medians[locus]:
-                    alleleVector[locus] = "0"
-                    alleleCount[locus] = "0"
+                if not alleleVector[locus] or alleleVector[locus][0] == int(float(medians[locus])):
+                    alleleVector[locus] = []
+                    alleleCount[locus] = 0
                 else:
-                    alleleCount[locus] = str(readcount[maxallelepergroup[maxkey[locus]]])
+                    alleleCount[locus] = readcount[top_alleles[maxkey[locus]]]
 
             else:
                 #in the 1st iteration: remove the top-scoring group from the set <a,b,c>vec as this increases the certainty when calculating the probability of the top scoring group being an outlier
-                for key in maxallelepergroup:
+                for key in top_alleles:
                     if key.split("*")[0] == locus:
                         if key != maxkey[locus]:
-                            alleleVector[locus] += "{},".format(readcount[maxallelepergroup[key]])
+                            alleleVector[locus].append(readcount[top_alleles[key]])
                 #2.) DO NOT add the decision thresholds to the sets <a,b,c>vec
-                if alleleVector[locus] == "":
-                    alleleVector[locus] = "0"
-                    alleleCount[locus] = "0"
+                if not alleleVector[locus]:
+                    alleleCount[locus] = 0
                 else:
-                    alleleCount[locus] = str(readcount[maxallelepergroup[maxkey[locus]]])
+                    alleleCount[locus] = readcount[top_alleles[maxkey[locus]]]
+
 
         confidence_vals = []
         for locus in locus_list:
-            allele_list = [float(x) for x in alleleVector[locus].rstrip(",").split(",")]
+            allele_list = [float(x) for x in alleleVector[locus]]
             confidence_val = calculate_confidence(float(alleleCount[locus]), allele_list)
             confidence_vals.append((maxkey[locus], confidence_val, 1 - confidence_val))
 
@@ -572,7 +581,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
         self.logger.info("Determining 4 digits HLA type...")
         for locus in locus_list:
             if not allele_dict[locus] == "no":
-                fourDigitString += "{},{},".format(allele_dict[locus], fourdigits.determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus].rstrip(","), hla_class, hla_classification, ambiguityfile))
+                fourDigitString += "{},{},".format(allele_dict[locus], fourdigits.determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus], hla_class, hla_classification, ambiguityfile))
             else:
                 fourDigitString += "{},,,".format(allele_dict[locus])
 
@@ -717,7 +726,7 @@ def report_HLA_four_digit_genotype(self, finalAlleles, finaloutput4digit, locus_
                 self.logger.info("{}\t{}".format(locus, finalAlleles[locus]))
 
 
-    def expression(self, aligned, sam, logfile, alleles_in, locus_list, length_dict, hla_dict):
+    def expression(self, aligned, sam, output, logfile, alleles_in, locus_list, length_dict, hla_dict):
         """Calculate locus-specific expression."""
         totalreads = float(linecache.getline(logfile, 1).split(':')[1])
 
@@ -756,7 +765,7 @@ def expression(self, aligned, sam, logfile, alleles_in, locus_list, length_dict,
                     count[locus] += float(1.0 / float(n))
 	
         #Calculate RPKM and print expression values for each locus to stdout
-        with open(os.path.join(self.working_dir, "hla.expression"), 'w') as outf:
+        with open(output, 'w') as outf:
             for locus in count:
                 rpkm = float((1000.0 / length_dict[locus])) * float((1000000.0 / totalreads)) * count[locus]
                 self.logger.info("{}: {} RPKM".format(locus, rpkm))
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..a7307ac
--- /dev/null
+++ b/test.py
@@ -0,0 +1,31 @@
+import unittest
+
+from seq2HLA import get_best_allele_per_group
+
+
+class TestSeq2HLAMethods(unittest.TestCase):
+    def test_get_best_allele_per_group(self):
+        """
+        Test that it can sum a list of integers
+        """
+        readspergroup = {
+            'A*01': 0,
+            'A*02': 0,
+            'B*45': 0,
+            'C*04': 0,
+            'G*29': 0
+        }
+        readcount = {
+            'A*01:05': 90,
+            'A*01:01:02': 122,
+            'B*45:07': 116,
+            'C*04:02': 74
+        }
+        
+        result = get_best_allele_per_group(readspergroup, readcount)
+        print(result)
+        self.assertEqual(result, ({'A*01': 122, 'A*02': 0, 'B*45': 116, 'C*04': 74, 'G*29': 0}, {'A*01': 'A*01:01:02', 'B*45': 'B*45:07', 'C*04': 'C*04:02'}))
+
+        
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test_data/sample_R1.fastq.gz b/test_data/input/sample_R1.fastq.gz
similarity index 100%
rename from test_data/sample_R1.fastq.gz
rename to test_data/input/sample_R1.fastq.gz
diff --git a/test_data/sample_R2.fastq.gz b/test_data/input/sample_R2.fastq.gz
similarity index 100%
rename from test_data/sample_R2.fastq.gz
rename to test_data/input/sample_R2.fastq.gz
diff --git a/test_data/output/hla_1_classical.HLAgenotype2digits b/test_data/output/hla_1_classical.HLAgenotype2digits
new file mode 100644
index 0000000..d903c0f
--- /dev/null
+++ b/test_data/output/hla_1_classical.HLAgenotype2digits
@@ -0,0 +1,4 @@
+#Locus	Allele 1	Confidence	Allele 2	Confidence
+A	A*24	0.0009938365	A*02	0.0009960293
+B	B*35	0.02892281	B*15	0.0008318822
+C	C*04	3.766758e-07	hoz("C*03")	0.0651378
diff --git a/test_data/output/hla_1_classical.HLAgenotype4digits b/test_data/output/hla_1_classical.HLAgenotype4digits
new file mode 100644
index 0000000..e351b6b
--- /dev/null
+++ b/test_data/output/hla_1_classical.HLAgenotype4digits
@@ -0,0 +1,4 @@
+#Locus	Allele 1	Confidence	Allele 2	Confidence
+A	A*24:02	0.0009938365	A*02:01	0.001056695
+B	B*35:14'	0.02892281	B*15:01'	0.0008609877
+C	C*04:01	3.766758e-07	C*04:01	0.0651378
diff --git a/test_data/output/hla_1_classical.expression b/test_data/output/hla_1_classical.expression
new file mode 100644
index 0000000..cbff77f
--- /dev/null
+++ b/test_data/output/hla_1_classical.expression
@@ -0,0 +1,3 @@
+A: 153474.83 RPKM
+C: 299860.82 RPKM
+B: 223649.0 RPKM
diff --git a/test_data/output/hla_1_nonclassical.HLAgenotype2digits b/test_data/output/hla_1_nonclassical.HLAgenotype2digits
new file mode 100644
index 0000000..c209090
--- /dev/null
+++ b/test_data/output/hla_1_nonclassical.HLAgenotype2digits
@@ -0,0 +1,10 @@
+#Locus	Allele 1	Confidence	Allele 2	Confidence
+E	E*01	NA	hoz("E*01")	NA
+F	F*01	NA	hoz("F*01")	NA
+G	no	NA	hoz("G*01")	NA
+H	H*02	2.220446e-16	hoz("H*01")	0.0259602
+J	J*01	NA	hoz("J*01")	NA
+K	K*01	NA	hoz("K*01")	NA
+L	L*01	NA	hoz("L*01")	NA
+P	no	NA	hoz("P*02")	NA
+V	no	NA	hoz("V*01")	NA
diff --git a/test_data/output/hla_1_nonclassical.HLAgenotype4digits b/test_data/output/hla_1_nonclassical.HLAgenotype4digits
new file mode 100644
index 0000000..faef3a4
--- /dev/null
+++ b/test_data/output/hla_1_nonclassical.HLAgenotype4digits
@@ -0,0 +1,10 @@
+#Locus	Allele 1	Confidence	Allele 2	Confidence
+E	E*01:03	NA	E*01:03	NA
+F	F*01:01	NA	F*01:01	NA
+G	no	NA	no	NA
+H	H*02:06	2.220446e-16	H*02:06	0.0259602
+J	J*01:01	NA	J*01:01	NA
+K	K*01:01	NA	K*01:01	NA
+L	L*01:02	NA	L*01:02	NA
+P	no	NA	no	NA
+V	no	NA	no	NA
diff --git a/test_data/output/hla_1_nonclassical.expression b/test_data/output/hla_1_nonclassical.expression
new file mode 100644
index 0000000..d2b299c
--- /dev/null
+++ b/test_data/output/hla_1_nonclassical.expression
@@ -0,0 +1,9 @@
+E: 248284.76 RPKM
+G: 0.0 RPKM
+F: 43661.47 RPKM
+H: 55744.05 RPKM
+K: 2619.91 RPKM
+J: 854.15 RPKM
+L: 853.37 RPKM
+P: 0.0 RPKM
+V: 0.0 RPKM
diff --git a/test_data/output/hla_2_classical.HLAgenotype2digits b/test_data/output/hla_2_classical.HLAgenotype2digits
new file mode 100644
index 0000000..9976684
--- /dev/null
+++ b/test_data/output/hla_2_classical.HLAgenotype2digits
@@ -0,0 +1,7 @@
+#Locus	Allele 1	Confidence	Allele 2	Confidence
+DQA1	DQA1*01	0	hoz("DQA1*04")	NA
+DQB1	DQB1*05	0.3246214	DQB1*06	0.3701177
+DRB1	DRB1*01	0.005774051	DRB1*15	0.02749135
+DRA	DRA*01	NA	hoz("DRA*01")	NA
+DPA1	DPA1*01	0.1600902	hoz("DPA1*02")	0.0574801
+DPB1	DPB1*02	8.582919e-10	DPB1*04	0.0109329
diff --git a/test_data/output/hla_2_classical.HLAgenotype4digits b/test_data/output/hla_2_classical.HLAgenotype4digits
new file mode 100644
index 0000000..e4740ab
--- /dev/null
+++ b/test_data/output/hla_2_classical.HLAgenotype4digits
@@ -0,0 +1,7 @@
+#Locus	Allele 1	Confidence	Allele 2	Confidence
+DQA1	DQA1*01:02	0.0	DQA1*01:02	NA
+DQB1	DQB1*05:01	0.3246214	DQB1*06:02'	0.3701177
+DRB1	DRB1*01:01	0.005774051	DRB1*15:01	0.02749135
+DRA	DRA*01:01	NA	DRA*01:01	NA
+DPA1	DPA1*01:03	0.1600902	DPA1*01:03	0.0574801
+DPB1	DPB1*02:01'	8.582919e-10	DPB1*04:01	0.0109329
diff --git a/test_data/output/hla_2_classical.expression b/test_data/output/hla_2_classical.expression
new file mode 100644
index 0000000..8775696
--- /dev/null
+++ b/test_data/output/hla_2_classical.expression
@@ -0,0 +1,6 @@
+DQB1: 9352.96 RPKM
+DQA1: 8557.96 RPKM
+DRB1: 45209.69 RPKM
+DPB1: 17124.51 RPKM
+DRA: 75562.7 RPKM
+DPA1: 37577.18 RPKM

From a7dd26db6334211f71809f2670ab7fa26129400c Mon Sep 17 00:00:00 2001
From: Patrick Sorn <patrick.sorn@tron-mainz.de>
Date: Thu, 24 Aug 2023 12:35:39 +0200
Subject: [PATCH 2/2] Restructured code and added test

---
 confidence.py                     | 105 ------------------------------
 fourdigits.py => determine_hla.py |  40 ++++++++----
 seq2HLA.py                        |  24 ++++---
 test.py                           |  35 +++++++++-
 4 files changed, 70 insertions(+), 134 deletions(-)
 delete mode 100644 confidence.py
 rename fourdigits.py => determine_hla.py (90%)

diff --git a/confidence.py b/confidence.py
deleted file mode 100644
index 8c52038..0000000
--- a/confidence.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env python
-
-import numpy as np
-import scipy.stats as st
-
-np.seterr("raise")
-
-def calculate_confidence(allele_count, allele_vector):
-    if not allele_vector or len(allele_vector) < 2:
-        return 1.0
-    mean = np.mean(allele_vector)
-    std = np.std(allele_vector, ddof = 1)
-    paril = 1.0 - st.norm.cdf(allele_count, mean, std)
-    poutlier = st.binom.pmf(0, len(allele_vector), paril)
-    return (1.0 - poutlier)
-
-
-def main():
-    val = [8498, 6256, 25164]
-    types = ["A*24", "B*35", "C*04"]
-    val_list = [
-        [
-            1244,
-            5940,
-            1722,
-            660,
-            5770,
-            222,
-            688,
-            374,
-            544,
-            1890,
-            636,
-            4272,
-            192,
-            132,
-            212,
-            212,
-            1150,
-            2426,
-            596,
-            160
-        ],
-        [
-            2718,
-            782,
-            1754,
-            1284,
-            5832,
-            2436,
-            1088,
-            654,
-            630,
-            924,
-            2086,
-            420,
-            344,
-            1316,
-            1314,
-            4534,
-            310,
-            3386,
-            1664,
-            896,
-            2002,
-            1372,
-            4500,
-            3398,
-            1374,
-            3508,
-            2194,
-            3488,
-            1192,
-            300,
-            436,
-            2846,
-            78,
-            276,
-            322
-        ],
-        [
-            6002,
-            2068,
-            2332,
-            6386,
-            3518,
-            3078,
-            5418,
-            3028,
-            4836,
-            2232,
-            2034,
-            2096,
-            16068
-        ]
-    ]
-
-    print("['[1] A*24', '[1] 8498', '[1] 1452.1', '[1] 1810.365', '[1] 0.0009938365', '[1] B*35', '[1] 6256', '[1] 1761.657', '[1] 1430.283', '[1] 0.02892281', '[1] C*04', '[1] 25164', '[1] 4545.846', '[1] 3800.572', '[1] 3.766758e-07', '']")
-
-    
-    print(calculate_confidence(val[0], val_list[0]))
-              
-
-if __name__ == "__main__":
-    main()
diff --git a/fourdigits.py b/determine_hla.py
similarity index 90%
rename from fourdigits.py
rename to determine_hla.py
index 6c47f64..5de807e 100755
--- a/fourdigits.py
+++ b/determine_hla.py
@@ -5,17 +5,29 @@
 
 
 import numpy as np
+import scipy.stats as st
 
-from confidence import calculate_confidence
 import config as cfg
 
-"""
-Determination of the 4-digit HLA type based on the 2-digit determination by seq2HLA.py
-"""
 
 top_alleles = {}
 
+np.seterr("raise")
+
+def calculate_confidence(allele_max, allele_count_list):
+    if not allele_count_list or len(allele_count_list) < 2:
+        return 1.0
+    mean = np.mean(allele_count_list)
+    std = np.std(allele_count_list, ddof = 1)
+    paril = 1.0 - st.norm.cdf(allele_max, mean, std)
+    poutlier = st.binom.pmf(0, len(allele_count_list), paril)
+    return (1.0 - poutlier)
+
+
 def determine_four_digits_main(infile, bgVec, hla_class, hla_classification, ambiguityfile):
+    """
+    Determination of the 4-digit HLA type based on the 2-digit determination by seq2HLA.py
+    """
     if hla_classification == "nonclassical":
         result = determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile)
     else:
@@ -90,7 +102,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
         if 0.0 < confidence < 0.001:
             for item in result_dict:
                 if result_dict[item] == top:
-                    writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
+                    writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
                     repeat = False
                     mostProbableAllele = str(item.split(":")[0]) + ":" + str(item.split(":")[1])
                     numbersolutions = 1
@@ -113,7 +125,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
             count = 0
             for item in result_dict:
                 if result_dict[item] >= perc:
-                    writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
+                    writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
                     repeat = False
                     item_split = item.split(":")
                     mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
@@ -133,7 +145,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
                     #for item in result_dict:
                     for item in result_dict2:
                         item_split = item.split(":")
-                        writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
+                        writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
                         repeat = False
                         mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
                         numbersolutions = 1
@@ -157,7 +169,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
                         ambiguity_handle.write("#Ambiguity:\n")
                         ambiguity_handle.write("#Based on the RNA-Seq reads and the dbMHC table, the following 4-digits alleles are possible:\n")
                         for ambi4DigitAllele in ambiguity_dict:
-                            ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, 1 - calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))
+                            ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))
 				
                         ambiguity_handle.write("#However, by taking into account the read data, the most probable 4-digit allele is:\n")
                         ambiguity_handle.write("{}\n\n".format(mostProbableAllele))
@@ -167,7 +179,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
                             item_split = item.split(":")
                             item4digit = "{}:{}".format(item_split[0], item_split[1])
                             if result_dict2[item] == max:
-                                writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
+                                writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
                                 repeat = False
     writehandle.close()
     return "{},{}".format(mostProbableAllele, numbersolutions)
@@ -221,7 +233,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
         if 0.0 < confidence < 0.001:
             for item in result_dict:
                 if result_dict[item] == top:
-                    writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
+                    writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
                     repeat = False
                     item_split = item.split(":")
                     mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
@@ -245,7 +257,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
             count = 0
             for item in result_dict:
                 if result_dict[item] >= perc:
-                    writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
+                    writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
                     repeat = False
                     item_split = item.split(":")
                     mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
@@ -264,7 +276,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
             if len(set(result)) == 1:
                 #for item in result_dict:
                 for item in result_dict2:
-                    writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
+                    writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
                     repeat = False
                     item_split = item.split(":")
                     mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
@@ -294,7 +306,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
                         ambiguity_handle.write("#Ambiguity:\n")
                         ambiguity_handle.write("#Based on the RNA-Seq reads and the dbMHC table, the following 4-digits alleles are possible:\n")
                         for ambi4DigitAllele in ambiguity_dict:
-                            ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, 1 - calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))
+                            ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))
                             
                         ambiguity_handle.write("#However, by taking into account the read data, the most probable 4-digit allele is:\n")
                         ambiguity_handle.write("{}\n\n".format(mostProbableAllele))
@@ -306,7 +318,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
                             #if np.mean(dbmhc_prob[item4digit]) == max:
                             #if meanPanPopFreq(dbmhc_prob[item4digit]) == max:
                             if result_dict2[item] == max:
-                                writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
+                                writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
                                 repeat = False
     writehandle.close()			
     return "{},{}".format(mostProbableAllele, numbersolutions)
diff --git a/seq2HLA.py b/seq2HLA.py
index 01bab62..c0b58d6 100755
--- a/seq2HLA.py
+++ b/seq2HLA.py
@@ -26,8 +26,9 @@
 
 # Internal modules
 import config as cfg
-from confidence import calculate_confidence
-import fourdigits
+
+from determine_hla import calculate_confidence
+from determine_hla import determine_four_digits_main
 from version import version
 
 
@@ -55,8 +56,11 @@ def get_read_len(fq_file, gzipped):
     return read_len
 
 
-#Return the p value of a prediction, which is stored in the intermediate textfile
+
 def get_P(locus, finaloutput):
+    """
+    Return the p value of a prediction, which is stored in the intermediate textfile.
+    """
     locus_dict = {
         "A": 2, "DQA1": 2, "E": 2,
         "B": 3, "DQB1": 3, "F": 3,
@@ -214,7 +218,6 @@ def run(self):
         )
 
 
-    #---------------Class I-------------------------------
     def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_classification, length_dict):
         """This method calls HLA types for class I."""
         twodigits1 = {}
@@ -227,7 +230,6 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
         twodigits3 = {}
         finalAlleles = {}
 
-        #-------1st iteration-----------------------------------
         if hla_class == 1:
             self.logger.info("----------HLA class I------------")
             if hla_classification == "classical":
@@ -239,7 +241,6 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
         prefix = "hla_{}_{}".format(hla_class, hla_classification)
 
         
-        #iteration = 1
         fq1_2 = os.path.join(self.working_dir, "{}_iteration_2_1.fq".format(prefix))
         fq2_2 = os.path.join(self.working_dir, "{}_iteration_2_2.fq".format(prefix))
         sam1 = os.path.join(self.working_dir, "{}_iteration_1.sam".format(prefix))
@@ -281,7 +282,6 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
         self.logger.info("Starting 2nd iteration:")
         self.logger.info("Mapping reads...")
         medians = {}
-        #iteration = 2
         
         self.mapping(sam2, aligned, bowtielog, fq1_2, fq2_2, bowtiebuild, 2, mapopt)
 
@@ -302,9 +302,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
         try:
             self.expression(aligned_1, sam1, expressionfile, bowtielog, output1, locus_list, length_dict, hla_dict)
         except IOError:
-            #tmp = ""
             for locus in locus_list:
-                #tmp += "{}: 0 RPKM\n".format(locus)
                 self.logger.debug("{}: 0 RPKM".format(locus))
 	
         #-----3rd iteration in case of at least one homozygous call-----------
@@ -387,7 +385,7 @@ def call_HLA(self, bowtiebuild, hla1fasta, mapopt, locus_list, hla_class, hla_cl
                 finalAlleles[locus] = "no\tNA\tno\tNA"
             self.report_HLA_four_digit_genotype(finalAlleles, finaloutput4digit, locus_list)
 
-        #self.cleanup()
+        self.cleanup()
 
 
     def cleanup(self):
@@ -570,7 +568,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
         for locus in locus_list:
             allele_list = [float(x) for x in alleleVector[locus]]
             confidence_val = calculate_confidence(float(alleleCount[locus]), allele_list)
-            confidence_vals.append((maxkey[locus], confidence_val, 1 - confidence_val))
+            confidence_vals.append((maxkey[locus], confidence_val))
 
         fourDigitString = ""
         if medianflag:
@@ -581,7 +579,7 @@ def predict_HLA(self, sam, medians, output, medianflag, locus_list, hla_class, r
         self.logger.info("Determining 4 digits HLA type...")
         for locus in locus_list:
             if not allele_dict[locus] == "no":
-                fourDigitString += "{},{},".format(allele_dict[locus], fourdigits.determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus], hla_class, hla_classification, ambiguityfile))
+                fourDigitString += "{},{},".format(allele_dict[locus], determine_four_digits_main("{}.4digits{}{}".format(sam, locus, iteration), alleleVector[locus], hla_class, hla_classification, ambiguityfile))
             else:
                 fourDigitString += "{},,,".format(allele_dict[locus])
 
@@ -599,7 +597,7 @@ def pred2file(self, entries, readspergroup_dict, output, allele_dict, HLAclass,
                 outf.write("{}\t{}\t".format(locus, allele_dict[locus]))
                 #compute the decision threshold (median) for homozygosity vs. heterozygosity for the second iteration
                 outf.write("{}\t".format(np.median(list(readspergroup_dict[locus].values()))))
-                outf.write("{}\t{}\n".format(entries[index][0], entries[index][2]))
+                outf.write("{}\t{}\n".format(entries[index][0], entries[index][1]))
                 index += 1
 
         self.logger.info("The digital haplotype is written into {}".format(output))
diff --git a/test.py b/test.py
index a7307ac..f2e7d5a 100644
--- a/test.py
+++ b/test.py
@@ -1,12 +1,13 @@
 import unittest
 
+from determine_hla import calculate_confidence
 from seq2HLA import get_best_allele_per_group
 
 
 class TestSeq2HLAMethods(unittest.TestCase):
     def test_get_best_allele_per_group(self):
         """
-        Test that it can sum a list of integers
+        Test that best alleles are selected per locus.
         """
         readspergroup = {
             'A*01': 0,
@@ -23,9 +24,39 @@ def test_get_best_allele_per_group(self):
         }
         
         result = get_best_allele_per_group(readspergroup, readcount)
-        print(result)
+        #print(result)
         self.assertEqual(result, ({'A*01': 122, 'A*02': 0, 'B*45': 116, 'C*04': 74, 'G*29': 0}, {'A*01': 'A*01:01:02', 'B*45': 'B*45:07', 'C*04': 'C*04:02'}))
 
         
+    def test_confidence_calculation_low_conf(self):
+        """
+        Test to make sure that confidence calculation is correct.
+        """
+        allele_max = 500
+        allele_count_list = [500, 400, 300]
+        result = calculate_confidence(allele_max, allele_count_list)
+        print(result)
+        self.assertEqual(result, 0.40444488206853557)
+
+
+    def test_confidence_calculation_high_conf(self):
+        """
+        Test to make sure that confidence calculation is correct.
+        """
+        allele_max = 5000
+        allele_count_list = [500, 400, 300]
+        result = calculate_confidence(allele_max, allele_count_list)
+        print(result)
+        self.assertEqual(result, 0.0)
+
+    def test_confidence_calculation_too_few_elements(self):
+        """
+        Test to make sure that too few elements in allele_count_list are not supported.
+        """
+        allele_max = 500
+        allele_count_list = [500]
+        result = calculate_confidence(allele_max, allele_count_list)
+        self.assertEqual(result, 1.0)
+        
 if __name__ == '__main__':
     unittest.main()