Skip to content

Add unit/integration tests #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ as input, uses a bowtie index comprising all HLA alleles and outputs the most li
- biopython (>= 1.79)
- numpy (>= 1.21.5)
- scipy (>= 1.7.3)
- bowtie (>= 1.0.0)
- bowtie (>= 1.3.1)


## Installation
Expand Down Expand Up @@ -50,13 +50,13 @@ optional arguments:
## Output

The results are output to stdout and to textfiles. Most important are:
- <prefix>-ClassI.HLAgenotype2digits => 2 digit result of Class I
- <prefix>-ClassII.HLAgenotype2digits => 2 digit result of Class II
- <prefix>-ClassI.HLAgenotype4digits => 4 digit result of Class I
- <prefix>-ClassII.HLAgenotype4digits => 4 digit result of Class II
- <prefix>.ambiguity => reports typing ambuigities (more than one solution for an allele possible)
- <prefix>-ClassI.expression => expression of Class I alleles
- <prefix>-ClassII.expression => expression of Class II alleles
- `hla_1_*.HLAgenotype2digits` - 2 digit result of Class I
- `hla_2_classical.HLAgenotype2digits` - 2 digit result of Class II
- `hla_1_*.HLAgenotype4digits` - 4 digit result of Class I
- `hla_2_classical.HLAgenotype4digits` - 4 digit result of Class II
- `hla.ambiguity` - reports typing ambuigities (more than one solution for an allele possible)
- `hla_1_*.expression` - expression of Class I alleles
- `hla_2_classical.expression` - expression of Class II alleles


## Version history
Expand Down
100 changes: 0 additions & 100 deletions confidence.py

This file was deleted.

44 changes: 28 additions & 16 deletions fourdigits.py → determine_hla.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,29 @@


import numpy as np
import scipy.stats as st

from confidence import calculate_confidence
import config as cfg

"""
Determination of the 4-digit HLA type based on the 2-digit determination by seq2HLA.py
"""

top_alleles = {}

np.seterr("raise")

def calculate_confidence(allele_max, allele_count_list):
if not allele_count_list or len(allele_count_list) < 2:
return 1.0
mean = np.mean(allele_count_list)
std = np.std(allele_count_list, ddof = 1)
paril = 1.0 - st.norm.cdf(allele_max, mean, std)
poutlier = st.binom.pmf(0, len(allele_count_list), paril)
return (1.0 - poutlier)


def determine_four_digits_main(infile, bgVec, hla_class, hla_classification, ambiguityfile):
"""
Determination of the 4-digit HLA type based on the 2-digit determination by seq2HLA.py
"""
if hla_classification == "nonclassical":
result = determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile)
else:
Expand Down Expand Up @@ -90,7 +102,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
if 0.0 < confidence < 0.001:
for item in result_dict:
if result_dict[item] == top:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
repeat = False
mostProbableAllele = str(item.split(":")[0]) + ":" + str(item.split(":")[1])
numbersolutions = 1
Expand All @@ -113,7 +125,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
count = 0
for item in result_dict:
if result_dict[item] >= perc:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
repeat = False
item_split = item.split(":")
mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
Expand All @@ -133,7 +145,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
#for item in result_dict:
for item in result_dict2:
item_split = item.split(":")
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
repeat = False
mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
numbersolutions = 1
Expand All @@ -157,7 +169,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
ambiguity_handle.write("#Ambiguity:\n")
ambiguity_handle.write("#Based on the RNA-Seq reads and the dbMHC table, the following 4-digits alleles are possible:\n")
for ambi4DigitAllele in ambiguity_dict:
ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, 1 - calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))
ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))

ambiguity_handle.write("#However, by taking into account the read data, the most probable 4-digit allele is:\n")
ambiguity_handle.write("{}\n\n".format(mostProbableAllele))
Expand All @@ -167,7 +179,7 @@ def determine_four_digits(dbmhc, infile, bgVec, ambiguityfile):
item_split = item.split(":")
item4digit = "{}:{}".format(item_split[0], item_split[1])
if result_dict2[item] == max:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
repeat = False
writehandle.close()
return "{},{}".format(mostProbableAllele, numbersolutions)
Expand All @@ -176,13 +188,13 @@ def calculate_outlier(top, bgVec):
"""Call R-script "commmand_fourdigit.R" to calculate the confidence of the top-scoring allele (four-digits)."""

# Convert bgVec into md5
md5sum = hashlib.md5(bgVec.encode("utf-8")).hexdigest()
md5sum = hashlib.md5(','.join(str(x) for x in bgVec).encode("utf-8")).hexdigest()
if not md5sum in top_alleles:
top_alleles[md5sum] = {}
if top in top_alleles[md5sum]:
return top_alleles[md5sum][top]
else:
allele_list = [float(x) for x in bgVec.rstrip(",").split(",")]
allele_list = [float(x) for x in bgVec]
confidence = calculate_confidence(float(top), allele_list)
top_alleles[md5sum][top] = confidence
return confidence
Expand Down Expand Up @@ -221,7 +233,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
if 0.0 < confidence < 0.001:
for item in result_dict:
if result_dict[item] == top:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
repeat = False
item_split = item.split(":")
mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
Expand All @@ -245,7 +257,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
count = 0
for item in result_dict:
if result_dict[item] >= perc:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict[item], bgVec)))
repeat = False
item_split = item.split(":")
mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
Expand All @@ -264,7 +276,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
if len(set(result)) == 1:
#for item in result_dict:
for item in result_dict2:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
repeat = False
item_split = item.split(":")
mostProbableAllele = "{}:{}".format(item_split[0], item_split[1])
Expand Down Expand Up @@ -294,7 +306,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
ambiguity_handle.write("#Ambiguity:\n")
ambiguity_handle.write("#Based on the RNA-Seq reads and the dbMHC table, the following 4-digits alleles are possible:\n")
for ambi4DigitAllele in ambiguity_dict:
ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, 1 - calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))
ambiguity_handle.write("{}\t{}\n".format(ambi4DigitAllele, calculate_outlier(ambiguity_dict[ambi4DigitAllele], bgVec)))

ambiguity_handle.write("#However, by taking into account the read data, the most probable 4-digit allele is:\n")
ambiguity_handle.write("{}\n\n".format(mostProbableAllele))
Expand All @@ -306,7 +318,7 @@ def determine_four_digits_minor_alleles(infile, bgVec, ambiguityfile):
#if np.mean(dbmhc_prob[item4digit]) == max:
#if meanPanPopFreq(dbmhc_prob[item4digit]) == max:
if result_dict2[item] == max:
writehandle.write("{}\t{}\n".format(item, 1 - calculate_outlier(result_dict2[item], bgVec)))
writehandle.write("{}\t{}\n".format(item, calculate_outlier(result_dict2[item], bgVec)))
repeat = False
writehandle.close()
return "{},{}".format(mostProbableAllele, numbersolutions)
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ channels:
- defaults
dependencies:
- biopython=1.79
- bowtie=1.0.0
- bowtie=1.3.1
- numpy=1.21.5
- scipy=1.7.3
- python=3.7.16
2 changes: 1 addition & 1 deletion run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

rm -rf test_out/

python seq2HLA.py -1 test_data/sample_R1.fastq.gz -2 test_data/sample_R2.fastq.gz -o test_out/ -p 12
python seq2HLA.py -1 test_data/input/sample_R1.fastq.gz -2 test_data/input/sample_R2.fastq.gz -o test_out/ -p 12
Loading