@@ -35,7 +35,7 @@ def setup(self):
35
35
36
36
if check_imports () == False :
37
37
return
38
- check_ensembl ()
38
+ # check_ensembl()
39
39
pd .set_option ('display.width' , 120 )
40
40
base .iedbmhc1path = self .iedbmhc1_path
41
41
base .iedbmhc2path = self .iedbmhc2_path
@@ -108,7 +108,7 @@ def run(self):
108
108
variants = load_variants (vcf_file = infile )
109
109
labels [f ]['variants' ] = len (variants )
110
110
print ('getting variant effects' )
111
- effects = get_variant_effects (variants , self .verbose )
111
+ effects = get_variants_effects (variants , self .verbose )
112
112
#serialize variant effects
113
113
effects_to_pickle (effects , eff_obj )
114
114
else :
@@ -360,8 +360,8 @@ def peptides_from_effect(eff, length=11, peptides=True, verbose=False):
360
360
wt = orig [st :end ]
361
361
else :
362
362
wt = None
363
- if verbose == True :
364
- print (type (eff ), len (orig ), len (mut ), vloc , st , end , len (mutpep ))
363
+ # if verbose == True:
364
+ # print (type(eff), len(orig), len(mut), vloc, st, end, len(mutpep))
365
365
if len (mutpep )< length :
366
366
if verbose == True :
367
367
print ('peptide length too small' )
@@ -562,26 +562,30 @@ def make_blastdb(url, name=None, filename=None, overwrite=False):
562
562
563
563
def make_human_blastdb ():
564
564
"""Human proteome blastdb"""
565
+
565
566
url = 'ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz'
566
567
filename = 'Homo_sapiens.GRCh38.pep.all.fa.gz'
567
568
blastdb = make_blastdb (url , name = 'GRCh38' , filename = filename )
568
569
return blastdb
569
570
570
571
def make_virus_blastdb ():
571
572
"""Human virus blastdb"""
573
+
572
574
url = 'http://www.uniprot.org/uniprot/?sort=score&desc=&compress=no&query=taxonomy:%22Viruses%20[10239]%22%20\
573
575
keyword:%22Reference%20proteome%20[KW-1185]%22%20host:%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no&preview=true&format=fasta'
574
576
filename = 'uniprot_human_virus_proteome.fa.gz'
575
577
blastdb = make_blastdb (url , name = 'human_virus' , filename = filename )
576
578
return blastdb
577
579
578
580
def self_matches (df , ** kwargs ):
581
+
579
582
blastdb = make_human_blastdb ()
580
583
x = find_matches (df , blastdb , ** kwargs )
581
584
x = x .rename (columns = {'sseq' :'self_match' ,'mismatch' :'self_mismatches' })
582
585
return x
583
586
584
587
def virus_matches (df , ** kwargs ):
588
+
585
589
blastdb = make_virus_blastdb ()
586
590
x = find_matches (df , blastdb , ** kwargs )
587
591
if 'sseq' in x .columns :
@@ -642,12 +646,14 @@ def check_mm(x):
642
646
return x
643
647
644
648
def wt_similarity (x , matrix = 'blosum62' ):
649
+
645
650
x1 = x .peptide
646
651
x2 = x .wt
647
652
matrix = tepitope .get_matrix (matrix )
648
653
return tepitope .similarity_score (matrix ,x1 ,x2 )
649
654
650
655
def self_similarity (x , matrix = 'blosum62' ):
656
+
651
657
if x .self_match is None :
652
658
return
653
659
x1 = x .peptide
@@ -656,6 +662,7 @@ def self_similarity(x, matrix='blosum62'):
656
662
return tepitope .similarity_score (matrix ,x1 ,x2 )
657
663
658
664
def virus_similarity (x , matrix = 'blosum62' ):
665
+
659
666
if x .virus_match is None :
660
667
return
661
668
x1 = x .peptide
@@ -676,6 +683,7 @@ def anchor_mutated(x):
676
683
677
684
def summary_plots (df ):
678
685
"""summary plots for testing results"""
686
+
679
687
f ,axs = plt .subplots (2 ,2 ,figsize = (10 ,10 ))
680
688
axs = axs .flat
681
689
g = df .groupby (['name' ]).size ().sort_values (ascending = False )[:20 ]
@@ -697,42 +705,38 @@ def show_predictors():
697
705
def check_imports ():
698
706
try :
699
707
import varcode
700
- except :
708
+ except Exception as e :
709
+ print (e )
701
710
print ('varcode required. please run pip install varcode' )
702
711
return False
703
712
return True
704
713
705
- def check_snap ():
706
- """Check if inside a snap"""
707
-
708
- if 'SNAP_COMMON' in os .environ :
709
- return True
710
- return False
711
-
712
714
def fetch_ensembl_release (path = None , release = '75' ):
713
- """get pyensembl genome files"""
715
+ """Get pyensembl genome files"""
714
716
715
717
from pyensembl import Genome ,EnsemblRelease
716
- if path is not None :
717
- os .environ ['PYENSEMBL_CACHE_DIR' ] = path
718
718
#this call should download the files
719
719
genome = EnsemblRelease (release , species = 'human' )
720
- genome .download ()
721
- genome .index ()
722
- #print ('pyensembl genome files cached in %s' %genome.cache_directory_path)
720
+ genome .download (overwrite = False )
721
+ genome .index (overwrite = False )
722
+ genome .cache_directory_path = path
723
+ print ('pyensembl genome files cached in %s' % genome .cache_directory_path )
724
+ #run_pyensembl_install()
723
725
return
724
726
725
- def check_ensembl ():
727
+ def check_ensembl (release = '75' ):
726
728
"""Check pyensembl ref genome cached. Needed for running in snap"""
727
729
728
730
#check if running inside a snap package so we can download
729
731
#the genome files for pyensembl
730
- if check_snap () is True :
731
- #print ('running inside snap')
732
- home = os .path .join ('/home' , os .environ ['USER' ])
732
+ cache_dir = None
733
+ if base .check_snap () is True :
734
+ #home = os.path.join('/home', os.environ['USER'])
735
+ home = os .environ ['SNAP_USER_COMMON' ]
733
736
cache_dir = os .path .join (home , '.cache' )
734
- print ('checking for ref human genome' )
735
- fetch_ensembl_release (cache_dir )
737
+ os .environ ['PYENSEMBL_CACHE_DIR' ] = cache_dir
738
+ print ('checking for ref human genome' )
739
+ fetch_ensembl_release (cache_dir , release )
736
740
return
737
741
738
742
def run_vep (vcf_file , out_format = 'vcf' , assembly = 'GRCh38' , cpus = 4 , path = None ):
@@ -760,6 +764,7 @@ def print_help():
760
764
print ("""use -h to get options""" )
761
765
762
766
def plot_variant_summary (data ):
767
+
763
768
from bokeh .plotting import figure
764
769
from bokeh .charts import Donut
765
770
d = Donut (df , label = ['abbr' , 'medal' ], values = 'medal_count' ,
@@ -775,12 +780,22 @@ def test_run():
775
780
options ['base' ]['predictors' ] = 'netmhcpan' #'mhcflurry'
776
781
options ['base' ]['mhc1_alleles' ] = 'HLA-A*02:01'
777
782
options ['base' ]['path' ] = 'neo_test'
783
+ options ['base' ]['overwrite' ] = True
778
784
#options['base']['mhc2_length'] = 11
779
785
#options['base']['verbose'] = True
780
786
#options['base']['cpus'] = 4
781
787
options ['neopredict' ]['vcf_files' ] = os .path .join (path , 'testing' ,'input.vcf' )
782
788
options = config .check_options (options )
783
789
#print (options)
784
790
W = NeoEpitopeWorkFlow (options )
791
+ check_ensembl (release = '75' )
785
792
st = W .setup ()
793
+ #check_ensembl()
786
794
W .run ()
795
+
796
+ def varcode_test ():
797
+ path = os .path .dirname (os .path .abspath (__file__ ))
798
+ infile = os .path .join (path , 'testing' ,'input.vcf' )
799
+ variants = load_variants (vcf_file = infile )
800
+ get_variants_effects (variants )
801
+ return
0 commit comments