fixes for running snap

dmnfarrell · dmnfarrell · commit e5a61b372e22 · 2019-02-10T22:22:57.000Z
diff --git a/CHANGES b/CHANGES
@@ -7,7 +7,7 @@ CHANGES
 ------
 
 major changes to neoepitope routines
-changes to app outputs
+some changes to app outputs
 added netmhcpan predictor
 added basicmhc1 predictor
 updated notebook examples
diff --git a/epitopepredict/app.py b/epitopepredict/app.py
@@ -26,8 +26,8 @@ def __init__(self, opts={}):
     def setup(self):
         """Setup main parameters"""
 
-        if check_snap() == True:
-            add_path()
+        #if base.check_snap() == True:
+            #add_path()
         pd.set_option('display.width', 120)
         #override base.defaults entries if provided in conf
         set_defaults(self.__dict__)
@@ -69,6 +69,8 @@ def setup(self):
             self.names=None
         else:
             self.names = self.names.split(',')
+        if self.names is not None:
+            print ('selected sequences:', self.names)
 
         if not os.path.exists(self.path) and self.path != '':
             os.mkdir(self.path)
@@ -359,13 +361,6 @@ def list_alleles():
         print ()
     return
 
-def check_snap():
-    """Check if inside a snap"""
-
-    if 'SNAP_COMMON' in os.environ:
-        return True
-    return False
-
 def add_path():
     """Add home dir to path for accessing tools from a snap"""
 
@@ -435,7 +430,9 @@ def main():
                         help="Analysis path", metavar="FILE")
     parser.add_option("-n", "--neoepitope", dest="neoepitope", action="store_true",
                         default=False, help="Neo-epitope pipeline")
-    parser.add_option("-s", "--server", dest="server",  action="store_true",
+    parser.add_option("-e", "--ensembl", dest="ensembl", action="store_true",
+                        default=False, help="Get ensembl files for a release")
+    parser.add_option("-s", "--server", dest="server",
                         default=False, help="Run web app")
     parser.add_option("-x", "--port", dest="port", default=8000,
                         help="Port for web app, default 8000")
@@ -467,7 +464,11 @@ def main():
     elif opts.neoepitope == True:
         if opts.test == True:
             neo.test_run()
+            #neo.varcode_test()
         else:
+            print (options)
+            release = options['ensembl_release']
+            neo.check_ensembl(release)
             W = neo.NeoEpitopeWorkFlow(options)
             st = W.setup()
             if st == True:
diff --git a/epitopepredict/base.py b/epitopepredict/base.py
@@ -402,6 +402,26 @@ def split_peptides(df,length=9,seqkey='sequence',newcol='peptide'):
     res = df.merge(res,on=seqkey)
     return res
 
+def check_snap():
+    """Check if inside a snap"""
+
+    if 'SNAP_COMMON' in os.environ:
+        print ('running in a snap')
+        return True
+    return False
+
+def set_netmhcpan_cmd():
+    """Setup the netmhcpan command for using inside snap. Avoids using
+    tcsh script."""
+
+    toolspath = os.path.join('/home', os.environ['USER'], 'tools')
+    netmhcpath = os.path.join(toolspath, 'netMHCpan-4.0/Linux_x86_64')
+    os.environ['NETMHCpan']=netmhcpath
+    os.environ['TMPDIR']='/tmp'
+    cmd = os.path.join(netmhcpath, 'bin/netMHCpan')
+    print ('netmhcpan cmd set to:', cmd)
+    return cmd
+
 class DataFrameIterator:
     """Simple iterator to get dataframes from a path out of memory"""
     def __init__(self, files):
@@ -924,6 +944,8 @@ def _predict_sequences(self, recs, path=None, overwrite=True, alleles=[], length
 
         results = []
         self.length = length
+        if compression == '':
+            compression = None
         for i,row in recs.iterrows():
             seq = row.translation
             seq = clean_sequence(seq) #clean the sequence of non-aa characters
@@ -1180,6 +1202,10 @@ def __init__(self, data=None, scoring='affinity'):
             self.rankascending = 0
         #load precalculated quantiles for sample peptides
         self.qf = self.get_quantile_data()
+        self.basecmd = 'netMHCpan'
+        #base command needs to be to the binary directly if running snap
+        if check_snap is True:
+            self.basecmd = set_netmhcpan_cmd()
 
     def read_result(self, temp):
         """Read raw results from netMHCpan output"""
@@ -1225,9 +1251,9 @@ def predict(self, peptides, allele='HLA-A*01:01', name='temp',
                 f.write(p+'\n')
         f.close()
         if self.scoring =='affinity':
-            cmd = 'netMHCpan -BA -f %s -inptype 1 -a %s' %(pepfile , allele)
+            cmd = '%s -BA -f %s -inptype 1 -a %s' %(self.basecmd, pepfile , allele)
         else:
-            cmd = 'netMHCpan -f %s -inptype 1 -a %s' %(pepfile , allele)
+            cmd = '%s -f %s -inptype 1 -a %s' %(self.basecmd, pepfile , allele)
         if show_cmd is True:
             print (cmd)
         try:
diff --git a/epitopepredict/config.py b/epitopepredict/config.py
@@ -59,7 +59,7 @@
                             'iedb_mhc1_method':'IEDB_recommended',
                             'iedb_mhc2_method':'IEDB_recommended'}
 
-baseoptions['neopredict'] = {'vcf_files':'',
+baseoptions['neopredict'] = {'vcf_files':'', 'ensembl_release':'75',
                              'selection_method':'promiscuity'}
 
 def write_default_config():
diff --git a/epitopepredict/neo.py b/epitopepredict/neo.py
@@ -35,7 +35,7 @@ def setup(self):
 
         if check_imports() == False:
             return
-        check_ensembl()
+        #check_ensembl()
         pd.set_option('display.width', 120)
         base.iedbmhc1path = self.iedbmhc1_path
         base.iedbmhc2path = self.iedbmhc2_path
@@ -108,7 +108,7 @@ def run(self):
                 variants = load_variants(vcf_file=infile)
                 labels[f]['variants'] = len(variants)
                 print ('getting variant effects')
-                effects = get_variant_effects(variants, self.verbose)
+                effects = get_variants_effects(variants, self.verbose)
                 #serialize variant effects
                 effects_to_pickle(effects, eff_obj)
             else:
@@ -360,8 +360,8 @@ def peptides_from_effect(eff, length=11, peptides=True, verbose=False):
             wt = orig[st:end]
         else:
             wt = None
-    if verbose == True:
-        print (type(eff), len(orig), len(mut), vloc, st, end, len(mutpep))
+    #if verbose == True:
+    #    print (type(eff), len(orig), len(mut), vloc, st, end, len(mutpep))
     if len(mutpep)<length:
         if verbose == True:
             print ('peptide length too small')
@@ -562,26 +562,30 @@ def make_blastdb(url, name=None, filename=None, overwrite=False):
 
 def make_human_blastdb():
     """Human proteome blastdb"""
+
     url = 'ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz'
     filename = 'Homo_sapiens.GRCh38.pep.all.fa.gz'
     blastdb = make_blastdb(url, name='GRCh38', filename=filename)
     return blastdb
 
 def make_virus_blastdb():
     """Human virus blastdb"""
+
     url = 'http://www.uniprot.org/uniprot/?sort=score&desc=&compress=no&query=taxonomy:%22Viruses%20[10239]%22%20\
     keyword:%22Reference%20proteome%20[KW-1185]%22%20host:%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no&preview=true&format=fasta'
     filename = 'uniprot_human_virus_proteome.fa.gz'
     blastdb = make_blastdb(url, name='human_virus', filename=filename)
     return blastdb
 
 def self_matches(df, **kwargs):
+
     blastdb = make_human_blastdb()
     x = find_matches(df, blastdb, **kwargs)
     x = x.rename(columns={'sseq':'self_match','mismatch':'self_mismatches'})
     return x
 
 def virus_matches(df, **kwargs):
+
     blastdb = make_virus_blastdb()
     x = find_matches(df, blastdb, **kwargs)
     if 'sseq' in x.columns:
@@ -642,12 +646,14 @@ def check_mm(x):
     return x
 
 def wt_similarity(x, matrix='blosum62'):
+
     x1 = x.peptide
     x2 = x.wt
     matrix = tepitope.get_matrix(matrix)
     return tepitope.similarity_score(matrix,x1,x2)
 
 def self_similarity(x, matrix='blosum62'):
+
     if x.self_match is None:
         return
     x1 = x.peptide
@@ -656,6 +662,7 @@ def self_similarity(x, matrix='blosum62'):
     return tepitope.similarity_score(matrix,x1,x2)
 
 def virus_similarity(x, matrix='blosum62'):
+
     if x.virus_match is None:
         return
     x1 = x.peptide
@@ -676,6 +683,7 @@ def anchor_mutated(x):
 
 def summary_plots(df):
     """summary plots for testing results"""
+
     f,axs=plt.subplots(2,2,figsize=(10,10))
     axs=axs.flat
     g = df.groupby(['name']).size().sort_values(ascending=False)[:20]
@@ -697,42 +705,38 @@ def show_predictors():
 def check_imports():
     try:
         import varcode
-    except:
+    except Exception as e:
+        print (e)
         print ('varcode required. please run pip install varcode')
         return False
     return True
 
-def check_snap():
-    """Check if inside a snap"""
-
-    if 'SNAP_COMMON' in os.environ:
-        return True
-    return False
-
 def fetch_ensembl_release(path=None, release='75'):
-    """get pyensembl genome files"""
+    """Get pyensembl genome files"""
 
     from pyensembl import Genome,EnsemblRelease
-    if path is not None:
-        os.environ['PYENSEMBL_CACHE_DIR'] = path
     #this call should download the files
     genome = EnsemblRelease(release, species='human')
-    genome.download()
-    genome.index()
-    #print ('pyensembl genome files cached in %s' %genome.cache_directory_path)
+    genome.download(overwrite=False)
+    genome.index(overwrite=False)
+    genome.cache_directory_path = path
+    print ('pyensembl genome files cached in %s' %genome.cache_directory_path)
+    #run_pyensembl_install()
     return
 
-def check_ensembl():
+def check_ensembl(release='75'):
     """Check pyensembl ref genome cached. Needed for running in snap"""
 
     #check if running inside a snap package so we can download
     #the genome files for pyensembl
-    if check_snap() is True:
-        #print ('running inside snap')
-        home = os.path.join('/home', os.environ['USER'])
+    cache_dir=None
+    if base.check_snap() is True:
+        #home = os.path.join('/home', os.environ['USER'])
+        home = os.environ['SNAP_USER_COMMON']
         cache_dir = os.path.join(home, '.cache')
-        print ('checking for ref human genome')
-        fetch_ensembl_release(cache_dir)
+        os.environ['PYENSEMBL_CACHE_DIR'] = cache_dir
+    print ('checking for ref human genome')
+    fetch_ensembl_release(cache_dir, release)
     return
 
 def run_vep(vcf_file, out_format='vcf', assembly='GRCh38', cpus=4, path=None):
@@ -760,6 +764,7 @@ def print_help():
     print ("""use -h to get options""")
 
 def plot_variant_summary(data):
+
     from bokeh.plotting import figure
     from bokeh.charts import Donut
     d = Donut(df, label=['abbr', 'medal'], values='medal_count',
@@ -775,12 +780,22 @@ def test_run():
     options['base']['predictors'] = 'netmhcpan' #'mhcflurry'
     options['base']['mhc1_alleles'] = 'HLA-A*02:01'
     options['base']['path'] = 'neo_test'
+    options['base']['overwrite'] = True
     #options['base']['mhc2_length'] = 11
     #options['base']['verbose'] = True
     #options['base']['cpus'] = 4
     options['neopredict']['vcf_files'] = os.path.join(path, 'testing','input.vcf')
     options = config.check_options(options)
     #print (options)
     W = NeoEpitopeWorkFlow(options)
+    check_ensembl(release='75')
     st = W.setup()
+    #check_ensembl()
     W.run()
+
+def varcode_test():
+    path = os.path.dirname(os.path.abspath(__file__))
+    infile = os.path.join(path, 'testing','input.vcf')
+    variants = load_variants(vcf_file=infile)
+    get_variants_effects(variants)
+    return
diff --git a/epitopepredict/peptutils.py b/epitopepredict/peptutils.py
@@ -143,6 +143,8 @@ def net_charge(seq):
 def compare_anchor_positions(x1, x2):
     """Check if anchor positions in 9-mers are mutated"""
 
+    if x1 is None or x2 is None:
+        return 0
     p1 = list(get_fragments(x1, length=9).peptide)
     p2 = list(get_fragments(x2, length=9).peptide)
     #is mutation in anchor residue
diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml
@@ -8,20 +8,21 @@ description: |
  each method can then be processed and visualized in a consistent manner.
 
 grade: stable
-confinement: devmode
+confinement: strict
 icon: gui/icon.png
 
 apps:
   epitopepredict:
     command: bin/epitopepredict
     plugs: [home,network-bind]
-
+    environment:
+      LD_LIBRARY_PATH: $SNAP/usr/lib/ncbi-blast+  
 parts:
   epitopepredict:
     plugin: python
     python-version: python3
     source: ../
     python-packages:
-      [mhcflurry,gtfparse==0.0.6,pyensembl==1.1.0,varcode==0.5.15]
+      [mhcflurry,gtfparse==1.2,pyensembl==1.7.3,varcode==0.8.0]
     stage-packages:
-      [python-setuptools,tcsh,gawk,ncbi-blast+,bowtie2]
+      [tcsh,gawk,ncbi-blast+]