|
| 1 | +FROM mambaorg/micromamba:1.5.10 AS app |
| 2 | + |
| 3 | +# build and run as root users since micromamba image has 'mambauser' set as the $USER |
| 4 | +USER root |
| 5 | +# set workdir to default for building; set to /data at the end |
| 6 | +WORKDIR / |
| 7 | + |
| 8 | +# ARG variables only persist during build time |
| 9 | +# had to include the v for some of these due to GitHub tags. |
| 10 | +# using pangolin-data github tag, NOT what is in the GH release title "v1.2.133" |
| 11 | +ARG PANGOLIN_VER="v4.3.1" |
| 12 | +ARG PANGOLIN_DATA_VER="v1.30" |
| 13 | +ARG SCORPIO_VER="v0.3.19" |
| 14 | +ARG CONSTELLATIONS_VER="v0.1.12" |
| 15 | +ARG USHER_VER="0.6.3" |
| 16 | + |
| 17 | +# metadata labels |
| 18 | +LABEL base.image="mambaorg/micromamba:1.5.10" |
| 19 | +LABEL dockerfile.version="1" |
| 20 | +LABEL software="pangolin" |
| 21 | +LABEL software.version=${PANGOLIN_VER} |
| 22 | +LABEL description="Conda environment for Pangolin. Pangolin: Software package for assigning SARS-CoV-2 genome sequences to global lineages." |
| 23 | +LABEL website="https://github.com/cov-lineages/pangolin" |
| 24 | +LABEL license="GNU General Public License v3.0" |
| 25 | +LABEL license.url="https://github.com/cov-lineages/pangolin/blob/master/LICENSE.txt" |
| 26 | +LABEL maintainer="Curtis Kapsak" |
| 27 | +LABEL maintainer.email="kapsakcj@gmail.com" |
| 28 | + |
| 29 | +# install dependencies; cleanup apt garbage |
| 30 | +RUN apt-get update && apt-get install -y --no-install-recommends \ |
| 31 | + wget \ |
| 32 | + ca-certificates \ |
| 33 | + git \ |
| 34 | + procps \ |
| 35 | + bsdmainutils && \ |
| 36 | + apt-get autoclean && rm -rf /var/lib/apt/lists/* |
| 37 | + |
| 38 | +# get the pangolin repo |
| 39 | +RUN wget "https://github.com/cov-lineages/pangolin/archive/${PANGOLIN_VER}.tar.gz" && \ |
| 40 | + tar -xf ${PANGOLIN_VER}.tar.gz && \ |
| 41 | + rm -v ${PANGOLIN_VER}.tar.gz && \ |
| 42 | + mv -v pangolin-* pangolin |
| 43 | + |
| 44 | +# set the environment; PATH is unnecessary here, but leaving anyways. It's reset later in dockerfile |
| 45 | +ENV PATH="$PATH" \ |
| 46 | + LC_ALL=C.UTF-8 |
| 47 | + |
| 48 | +# modify environment.yml to pin specific versions during install |
| 49 | +# pin specific versions of usher, scorpio, pangolin-data, constellations, and pulp |
| 50 | +# create the conda environment using modified environment.yml |
| 51 | +RUN sed -i "s|usher.*|usher=${USHER_VER}|" /pangolin/environment.yml && \ |
| 52 | + sed -i "s|scorpio.git|scorpio.git@${SCORPIO_VER}|" /pangolin/environment.yml && \ |
| 53 | + sed -i "s|pangolin-data.git|pangolin-data.git@${PANGOLIN_DATA_VER}|" /pangolin/environment.yml && \ |
| 54 | + sed -i "s|constellations.git|constellations.git@${CONSTELLATIONS_VER}|" /pangolin/environment.yml && \ |
| 55 | + sed -i "12 a\ - pulp=2.7.0" /pangolin/environment.yml && \ |
| 56 | + micromamba create -n pangolin -y -f /pangolin/environment.yml && \ |
| 57 | + micromamba clean -a -y -f |
| 58 | + |
| 59 | +# so that mamba/conda env is active when running below commands |
| 60 | +ENV ENV_NAME="pangolin" |
| 61 | +ARG MAMBA_DOCKERFILE_ACTIVATE=1 |
| 62 | + |
| 63 | +WORKDIR /pangolin |
| 64 | + |
| 65 | +# run pip install step; download optional pre-computed assignment hashes for UShER (useful for running on large batches of samples) |
| 66 | +# best to skip using the assigment-cache if running on one sample for speed |
| 67 | +# print versions |
| 68 | +RUN pip install . && \ |
| 69 | + pangolin --add-assignment-cache && \ |
| 70 | + mkdir /data && \ |
| 71 | + pangolin --all-versions && \ |
| 72 | + usher --version |
| 73 | + |
| 74 | +# final working directory in "app" layer is /data for passing data in/out of container |
| 75 | +WORKDIR /data |
| 76 | + |
| 77 | +# hardcode pangolin executable into the PATH variable |
| 78 | +ENV PATH="${PATH}:/opt/conda/envs/pangolin/bin/" XDG_CACHE_HOME=/tmp |
| 79 | + |
| 80 | +# default command is to pull up help options for pangolin; can be overridden of course |
| 81 | +CMD ["pangolin", "-h"] |
| 82 | + |
| 83 | +# new base for testing |
| 84 | +FROM app AS test |
| 85 | + |
| 86 | +# so that mamba/conda env is active when running below commands |
| 87 | +ENV ENV_NAME="pangolin" |
| 88 | +ARG MAMBA_DOCKERFILE_ACTIVATE=1 |
| 89 | + |
| 90 | +# test on test sequences supplied with Pangolin code |
| 91 | +RUN pangolin /pangolin/pangolin/test/test_seqs.fasta -o /data/test_seqs-output-pusher && \ |
| 92 | + column -t -s, /data/test_seqs-output-pusher/lineage_report.csv |
| 93 | + |
| 94 | +# test functionality of assignment-cache option |
| 95 | +RUN pangolin --use-assignment-cache /pangolin/pangolin/test/test_seqs.fasta |
| 96 | + |
| 97 | +# download B.1.1.7 genome from Utah |
| 98 | +ADD https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.consensus.fa /test-data/SRR13957123.consensus.fa |
| 99 | + |
| 100 | +# test on a B.1.1.7 genome |
| 101 | +RUN pangolin /test-data/SRR13957123.consensus.fa -o /test-data/SRR13957123-pusher && \ |
| 102 | + column -t -s, /test-data/SRR13957123-pusher/lineage_report.csv |
| 103 | + |
| 104 | + # install unzip for unzipping zip archive from NCBI |
| 105 | +RUN apt-get update && apt-get install -y --no-install-recommends unzip |
| 106 | + |
| 107 | +# install ncbi datasets tool (pre-compiled binary); place in $PATH |
| 108 | +RUN wget https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets && \ |
| 109 | + chmod +x datasets && \ |
| 110 | + mv -v datasets /usr/local/bin |
| 111 | + |
| 112 | +# download assembly for a BA.1 from Florida (https://www.ncbi.nlm.nih.gov/biosample?term=SAMN29506515 and https://www.ncbi.nlm.nih.gov/nuccore/ON924087) |
| 113 | +# run pangolin in usher analysis mode |
| 114 | +RUN datasets download virus genome accession ON924087.1 --filename ON924087.1.zip && \ |
| 115 | + unzip ON924087.1.zip && rm ON924087.1.zip && \ |
| 116 | + mv -v ncbi_dataset/data/genomic.fna ON924087.1.genomic.fna && \ |
| 117 | + rm -vr ncbi_dataset/ README.md && \ |
| 118 | + pangolin ON924087.1.genomic.fna -o ON924087.1-usher && \ |
| 119 | + column -t -s, ON924087.1-usher/lineage_report.csv |
| 120 | + |
| 121 | +# test specific for new lineage, XBB.1.16, introduced in pangolin-data v1.19 |
| 122 | +# using this assembly: https://www.ncbi.nlm.nih.gov/nuccore/2440446687 |
| 123 | +# biosample here: https://www.ncbi.nlm.nih.gov/biosample?term=SAMN33060589 |
| 124 | +# one of the sample included in initial pango-designation here: https://github.com/cov-lineages/pango-designation/issues/1723 |
| 125 | +RUN datasets download virus genome accession OQ381818.1 --filename OQ381818.1.zip && \ |
| 126 | + unzip -o OQ381818.1.zip && rm OQ381818.1.zip && \ |
| 127 | + mv -v ncbi_dataset/data/genomic.fna OQ381818.1.genomic.fna && \ |
| 128 | + rm -vr ncbi_dataset/ README.md && \ |
| 129 | + pangolin OQ381818.1.genomic.fna -o OQ381818.1-usher && \ |
| 130 | + column -t -s, OQ381818.1-usher/lineage_report.csv |
| 131 | + |
| 132 | +# testing another XBB.1.16, trying to test scorpio functionality. Want pangolin to NOT assign lineage based on pango hash match. |
| 133 | +# this test runs as expected, uses scorpio to check for constellation of mutations, then assign using PUSHER placement |
| 134 | +RUN datasets download virus genome accession OR177999.1 --filename OR177999.1.zip && \ |
| 135 | +unzip -o OR177999.1.zip && rm OR177999.1.zip && \ |
| 136 | +mv -v ncbi_dataset/data/genomic.fna OR177999.1.genomic.fna && \ |
| 137 | +rm -vr ncbi_dataset/ README.md && \ |
| 138 | +pangolin OR177999.1.genomic.fna -o OR177999.1-usher && \ |
| 139 | +column -t -s, OR177999.1-usher/lineage_report.csv |
| 140 | + |
| 141 | + ## test for BA.2.86 |
| 142 | + # virus identified in MI: https://www.ncbi.nlm.nih.gov/nuccore/OR461132.1 |
| 143 | +RUN datasets download virus genome accession OR461132.1 --filename OR461132.1.zip && \ |
| 144 | +unzip -o OR461132.1.zip && rm OR461132.1.zip && \ |
| 145 | +mv -v ncbi_dataset/data/genomic.fna OR461132.1.genomic.fna && \ |
| 146 | +rm -vr ncbi_dataset/ README.md && \ |
| 147 | +pangolin OR461132.1.genomic.fna -o OR461132.1-usher && \ |
| 148 | +column -t -s, OR461132.1-usher/lineage_report.csv |
| 149 | + |
| 150 | + ## test for JN.2 (BA.2.86 sublineage) JN.2 is an alias of B.1.1.529.2.86.1.2 |
| 151 | + # NY CDC Quest sample: https://www.ncbi.nlm.nih.gov/nuccore/OR598183 |
| 152 | +RUN datasets download virus genome accession OR598183.1 --filename OR598183.1.zip && \ |
| 153 | +unzip -o OR598183.1.zip && rm OR598183.1.zip && \ |
| 154 | +mv -v ncbi_dataset/data/genomic.fna OR598183.1.genomic.fna && \ |
| 155 | +rm -vr ncbi_dataset/ README.md && \ |
| 156 | +pangolin OR598183.1.genomic.fna -o OR598183.1-usher && \ |
| 157 | +column -t -s, OR598183.1-usher/lineage_report.csv |
| 158 | + |
| 159 | +## test for JQ.1 (BA.2.86.3 sublineage); JQ.1 is an alias of B.1.1.529.2.86.3.1 |
| 160 | +# THANK YOU ERIN AND UPHL!! https://www.ncbi.nlm.nih.gov/nuccore/OR716684 |
| 161 | +# this test is important due to the fact that this lineage was included in the UShER tree, despite being designated after the pangolin-designation 1.23 release |
| 162 | +# it previously caused and error/bug in pangolin, but now is fixed |
| 163 | +RUN datasets download virus genome accession OR716684.1 --filename OR716684.1.zip && \ |
| 164 | +unzip -o OR716684.1.zip && rm OR716684.1.zip && \ |
| 165 | +mv -v ncbi_dataset/data/genomic.fna OR716684.1.genomic.fna && \ |
| 166 | +rm -vr ncbi_dataset/ README.md && \ |
| 167 | +pangolin OR716684.1.genomic.fna -o OR716684.1-usher && \ |
| 168 | +column -t -s, OR716684.1-usher/lineage_report.csv |
| 169 | + |
| 170 | +## test for JN.1.22 (BA.2.86.x sublineage; full unaliased lineage is B.1.1.529.2.86.1.1.22) |
| 171 | +# see here for commit where it was designated https://github.com/cov-lineages/pango-designation/commit/a90c8e31c154621ed86c985debfea09e17541cda |
| 172 | +# Here's the genome on NCBI, which was used to designate JN.1.22 lineage |
| 173 | +RUN datasets download virus genome accession PP189069.1 --filename PP189069.1.zip && \ |
| 174 | +unzip -o PP189069.1.zip && rm PP189069.1.zip && \ |
| 175 | +mv -v ncbi_dataset/data/genomic.fna PP189069.1.genomic.fna && \ |
| 176 | +rm -vr ncbi_dataset/ README.md && \ |
| 177 | +pangolin PP189069.1.genomic.fna -o PP189069.1-usher && \ |
| 178 | +column -t -s, PP189069.1-usher/lineage_report.csv |
| 179 | + |
| 180 | +## test for JN.1.48 (BA.2.86.x sublineage; full unaliased lineage is B.1.1.529.2.86.1.1.48) |
| 181 | +# this lineages which was designated in pango-designation v1.27: https://github.com/cov-lineages/pango-designation/releases/tag/v1.27 |
| 182 | +# see here for commit where it was designated https://github.com/cov-lineages/pango-designation/commit/67f48bf24283999f1940f3aee8159f404124ff3f |
| 183 | +# Here's the genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/PP218754 |
| 184 | +RUN datasets download virus genome accession PP218754.1 --filename PP218754.1.zip && \ |
| 185 | +unzip -o PP218754.1.zip && rm PP218754.1.zip && \ |
| 186 | +mv -v ncbi_dataset/data/genomic.fna PP218754.1.genomic.fna && \ |
| 187 | +rm -vr ncbi_dataset/ README.md && \ |
| 188 | +pangolin PP218754.1.genomic.fna -o PP218754.1-usher && \ |
| 189 | +column -t -s, PP218754.1-usher/lineage_report.csv |
| 190 | + |
| 191 | +# new lineage LK.1 that was introduced in pango-designation v1.28: https://github.com/cov-lineages/pango-designation/commit/922795c90de355e67200cf4d379e8e5ff22472e4 |
| 192 | +# thank you Luis, Lorraine, Marcos & team from PR Sci Trust for sharing your data! |
| 193 | +# genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/2728145425 |
| 194 | +RUN datasets download virus genome accession PP770375.1 --filename PP770375.1.zip && \ |
| 195 | +unzip -o PP770375.1.zip && rm PP770375.1.zip && \ |
| 196 | +mv -v ncbi_dataset/data/genomic.fna PP770375.1.genomic.fna && \ |
| 197 | +rm -vr ncbi_dataset/ README.md && \ |
| 198 | +pangolin PP770375.1.genomic.fna -o PP770375.1-usher && \ |
| 199 | +column -t -s, PP770375.1-usher/lineage_report.csv |
| 200 | + |
| 201 | +# new lineage KP.3.3.2 that was introduced in pango-designation v1.29: https://github.com/cov-lineages/pango-designation/commit/7125e606818312b78f0756d7fcab6dba92dd0a9e |
| 202 | +# genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/PQ073669 |
| 203 | +RUN datasets download virus genome accession PQ073669.1 --filename PQ073669.1.zip && \ |
| 204 | +unzip -o PQ073669.1.zip && rm PQ073669.1.zip && \ |
| 205 | +mv -v ncbi_dataset/data/genomic.fna PQ073669.1.genomic.fna && \ |
| 206 | +rm -vr ncbi_dataset/ README.md && \ |
| 207 | +pangolin PQ073669.1.genomic.fna -o PQ073669.1-usher && \ |
| 208 | +column -t -s, PQ073669.1-usher/lineage_report.csv |
| 209 | + |
| 210 | +# new lineage MC.2 that was introduced in pango-designation v1.30: https://github.com/cov-lineages/pango-designation/commit/c64dbc47fbfbfd7f4da011deeb1a88dd6baa45f1#diff-a121ea4b8cbeb4c0020511b5535bf24489f0223cc83511df7b8209953115d329R2564181 |
| 211 | +# genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/PQ034842.1 |
| 212 | +RUN datasets download virus genome accession PQ034842.1 --filename PQ034842.1.zip && \ |
| 213 | +unzip -o PQ034842.1.zip && rm PQ034842.1.zip && \ |
| 214 | +mv -v ncbi_dataset/data/genomic.fna PQ034842.1.genomic.fna && \ |
| 215 | +rm -vr ncbi_dataset/ README.md && \ |
| 216 | +pangolin PQ034842.1.genomic.fna -o PQ034842.1-usher && \ |
| 217 | +column -t -s, PQ034842.1-usher/lineage_report.csv |
0 commit comments