Merge pull request #133 from fmalmeida/dev

fmalmeida · web-flow · commit 2a9be84eba04 · 2025-03-29T16:59:46.000+01:00
Release v3.4.2
diff --git a/.zenodo.json b/.zenodo.json
@@ -2,7 +2,7 @@
     "description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.</p>", 
     "license": "other-open", 
     "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", 
-    "version": "v3.4.0", 
+    "version": "v3.4.2", 
     "upload_type": "software",
     "creators": [
         {
diff --git a/bin/run_blasts.py b/bin/run_blasts.py
@@ -151,9 +151,9 @@ def summary(output):
             prodc=line["sseqid"].split('~~~')[3]
             desc=line["sseqid"].split('~~~')[4]
         else:
-            prodc=line["sseqid"].split('~~~')[3].split(' ')[0]
+            prodc=line["sseqid"].split('~~~')[3].split(' ')[0].replace(' ', '')
             try:
-                desc=' '.join(line["stitle"].split('~~~')[3].split(' ')[1:-1])
+                desc=''.join(line["stitle"].split('~~~')[3])
             except:
                 desc='Not found'
         # Subject coverage
diff --git a/conf/docker.config b/conf/docker.config
@@ -24,7 +24,7 @@ process {
 
     // container for R tools
     withLabel: 'renv' {
-      container = 'fmalmeida/bacannot@sha256:23a0713d3694a10ee4c570a4e65a471045781a73711495aa08ae7d40f9b65097'
+      container = 'fmalmeida/bacannot@sha256:fc0d5060474d709e4b96ad0b97bc2a5354d4da7d12afe549fb9d6446a7be9562'
     }
 
     // container for bacannot server
diff --git a/docker/renv/Dockerfile b/docker/renv/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:22.04
 
-LABEL MAINTAINER Felipe Marques de Almeida <marques.felipe@aluno.unb.br>
+LABEL MAINTAINER="Felipe Marques de Almeida <marques.felipe@aluno.unb.br>"
 
 # Workdir
 WORKDIR /work
diff --git a/docker/renv/reports/report_custom_blast.Rmd b/docker/renv/reports/report_custom_blast.Rmd
@@ -64,7 +64,7 @@ blast_gff    <- try(read.delim(params$blast_gff, header = FALSE, col.names=c("Co
 ## Check for emptyness
 if ( class(blast_gff) == "try-error" || check_lines(custom_blast) == 0 ) {
   custom_blast <- data.frame(
-    matrix(ncol = 14, nrow = 0)
+    matrix(ncol = 13, nrow = 0)
   )
   blast_gff <- data.frame(
     matrix(ncol = 9, nrow = 0)
diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 The tracking for changes started in v2.1
 
+## v3.4.2 [29-March-2025]
+
+* [[[#131](https://github.com/fmalmeida/bacannot/issues/131)]]
+    * Add a small sed command to correct wrong fasta headers from public databases
+* correct zenodo_get command to always download latest
+* correct channel parsing for generating html reports of custom databases
+* correct description field of `run_blasts.py` script
+
 ## v3.4.1 [14-November-2024]
 
 * [[[#127](https://github.com/fmalmeida/bacannot/issues/127)]]
diff --git a/modules/MGEs/iceberg.nf b/modules/MGEs/iceberg.nf
@@ -30,7 +30,7 @@ process ICEBERG {
         --mincov ${params.blast_MGEs_mincov} \\
         --threads $task.cpus \\
         --out ${prefix}_iceberg_blastp_onGenes.txt --2way | \\
-    sed -e 's/GENE/ICEBERG_ID/g' > ${prefix}_iceberg_blastp_onGenes.summary.txt ;
+    sed -e 's/GENE/ICEBERG_ID/g' -e 's/;//g' > ${prefix}_iceberg_blastp_onGenes.summary.txt ;
 
     ## Checking for full-length ICEs
     ### The blast db was throwing errors
@@ -46,6 +46,6 @@ process ICEBERG {
         --mincov 0 \\
         --threads $task.cpus \\
         --out ${prefix}_iceberg_blastn_onGenome.txt | \\
-    sed -e 's/GENE/ICEBERG_ID/g' > ${prefix}_iceberg_blastn_onGenome.summary.txt ;
+    sed -e 's/GENE/ICEBERG_ID/g' -e 's/;//g' > ${prefix}_iceberg_blastn_onGenome.summary.txt ;
     """
 }
diff --git a/modules/bacannot_dbs/get_zenodo.nf b/modules/bacannot_dbs/get_zenodo.nf
@@ -10,7 +10,7 @@ process GET_ZENODO_DB {
     script:
     """
     # download database from zenodo
-    zenodo_get https://doi.org/10.5281/zenodo.7615811
+    zenodo_get 12674473
 
     # organize data
     tar zxvf *.tar.gz && rm *.tar.gz
diff --git a/modules/bacannot_dbs/phast.nf b/modules/bacannot_dbs/phast.nf
@@ -13,7 +13,7 @@ process PHAST_DB {
         sed -e 's/ >/ /g' -e 's/~ /~/g' | \\
         awk -F "~~~" ' { if (\$0 ~ />/) { gsub(" ", "_", \$2); print \$1 "~~~" \$2 "~~~" \$3 "~~~" \$4 "~~~" \$5 } else { print \$0 }}' | \\
         awk -F "~~~" ' { if (\$0 ~ />/) { gsub("-", "_", \$2); print \$1 "~~~" \$2 "~~~" \$3 "~~~" \$4 "~~~" \$5 } else { print \$0 }}' | \\
-        sed -e 's/~~~>/~~~/g' > sequences && \\
+        sed -e 's/~~~>/~~~/g' -e 's/;//g' > sequences && \\
         rm phast_prot.fasta && \\
         diamond makedb --in sequences -d diamond && \\
         makeblastdb -in sequences -title 'PHAST' -dbtype prot -logfile /dev/null
diff --git a/modules/bacannot_dbs/sourmash.nf b/modules/bacannot_dbs/sourmash.nf
@@ -8,8 +8,8 @@ process SOURMASH_DB {
     script:
     """
     # download sourmash database
-    curl -L -o genbank-21.lca.json.gz https://osf.io/gk2za/download
-    curl -L -o genbank-31.lca.json.gz https://osf.io/ypsjq/download
-    curl -L -o genbank-51.lca.json.gz https://osf.io/297dp/download
+    curl -L -o genbank-21.lca.json.gz https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs207/gtdb-rs207.genomic-reps.dna.k21.lca.json.gz
+    curl -L -o genbank-31.lca.json.gz https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs207/gtdb-rs207.genomic-reps.dna.k31.lca.json.gz
+    curl -L -o genbank-51.lca.json.gz https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs207/gtdb-rs207.genomic-reps.dna.k51.lca.json.gz
     """
 }
diff --git a/modules/prophages/phast.nf b/modules/prophages/phast.nf
@@ -23,6 +23,6 @@ process PHAST {
       --mincov ${params.blast_MGEs_mincov} \\
       --threads $task.cpus \\
       --out ${prefix}_phast_blastp_onGenes.txt --2way | \\
-  sed -e 's/PRODUCT/PHAST_ID/g' > ${prefix}_phast_blastp_onGenes.summary.txt ;
+  sed -e 's/PRODUCT/PHAST_ID/g' -e 's/;//g' > ${prefix}_phast_blastp_onGenes.summary.txt ;
   """
 }
diff --git a/modules/resistance/argminer.nf b/modules/resistance/argminer.nf
@@ -22,6 +22,7 @@ process ARGMINER {
       --mincov ${params.blast_resistance_mincov} \\
       --threads $task.cpus \\
       --out ${prefix}_argminer_blastp_onGenes.txt \\
-      --2way > ${prefix}_argminer_blastp_onGenes.summary.txt ;
+      --2way | \\
+      sed -e 's/;//g' > ${prefix}_argminer_blastp_onGenes.summary.txt ;
   """
 }
diff --git a/modules/virulence/vfdb.nf b/modules/virulence/vfdb.nf
@@ -24,6 +24,6 @@ process VFDB {
       --threads $task.cpus \\
       --out ${prefix}_vfdb_blastn_onGenes.txt \\
       --2way | \\
-  sed -e 's/ACCESSION/VFDB_ID/g' > ${prefix}_vfdb_blastn_onGenes.summary.txt ;
+  sed -e 's/ACCESSION/VFDB_ID/g' -e 's/;//g' > ${prefix}_vfdb_blastn_onGenes.summary.txt ;
   """
 }
diff --git a/modules/virulence/victors.nf b/modules/virulence/victors.nf
@@ -24,6 +24,6 @@ process VICTORS {
       --threads $task.cpus \\
       --out ${prefix}_victors_blastp_onGenes.txt \\
       --2way | \\
-  sed -e 's/PRODUCT/VICTORS_ID/g' > ${prefix}_victors_blastp_onGenes.summary.txt ;
+  sed -e 's/PRODUCT/VICTORS_ID/g' -e 's/;//g' > ${prefix}_victors_blastp_onGenes.summary.txt ;
   """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -108,7 +108,7 @@ manifest {
     homePage        = "https://github.com/fmalmeida/bacannot"
     mainScript      = "main.nf"
     nextflowVersion = "!>=22.10.1"
-    version         = '3.4.1'
+    version         = '3.4.2'
 }
 
 // Function to ensure that resource requirements don't go beyond
diff --git a/workflows/bacannot.nf b/workflows/bacannot.nf
@@ -391,8 +391,17 @@ workflow BACANNOT {
 
       // Render reports
       if (params.custom_db || params.ncbi_proteins) {
+        // parse GFFs
+        custom_db_gffs_ch = 
+        MERGE_ANNOTATIONS.out.customdb_gff
+        .map{ id, file ->
+            def db = file.baseName.replaceAll('^custom_database_', '')
+            [ id, db, file ]
+        }
+
+        // report
         CUSTOM_DATABASE_REPORT( 
-          CUSTOM_DATABASE.out.summary.join( MERGE_ANNOTATIONS.out.customdb_gff, remainder:true ) 
+          CUSTOM_DATABASE.out.summary.join( custom_db_gffs_ch, by: [0, 1], remainder:true )
         )
       }
       REPORT(

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`"description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.</p>",`
`3`	`3`	`"license": "other-open",`
`4`	`4`	`"title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline",`
`5`		`- "version": "v3.4.0",`
	`5`	`+ "version": "v3.4.2",`
`6`	`6`	`"upload_type": "software",`
`7`	`7`	`"creators": [`
`8`	`8`	`{`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ process {`
`24`	`24`
`25`	`25`	`// container for R tools`
`26`	`26`	`withLabel: 'renv' {`
`27`		`- container = 'fmalmeida/bacannot@sha256:23a0713d3694a10ee4c570a4e65a471045781a73711495aa08ae7d40f9b65097'`
	`27`	`+ container = 'fmalmeida/bacannot@sha256:fc0d5060474d709e4b96ad0b97bc2a5354d4da7d12afe549fb9d6446a7be9562'`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`// container for bacannot server`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ blast_gff <- try(read.delim(params$blast_gff, header = FALSE, col.names=c("Co`
`64`	`64`	`## Check for emptyness`
`65`	`65`	`if ( class(blast_gff) == "try-error" \|\| check_lines(custom_blast) == 0 ) {`
`66`	`66`	`custom_blast <- data.frame(`
`67`		`- matrix(ncol = 14, nrow = 0)`
	`67`	`+ matrix(ncol = 13, nrow = 0)`
`68`	`68`	`)`
`69`	`69`	`blast_gff <- data.frame(`
`70`	`70`	`matrix(ncol = 9, nrow = 0)`
Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,6 @@ process PHAST {`
`23`	`23`	`--mincov ${params.blast_MGEs_mincov} \\`
`24`	`24`	`--threads $task.cpus \\`
`25`	`25`	`--out ${prefix}_phast_blastp_onGenes.txt --2way \| \\`
`26`		`- sed -e 's/PRODUCT/PHAST_ID/g' > ${prefix}_phast_blastp_onGenes.summary.txt ;`
	`26`	`+ sed -e 's/PRODUCT/PHAST_ID/g' -e 's/;//g' > ${prefix}_phast_blastp_onGenes.summary.txt ;`
`27`	`27`	`"""`
`28`	`28`	`}`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ process ARGMINER {`
`22`	`22`	`--mincov ${params.blast_resistance_mincov} \\`
`23`	`23`	`--threads $task.cpus \\`
`24`	`24`	`--out ${prefix}_argminer_blastp_onGenes.txt \\`
`25`		`- --2way > ${prefix}_argminer_blastp_onGenes.summary.txt ;`
	`25`	`+ --2way \| \\`
	`26`	`+ sed -e 's/;//g' > ${prefix}_argminer_blastp_onGenes.summary.txt ;`
`26`	`27`	`"""`
`27`	`28`	`}`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,6 @@ process VFDB {`
`24`	`24`	`--threads $task.cpus \\`
`25`	`25`	`--out ${prefix}_vfdb_blastn_onGenes.txt \\`
`26`	`26`	`--2way \| \\`
`27`		`- sed -e 's/ACCESSION/VFDB_ID/g' > ${prefix}_vfdb_blastn_onGenes.summary.txt ;`
	`27`	`+ sed -e 's/ACCESSION/VFDB_ID/g' -e 's/;//g' > ${prefix}_vfdb_blastn_onGenes.summary.txt ;`
`28`	`28`	`"""`
`29`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,6 @@ process VICTORS {`
`24`	`24`	`--threads $task.cpus \\`
`25`	`25`	`--out ${prefix}_victors_blastp_onGenes.txt \\`
`26`	`26`	`--2way \| \\`
`27`		`- sed -e 's/PRODUCT/VICTORS_ID/g' > ${prefix}_victors_blastp_onGenes.summary.txt ;`
	`27`	`+ sed -e 's/PRODUCT/VICTORS_ID/g' -e 's/;//g' > ${prefix}_victors_blastp_onGenes.summary.txt ;`
`28`	`28`	`"""`
`29`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ manifest {`
`108`	`108`	`homePage = "https://github.com/fmalmeida/bacannot"`
`109`	`109`	`mainScript = "main.nf"`
`110`	`110`	`nextflowVersion = "!>=22.10.1"`
`111`		`- version = '3.4.1'`
	`111`	`+ version = '3.4.2'`
`112`	`112`	`}`
`113`	`113`
`114`	`114`	`// Function to ensure that resource requirements don't go beyond`