- 
                Notifications
    
You must be signed in to change notification settings  - Fork 89
 
AWS mirrored SRA file download #355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
8ec2d93
              d51006b
              bfb598c
              7b85915
              be52c3e
              63a34da
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| name: sra_aws_download | ||
| channels: | ||
| - conda-forge | ||
| - bioconda | ||
| - defaults | ||
| dependencies: | ||
| - conda-forge::awscli=2.15.0 | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| process SRA_AWS_DOWNLOAD { | ||
| tag "$meta.id" | ||
| label 'process_low' | ||
| label 'error_retry' | ||
| 
     | 
||
| conda "${moduleDir}/environment.yml" | ||
| container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
| 'https://depot.galaxyproject.org/singularity/awscli:1.8.3--py35_0' : | ||
| 'quay.io/biocontainers/awscli:1.8.3--py35_0' }" | ||
| 
     | 
||
| input: | ||
| tuple val(meta), val(run_accession) | ||
| 
     | 
||
| output: | ||
| tuple val(meta), path("*.sra"), emit: sra | ||
| path "versions.yml" , emit: versions | ||
| 
     | 
||
| when: | ||
| task.ext.when == null || task.ext.when | ||
| 
     | 
||
| script: | ||
| def args = task.ext.args ?: '' | ||
| def prefix = task.ext.prefix ?: "${run_accession}" | ||
| """ | ||
| # Download SRA file from AWS S3 Open Data Program | ||
| aws s3 cp \\ | ||
| --region us-east-1 \\ | ||
| --no-sign-request \\ | ||
| ${args} \\ | ||
| s3://sra-pub-run-odp/sra/${run_accession}/${run_accession} \\ | ||
| ${prefix}.sra | ||
| 
         
      Comment on lines
    
      +26
     to 
      +31
    
   
  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nextflow has native method of downloading from AWS using the SDK, do we think this will be more efficient than using that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using Nextflow: 
 Using a process: 
  | 
||
| 
     | 
||
| # Verify download | ||
| if [ ! -f "${prefix}.sra" ]; then | ||
| echo "ERROR: Failed to download ${run_accession} from AWS S3" | ||
| exit 1 | ||
| fi | ||
| 
         
      Comment on lines
    
      +33
     to 
      +37
    
   
  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't be necessary,   | 
||
| 
     | 
||
| cat <<-END_VERSIONS > versions.yml | ||
| "${task.process}": | ||
| aws-cli: \$(aws --version 2>&1 | sed 's/aws-cli\\///; s/ Python.*//') | ||
| END_VERSIONS | ||
| """ | ||
| 
     | 
||
| stub: | ||
| def prefix = task.ext.prefix ?: "${run_accession}" | ||
| """ | ||
| touch ${prefix}.sra | ||
| 
     | 
||
| cat <<-END_VERSIONS > versions.yml | ||
| "${task.process}": | ||
| aws-cli: \$(aws --version 2>&1 | sed 's/aws-cli\\///; s/ Python.*//') | ||
| END_VERSIONS | ||
| """ | ||
| } | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| process { | ||
| withName: 'SRA_AWS_DOWNLOAD' { | ||
| publishDir = [ | ||
| path: { "${params.outdir}/sra" }, | ||
| enabled: false | ||
| ] | ||
| } | ||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| nextflow_process { | ||
| 
     | 
||
| name "Test Process SRA_AWS_DOWNLOAD" | ||
| script "../main.nf" | ||
| process "SRA_AWS_DOWNLOAD" | ||
| tag "modules" | ||
| tag "modules_local" | ||
| tag "sra_aws_download" | ||
| 
     | 
||
| test("Should download SRA file from AWS") { | ||
| 
     | 
||
| when { | ||
| process { | ||
| """ | ||
| input[0] = [ | ||
| [ id:'test', single_end:false ], | ||
| 'DRR028935' | ||
| ] | ||
| """ | ||
| } | ||
| } | ||
| 
     | 
||
| then { | ||
| assertAll( | ||
| { assert process.success }, | ||
| { assert snapshot(process.out).match() } | ||
| ) | ||
| } | ||
| 
     | 
||
| } | ||
| 
     | 
||
| test("Should download SRA file from AWS - stub") { | ||
| 
     | 
||
| options "-stub" | ||
| 
     | 
||
| when { | ||
| process { | ||
| """ | ||
| input[0] = [ | ||
| [ id:'test', single_end:false ], | ||
| 'DRR028935' | ||
| ] | ||
| """ | ||
| } | ||
| } | ||
| 
     | 
||
| then { | ||
| assertAll( | ||
| { assert process.success }, | ||
| { assert snapshot(process.out).match() } | ||
| ) | ||
| } | ||
| 
     | 
||
| } | ||
| 
     | 
||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| { | ||
| "Should download SRA file from AWS": { | ||
| "content": [ | ||
| { | ||
| "0": [ | ||
| [ | ||
| { | ||
| "id": "test", | ||
| "single_end": false | ||
| }, | ||
| "DRR028935.sra:md5,bc88b59c510081d85448416f05094ed5" | ||
| ] | ||
| ], | ||
| "1": [ | ||
| "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" | ||
| ], | ||
| "sra": [ | ||
| [ | ||
| { | ||
| "id": "test", | ||
| "single_end": false | ||
| }, | ||
| "DRR028935.sra:md5,bc88b59c510081d85448416f05094ed5" | ||
| ] | ||
| ], | ||
| "versions": [ | ||
| "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" | ||
| ] | ||
| } | ||
| ], | ||
| "meta": { | ||
| "nf-test": "0.9.2", | ||
| "nextflow": "25.04.6" | ||
| }, | ||
| "timestamp": "2025-08-14T14:59:02.578113" | ||
| }, | ||
| "Should download SRA file from AWS - stub": { | ||
| "content": [ | ||
| { | ||
| "0": [ | ||
| [ | ||
| { | ||
| "id": "test", | ||
| "single_end": false | ||
| }, | ||
| "DRR028935.sra:md5,d41d8cd98f00b204e9800998ecf8427e" | ||
| ] | ||
| ], | ||
| "1": [ | ||
| "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" | ||
| ], | ||
| "sra": [ | ||
| [ | ||
| { | ||
| "id": "test", | ||
| "single_end": false | ||
| }, | ||
| "DRR028935.sra:md5,d41d8cd98f00b204e9800998ecf8427e" | ||
| ] | ||
| ], | ||
| "versions": [ | ||
| "versions.yml:md5,ce0676c62bd6864661cf98777e7c2896" | ||
| ] | ||
| } | ||
| ], | ||
| "meta": { | ||
| "nf-test": "0.9.2", | ||
| "nextflow": "25.04.6" | ||
| }, | ||
| "timestamp": "2025-08-14T14:59:07.021124" | ||
| } | ||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' | ||
| include { SRA_AWS_DOWNLOAD } from '../../../modules/local/sra_aws_download/main' | ||
| include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/fasterqdump/main' | ||
| 
     | 
||
| // | ||
| // Download FASTQ sequencing reads from AWS S3 SRA mirror | ||
| // | ||
| workflow FASTQ_DOWNLOAD_AWS_SRATOOLS { | ||
| take: | ||
| ch_sra_ids // channel: [ val(meta), val(id) ] | ||
| ch_dbgap_key // channel: [ path(dbgap_key) ] | ||
| 
     | 
||
| main: | ||
| 
     | 
||
| ch_versions = Channel.empty() | ||
| 
     | 
||
| // | ||
| // Detect existing NCBI user settings or create new ones. | ||
| // | ||
| CUSTOM_SRATOOLSNCBISETTINGS ( ch_sra_ids.collect() ) | ||
| ch_ncbi_settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings | ||
| ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) | ||
| 
     | 
||
| // | ||
| // Download SRA files from AWS S3 | ||
| // | ||
| SRA_AWS_DOWNLOAD ( ch_sra_ids ) | ||
| ch_versions = ch_versions.mix(SRA_AWS_DOWNLOAD.out.versions.first()) | ||
| 
     | 
||
| // | ||
| // Convert the SRA format into one or more compressed FASTQ files. | ||
| // | ||
| SRATOOLS_FASTERQDUMP ( SRA_AWS_DOWNLOAD.out.sra, ch_ncbi_settings, ch_dbgap_key ) | ||
| ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) | ||
| 
     | 
||
| emit: | ||
| reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] | ||
| versions = ch_versions // channel: [ versions.yml ] | ||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| nextflow_workflow { | ||
| 
     | 
||
| name "Test workflow: fastq_download_aws_sratools/main.nf" | ||
| script "../main.nf" | ||
| workflow "FASTQ_DOWNLOAD_AWS_SRATOOLS" | ||
| 
     | 
||
| tag "CUSTOM_SRATOOLSNCBISETTINGS" | ||
| tag "SRA_AWS_DOWNLOAD" | ||
| tag "SRATOOLS_FASTERQDUMP" | ||
| 
     | 
||
| test("Parameters: default") { | ||
| 
     | 
||
| when { | ||
| workflow { | ||
| """ | ||
| input[0] = Channel.of( | ||
| [[ id:'test_single_end', single_end:true ], 'DRR000774'], | ||
| [[ id:'test_paired_end', single_end:false ], 'SRR11140744'] | ||
| ) | ||
| input[1] = [] | ||
| """ | ||
| } | ||
| } | ||
| 
     | 
||
| then { | ||
| def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip | ||
| def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip | ||
| def selines = path(workflow.out.reads[1][1]).linesGzip | ||
| assertAll( | ||
| { assert workflow.success }, | ||
| { assert snapshot(pelines1[0..5]).match("test_pe_reads_1_lines") }, | ||
| { assert snapshot(pelines1.size()).match("test_pe_reads_1_size") }, | ||
| { assert snapshot(pelines2[0..5]).match("test_pe_reads_2_lines") }, | ||
| { assert snapshot(pelines2.size()).match("test_pe_reads_2_size") }, | ||
| { assert snapshot(selines[0..5]).match("test_se_reads_lines") }, | ||
| { assert snapshot(selines.size()).match("test_se_reads_size") }, | ||
| { assert snapshot(workflow.out.versions).match("versions") } | ||
| ) | ||
| } | ||
| } | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is just doing
aws cp, why not use the file operator in native Nextflow, e.g.file/fromPath("s3://sra-pub-run-odp/sra/${run_accession}/${run_accession}")There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thinking about this, we can probably write a function to return
meta, file("s3://etc")and make that into a pseudo-process, then call it exactly the same way. This wouldn't actually copy the file, just a pointer to it so we would only ever move the file once, which will be infinitely more efficient than using a process (literally!). It would still copy the file to the publishDir via normal Nextflow mechanisms.