|  | 
|  | 1 | +#' Convenience function to download the assembly_summary.txt file from genbank | 
|  | 2 | +#' | 
|  | 3 | +#' @param destfile passed to download.file()'s destfile, path to store the downloaded file | 
|  | 4 | +#' | 
|  | 5 | +#' @return returns nothing but probably should... | 
|  | 6 | +#' @export | 
|  | 7 | +#' | 
|  | 8 | +#' @examples #not run download_gbk_assembly_summary(destfile='assembly_summary.txt') | 
|  | 9 | +download_gbk_assembly_summary <- function(destfile){ | 
|  | 10 | +  original_options <- base::options(timeout = 6000) | 
|  | 11 | +  base::on.exit(base::options(original_options)) | 
|  | 12 | + | 
|  | 13 | +  utils::download.file('https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt', | 
|  | 14 | +                       destfile = destfile) | 
|  | 15 | + | 
|  | 16 | +} | 
|  | 17 | + | 
|  | 18 | +#' generate ftp site download urls for all SNP trees containing the provided isolates | 
|  | 19 | +#' | 
|  | 20 | +#' @param organism a string ie 'Salmonella' or 'Campylobacter' etc | 
|  | 21 | +#' @param data a metadata table, must contain the column 'PDS_acc' from merging in the SNP cluster data | 
|  | 22 | +#' @param PDG The PDG version the metadata is from. | 
|  | 23 | +#' | 
|  | 24 | +#' @return returns a vector of ftp download urls for each tar.gz file containing the SNP tree info | 
|  | 25 | +#' @export | 
|  | 26 | +#' | 
|  | 27 | +#' @examples make_SNPtree_urls(organism = 'Klebsiella', | 
|  | 28 | +#'  data = klebsiella_example_dat, PDG = 'PDG000000012.1053') | 
|  | 29 | +make_SNPtree_urls <- function(organism, data, PDG){ | 
|  | 30 | +  # One SNP tree for each PDG represented in the data | 
|  | 31 | +  # Organism <- 'Klebsiella' | 
|  | 32 | +  # PDG <- 'PDG000000012.1053' | 
|  | 33 | + | 
|  | 34 | +  num_no_clust <- base::sum(base::is.na(data$PDS_acc)) | 
|  | 35 | + | 
|  | 36 | +  PDSs <- data %>% dplyr::filter(!is.na(.data$PDS_acc)) %>% dplyr::pull(.data$PDS_acc) %>% base::unique() | 
|  | 37 | +  urls <- base::paste0('https://ftp.ncbi.nlm.nih.gov/pathogen/Results/',organism,'/', PDG, '/SNP_trees/', PDSs, '.tar.gz') | 
|  | 38 | +  base::message(base::paste(num_no_clust, 'Isolates in the collection are not represented in SNP trees')) | 
|  | 39 | +  return(urls) | 
|  | 40 | +} | 
|  | 41 | + | 
|  | 42 | + | 
|  | 43 | +#' generate ftp site paths for a selection of assembly accessions | 
|  | 44 | +#' | 
|  | 45 | +#' @param assembly_summary_path path to genbank assembly_summary.txt, see download_gbk_assembly_summary() | 
|  | 46 | +#' | 
|  | 47 | +#' @param data a dataframe containing an asm_acc | 
|  | 48 | +#' | 
|  | 49 | +#' @return a two column tibble 1= asm_acc ; 2= ftp_path | 
|  | 50 | +#' @export | 
|  | 51 | +#' | 
|  | 52 | +#' @examples #make_ftp_paths(klebsiella_example_data './test/assembly_summary.txt') | 
|  | 53 | +make_ftp_paths <- function(data, assembly_summary_path){ | 
|  | 54 | + | 
|  | 55 | +  # browser() | 
|  | 56 | +  # should check for NAs or weirdly formatted asm_acc | 
|  | 57 | + | 
|  | 58 | +  # check_asm_acc | 
|  | 59 | + | 
|  | 60 | +  ftp_asm_map <- | 
|  | 61 | +    readr::read_tsv(assembly_summary_path, skip=1) %>% | 
|  | 62 | +    dplyr::transmute(asm_acc=.data$`# assembly_accession`, | 
|  | 63 | +                     .data$ftp_path) | 
|  | 64 | +  dplyr::filter(grepl('https://ftp.ncbi.nlm.nih.gov',.data$ftp_path)) | 
|  | 65 | + | 
|  | 66 | +  result <- data %>% dplyr::left_join(ftp_asm_map) | 
|  | 67 | + | 
|  | 68 | +  return(result) | 
|  | 69 | + | 
|  | 70 | +} | 
|  | 71 | + | 
|  | 72 | + | 
|  | 73 | +#' make specific ftp download paths for a dataframe with ftp_paths and assembly accessions | 
|  | 74 | +#' | 
|  | 75 | +#' @param type type of download path to generate, one of: 'fna', 'gbff', 'gff', 'gtf', 'faa', 'cds' | 
|  | 76 | +#' @param data a dataframe with the columns 'ftp_path' and 'asm_acc' | 
|  | 77 | +#' ftp_path should be a column produced by the function make_ftp_paths() | 
|  | 78 | +#' | 
|  | 79 | +#' @return returns the original dataframe with an added column, named "{type}_download" | 
|  | 80 | +#' @export | 
|  | 81 | +#' | 
|  | 82 | +#' @examples make_download_urls(klebsiella_example_dat, type='fna') | 
|  | 83 | +#' @importFrom rlang := | 
|  | 84 | +make_download_urls <- function(data, type){ | 
|  | 85 | +  suffixes <- supported_download_types(type) | 
|  | 86 | +  result <- | 
|  | 87 | +    data %>% | 
|  | 88 | +    dplyr::mutate("{type}_download":= | 
|  | 89 | +             base::paste0(.data$ftp_path, | 
|  | 90 | +                          '/', | 
|  | 91 | +                          base::sub('https://ftp.ncbi.nlm.nih.gov/genomes/all/.*/.*/.*/.*/(.*)', '\\1', | 
|  | 92 | +                                    .data$ftp_path), | 
|  | 93 | +                          suffixes[type])) | 
|  | 94 | +  return(result) | 
|  | 95 | + | 
|  | 96 | +} | 
|  | 97 | + | 
|  | 98 | +#' Make download destination paths | 
|  | 99 | +#' | 
|  | 100 | +#' @param data A dataframe containing an asm_acc column | 
|  | 101 | +#' @param type the type of files you want to download, one of: 'fna', 'gbff', 'gff', 'gtf', 'faa', 'cds' | 
|  | 102 | +#' @param dest_dir path to the directory you want to use, must exist, should include a trailing '/' | 
|  | 103 | +#' | 
|  | 104 | +#' @return returns a dataframe with an added "{type}_dest" column containing the paths to pass to download.file | 
|  | 105 | +#' @export | 
|  | 106 | +#' @importFrom rlang := | 
|  | 107 | +#' | 
|  | 108 | +#' @examples # download_data %>% make_dest_faths(type='fna', dest_dir='./data/') | 
|  | 109 | +make_dest_paths <- function(data, type, dest_dir){ | 
|  | 110 | + | 
|  | 111 | +  base::file.exists(dest_dir) | 
|  | 112 | +  supported_download_types(type) | 
|  | 113 | + | 
|  | 114 | +  data %>% | 
|  | 115 | +    dplyr::mutate("{type}_dest":=paste0(dest_dir, .data$asm_acc, '.', type, '.gz')) | 
|  | 116 | +} | 
|  | 117 | + | 
|  | 118 | +#' Download specified files from NCBI ftp site | 
|  | 119 | +#' | 
|  | 120 | +#' @param data a dataframe with columns created by make_download_urls() and make_dest_paths() | 
|  | 121 | +#' @param type the type of files you want to download, one of: 'fna', 'gbff', 'gff', 'gtf', 'faa', 'cds' | 
|  | 122 | +#' | 
|  | 123 | +#' @return the results of attempting to download the specified files, | 
|  | 124 | +#'  A dataframe with two added columns: | 
|  | 125 | +#'    1. return of download.file, should be 0 | 
|  | 126 | +#'    2. error column from purrr::safely(), should contain any error messages. | 
|  | 127 | +#' @export | 
|  | 128 | +#' | 
|  | 129 | +#' @importFrom rlang := | 
|  | 130 | +#' @examples # download_data %>% download_genomes('fna') | 
|  | 131 | +download_genomes <- | 
|  | 132 | +  function(data, type){ | 
|  | 133 | +    supported_download_types(type) | 
|  | 134 | +    url_var <- base::paste0(type, '_download') | 
|  | 135 | +    dest_var <- base::paste0(type, '_dest') | 
|  | 136 | +    err_var <- stats::setNames(base::list(base::as.character), glue::glue("{type}_dl_error")) | 
|  | 137 | + | 
|  | 138 | +    safe_download <- purrr::safely(utils::download.file) | 
|  | 139 | + | 
|  | 140 | +    data %>% | 
|  | 141 | +      dplyr::select(.data$asm_acc, dplyr::starts_with(type)) %>% | 
|  | 142 | +      dplyr::mutate("{type}_dl":=purrr::map2(.x=!!rlang::sym(url_var), .y=!!rlang::sym(dest_var), .f = ~safe_download(.x, .y))) %>% | 
|  | 143 | +      tidyr::unnest_wider(glue::glue('{type}_dl'), | 
|  | 144 | +                          names_sep = '_', | 
|  | 145 | +                          simplify = TRUE, | 
|  | 146 | +                          transform = err_var) | 
|  | 147 | + | 
|  | 148 | +  } | 
|  | 149 | + | 
|  | 150 | + | 
|  | 151 | + | 
|  | 152 | + | 
|  | 153 | + | 
|  | 154 | + | 
|  | 155 | + | 
|  | 156 | + | 
|  | 157 | +#' helper function to check if user supplied type is in the supported types | 
|  | 158 | +#' | 
|  | 159 | +#' @param type a user input string to check | 
|  | 160 | +#' | 
|  | 161 | +#' @return returns a named vector of acceptable files and they appropriate suffixes | 
|  | 162 | +#' | 
|  | 163 | +#' @examples # supported_download_types('fna') | 
|  | 164 | +supported_download_types <- | 
|  | 165 | +  function(type){ | 
|  | 166 | +    suffixes=base::c(fna='_genomic.fna.gz', | 
|  | 167 | +                     gbff='_genomic.gbff.gz', | 
|  | 168 | +                     gff='_genomic.gff.gz', | 
|  | 169 | +                     gtf='_genomic.gtf.gz ', | 
|  | 170 | +                     faa='_protein.faa.gz', | 
|  | 171 | +                     cds='_cds_from_genomic.fna.gz') | 
|  | 172 | + | 
|  | 173 | +    if (!(type %in% base::names(suffixes))){ | 
|  | 174 | +      base::errorCondition(base::paste0('"type" must be one of ','"', base::paste(base::names(suffixes), collapse = ' '),'"')) | 
|  | 175 | +    } | 
|  | 176 | +    return(suffixes) | 
|  | 177 | +  } | 
|  | 178 | + | 
|  | 179 | + | 
0 commit comments