Skip to content

Commit c90e799

Browse files
authored
Merge pull request #11 from Jtrachsel/dev
Merge Dev, update to v0.5.1
2 parents b2dba54 + baddd21 commit c90e799

23 files changed

+311
-157
lines changed

DESCRIPTION

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: pdtools
22
Title: Tools to interact with NCBI's Pathogen Detection project
3-
Version: 0.4.3
3+
Version: 0.5.1
44
Authors@R:
55
person("Julian", "Trachsel", , "julestrachsel@gmail.com", role = c("aut", "cre"),
66
comment = c(ORCID = "https://orcid.org/0000-0003-2357-7737"))
@@ -27,7 +27,8 @@ Imports:
2727
readr,
2828
furrr,
2929
future,
30-
magrittr
30+
magrittr,
31+
glue
3132
Suggests:
3233
testthat (>= 3.0.0)
3334
Config/testthat/edition: 3

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export(build_ppanggolin_file_fastas)
55
export(check_complete_PDG)
66
export(download_PDD_metadata)
77
export(download_gbk_assembly_summary)
8+
export(download_genomes)
89
export(download_most_recent_complete)
910
export(extract_collection_agency)
1011
export(extract_consensus_ag_species)
@@ -18,6 +19,7 @@ export(get_pangenome_representatives2)
1819
export(list_PDGs)
1920
export(list_organisms)
2021
export(make_SNPtree_urls)
22+
export(make_dest_paths)
2123
export(make_download_urls)
2224
export(make_ftp_paths)
2325
export(pan_mat_to_gene_vec_tibble)

R/genome_download_tools.R

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#' Convenience function to download the assembly_summary.txt file from genbank
2+
#'
3+
#' @param destfile passed to download.file()'s destfile, path to store the downloaded file
4+
#'
5+
#' @return returns nothing but probably should...
6+
#' @export
7+
#'
8+
#' @examples #not run download_gbk_assembly_summary(destfile='assembly_summary.txt')
9+
download_gbk_assembly_summary <- function(destfile){
10+
original_options <- base::options(timeout = 6000)
11+
base::on.exit(base::options(original_options))
12+
13+
utils::download.file('https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt',
14+
destfile = destfile)
15+
16+
}
17+
18+
#' generate ftp site download urls for all SNP trees containing the provided isolates
19+
#'
20+
#' @param organism a string ie 'Salmonella' or 'Campylobacter' etc
21+
#' @param data a metadata table, must contain the column 'PDS_acc' from merging in the SNP cluster data
22+
#' @param PDG The PDG version the metadata is from.
23+
#'
24+
#' @return returns a vector of ftp download urls for each tar.gz file containing the SNP tree info
25+
#' @export
26+
#'
27+
#' @examples make_SNPtree_urls(organism = 'Klebsiella',
28+
#' data = klebsiella_example_dat, PDG = 'PDG000000012.1053')
29+
make_SNPtree_urls <- function(organism, data, PDG){
30+
# One SNP tree for each PDG represented in the data
31+
# Organism <- 'Klebsiella'
32+
# PDG <- 'PDG000000012.1053'
33+
34+
num_no_clust <- base::sum(base::is.na(data$PDS_acc))
35+
36+
PDSs <- data %>% dplyr::filter(!is.na(.data$PDS_acc)) %>% dplyr::pull(.data$PDS_acc) %>% base::unique()
37+
urls <- base::paste0('https://ftp.ncbi.nlm.nih.gov/pathogen/Results/',organism,'/', PDG, '/SNP_trees/', PDSs, '.tar.gz')
38+
base::message(base::paste(num_no_clust, 'Isolates in the collection are not represented in SNP trees'))
39+
return(urls)
40+
}
41+
42+
43+
#' generate ftp site paths for a selection of assembly accessions
44+
#'
45+
#' @param assembly_summary_path path to genbank assembly_summary.txt, see download_gbk_assembly_summary()
46+
#'
47+
#' @param data a dataframe containing an asm_acc
48+
#'
49+
#' @return a two column tibble 1= asm_acc ; 2= ftp_path
50+
#' @export
51+
#'
52+
#' @examples #make_ftp_paths(klebsiella_example_data './test/assembly_summary.txt')
53+
make_ftp_paths <- function(data, assembly_summary_path){
54+
55+
# browser()
56+
# should check for NAs or weirdly formatted asm_acc
57+
58+
# check_asm_acc
59+
60+
ftp_asm_map <-
61+
readr::read_tsv(assembly_summary_path, skip=1) %>%
62+
dplyr::transmute(asm_acc=.data$`# assembly_accession`,
63+
.data$ftp_path)
64+
dplyr::filter(grepl('https://ftp.ncbi.nlm.nih.gov',.data$ftp_path))
65+
66+
result <- data %>% dplyr::left_join(ftp_asm_map)
67+
68+
return(result)
69+
70+
}
71+
72+
73+
#' make specific ftp download paths for a dataframe with ftp_paths and assembly accessions
74+
#'
75+
#' @param type type of download path to generate, one of: 'fna', 'gbff', 'gff', 'gtf', 'faa', 'cds'
76+
#' @param data a dataframe with the columns 'ftp_path' and 'asm_acc'
77+
#' ftp_path should be a column produced by the function make_ftp_paths()
78+
#'
79+
#' @return returns the original dataframe with an added column, named "{type}_download"
80+
#' @export
81+
#'
82+
#' @examples make_download_urls(klebsiella_example_dat, type='fna')
83+
#' @importFrom rlang :=
84+
make_download_urls <- function(data, type){
85+
suffixes <- supported_download_types(type)
86+
result <-
87+
data %>%
88+
dplyr::mutate("{type}_download":=
89+
base::paste0(.data$ftp_path,
90+
'/',
91+
base::sub('https://ftp.ncbi.nlm.nih.gov/genomes/all/.*/.*/.*/.*/(.*)', '\\1',
92+
.data$ftp_path),
93+
suffixes[type]))
94+
return(result)
95+
96+
}
97+
98+
#' Make download destination paths
99+
#'
100+
#' @param data A dataframe containing an asm_acc column
101+
#' @param type the type of files you want to download, one of: 'fna', 'gbff', 'gff', 'gtf', 'faa', 'cds'
102+
#' @param dest_dir path to the directory you want to use, must exist, should include a trailing '/'
103+
#'
104+
#' @return returns a dataframe with an added "{type}_dest" column containing the paths to pass to download.file
105+
#' @export
106+
#' @importFrom rlang :=
107+
#'
108+
#' @examples # download_data %>% make_dest_faths(type='fna', dest_dir='./data/')
109+
make_dest_paths <- function(data, type, dest_dir){
110+
111+
base::file.exists(dest_dir)
112+
supported_download_types(type)
113+
114+
data %>%
115+
dplyr::mutate("{type}_dest":=paste0(dest_dir, .data$asm_acc, '.', type, '.gz'))
116+
}
117+
118+
#' Download specified files from NCBI ftp site
119+
#'
120+
#' @param data a dataframe with columns created by make_download_urls() and make_dest_paths()
121+
#' @param type the type of files you want to download, one of: 'fna', 'gbff', 'gff', 'gtf', 'faa', 'cds'
122+
#'
123+
#' @return the results of attempting to download the specified files,
124+
#' A dataframe with two added columns:
125+
#' 1. return of download.file, should be 0
126+
#' 2. error column from purrr::safely(), should contain any error messages.
127+
#' @export
128+
#'
129+
#' @importFrom rlang :=
130+
#' @examples # download_data %>% download_genomes('fna')
131+
download_genomes <-
132+
function(data, type){
133+
supported_download_types(type)
134+
url_var <- base::paste0(type, '_download')
135+
dest_var <- base::paste0(type, '_dest')
136+
err_var <- stats::setNames(base::list(base::as.character), glue::glue("{type}_dl_error"))
137+
138+
safe_download <- purrr::safely(utils::download.file)
139+
140+
data %>%
141+
dplyr::select(.data$asm_acc, dplyr::starts_with(type)) %>%
142+
dplyr::mutate("{type}_dl":=purrr::map2(.x=!!rlang::sym(url_var), .y=!!rlang::sym(dest_var), .f = ~safe_download(.x, .y))) %>%
143+
tidyr::unnest_wider(glue::glue('{type}_dl'),
144+
names_sep = '_',
145+
simplify = TRUE,
146+
transform = err_var)
147+
148+
}
149+
150+
151+
152+
153+
154+
155+
156+
157+
#' helper function to check if user supplied type is in the supported types
158+
#'
159+
#' @param type a user input string to check
160+
#'
161+
#' @return returns a named vector of acceptable files and they appropriate suffixes
162+
#'
163+
#' @examples # supported_download_types('fna')
164+
supported_download_types <-
165+
function(type){
166+
suffixes=base::c(fna='_genomic.fna.gz',
167+
gbff='_genomic.gbff.gz',
168+
gff='_genomic.gff.gz',
169+
gtf='_genomic.gtf.gz ',
170+
faa='_protein.faa.gz',
171+
cds='_cds_from_genomic.fna.gz')
172+
173+
if (!(type %in% base::names(suffixes))){
174+
base::errorCondition(base::paste0('"type" must be one of ','"', base::paste(base::names(suffixes), collapse = ' '),'"'))
175+
}
176+
return(suffixes)
177+
}
178+
179+
File renamed without changes.

0 commit comments

Comments
 (0)