Skip to content

Commit 78a7fcc

Browse files
authored
Merge pull request #3 from Jtrachsel/dev
Dev
2 parents 5c33d10 + 216d573 commit 78a7fcc

File tree

4 files changed

+49
-14
lines changed

4 files changed

+49
-14
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: pdtools
22
Title: Tools to interact with NCBI's Pathogen Detection project
3-
Version: 0.1.0
3+
Version: 0.1.2
44
Authors@R:
55
person("Julian", "Trachsel", , "julestrachsel@gmail.com", role = c("aut", "cre"),
66
comment = c(ORCID = "https://orcid.org/0000-0003-2357-7737"))

R/metadata_clean.R

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,18 @@ extract_country <- function(meta, parallel=FALSE){
200200
return(finished)
201201

202202
}
203+
204+
205+
#### WORK ON THIS
206+
# extract_state <- function(data){
207+
#
208+
# pattern_vec <- state_vector
209+
#
210+
# first_pass <-
211+
# data |>
212+
# dplyr::transmute(target_acc=.data$target_acc,
213+
# search_vals=base::tolower(.data$geo_loc_name)) |>
214+
# dplyr::mutate(country=furrr::future_map_chr(.x = .data$search_vals, ~matches_from_vector_of_patterns(pattern_vec, search_string = .x)))
215+
#
216+
#
217+
# }

R/pangenome_tools.R

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -149,14 +149,15 @@ pan_mat_to_gene_vec_tibble <- function(pan_mat){
149149
#' @param pan_mat a presence absence matrix of 1/0, rows are genomes, columns are genes
150150
#' @param desired_coverage proportion of the pangenome's gene content you want the reduced set to contain (.95)
151151
#' @param SEED random seed to use when selecting the first genome of the collection.
152+
#' @param verbose T/F provides updates via print statements
152153
#'
153154
#' @return returns a list of length 3. 1:names of the genomes, 2:scores for each iteration , 3:proportion coverage for each iteration
154155
#' @export
155156
#'
156157
#' @examples #gen_pangenome_representatives(pan_mat)
157158
#' @importFrom rlang .data
158159
get_pangenome_representatives <-
159-
function(pan_mat, desired_coverage=.95, SEED=3){
160+
function(pan_mat, desired_coverage=.95, SEED=3, verbose=FALSE){
160161
# hopefully get smallest set of genomes that gives desired coverage of pangenome
161162
# browser()
162163
genomes <- pan_mat_to_gene_vec_tibble(pan_mat)
@@ -173,35 +174,47 @@ get_pangenome_representatives <-
173174

174175
# best score = total number of genes in pangenome
175176
best_score <- base::nrow(pan_mat)
176-
tot_genomes <- base::col(pan_mat)
177+
tot_genomes <- base::ncol(pan_mat)
177178
desired_score <- best_score * desired_coverage
178179

179-
print(base::paste(tot_genomes, 'total genomes'))
180-
print(base::paste(best_score, '= best possible score'))
181-
print(base::paste(desired_score, ' = desired score'))
182-
183180
score <- base::length(cumulative_pan)
184181
scores <- base::c(score)
185-
print(base::paste0('starting score = ', score))
182+
183+
if (verbose) {
184+
print(base::paste(tot_genomes, 'total genomes'))
185+
print(base::paste(best_score, '= best possible score'))
186+
print(base::paste(desired_score, ' = desired score'))
187+
print(base::paste0('starting score = ', score))
188+
}
189+
186190
while (score < desired_score){
187191

188192
# calculates the number of new genes each genome would contribute to the cumulative pangenome
193+
194+
genomes <-
195+
genomes |>
196+
dplyr::mutate(num_new=purrr::map_int(.x = .data$gene_vec, .f= ~(base::sum(!(base::is.element(.x, cumulative_pan)))))) |>
197+
dplyr::filter(.data$num_new > 0) # removes genomes that do not contribute new information
198+
189199
# filters the genomes to only those that contain the max number of new genes for that iteration
190-
# selects the first one and adds it to the cumulative pangenome.
200+
# selects a random genome from those that contribute the max number of new genes
191201
best_addition_genome <-
192202
genomes |>
193-
dplyr::mutate(num_new=purrr::map_int(.x = .data$gene_vec, .f= ~(base::sum(!(base::is.element(.x, cumulative_pan)))))) |>
194203
dplyr::filter(.data$num_new == max(.data$num_new)) |>
195-
# dplyr::arrange(dplyr::desc(.data$num_new)) |>
196204
dplyr::slice_sample(n = 1)
197205

198206
cumulative_pan <- base::c(cumulative_pan, best_addition_genome$gene_vec[[1]]) |> base::unique()
199207
cumulative_genomes <- base::c(cumulative_genomes, best_addition_genome$genome_name[[1]])
200208
score <- base::length(cumulative_pan)
201209
scores <- base::c(scores, score)
202-
base::print(base::paste0('new score = ', score))
203210
proportion_coverages <- scores/best_score
204-
print(base::paste0('proportion covered = ', score/best_score))
211+
212+
if (verbose){
213+
214+
base::print(base::paste0('new score = ', score))
215+
base::print(base::paste0('proportion covered = ', score/best_score))
216+
217+
}
205218
}
206219
return(base::list(cumulative_genomes, scores, proportion_coverages))
207220
}

man/get_pangenome_representatives.Rd

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)