Merge pull request #6 from bnicenboim/dev

bnicenboim · web-flow · commit dc508b2c8a94 · 2023-02-13T08:08:12.000+01:00
max length
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,7 +10,7 @@ Authors@R: c(
     person("Chris", "Emmerly", role = "ctb"),
     person("Giovanni", "Cassani", role = "ctb"))
 Description: Access to word predictability using large language (transformer) models.
-URL: <https://bruno.nicenboim.me/pangoling>, <https://github.com/bnicenboim/pangoling>
+URL: https://bruno.nicenboim.me/pangoling, https://github.com/bnicenboim/pangoling
 BugReports: https://github.com/bnicenboim/pangoling/issues
 License: MIT + file LICENSE
 Encoding: UTF-8
diff --git a/R/tr_utils.R b/R/tr_utils.R
@@ -220,28 +220,27 @@ create_tensor_lst <- function(texts,
     !is.null(tkzr$special_tokens_map$eos_token)) {
     tkzr$pad_token <- tkzr$eos_token
   }
-   max_length <- tkzr$model_max_length
   # If I runt the following line, some models such as
   # 'flax-community/gpt-2-spanish' give a weird error of
   # 'GPT2TokenizerFast' object has no attribute 'is_fast'
   # max_length <- tkzr$model_max_length
   # thus the ugly hack
-  ## max_length <- chr_match(utils::capture.output(tkzr),
-  ##                         pattern = "model_max_len=([0-9]*)") |>
-  ##   c() |>
-    ## (\(x) x[[2]])()
-  if (is.null(max_length) || is.na(max_length) || max_length < 1) {
-    message_verbose("Unknown maximum length of input. This might cause a problem for long inputs exceeding the maximum length.")
-    max_length <- Inf
-  }
+  # max_length <- chr_match(utils::capture.output(tkzr),
+  #                         pattern = "model_max_len=([0-9]*)") |>
+  #   c() |>
+  # (\(x) x[[2]])()
+  # if (is.null(max_length) || is.na(max_length) || max_length < 1) {
+  #   message_verbose("Unknown maximum length of input. This might cause a problem for long inputs exceeding the maximum length.")
+  #   max_length <- Inf
+  # }
   lapply(texts, function(text) {
     tensor <- encode(text,
       tkzr,
       add_special_tokens = add_special_tokens,
       stride = as.integer(stride),
-      truncation = is.finite(max_length),
-      return_overflowing_tokens = is.finite(max_length),
-      padding = is.finite(max_length)
+      truncation = TRUE, #is.finite(max_length),
+      return_overflowing_tokens = TRUE, #is.finite(max_length),
+      padding = TRUE #is.finite(max_length)
     )
     tensor
   })
diff --git a/R/zzz.R b/R/zzz.R
@@ -30,8 +30,9 @@ torch <- NULL
   lang_model <<- memoise::memoise(lang_model)
   transformer_vocab <<- memoise::memoise(transformer_vocab)
 
+  # avoid notes:
+  utils::globalVariables(c("mask_n"))
+
   invisible()
 }
 
-## avoid notes:
-utils::globalVariables(c("mask_n"))
diff --git a/README.Rmd b/README.Rmd
@@ -70,7 +70,7 @@ df_sent
 
 > Nicenboim B (2023). _pangoling: Access to
 >  language model predictions in R_. R package
->  version 0.0.0.9000,
+>  version `r packageVersion("pangoling")`,
 >  <https://github.com/bnicenboim/pangoling>.
 
 ## Code of conduct
diff --git a/README.md b/README.md
@@ -122,7 +122,7 @@ df_sent
 ## How to cite
 
 > Nicenboim B (2023). *pangoling: Access to language model predictions
-> in R*. R package version 0.0.0.9000,
+> in R*. R package version 0.0.0.9001,
 > <https://github.com/bnicenboim/pangoling>.
 
 ## Code of conduct
diff --git a/man/pangoling-package.Rd b/man/pangoling-package.Rd
diff --git a/tests/testthat.R b/tests/testthat.R
@@ -1,3 +1,11 @@
+# This file is part of the standard setup for testthat.
+# It is recommended that you do not modify it.
+#
+# Where should you do additional test configuration?
+# Learn more about the roles of various files in:
+# * https://r-pkgs.org/tests.html
+# * https://testthat.r-lib.org/reference/test_package.html#special-files
+
 library(testthat)
 library(pangoling)
 
diff --git a/tests/testthat/_snaps/tr_causal.md b/tests/testthat/_snaps/tr_causal.md
diff --git a/tests/testthat/_snaps/tr_masked.md b/tests/testthat/_snaps/tr_masked.md
diff --git a/tests/testthat/test-tr_causal.R b/tests/testthat/test-tr_causal.R
@@ -25,7 +25,6 @@ test_that("gpt2 get prob work", {
   cont <-
     causal_next_tokens_tbl("The apple doesn't fall far from the")
   expect_equal(sum(exp(cont$lp)),1,tolerance = .0001)
-  expect_snapshot(cont)
   expect_equal(cont[1]$token, "Ġtree")
   prov_words <- strsplit(prov, " ")[[1]]
   sent2_words <- strsplit(sent2, " ")[[1]]
@@ -36,7 +35,6 @@ test_that("gpt2 get prob work", {
   expect_equal(names(lp_sent2), sent2_words)
   lp_sent3 <- causal_lp(x = sent3_words)
   expect_equal(names(lp_sent3), sent3_words)
-  expect_snapshot(lp_prov)
   expect_equal(cont$lp[1], unname(lp_prov[[8]]), tolerance = .0001)
   lp_prov_mat <- causal_lp_mats(x = prov_words)
   mat <- lp_prov_mat[[1]]
@@ -56,16 +54,13 @@ test_that("gpt2 get prob work", {
   expect_equal(rownames(lp_prov_mat[[1]]), transformer_vocab())
   expect_equal(sum(exp(mat[, 2])), 1, tolerance = .0001) # sums to one
 
-  lp_prov2 <-
-    causal_lp(x = strsplit(paste0(prov, "."), " ")[[1]])
-  expect_snapshot(lp_prov2)
   # regex
-  lp_prov3 <-
+  lp_prov2 <-
     causal_lp(
       x = strsplit(paste0(prov, "."), " ")[[1]],
       ignore_regex = "[[:punct:]]"
     )
-  expect_equal(unname(lp_prov), unname(lp_prov3), tolerance = 0.001)
+  expect_equal(unname(lp_prov), unname(lp_prov2), tolerance = 0.001)
 
   ##
   sent <- "This is it, is it?"
@@ -102,15 +97,14 @@ test_that("can handle extra parameters", {
   word_1_prob <- causal_next_tokens_tbl("<|endoftext|>")
   prob1 <- word_1_prob[token == "This"]$lp
   names(prob1) <- "This"
-  expect_snapshot(probs)
-  expect_equal(probs[1], prob1)
+  expect_equal(probs[1], prob1, tolerance = 0.0001)
 
   probs_F <- causal_lp(x = c("This", "is", "it"), add_special_tokens = FALSE)
   expect_true(is.na(probs_F[1]))
   word_2_prob <- causal_next_tokens_tbl("This")
   prob2 <- word_2_prob[token == "Ġis"]$lp
   names(prob2) <- "is"
-  expect_equal(probs_F[2], prob2)
+  expect_equal(probs_F[2], prob2, tolerance = .0001)
 })
 
 
@@ -129,18 +123,16 @@ if (0) {
   })
 }
 
-test_that("other models using get prob work", {
+test_that("other models using get prob don't fail", {
   skip_if_no_python_stuff()
   tokenize("El bebé de cigüeña.", model = "flax-community/gpt-2-spanish")
 
-  expect_snapshot(
-    causal_lp(x = c("El", "bebé", "de", "cigüeña."), model = "flax-community/gpt-2-spanish")
-  )
+  expect_no_error(causal_lp(x = c("El", "bebé", "de", "cigüeña."),
+                            model = "flax-community/gpt-2-spanish"))
 
-  lp_provd <-
+  expect_no_error(
     causal_lp(
       x = strsplit(paste0(prov, "."), " ")[[1]],
       model = "distilgpt2"
-    )
-  expect_snapshot(lp_provd)
+    ))
 })
diff --git a/tests/testthat/test-tr_masked.R b/tests/testthat/test-tr_masked.R
@@ -17,13 +17,12 @@ test_that("bert masked works", {
     masked_tokens_tbl("The apple doesn't fall far from the [MASK].",
       model = "google/bert_uncased_L-2_H-128_A-2"
     )
-
-  expect_snapshot(mask_1)
+  expect_equal(colnames(mask_1),c("masked_sentence", "token", "lp", "mask_n"))
+  expect_equal(sum(exp(mask_1$lp)),1, tolerance = 0.0001)
   mask_2 <-
     masked_tokens_tbl("The apple doesn't fall far from [MASK] [MASK].",
       model = "google/bert_uncased_L-2_H-128_A-2"
     )
-  expect_snapshot(mask_2)
   mask_2_ <-
     masked_tokens_tbl(
       "[CLS] The apple doesn't fall far from [MASK] [MASK]. [SEP]",