pytorch-labs · larryliu0820 · Mar 14, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,80 @@
+[flake8]
+select = B,C,E,F,P,W,B9,TOR0,TOR1,TOR2
+max-line-length = 80
+ignore =
+    # Black conflicts and overlaps.
+    B950,
+    E111,
+    E115,
+    E117,
+    E121,
+    E122,
+    E123,
+    E124,
+    E125,
+    E126,
+    E127,
+    E128,
+    E129,
+    E131,
+    E201,
+    E202,
+    E203,
+    E221,
+    E222,
+    E225,
+    E226,
+    E227,
+    E231,
+    E241,
+    E251,
+    E252,
+    E261,
+    E262,
+    E265,
+    E271,
+    E272,
+    E301,
+    E302,
+    E303,
+    E305,
+    E306,
+    E501,
+    E502,
+    E701,
+    E702,
+    E703,
+    E704,
+    W291,
+    W292,
+    W293,
+    W391,
+    W504,
+
+    # Too opinionated.
+    E265,
+    E266,
+    E402,
+    E722,
+    B001,
+    P207,
+    B003,
+    P208,
+    C403,
+    W503,
+
+    # Bugbear has opinions: https://github.com/PyCQA/flake8-bugbear#opinionated-warnings
+    B904,
+    B905,
+    B906,
+    B907,
+exclude =
+    ./.git,
+    ./backends/xnnpack/third-party,
+    ./build,
+    ./configurations,
+    ./docs,
+    ./third_party,
+    *.pyi
+
+max-complexity = 12
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -29,8 +29,8 @@ init_command = [
 [[linter]]
 code = 'UFMT'
 include_patterns = [
-    '**/*.py',
-    '**/*.pyi',
+    '*.py',
+    '*.pyi',
 ]
 exclude_patterns = [
     'third-party/**',
@@ -135,3 +135,33 @@ command = [
     '@{{PATHSFILE}}',
 ]
 is_formatter = true
+
+[[linter]]
+code = 'MYPY'
+include_patterns = [
+    '*.py',
+]
+exclude_patterns = [
+    'third-party/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'mypy_linter',
+    '--config=.mypy.ini',
+    '--show-disable',
+    '--',
+    '--explicit-package-bases',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    '--requirement=requirements-lintrunner.txt',
+]
diff --git a/.mypy.ini b/.mypy.ini
@@ -0,0 +1,34 @@
+[mypy]
+allow_redefinition = True
+warn_unused_configs = True
+warn_redundant_casts = True
+show_error_codes = True
+show_column_numbers = True
+disallow_untyped_decorators = True
+follow_imports = normal
+local_partial_types = True
+enable_error_code = possibly-undefined
+warn_unused_ignores = False
+
+mypy_path = pytorch_tokenizers
+
+[mypy-buck_util]
+ignore_missing_imports = True
+
+[mypy-docutils.*]
+ignore_missing_imports = True
+
+[mypy-pandas]
+ignore_missing_imports = True
+
+[mypy-ruamel]
+ignore_missing_imports = True
+
+[mypy-tomllib]
+ignore_missing_imports = True
+
+[mypy-yaml]
+ignore_missing_imports = True
+
+[mypy-zstd]
+ignore_missing_imports = True
diff --git a/examples/tokenize_tool/main.cpp b/examples/tokenize_tool/main.cpp
@@ -25,7 +25,7 @@
 
 using namespace tokenizers;
 
-std::string help(char* argv[]) {
+std::string help(char *argv[]) {
   std::stringstream ss;
   ss << "Usage: " << argv[0] << " <type> <model> <input to tokenize...>"
      << std::endl
@@ -37,7 +37,7 @@ std::string help(char* argv[]) {
   return ss.str();
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   // Check for the right number of CLI args
   if (argc < 4) {
     std::cerr << help(argv) << std::endl;
@@ -95,7 +95,7 @@ int main(int argc, char* argv[]) {
   // Decode
   std::cout << "Decoding..." << std::endl;
   uint64_t prev = tok_ptr->bos_tok();
-  for (const auto& current : encoded) {
+  for (const auto &current : encoded) {
     const auto decoded_result = tok_ptr->decode(prev, current);
     std::cout << decoded_result.get();
     prev = current;

diff --git a/include/pytorch/tokenizers/base64.h b/include/pytorch/tokenizers/base64.h
@@ -36,7 +36,7 @@ namespace base64 {
 using tokenizers::Error;
 using tokenizers::Result;
 
-Result<std::string> decode(const std::string_view& input);
+Result<std::string> decode(const std::string_view &input);
 
 namespace detail {
 
@@ -68,12 +68,9 @@ inline Error validate(uint32_t v) {
   return Error::Ok;
 }
 
-inline Error decode(const std::string_view& input, std::string& output) {
-  TK_CHECK_OR_RETURN_ERROR(
-      input.size() == 4,
-      Base64DecodeFailure,
-      "input length must be 4, got %zu",
-      input.size());
+inline Error decode(const std::string_view &input, std::string &output) {
+  TK_CHECK_OR_RETURN_ERROR(input.size() == 4, Base64DecodeFailure,
+                           "input length must be 4, got %zu", input.size());
 
   uint32_t val = 0;
 
@@ -103,14 +100,10 @@ inline Error decode(const std::string_view& input, std::string& output) {
   return Error::Ok;
 }
 
-inline Error decode_1_padding(
-    const std::string_view& input,
-    std::string& output) {
-  TK_CHECK_OR_RETURN_ERROR(
-      input.size() == 3,
-      Base64DecodeFailure,
-      "input length must be 3, got %zu",
-      input.size());
+inline Error decode_1_padding(const std::string_view &input,
+                              std::string &output) {
+  TK_CHECK_OR_RETURN_ERROR(input.size() == 3, Base64DecodeFailure,
+                           "input length must be 3, got %zu", input.size());
 
   uint32_t val = 0;
 
@@ -134,14 +127,10 @@ inline Error decode_1_padding(
   return Error::Ok;
 }
 
-inline Error decode_2_padding(
-    const std::string_view& input,
-    std::string& output) {
-  TK_CHECK_OR_RETURN_ERROR(
-      input.size() == 2,
-      Base64DecodeFailure,
-      "input length must be 2, got %zu",
-      input.size());
+inline Error decode_2_padding(const std::string_view &input,
+                              std::string &output) {
+  TK_CHECK_OR_RETURN_ERROR(input.size() == 2, Base64DecodeFailure,
+                           "input length must be 2, got %zu", input.size());
 
   uint32_t val = 0;
 
@@ -161,13 +150,12 @@ inline Error decode_2_padding(
 
 } // namespace detail
 
-inline tokenizers::Result<std::string> decode(const std::string_view& input) {
+inline tokenizers::Result<std::string> decode(const std::string_view &input) {
   TK_CHECK_OR_RETURN_ERROR(!input.empty(), Base64DecodeFailure, "empty input");
 
   // Faster than `input.size() % 4`.
   TK_CHECK_OR_RETURN_ERROR(
-      (input.size() & 3) == 0 && input.size() >= 4,
-      Base64DecodeFailure,
+      (input.size() & 3) == 0 && input.size() >= 4, Base64DecodeFailure,
       "input length must be larger than 4 and is multiple of 4, got %zu",
       input.size());
 

diff --git a/include/pytorch/tokenizers/bpe_tokenizer_base.h b/include/pytorch/tokenizers/bpe_tokenizer_base.h
@@ -32,29 +32,27 @@ using Decoder = std::unordered_map<uint64_t, std::string>;
 using Re2UPtr = std::unique_ptr<re2::RE2>;
 
 class BPETokenizerBase : public Tokenizer {
- public:
-  Result<std::vector<uint64_t>>
-  encode(const std::string& input, int8_t bos, int8_t eos) const override;
+public:
+  Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
+                                       int8_t eos) const override;
 
-  Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const override;
+  Result<std::string> decode(uint64_t prev_token,
+                             uint64_t token) const override;
 
- protected:
+protected:
   explicit BPETokenizerBase() {}
-  virtual ~BPETokenizerBase() {}
+  virtual ~BPETokenizerBase() override {}
 
   std::pair<std::optional<std::string>, re2::StringPiece>
-  split_with_allowed_special_token_(
-      re2::StringPiece& input,
-      const Encoder& allowed_special) const;
+  split_with_allowed_special_token_(re2::StringPiece &input,
+                                    const Encoder &allowed_special) const;
 
-  Result<std::pair<std::vector<uint64_t>, uint64_t>> encode_with_special_token_(
-      const std::string& text,
-      const Encoder& allowed_special) const;
+  Result<std::pair<std::vector<uint64_t>, uint64_t>>
+  encode_with_special_token_(const std::string &text,
+                             const Encoder &allowed_special) const;
 
-  Result<std::vector<uint64_t>> byte_pair_encode_(
-      const std::string& piece,
-      const Encoder& encoder) const;
+  Result<std::vector<uint64_t>> byte_pair_encode_(const std::string &piece,
+                                                  const Encoder &encoder) const;
 
   // Protected members that can be overloaded by other BPE tokenizers
   Re2UPtr special_token_regex_;
@@ -63,13 +61,11 @@ class BPETokenizerBase : public Tokenizer {
   Decoder decoder_;
   Decoder special_token_decoder_;
 
- private:
-  virtual Error _encode(
-      re2::StringPiece& input,
-      std::vector<uint64_t>& ret,
-      uint64_t& last_piece_token_len) const = 0;
+private:
+  virtual Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
+                        uint64_t &last_piece_token_len) const = 0;
 
-  virtual void _decode(re2::StringPiece input, std::string& ret) const = 0;
+  virtual void _decode(re2::StringPiece input, std::string &ret) const = 0;
 };
 
 } // namespace detail