Skip to content

Commit 4a1d033

Browse files
authored
clang format the rest of code (#32)
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 87eacf7 commit 4a1d033

File tree

6 files changed

+102
-54
lines changed

6 files changed

+102
-54
lines changed

examples/tokenize_tool/main.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
using namespace tokenizers;
2727

28-
std::string help(char *argv[]) {
28+
std::string help(char* argv[]) {
2929
std::stringstream ss;
3030
ss << "Usage: " << argv[0] << " <type> <model> <input to tokenize...>"
3131
<< std::endl
@@ -37,7 +37,7 @@ std::string help(char *argv[]) {
3737
return ss.str();
3838
}
3939

40-
int main(int argc, char *argv[]) {
40+
int main(int argc, char* argv[]) {
4141
// Check for the right number of CLI args
4242
if (argc < 4) {
4343
std::cerr << help(argv) << std::endl;
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
9595
// Decode
9696
std::cout << "Decoding..." << std::endl;
9797
uint64_t prev = tok_ptr->bos_tok();
98-
for (const auto &current : encoded) {
98+
for (const auto& current : encoded) {
9999
const auto decoded_result = tok_ptr->decode(prev, current);
100100
std::cout << decoded_result.get();
101101
prev = current;

test/test_base64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include "gtest/gtest.h"
109
#include <pytorch/tokenizers/base64.h>
10+
#include "gtest/gtest.h"
1111

1212
namespace tokenizers {
1313

test/test_llama2c_tokenizer.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ namespace tokenizers {
1212

1313
namespace {
1414
// Test case based on llama2.c tokenizer
15-
static inline std::string _get_resource_path(const std::string &name) {
15+
static inline std::string _get_resource_path(const std::string& name) {
1616
#ifdef TOKENIZERS_FB_BUCK
17-
return facebook::xplat::testing::getPathForTestResource("test/resources/" +
18-
name);
17+
return facebook::xplat::testing::getPathForTestResource(
18+
"test/resources/" + name);
1919
#else
2020
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
2121
#endif
@@ -24,7 +24,7 @@ static inline std::string _get_resource_path(const std::string &name) {
2424
} // namespace
2525

2626
class Llama2cTokenizerTest : public Test {
27-
public:
27+
public:
2828
void SetUp() override {
2929
tokenizer_ = std::make_unique<Llama2cTokenizer>();
3030
modelPath_ = _get_resource_path("test_llama2c_tokenizer.bin");

test/test_pre_tokenizer.cpp

Lines changed: 62 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ using namespace tokenizers;
1919

2020
// Helpers /////////////////////////////////////////////////////////////////////
2121

22-
static void assert_split_match(const PreTokenizer &ptok,
23-
const std::string &prompt,
24-
const std::vector<std::string> &expected) {
22+
static void assert_split_match(
23+
const PreTokenizer& ptok,
24+
const std::string& prompt,
25+
const std::vector<std::string>& expected) {
2526
re2::StringPiece prompt_view(prompt);
26-
const auto &got = ptok.pre_tokenize(prompt_view);
27+
const auto& got = ptok.pre_tokenize(prompt_view);
2728
EXPECT_EQ(expected.size(), got.size());
2829
for (auto i = 0; i < got.size(); ++i) {
2930
EXPECT_EQ(expected[i], got[i]);
@@ -34,14 +35,16 @@ static void assert_split_match(const PreTokenizer &ptok,
3435
class RegexPreTokenizerTest : public ::testing::Test {};
3536

3637
// Test the basic construction
37-
TEST_F(RegexPreTokenizerTest, Construct) { RegexPreTokenizer ptok("[0-9]+"); }
38+
TEST_F(RegexPreTokenizerTest, Construct) {
39+
RegexPreTokenizer ptok("[0-9]+");
40+
}
3841

3942
// Test basic splitting using the expression for Tiktoken
4043
TEST_F(RegexPreTokenizerTest, TiktokenExpr) {
4144
RegexPreTokenizer ptok(
4245
R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)");
43-
assert_split_match(ptok, "How are you doing?",
44-
{"How", " are", " you", " doing", "?"});
46+
assert_split_match(
47+
ptok, "How are you doing?", {"How", " are", " you", " doing", "?"});
4548
}
4649

4750
// DigitsPreTokenizer //////////////////////////////////////////////////////////
@@ -51,15 +54,18 @@ class DigitsPreTokenizerTest : public ::testing::Test {};
5154
TEST_F(DigitsPreTokenizerTest, IndividualDigits) {
5255
DigitsPreTokenizer ptok(true);
5356
assert_split_match(
54-
ptok, "The number 1 then 234 then 5.",
57+
ptok,
58+
"The number 1 then 234 then 5.",
5559
{"The number ", "1", " then ", "2", "3", "4", " then ", "5", "."});
5660
}
5761

5862
// Test digit splitting with contiguous digits
5963
TEST_F(DigitsPreTokenizerTest, ContiguousDigits) {
6064
DigitsPreTokenizer ptok(false);
61-
assert_split_match(ptok, "The number 1 then 234 then 5.",
62-
{"The number ", "1", " then ", "234", " then ", "5", "."});
65+
assert_split_match(
66+
ptok,
67+
"The number 1 then 234 then 5.",
68+
{"The number ", "1", " then ", "234", " then ", "5", "."});
6369
}
6470

6571
// ByteLevelPreTokenizer ///////////////////////////////////////////////////////
@@ -69,7 +75,8 @@ TEST_F(ByteLevelPreTokenizerTest, PreTokenizeDefault) {
6975
ByteLevelPreTokenizer ptok;
7076
assert_split_match(ptok, "Hello World", {"ĠHello", "ĠWorld"});
7177
assert_split_match(
72-
ptok, "The number 1 then 234 then 5.",
78+
ptok,
79+
"The number 1 then 234 then 5.",
7380
{"ĠThe", "Ġnumber", "Ġ1", "Ġthen", "Ġ234", "Ġthen", "Ġ5", "."});
7481
}
7582

@@ -90,9 +97,22 @@ TEST_F(SequencePreTokenizerTest, PreTokenizeDigitAndByteLevel) {
9097
PreTokenizer::Ptr dptok(new DigitsPreTokenizer(true));
9198
PreTokenizer::Ptr bptok(new ByteLevelPreTokenizer(false));
9299
SequencePreTokenizer ptok({dptok, bptok});
93-
assert_split_match(ptok, "The number 1 then 234 then 5.",
94-
{"The", "Ġnumber", "Ġ", "1", "Ġthen", "Ġ", "2", "3", "4",
95-
"Ġthen", "Ġ", "5", "."});
100+
assert_split_match(
101+
ptok,
102+
"The number 1 then 234 then 5.",
103+
{"The",
104+
"Ġnumber",
105+
"Ġ",
106+
"1",
107+
"Ġthen",
108+
"Ġ",
109+
"2",
110+
"3",
111+
"4",
112+
"Ġthen",
113+
"Ġ",
114+
"5",
115+
"."});
96116
}
97117

98118
// PreTokenizerConfig //////////////////////////////////////////////////////////
@@ -132,12 +152,14 @@ TEST_F(PreTokenizerConfigTest, AllTypesFailureCases) {
132152

133153
// Sequence
134154
EXPECT_THROW(PreTokenizerConfig("Sequence").create(), std::runtime_error);
135-
EXPECT_THROW(PreTokenizerConfig("Sequence").set_pretokenizers({}).create(),
136-
std::runtime_error);
137-
EXPECT_THROW(PreTokenizerConfig("Sequence")
138-
.set_pretokenizers({PreTokenizerConfig("Split")})
139-
.create(),
140-
std::runtime_error);
155+
EXPECT_THROW(
156+
PreTokenizerConfig("Sequence").set_pretokenizers({}).create(),
157+
std::runtime_error);
158+
EXPECT_THROW(
159+
PreTokenizerConfig("Sequence")
160+
.set_pretokenizers({PreTokenizerConfig("Split")})
161+
.create(),
162+
std::runtime_error);
141163

142164
// Unsupported
143165
EXPECT_THROW(PreTokenizerConfig("Unsupported").create(), std::runtime_error);
@@ -161,9 +183,22 @@ TEST_F(PreTokenizerConfigTest, ParseJson) {
161183
}},
162184
})
163185
.create();
164-
assert_split_match(*ptok, "The number 1 then 234 then 5.",
165-
{"The", "Ġnumber", "Ġ", "1", "Ġthen", "Ġ", "2", "3", "4",
166-
"Ġthen", "Ġ", "5", "."});
186+
assert_split_match(
187+
*ptok,
188+
"The number 1 then 234 then 5.",
189+
{"The",
190+
"Ġnumber",
191+
"Ġ",
192+
"1",
193+
"Ġthen",
194+
"Ġ",
195+
"2",
196+
"3",
197+
"4",
198+
"Ġthen",
199+
"Ġ",
200+
"5",
201+
"."});
167202
}
168203

169204
TEST_F(PreTokenizerConfigTest, ParseJsonOptionalKey) {
@@ -173,8 +208,10 @@ TEST_F(PreTokenizerConfigTest, ParseJsonOptionalKey) {
173208
{"type", "Digits"},
174209
})
175210
.create();
176-
assert_split_match(*ptok, "The number 1 then 234 then 5.",
177-
{"The number ", "1", " then ", "234", " then ", "5", "."});
211+
assert_split_match(
212+
*ptok,
213+
"The number 1 then 234 then 5.",
214+
{"The number ", "1", " then ", "234", " then ", "5", "."});
178215
}
179216

180217
TEST_F(PreTokenizerConfigTest, Split) {

test/test_sentencepiece.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
namespace tokenizers {
1414

1515
namespace {
16-
static inline std::string _get_resource_path(const std::string &name) {
16+
static inline std::string _get_resource_path(const std::string& name) {
1717
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
1818
}
1919
} // namespace

test/test_tiktoken.cpp

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,30 +20,35 @@ static constexpr int32_t kSpecialTokensSize = 256;
2020
static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
2121
auto special_tokens =
2222
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
23-
"<|begin_of_text|>", "<|end_of_text|>",
24-
"<|reserved_special_token_0|>", "<|reserved_special_token_1|>",
25-
"<|reserved_special_token_2|>", "<|reserved_special_token_3|>",
26-
"<|start_header_id|>", "<|end_header_id|>",
27-
"<|reserved_special_token_4|>", "<|eot_id|>"});
23+
"<|begin_of_text|>",
24+
"<|end_of_text|>",
25+
"<|reserved_special_token_0|>",
26+
"<|reserved_special_token_1|>",
27+
"<|reserved_special_token_2|>",
28+
"<|reserved_special_token_3|>",
29+
"<|start_header_id|>",
30+
"<|end_header_id|>",
31+
"<|reserved_special_token_4|>",
32+
"<|eot_id|>"});
2833

2934
// pad the rest of the special tokens with reserved tokens
3035
ssize_t reserved_special_token_num = 5;
3136
while (special_tokens->size() < kSpecialTokensSize) {
32-
special_tokens->emplace_back("<|reserved_special_token_" +
33-
std::to_string(reserved_special_token_num++) +
34-
"|>");
37+
special_tokens->emplace_back(
38+
"<|reserved_special_token_" +
39+
std::to_string(reserved_special_token_num++) + "|>");
3540
}
3641
return special_tokens;
3742
}
3843

39-
static inline std::string _get_resource_path(const std::string &name) {
44+
static inline std::string _get_resource_path(const std::string& name) {
4045
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
4146
}
4247

4348
} // namespace
4449

4550
class TiktokenTest : public Test {
46-
public:
51+
public:
4752
void SetUp() override {
4853
tokenizer_ = std::make_unique<Tiktoken>(_get_special_tokens(), 0, 1);
4954
modelPath_ = _get_resource_path("test_tiktoken_tokenizer.model");
@@ -110,23 +115,29 @@ TEST_F(TiktokenTest, ConstructionWithInvalidBOSIndex) {
110115
// gtest death test doesn't work on iOS:
111116
// https://github.com/google/googletest/issues/2834
112117
#if !GTEST_OS_IOS
113-
EXPECT_EXIT(std::make_unique<Tiktoken>(
114-
std::make_unique<std::vector<std::string>>(
115-
std::vector<std::string>{"<|end_of_text|>"}),
116-
1, 0),
117-
::testing::KilledBySignal(SIGABRT), "");
118+
EXPECT_EXIT(
119+
std::make_unique<Tiktoken>(
120+
std::make_unique<std::vector<std::string>>(
121+
std::vector<std::string>{"<|end_of_text|>"}),
122+
1,
123+
0),
124+
::testing::KilledBySignal(SIGABRT),
125+
"");
118126
#endif
119127
}
120128

121129
TEST_F(TiktokenTest, ConstructionWithInvalidEOSIndex) {
122130
// gtest death test doesn't work on iOS:
123131
// https://github.com/google/googletest/issues/2834
124132
#if !GTEST_OS_IOS
125-
EXPECT_EXIT(std::make_unique<Tiktoken>(
126-
std::make_unique<std::vector<std::string>>(
127-
std::vector<std::string>{"<|begin_of_text|>"}),
128-
0, 1),
129-
::testing::KilledBySignal(SIGABRT), "");
133+
EXPECT_EXIT(
134+
std::make_unique<Tiktoken>(
135+
std::make_unique<std::vector<std::string>>(
136+
std::vector<std::string>{"<|begin_of_text|>"}),
137+
0,
138+
1),
139+
::testing::KilledBySignal(SIGABRT),
140+
"");
130141
#endif
131142
}
132143

0 commit comments

Comments
 (0)