Skip to content

Commit 318d25d

Browse files
authored
Format (#14)
* Format Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Fix pull and trunk Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent f1ce222 commit 318d25d

25 files changed

+498
-369
lines changed

.github/workflows/pull.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@ jobs:
1818
strategy:
1919
fail-fast: false
2020
with:
21-
runner: linux.2xlarge
21+
runner: linux.large
2222
submodules: 'true'
2323
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
2424
timeout: 90
2525
script: |
26+
set -ex
2627
cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
2728
cmake --build build -j9 --config Debug
28-
ctest
29+
cd build && ctest

.github/workflows/trunk.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ jobs:
2626
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
2727
timeout: 90
2828
script: |
29+
set -ex
2930
cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
3031
cmake --build build -j9 --config Debug
31-
ctest
32+
cd build && ctest

CMakeLists.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
# Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
2-
#
3-
# This source code is licensed under the BSD-style license found in the LICENSE
4-
# file in the root directory of this source tree.
1+
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
52

63
#
74
# Build tokenizers.

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ outlined on that page and do not file a public issue.
2828

2929
## License
3030
By contributing to tokenizers, you agree that your contributions will be licensed
31-
under the LICENSE file in the root directory of this source tree.
31+
under the LICENSE file in the root directory of this source tree.

include/base64.h

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace base64 {
3636
using tokenizers::Error;
3737
using tokenizers::Result;
3838

39-
Result<std::string> decode(const std::string_view &input);
39+
Result<std::string> decode(const std::string_view& input);
4040

4141
namespace detail {
4242

@@ -68,9 +68,12 @@ inline Error validate(uint32_t v) {
6868
return Error::Ok;
6969
}
7070

71-
inline Error decode(const std::string_view &input, std::string &output) {
72-
TK_CHECK_OR_RETURN_ERROR(input.size() == 4, Base64DecodeFailure,
73-
"input length must be 4, got %zu", input.size());
71+
inline Error decode(const std::string_view& input, std::string& output) {
72+
TK_CHECK_OR_RETURN_ERROR(
73+
input.size() == 4,
74+
Base64DecodeFailure,
75+
"input length must be 4, got %zu",
76+
input.size());
7477

7578
uint32_t val = 0;
7679

@@ -100,10 +103,14 @@ inline Error decode(const std::string_view &input, std::string &output) {
100103
return Error::Ok;
101104
}
102105

103-
inline Error decode_1_padding(const std::string_view &input,
104-
std::string &output) {
105-
TK_CHECK_OR_RETURN_ERROR(input.size() == 3, Base64DecodeFailure,
106-
"input length must be 3, got %zu", input.size());
106+
inline Error decode_1_padding(
107+
const std::string_view& input,
108+
std::string& output) {
109+
TK_CHECK_OR_RETURN_ERROR(
110+
input.size() == 3,
111+
Base64DecodeFailure,
112+
"input length must be 3, got %zu",
113+
input.size());
107114

108115
uint32_t val = 0;
109116

@@ -127,10 +134,14 @@ inline Error decode_1_padding(const std::string_view &input,
127134
return Error::Ok;
128135
}
129136

130-
inline Error decode_2_padding(const std::string_view &input,
131-
std::string &output) {
132-
TK_CHECK_OR_RETURN_ERROR(input.size() == 2, Base64DecodeFailure,
133-
"input length must be 2, got %zu", input.size());
137+
inline Error decode_2_padding(
138+
const std::string_view& input,
139+
std::string& output) {
140+
TK_CHECK_OR_RETURN_ERROR(
141+
input.size() == 2,
142+
Base64DecodeFailure,
143+
"input length must be 2, got %zu",
144+
input.size());
134145

135146
uint32_t val = 0;
136147

@@ -150,12 +161,13 @@ inline Error decode_2_padding(const std::string_view &input,
150161

151162
} // namespace detail
152163

153-
inline tokenizers::Result<std::string> decode(const std::string_view &input) {
164+
inline tokenizers::Result<std::string> decode(const std::string_view& input) {
154165
TK_CHECK_OR_RETURN_ERROR(!input.empty(), Base64DecodeFailure, "empty input");
155166

156167
// Faster than `input.size() % 4`.
157168
TK_CHECK_OR_RETURN_ERROR(
158-
(input.size() & 3) == 0 && input.size() >= 4, Base64DecodeFailure,
169+
(input.size() & 3) == 0 && input.size() >= 4,
170+
Base64DecodeFailure,
159171
"input length must be larger than 4 and is multiple of 4, got %zu",
160172
input.size());
161173

include/detail/bpe_tokenizer_base.h

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,27 +28,29 @@ using Decoder = std::unordered_map<uint64_t, std::string>;
2828
using Re2UPtr = std::unique_ptr<re2::RE2>;
2929

3030
class BPETokenizerBase : public Tokenizer {
31-
public:
32-
Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
33-
int8_t eos) const override;
31+
public:
32+
Result<std::vector<uint64_t>>
33+
encode(const std::string& input, int8_t bos, int8_t eos) const override;
3434

35-
Result<std::string> decode(uint64_t prev_token,
36-
uint64_t token) const override;
35+
Result<std::string> decode(uint64_t prev_token, uint64_t token)
36+
const override;
3737

38-
protected:
38+
protected:
3939
explicit BPETokenizerBase() {}
4040
virtual ~BPETokenizerBase() {}
4141

4242
std::pair<std::optional<std::string>, re2::StringPiece>
43-
split_with_allowed_special_token_(re2::StringPiece &input,
44-
const Encoder &allowed_special) const;
43+
split_with_allowed_special_token_(
44+
re2::StringPiece& input,
45+
const Encoder& allowed_special) const;
4546

46-
Result<std::pair<std::vector<uint64_t>, uint64_t>>
47-
encode_with_special_token_(const std::string &text,
48-
const Encoder &allowed_special) const;
47+
Result<std::pair<std::vector<uint64_t>, uint64_t>> encode_with_special_token_(
48+
const std::string& text,
49+
const Encoder& allowed_special) const;
4950

50-
Result<std::vector<uint64_t>> byte_pair_encode_(const std::string &piece,
51-
const Encoder &encoder) const;
51+
Result<std::vector<uint64_t>> byte_pair_encode_(
52+
const std::string& piece,
53+
const Encoder& encoder) const;
5254

5355
// Protected members that can be overloaded by other BPE tokenizers
5456
Re2UPtr special_token_regex_;
@@ -57,11 +59,13 @@ class BPETokenizerBase : public Tokenizer {
5759
Decoder decoder_;
5860
Decoder special_token_decoder_;
5961

60-
private:
61-
virtual Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
62-
uint64_t &last_piece_token_len) const = 0;
62+
private:
63+
virtual Error _encode(
64+
re2::StringPiece& input,
65+
std::vector<uint64_t>& ret,
66+
uint64_t& last_piece_token_len) const = 0;
6367

64-
virtual void _decode(re2::StringPiece input, std::string &ret) const = 0;
68+
virtual void _decode(re2::StringPiece input, std::string& ret) const = 0;
6569
};
6670

6771
} // namespace detail

include/error.h

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313

1414
#pragma once
1515

16-
#include "log.h"
1716
#include <stdint.h>
17+
#include "log.h"
1818

1919
namespace tokenizers {
2020

@@ -70,12 +70,12 @@ enum class Error : error_code_t {
7070
* @param[in] message__ Format string for the log error message.
7171
* @param[in] ... Optional additional arguments for the format string.
7272
*/
73-
#define TK_CHECK_OR_RETURN_ERROR(cond__, error__, message__, ...) \
74-
{ \
75-
if (!(cond__)) { \
76-
TK_LOG(Error, message__, ##__VA_ARGS__); \
77-
return ::tokenizers::Error::error__; \
78-
} \
73+
#define TK_CHECK_OR_RETURN_ERROR(cond__, error__, message__, ...) \
74+
{ \
75+
if (!(cond__)) { \
76+
TK_LOG(Error, message__, ##__VA_ARGS__); \
77+
return ::tokenizers::Error::error__; \
78+
} \
7979
}
8080

8181
/**
@@ -86,13 +86,13 @@ enum class Error : error_code_t {
8686
* @param[in] ... Optional format string for the log error message and its
8787
* arguments.
8888
*/
89-
#define TK_CHECK_OK_OR_RETURN_ERROR(error__, ...) \
89+
#define TK_CHECK_OK_OR_RETURN_ERROR(error__, ...) \
9090
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR(error__, ##__VA_ARGS__)
9191

9292
// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
93-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR(...) \
94-
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, \
95-
4, 3, 2, 1) \
93+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR(...) \
94+
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT( \
95+
__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) \
9696
(__VA_ARGS__)
9797

9898
/**
@@ -119,43 +119,43 @@ enum class Error : error_code_t {
119119
* TK_CHECK_OK_OR_RETURN_ERROR(error_code); // Calls v1
120120
* TK_CHECK_OK_OR_RETURN_ERROR(error_code, "Error message", ...); // Calls v2
121121
*/
122-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT(_1, _2, _3, _4, _5, _6, \
123-
_7, _8, _9, _10, N, ...) \
122+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_SELECT( \
123+
_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) \
124124
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_##N
125125

126126
// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
127-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_1(error__) \
128-
do { \
129-
const auto et_error__ = (error__); \
130-
if (et_error__ != ::tokenizers::Error::Ok) { \
131-
return et_error__; \
132-
} \
127+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_1(error__) \
128+
do { \
129+
const auto et_error__ = (error__); \
130+
if (et_error__ != ::tokenizers::Error::Ok) { \
131+
return et_error__; \
132+
} \
133133
} while (0)
134134

135135
// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
136-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2(error__, message__, ...) \
137-
do { \
138-
const auto et_error__ = (error__); \
139-
if (et_error__ != ::tokenizers::Error::Ok) { \
140-
TK_LOG(Error, message__, ##__VA_ARGS__); \
141-
return et_error__; \
142-
} \
136+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2(error__, message__, ...) \
137+
do { \
138+
const auto et_error__ = (error__); \
139+
if (et_error__ != ::tokenizers::Error::Ok) { \
140+
TK_LOG(Error, message__, ##__VA_ARGS__); \
141+
return et_error__; \
142+
} \
143143
} while (0)
144144

145145
// Internal only: Use ET_CHECK_OK_OR_RETURN_ERROR() instead.
146-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_3 \
146+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_3 \
147147
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
148-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_4 \
148+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_4 \
149149
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
150-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_5 \
150+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_5 \
151151
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
152-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_6 \
152+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_6 \
153153
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
154-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_7 \
154+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_7 \
155155
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
156-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_8 \
156+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_8 \
157157
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
158-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_9 \
158+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_9 \
159159
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2
160-
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_10 \
160+
#define TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_10 \
161161
TK_INTERNAL_CHECK_OK_OR_RETURN_ERROR_2

include/hf_tokenizer.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
namespace tokenizers {
2929
class HFTokenizer : public detail::BPETokenizerBase {
30-
public:
30+
public:
3131
/*-- Public Interface --*/
3232

3333
/**
@@ -39,13 +39,15 @@ class HFTokenizer : public detail::BPETokenizerBase {
3939
/**
4040
* Load the model data into the
4141
*/
42-
Error load(const std::string &tokenizer_path) override;
42+
Error load(const std::string& tokenizer_path) override;
4343

44-
private:
45-
Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
46-
uint64_t &last_piece_token_len) const override;
44+
private:
45+
Error _encode(
46+
re2::StringPiece& input,
47+
std::vector<uint64_t>& ret,
48+
uint64_t& last_piece_token_len) const override;
4749

48-
void _decode(re2::StringPiece input, std::string &ret) const override;
50+
void _decode(re2::StringPiece input, std::string& ret) const override;
4951

5052
PreTokenizer::Ptr _pretokenizer;
5153
TokenDecoder::Ptr _decoder;

0 commit comments

Comments
 (0)