Skip to content

Add pcre2 as re2 fallback #50

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@
[submodule "third-party/json"]
path = third-party/json
url = https://github.com/nlohmann/json.git
[submodule "third-party/pcre2"]
path = third-party/pcre2
url = https://github.com/PCRE2Project/pcre2.git
21 changes: 18 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)

# Configure PCRE2
set(PCRE2_BUILD_PCRE2_8 ON)
set(PCRE2_BUILD_PCRE2_16 OFF)
set(PCRE2_BUILD_PCRE2_32 OFF)
set(PCRE2_BUILD_TESTS OFF)
set(PCRE2_BUILD_PCRE2GREP OFF)
set(PCRE2_BUILD_PCRE2TEST OFF)
set(PCRE2_BUILD_PCRE2GPERF OFF)
set(PCRE2_BUILD_DOCS OFF)
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)

set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
Expand All @@ -45,9 +58,10 @@ target_include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)

target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)

# Build test
if(TOKENIZERS_BUILD_TEST)
Expand Down Expand Up @@ -77,7 +91,8 @@ if(TOKENIZERS_BUILD_TEST)
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers)
add_test(${test_name} "${test_name}")
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
Expand Down
52 changes: 52 additions & 0 deletions include/pytorch/tokenizers/pcre2_regex.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <memory>
#include <string>

// Define PCRE2 code unit width before including pcre2.h
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>

#include <pytorch/tokenizers/regex.h>

namespace tokenizers {

/**
* @brief PCRE2-based implementation of IRegex.
*/
class Pcre2Regex : public IRegex {
public:
/**
* @brief Construct a PCRE2 regex with the given pattern.
*
* @param pattern The regex pattern to compile.
*/
explicit Pcre2Regex(const std::string& pattern);

/**
* @brief Destructor to clean up PCRE2 resources.
*/
~Pcre2Regex();

/**
* @brief Return all non-overlapping matches found in the input string.
*/
virtual std::vector<Match> find_all(const std::string& text) const override;

private:
pcre2_code* regex_;
pcre2_match_data* match_data_;

friend Result<std::unique_ptr<IRegex>> create_regex(
const std::string& pattern);
};

} // namespace tokenizers
109 changes: 109 additions & 0 deletions src/pcre2_regex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <iostream>
#include <vector>

#include <pytorch/tokenizers/pcre2_regex.h>

namespace tokenizers {

Pcre2Regex::Pcre2Regex(const std::string& pattern)
: regex_(nullptr), match_data_(nullptr) {
int error_code;
PCRE2_SIZE error_offset;

// Compile the pattern
regex_ = pcre2_compile(
reinterpret_cast<PCRE2_SPTR>(pattern.c_str()),
pattern.length(),
PCRE2_UCP | PCRE2_UTF, // Enable Unicode support and UTF-8 mode
&error_code,
&error_offset,
nullptr);

if (regex_ == nullptr) {
PCRE2_UCHAR error_buffer[256];
pcre2_get_error_message(error_code, error_buffer, sizeof(error_buffer));
std::cerr << "PCRE2 compilation failed at offset " << error_offset << ": "
<< error_buffer << std::endl;
return;
}

// Create match data
match_data_ = pcre2_match_data_create_from_pattern(regex_, nullptr);
if (match_data_ == nullptr) {
pcre2_code_free(regex_);
regex_ = nullptr;
std::cerr << "Failed to create PCRE2 match data" << std::endl;
return;
}
}

Pcre2Regex::~Pcre2Regex() {
if (match_data_) {
pcre2_match_data_free(match_data_);
}
if (regex_) {
pcre2_code_free(regex_);
}
}

std::vector<Match> Pcre2Regex::find_all(const std::string& text) const {
std::vector<Match> result;

if (!regex_ || !match_data_) {
return result;
}

PCRE2_SIZE* ovector;
PCRE2_SPTR subject = reinterpret_cast<PCRE2_SPTR>(text.c_str());
PCRE2_SIZE subject_length = text.length();
PCRE2_SIZE offset = 0;

while (offset < subject_length) {
int rc = pcre2_match(
regex_,
subject,
subject_length,
offset,
0, // Default options
match_data_,
nullptr);

if (rc < 0) {
if (rc == PCRE2_ERROR_NOMATCH) {
break; // No more matches
} else {
// Error occurred
PCRE2_UCHAR error_buffer[256];
pcre2_get_error_message(rc, error_buffer, sizeof(error_buffer));
std::cerr << "PCRE2 matching error: " << error_buffer << std::endl;
break;
}
}

ovector = pcre2_get_ovector_pointer(match_data_);

// Add the match to the result
result.push_back({ovector[0], ovector[1]});

// Move to the next position after the match
offset = ovector[1];

// If the match was empty, move forward by one character to avoid infinite
// loop
if (ovector[0] == ovector[1]) {
offset++;
}
}

return result;
}

} // namespace tokenizers
19 changes: 15 additions & 4 deletions src/regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <pytorch/tokenizers/pcre2_regex.h>
#include <pytorch/tokenizers/re2_regex.h>
#include <pytorch/tokenizers/regex.h>
#include <pytorch/tokenizers/std_regex.h>
Expand All @@ -18,8 +19,8 @@ namespace tokenizers {

/**
* @brief Factory function that creates a regex object using RE2 if possible.
* Falls back to std::regex if RE2 rejects the pattern with
* ErrorBadPerlOp.
* Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if
* PCRE2 fails.
*/
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
// Try RE2 first
Expand All @@ -30,10 +31,20 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
}

if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
try {
// RE2 doesn't support some Perl features, try PCRE2
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");

if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) {
std::cout
<< "RE2 is unable to support things such as negative lookaheads in "
<< pattern << ", defaulting to std::regex.";
<< pattern << ", using PCRE2 instead." << std::endl;
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2));
}

// If PCRE2 also fails, fall back to std::regex
try {
std::cout
<< "PCRE2 failed to compile pattern, falling back to std::regex.";
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
} catch (const std::regex_error& e) {
Expand Down
3 changes: 3 additions & 0 deletions targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ def define_common_targets():
srcs = ["src/regex.cpp"] + glob([
"src/*_regex.cpp",
]),
deps = [
"fbsource//third-party/pcre2:pcre2-8",
],
exported_headers = subdir_glob([
("include", "pytorch/tokenizers/regex.h"),
("include", "pytorch/tokenizers/*_regex.h"),
Expand Down
107 changes: 107 additions & 0 deletions test/test_regex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <gtest/gtest.h>

#include "pytorch/tokenizers/pcre2_regex.h"
#include "pytorch/tokenizers/re2_regex.h"
#include "pytorch/tokenizers/regex.h"

using namespace tokenizers;

class RegexTest : public ::testing::Test {};

// Test basic functionality
TEST_F(RegexTest, BasicMatching) {
auto regex = TK_UNWRAP_THROW(create_regex("\\w+"));

std::string text = "Hello world";
auto matches = regex->find_all(text);
ASSERT_EQ(matches.size(), 2);
EXPECT_EQ(matches[0].start, 0);
EXPECT_EQ(matches[0].end, 5);
EXPECT_EQ(
text.substr(matches[0].start, matches[0].end - matches[0].start),
"Hello");
EXPECT_EQ(matches[1].start, 6);
EXPECT_EQ(matches[1].end, 11);
EXPECT_EQ(
text.substr(matches[1].start, matches[1].end - matches[1].start),
"world");
}

// Test pattern that only PCRE2 supports (lookbehind)
TEST_F(RegexTest, Pcre2Specific) {
const std::string pattern = "(?<=@)\\w+";

// Verify that the factory function fallsback on a PCRE2 regex
auto regex = TK_UNWRAP_THROW(create_regex(pattern));
EXPECT_NE(dynamic_cast<Pcre2Regex*>(regex.get()), nullptr);

std::string text = "[email protected]";
auto matches = regex->find_all(text);
ASSERT_EQ(matches.size(), 1);
EXPECT_EQ(matches[0].start, 5);
EXPECT_EQ(matches[0].end, 12);
EXPECT_EQ(
text.substr(matches[0].start, matches[0].end - matches[0].start),
"example");
}

// Test complex pattern with negative lookahead that should fall back to PCRE2.
// This specific pattern is from the Qwen2.5 1.5B pretokenizer.
// https://huggingface.co/Qwen/Qwen2.5-1.5B/raw/main/tokenizer.json
TEST_F(RegexTest, ComplexPatternWithNegativeLookahead) {
const std::string complex_pattern =
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";

// Now verify that the factory function fallsback on a PCRE2 regex
auto regex = TK_UNWRAP_THROW(create_regex(complex_pattern));
EXPECT_NE(dynamic_cast<Pcre2Regex*>(regex.get()), nullptr);

// Test the pattern with some sample text
std::string text = "Hello's world\n test";
auto matches = regex->find_all(text);

// We expect to match:
// 1. "Hello" (word)
// 2. "'s" (contraction)
// 3. " world" (word with leading space)
// 4. "\n" (newline)
// 5. " " (whitespace)
// 6. " test" (word with leading space)
ASSERT_EQ(matches.size(), 6);

EXPECT_EQ(matches[0].start, 0);
EXPECT_EQ(matches[0].end, 5);
EXPECT_EQ(
text.substr(matches[0].start, matches[0].end - matches[0].start),
"Hello");
EXPECT_EQ(matches[1].start, 5);
EXPECT_EQ(matches[1].end, 7);
EXPECT_EQ(
text.substr(matches[1].start, matches[1].end - matches[1].start), "'s");
EXPECT_EQ(matches[2].start, 7);
EXPECT_EQ(matches[2].end, 13);
EXPECT_EQ(
text.substr(matches[2].start, matches[2].end - matches[2].start),
" world");
EXPECT_EQ(matches[3].start, 13);
EXPECT_EQ(matches[3].end, 14);
EXPECT_EQ(
text.substr(matches[3].start, matches[3].end - matches[3].start), "\n");
EXPECT_EQ(matches[4].start, 14);
EXPECT_EQ(matches[4].end, 15);
EXPECT_EQ(
text.substr(matches[4].start, matches[4].end - matches[4].start), " ");
EXPECT_EQ(matches[5].start, 15);
EXPECT_EQ(matches[5].end, 20);
EXPECT_EQ(
text.substr(matches[5].start, matches[5].end - matches[5].start),
" test");
}
1 change: 1 addition & 0 deletions third-party/pcre2
Submodule pcre2 added at 2e03e3