Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cmake/re2.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
cmake_minimum_required(VERSION 3.7.2)

project(re2-download NONE)

include(ExternalProject)
ExternalProject_Add(re2
GIT_REPOSITORY https://github.com/google/re2.git
GIT_TAG 2021-09-01
SOURCE_DIR "${CMAKE_BINARY_DIR}/re2-src"
BINARY_DIR "${CMAKE_BINARY_DIR}/re2-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
CMAKE_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
)
9 changes: 7 additions & 2 deletions cpp/perspective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
#########################
# PYTHON BINDINGS BUILD #
#########################
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
include_directories("${PSP_PYTHON_SRC}/perspective/include")

# Set CMP0094 to NEW - find the first version that matches constraints,
Expand Down Expand Up @@ -426,6 +427,9 @@ endif()
# Build minimal arrow itself
psp_build_dep("arrow" "${PSP_CMAKE_MODULE_PATH}/arrow.txt.in")

# Build re2 as our regex library
psp_build_dep("re2" "${PSP_CMAKE_MODULE_PATH}/re2.txt.in")

find_package(Flatbuffers)
if(NOT FLATBUFFERS_FOUND)
message(FATAL_ERROR"${Red}Flatbuffers could not be located${ColorReset}")
Expand Down Expand Up @@ -508,6 +512,7 @@ set (SOURCE_FILES
${PSP_CPP_SRC}/src/cpp/raii_impl_osx.cpp
${PSP_CPP_SRC}/src/cpp/raii_impl_win.cpp
${PSP_CPP_SRC}/src/cpp/range.cpp
${PSP_CPP_SRC}/src/cpp/regex.cpp
${PSP_CPP_SRC}/src/cpp/rlookup.cpp
${PSP_CPP_SRC}/src/cpp/scalar.cpp
${PSP_CPP_SRC}/src/cpp/schema_column.cpp
Expand Down Expand Up @@ -580,7 +585,7 @@ if (PSP_WASM_BUILD)
add_library(psp ${WASM_SOURCE_FILES})
target_compile_definitions(psp PRIVATE PSP_ENABLE_WASM=1)
set_target_properties(psp PROPERTIES COMPILE_FLAGS "")
target_link_libraries(psp arrow)
target_link_libraries(psp arrow re2)

# "esm/erspective.cpp.js" from CMAKE_EXECUTABLE_SYNTAX
add_executable(perspective_esm src/cpp/emscripten.cpp)
Expand Down Expand Up @@ -652,7 +657,7 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
endif()

# Link against minimal arrow static library
target_link_libraries(psp arrow)
target_link_libraries(psp arrow re2)

target_link_libraries(binding psp)

Expand Down
92 changes: 48 additions & 44 deletions cpp/perspective/src/cpp/arrow_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,47 +20,55 @@
#include <arrow/csv/reader.h>
#endif


template <class TimePoint>
static inline arrow::TimestampType::c_type ConvertTimePoint(TimePoint tp, arrow::TimeUnit::type unit) {
auto duration = tp.time_since_epoch();
switch (unit) {
case arrow::TimeUnit::SECOND:
return std::chrono::duration_cast<std::chrono::seconds>(duration).count();
case arrow::TimeUnit::MILLI:
return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
case arrow::TimeUnit::MICRO:
return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
case arrow::TimeUnit::NANO:
return std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
default:
// Compiler errors without default case even though all enum cases are handled
assert(0);
return 0;
}
static inline arrow::TimestampType::c_type
ConvertTimePoint(TimePoint tp, arrow::TimeUnit::type unit) {
auto duration = tp.time_since_epoch();
switch (unit) {
case arrow::TimeUnit::SECOND:
return std::chrono::duration_cast<std::chrono::seconds>(duration)
.count();
case arrow::TimeUnit::MILLI:
return std::chrono::duration_cast<std::chrono::milliseconds>(
duration)
.count();
case arrow::TimeUnit::MICRO:
return std::chrono::duration_cast<std::chrono::microseconds>(
duration)
.count();
case arrow::TimeUnit::NANO:
return std::chrono::duration_cast<std::chrono::nanoseconds>(
duration)
.count();
default:
// Compiler errors without default case even though all enum cases
// are handled
assert(0);
return 0;
}
}


static inline bool ParseYYYY_MM_DD(const char* s,
arrow_vendored::date::year_month_day* out) {
uint16_t year = 0;
uint8_t month = 0;
uint8_t day = 0;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 8, 2, &day))) {
return false;
}
*out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
arrow_vendored::date::day{day}};
return out->ok();
static inline bool
ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) {
uint16_t year = 0;
uint8_t month = 0;
uint8_t day = 0;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(
!arrow::internal::ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 8, 2, &day))) {
return false;
}
*out = {arrow_vendored::date::year{year},
arrow_vendored::date::month{month}, arrow_vendored::date::day{day}};
return out->ok();
}

namespace perspective {
Expand Down Expand Up @@ -143,9 +151,7 @@ namespace apachearrow {
if (length == 23) {
// "YYYY-MM-DD[ T]hh:mm:ss.sss"
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(
!ParseYYYY_MM_DD(
s, &ymd))) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
Expand All @@ -166,9 +172,7 @@ namespace apachearrow {
} else if (length == 25) {
// "2008-09-15[ T]15:53:00+05:00"
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(
!ParseYYYY_MM_DD(
s, &ymd))) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
Expand Down
49 changes: 34 additions & 15 deletions cpp/perspective/src/cpp/computed_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ t_computed_expression::t_computed_expression(

void
t_computed_expression::compute(std::shared_ptr<t_data_table> source_table,
std::shared_ptr<t_data_table> destination_table, t_expression_vocab& vocab) const {
std::shared_ptr<t_data_table> destination_table, t_expression_vocab& vocab,
t_regex_mapping& regex_mapping) const {
// TODO: share symtables across pre/re/compute
exprtk::symbol_table<t_tscalar> sym_table;

Expand All @@ -100,7 +101,7 @@ t_computed_expression::compute(std::shared_ptr<t_data_table> source_table,

// Create a function store, with is_type_validator set to false as we
// are calculating values, not type-checking.
t_computed_function_store function_store(vocab, false);
t_computed_function_store function_store(vocab, regex_mapping, false);
function_store.register_computed_functions(sym_table);

exprtk::expression<t_tscalar> expr_definition;
Expand Down Expand Up @@ -209,13 +210,14 @@ t_computed_expression_parser::precompute(const std::string& expression_alias,
const std::string& expression_string,
const std::string& parsed_expression_string,
const std::vector<std::pair<std::string, std::string>>& column_ids,
std::shared_ptr<t_schema> schema, t_expression_vocab& vocab) {
std::shared_ptr<t_schema> schema, t_expression_vocab& vocab,
t_regex_mapping& regex_mapping) {
exprtk::symbol_table<t_tscalar> sym_table;
sym_table.add_constants();

// Create a function store, with is_type_validator set to true as we are
// just getting the output types.
t_computed_function_store function_store(vocab, true);
t_computed_function_store function_store(vocab, regex_mapping, true);
function_store.register_computed_functions(sym_table);

std::vector<t_tscalar> values;
Expand Down Expand Up @@ -271,15 +273,16 @@ t_computed_expression_parser::get_dtype(const std::string& expression_alias,
const std::string& expression_string,
const std::string& parsed_expression_string,
const std::vector<std::pair<std::string, std::string>>& column_ids,
const t_schema& schema, t_expression_error& error, t_expression_vocab& vocab) {
const t_schema& schema, t_expression_error& error,
t_expression_vocab& vocab, t_regex_mapping& regex_mapping) {
exprtk::symbol_table<t_tscalar> sym_table;
sym_table.add_constants();

std::vector<t_tscalar> values;

// Create a function store, with is_type_validator set to true as we are
// just validating the output types.
t_computed_function_store function_store(vocab, true);
t_computed_function_store function_store(vocab, regex_mapping, true);
function_store.register_computed_functions(sym_table);

auto num_input_columns = column_ids.size();
Expand Down Expand Up @@ -418,8 +421,8 @@ t_validated_expression_map::get_expression_errors() const {
return m_expression_errors;
}

t_computed_function_store::t_computed_function_store(
t_expression_vocab& vocab, bool is_type_validator)
t_computed_function_store::t_computed_function_store(t_expression_vocab& vocab,
t_regex_mapping& regex_mapping, bool is_type_validator)
: m_day_of_week_fn(computed_function::day_of_week(vocab, is_type_validator))
, m_month_of_year_fn(
computed_function::month_of_year(vocab, is_type_validator))
Expand All @@ -428,34 +431,44 @@ t_computed_function_store::t_computed_function_store(
, m_order_fn(computed_function::order(is_type_validator))
, m_upper_fn(computed_function::upper(vocab, is_type_validator))
, m_lower_fn(computed_function::lower(vocab, is_type_validator))
, m_to_string_fn(computed_function::to_string(vocab, is_type_validator)) {}
, m_to_string_fn(computed_function::to_string(vocab, is_type_validator))
, m_match_fn(computed_function::match(regex_mapping))
, m_fullmatch_fn(computed_function::fullmatch(regex_mapping))
, m_search_fn(
computed_function::search(vocab, regex_mapping, is_type_validator)) {}

void
t_computed_function_store::register_computed_functions(
exprtk::symbol_table<t_tscalar>& sym_table) {
// General/numeric functions
sym_table.add_function("bucket", t_computed_expression_parser::BUCKET_FN);
sym_table.add_reserved_function(
"inrange", t_computed_expression_parser::INRANGE_FN);
sym_table.add_reserved_function(
"min", t_computed_expression_parser::MIN_FN);
sym_table.add_reserved_function(
"max", t_computed_expression_parser::MAX_FN);
sym_table.add_function(
"percent_of", t_computed_expression_parser::PERCENT_OF_FN);
sym_table.add_function("is_null", t_computed_expression_parser::IS_NULL_FN);
sym_table.add_function(
"is_not_null", t_computed_expression_parser::IS_NOT_NULL_FN);

// Date/datetime functions
sym_table.add_function(
"hour_of_day", t_computed_expression_parser::HOUR_OF_DAY_FN);
sym_table.add_function("day_of_week", m_day_of_week_fn);
sym_table.add_function("month_of_year", m_month_of_year_fn);

// String functions
sym_table.add_function("intern", m_intern_fn);
sym_table.add_function("concat", m_concat_fn);
sym_table.add_function("order", m_order_fn);
sym_table.add_function("upper", m_upper_fn);
sym_table.add_function("lower", m_lower_fn);
sym_table.add_function("length", t_computed_expression_parser::LENGTH_FN);
sym_table.add_function("string", m_to_string_fn);
sym_table.add_function(
"percent_of", t_computed_expression_parser::PERCENT_OF_FN);
sym_table.add_function("is_null", t_computed_expression_parser::IS_NULL_FN);
sym_table.add_function(
"is_not_null", t_computed_expression_parser::IS_NOT_NULL_FN);

// Type conversion functions
sym_table.add_function(
"integer", t_computed_expression_parser::TO_INTEGER_FN);
sym_table.add_function("float", t_computed_expression_parser::TO_FLOAT_FN);
Expand All @@ -464,6 +477,12 @@ t_computed_function_store::register_computed_functions(
sym_table.add_function("date", t_computed_expression_parser::MAKE_DATE_FN);
sym_table.add_function(
"datetime", t_computed_expression_parser::MAKE_DATETIME_FN);
sym_table.add_function("string", m_to_string_fn);

// Regex functions
sym_table.add_function("match", m_match_fn);
sym_table.add_function("fullmatch", m_fullmatch_fn);
sym_table.add_function("search", m_search_fn);

// Register static free functions as well
sym_table.add_function("today", computed_function::today);
Expand Down
Loading