From 63d64b07784c25010948c19417e675b779dae0e1 Mon Sep 17 00:00:00 2001 From: Laria Carolin Chabowski Date: Mon, 3 Jul 2023 23:39:29 +0200 Subject: [PATCH] Add regex module "re" This uses the PCRE2 library to implement regexes in apfl --- src/CMakeLists.txt | 25 +- src/apfl.h | 16 + src/context.c | 148 +++++++-- src/context.h | 12 +- src/eval.c | 4 + src/functional-tests/re.at | 69 ++++ src/globals.apfl | 25 +- src/modules.c | 21 ++ src/modules.h | 10 + src/re.apfl | 97 ++++++ src/re.c | 642 +++++++++++++++++++++++++++++++++++++ webpage/.gitignore | 1 + webpage/build.sh | 22 +- 13 files changed, 1049 insertions(+), 43 deletions(-) create mode 100644 src/functional-tests/re.at create mode 100644 src/modules.c create mode 100644 src/re.apfl create mode 100644 src/re.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f7db914..d95e454 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,11 @@ set(CMAKE_C_EXTENSIONS OFF) option(BUILD_SHARED_LIBS "Build dynamic / shared libraries" ON) option(TEST_WITH_VALGRIND_MEMCHECK "Also run tests with valgrind / memcheck" ON) +include(FindPkgConfig) + +pkg_check_modules(PCRE2 REQUIRED libpcre2-8) + +include_directories(${PCRE2_INCLUDE_DIRS}) set(commonfiles alloc.c bytecode.c @@ -32,13 +37,17 @@ add_library(apfl builtins.c context.c eval.c + modules.c + re.c registry.c scope.c symbols.c mod_globals.c + mod_re.c ) target_link_libraries(apfl PUBLIC m) +target_link_libraries(apfl PUBLIC ${PCRE2_LIBRARIES}) add_executable(apfl-bin main.c) target_link_libraries(apfl-bin PUBLIC apfl) @@ -51,11 +60,16 @@ else() export(TARGETS apflc FILE "${CMAKE_BINARY_DIR}/ApflApflcNativeConfig.cmake") endif() -add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/mod_globals.c" - COMMAND apflc -c apfl_mod_globals "${CMAKE_CURRENT_SOURCE_DIR}/globals.apfl" "${CMAKE_CURRENT_BINARY_DIR}/mod_globals.c" - DEPENDS apflc "${CMAKE_CURRENT_SOURCE_DIR}/globals.apfl" -) +function(apfl_to_c apflfile cfile cfuncname) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${cfile}" + COMMAND apflc -c ${cfuncname} "${CMAKE_CURRENT_SOURCE_DIR}/${apflfile}" "${CMAKE_CURRENT_BINARY_DIR}/${cfile}" + DEPENDS apflc "${CMAKE_CURRENT_SOURCE_DIR}/${apflfile}" + ) +endfunction() + +apfl_to_c(globals.apfl mod_globals.c apfl_mod_globals) +apfl_to_c(re.apfl mod_re.c apfl_mod_re) add_executable(functional-test-runner functional-test-runner.c) target_link_libraries(functional-test-runner PUBLIC apfl) @@ -121,6 +135,7 @@ functionaltest("symbols") functionaltest("get-optional") functionaltest("has-key") functionaltest("tonumber") +functionaltest("re") install(TARGETS apfl DESTINATION lib) install(TARGETS apfl-bin DESTINATION bin) diff --git a/src/apfl.h b/src/apfl.h index 9880dec..fcee367 100644 --- a/src/apfl.h +++ b/src/apfl.h @@ -687,6 +687,7 @@ enum apfl_result { struct apfl_config { struct apfl_allocator allocator; struct apfl_io_writer output_writer; + bool no_standard_modules; }; apfl_ctx apfl_ctx_new(struct apfl_config); @@ -697,6 +698,8 @@ typedef void (*apfl_panic_callback)(apfl_ctx, void *, enum apfl_result); void apfl_ctx_set_panic_callback(apfl_ctx, apfl_panic_callback, void *); +struct apfl_allocator apfl_get_allocator(apfl_ctx); + typedef struct apfl_iterative_runner_data *apfl_iterative_runner; apfl_iterative_runner apfl_iterative_runner_new(apfl_ctx, struct apfl_source_reader); @@ -741,8 +744,15 @@ void apfl_push_number(apfl_ctx, apfl_number); void apfl_push_string_view_copy(apfl_ctx, struct apfl_string_view); // Push a constant string. void apfl_push_const_string(apfl_ctx, const char *); +// Move a string onto the stack as a new string value. +// Returns false on error, you should then clean up string manually and throw an +// allocation error. +bool apfl_move_string_onto_stack(apfl_ctx ctx, struct apfl_string string); // Push a C symbol onto the stack. void apfl_push_csymbol(apfl_ctx, apfl_cfunc, const char *); +// Returns the csymbol function or NULL, if the element was no csymbol. +// The value is popped from the stack in any case. +apfl_cfunc apfl_pop_csymbol(apfl_ctx, apfl_stackidx); // Push a symbol onto the stack. s is a string value that will be popped from the stack. void apfl_push_symbol(apfl_ctx, apfl_stackidx s); // Push an anonymous symbol onto the stack. @@ -804,6 +814,12 @@ void apfl_cfunc_self_getslot(apfl_ctx, apfl_slotidx); void apfl_cfunc_setslot(apfl_ctx, apfl_stackidx cfunc, apfl_slotidx, apfl_stackidx value); void apfl_cfunc_self_setslot(apfl_ctx, apfl_slotidx, apfl_stackidx value); +typedef void (*apfl_cfunc_defer_callback)(apfl_ctx, void *); + +// Run a callback at the end of a cfunc. Callbacks added with this will be run +// in the reverse order upon returning from the cfunc. +void apfl_cfunc_defer(apfl_ctx, apfl_cfunc_defer_callback, void *); + void apfl_push_userdata(apfl_ctx, void *); void *apfl_get_userdata(apfl_ctx, apfl_stackidx); diff --git a/src/context.c b/src/context.c index 6618d67..decddc3 100644 --- a/src/context.c +++ b/src/context.c @@ -35,13 +35,46 @@ struct protected_errcallback_data { void (*errcallback)(apfl_ctx, void *); }; -void +static void protected_errcallback(apfl_ctx ctx, void *opaque) { struct protected_errcallback_data *data = opaque; data->errcallback(ctx, data->opaque_outer); } +static void +protected_run_deferreds(apfl_ctx ctx, void *opaque) +{ + struct call_stack_entry *cse = opaque; + apfl_cfunc_run_deferred(ctx, cse); +} + +static void +protected_in_error_handling( + apfl_ctx ctx, + void (*callback)(apfl_ctx, void *), + void *opaque, + enum apfl_result *result, + bool *with_error_on_stack +) { + switch (apfl_do_protected(ctx, callback, opaque, NULL)) { + case APFL_RESULT_OK: + break; + case APFL_RESULT_ERR: + *result = APFL_RESULT_ERRERR; + *with_error_on_stack = false; + break; + case APFL_RESULT_ERRERR: + *result = APFL_RESULT_ERRERR; + *with_error_on_stack = false; + break; + case APFL_RESULT_ERR_ALLOC: + *result = APFL_RESULT_ERR_ALLOC; + *with_error_on_stack = false; + break; + } +} + enum apfl_result apfl_do_protected( apfl_ctx ctx, @@ -72,22 +105,7 @@ apfl_do_protected( .opaque_outer = opaque, .errcallback = errcallback, }; - switch (apfl_do_protected(ctx, protected_errcallback, &data, NULL)) { - case APFL_RESULT_OK: - break; - case APFL_RESULT_ERR: - result = APFL_RESULT_ERRERR; - with_error_on_stack = false; - break; - case APFL_RESULT_ERRERR: - result = APFL_RESULT_ERRERR; - with_error_on_stack = false; - break; - case APFL_RESULT_ERR_ALLOC: - result = APFL_RESULT_ERR_ALLOC; - with_error_on_stack = false; - break; - } + protected_in_error_handling(ctx, protected_errcallback, &data, &result, &with_error_on_stack); } struct apfl_value err; @@ -104,8 +122,12 @@ apfl_do_protected( } assert(callstack_len <= ctx->call_stack.cap); - for (size_t i = callstack_len; i < ctx->call_stack.len; i++) { - apfl_call_stack_entry_deinit(ctx->gc.allocator, &ctx->call_stack.items[i]); + for (size_t i = ctx->call_stack.len; i-- > callstack_len; ) { + struct call_stack_entry *cse = &ctx->call_stack.items[i]; + if (cse->type == APFL_CSE_CFUNCTION) { + protected_in_error_handling(ctx, protected_run_deferreds, cse, &result, &with_error_on_stack); + } + apfl_call_stack_entry_deinit(ctx->gc.allocator, cse); } bool ok = apfl_resizable_resize( @@ -764,6 +786,12 @@ func_call_stack_entry_deinit(struct apfl_allocator allocator, struct func_call_s FREE_LIST(allocator, cse->matcher_stack.items, cse->matcher_stack.cap); } +static void +cfunc_call_stack_entry_deinit(struct apfl_allocator allocator, struct cfunc_call_stack_entry *cse) +{ + FREE_LIST(allocator, cse->deferred_list, cse->deferred_cap); +} + void apfl_matcher_call_stack_entry_deinit(struct apfl_allocator allocator, struct matcher_call_stack_entry *cse) { @@ -785,6 +813,8 @@ apfl_call_stack_entry_deinit(struct apfl_allocator allocator, struct call_stack_ func_call_stack_entry_deinit(allocator, &entry->func); break; case APFL_CSE_CFUNCTION: + cfunc_call_stack_entry_deinit(allocator, &entry->cfunc); + break; case APFL_CSE_FUNCTION_DISPATCH: break; case APFL_CSE_MATCHER: @@ -816,15 +846,8 @@ init_globals_protected(apfl_ctx ctx, void *opaque) { (void)opaque; - struct apfl_io_string_reader_data reader = apfl_io_string_reader_create(apfl_mod_globals()); - struct apfl_io_reader r = apfl_io_string_reader(&reader); - apfl_load_bytecode(ctx, r); - apfl_list_create(ctx, 0); - apfl_call(ctx, -2, -1); - apfl_list_create(ctx, 1); apfl_builtins(ctx); - apfl_list_append(ctx, -2, -1); - apfl_call(ctx, -2, -1); + apfl_build_native_and_bytecode_combined_module(ctx, -1, apfl_mod_globals()); struct apfl_value val = apfl_stack_must_get(ctx, -1); if (val.type != VALUE_DICT) { @@ -846,6 +869,15 @@ init_globals_protected(apfl_ctx ctx, void *opaque) } } +static void +init_standard_modules_protected(apfl_ctx ctx, void *opaque) +{ + (void)opaque; + + apfl_push_cfunc(ctx, apfl_module_re, 0); + apfl_modules_register(ctx, "re", -1); +} + #define DEBUG_INIT_GLOBALS 1 #if DEBUG_INIT_GLOBALS @@ -892,6 +924,12 @@ init_globals(apfl_ctx ctx) return apfl_do_protected(ctx, init_globals_protected, NULL, INIT_GLOBALS_ERRCALLBACK) == APFL_RESULT_OK; } +static bool +init_standard_modules(apfl_ctx ctx) +{ + return apfl_do_protected(ctx, init_standard_modules_protected, NULL, INIT_GLOBALS_ERRCALLBACK) == APFL_RESULT_OK; +} + apfl_ctx apfl_ctx_new(struct apfl_config config) { @@ -927,6 +965,10 @@ apfl_ctx_new(struct apfl_config config) goto error; } + if (!config.no_standard_modules && !init_standard_modules(ctx)) { + goto error; + } + return ctx; error: @@ -1311,6 +1353,16 @@ apfl_push_csymbol(apfl_ctx ctx, apfl_cfunc id, const char *name) }); } +apfl_cfunc +apfl_pop_csymbol(apfl_ctx ctx, apfl_stackidx idx) +{ + struct apfl_value val = apfl_stack_must_pop(ctx, idx); + + return val.type == VALUE_CSYMBOL + ? val.csymbol.id + : NULL; +} + static void push_symbol_inner(apfl_ctx ctx, apfl_stackidx idx) { @@ -2385,3 +2437,45 @@ apfl_load_bytecode(apfl_ctx ctx, struct apfl_io_reader r) load_bytecode_inner(ctx, r); apfl_gc_tmproots_restore(&ctx->gc, tmproots); } + +struct apfl_allocator +apfl_get_allocator(apfl_ctx ctx) +{ + return ctx->gc.allocator; +} + +void +apfl_cfunc_defer(apfl_ctx ctx, apfl_cfunc_defer_callback cb, void *opaque) +{ + struct call_stack_entry *cse = apfl_call_stack_cur_entry(ctx); + if (cse == NULL || cse->type != APFL_CSE_CFUNCTION) { + apfl_raise_const_error(ctx, "apfl_cfunc_defer must be called from within a cfunc"); + } + + if (!apfl_resizable_append( + ctx->gc.allocator, + sizeof(struct cfunc_deferred), + (void **)&cse->cfunc.deferred_list, + &cse->cfunc.deferred_len, + &cse->cfunc.deferred_cap, + &(struct cfunc_deferred) { + .cb = cb, + .opaque = opaque, + }, + 1 + )) { + apfl_raise_alloc_error(ctx); + } +} + +void +apfl_cfunc_run_deferred(apfl_ctx ctx, struct call_stack_entry *cse) +{ + assert(cse != NULL); + assert(cse->type == APFL_CSE_CFUNCTION); + + for (size_t i = cse->cfunc.deferred_len; i-- > 0; ) { + struct cfunc_deferred *deferred = &cse->cfunc.deferred_list[i]; + deferred->cb(ctx, deferred->opaque); + } +} diff --git a/src/context.h b/src/context.h index a2666ba..290fe59 100644 --- a/src/context.h +++ b/src/context.h @@ -64,8 +64,17 @@ struct func_call_stack_entry { bool matcher_result; }; +struct cfunc_deferred { + apfl_cfunc_defer_callback cb; + void *opaque; +}; + struct cfunc_call_stack_entry { struct cfunction *func; + + struct cfunc_deferred *deferred_list; + size_t deferred_len; + size_t deferred_cap; }; enum matcher_mode { @@ -162,6 +171,8 @@ struct apfl_ctx_data { void apfl_matcher_call_stack_entry_deinit(struct apfl_allocator, struct matcher_call_stack_entry *); void apfl_call_stack_entry_deinit(struct apfl_allocator, struct call_stack_entry *); +void apfl_cfunc_run_deferred(apfl_ctx ctx, struct call_stack_entry *cse); + struct stack apfl_stack_new(void); bool apfl_stack_push(apfl_ctx, struct apfl_value); @@ -176,7 +187,6 @@ bool apfl_stack_drop(apfl_ctx, apfl_stackidx); bool apfl_stack_drop_multi(apfl_ctx ctx, size_t count, apfl_stackidx *indices); void apfl_stack_clear(apfl_ctx); struct apfl_value *apfl_stack_push_placeholder(apfl_ctx); -bool apfl_move_string_onto_stack(apfl_ctx, struct apfl_string); // Like apfl_tostring, but ensures it's a dynamically allocated string and returns the underlying string. struct apfl_string *apfl_to_dynamic_string(apfl_ctx ctx, apfl_stackidx index); diff --git a/src/eval.c b/src/eval.c index a4805c4..fb42065 100644 --- a/src/eval.c +++ b/src/eval.c @@ -509,10 +509,14 @@ call_inner(apfl_ctx ctx, size_t tmproots, apfl_stackidx func_index, apfl_stackid .stack = apfl_stack_new(), .cfunc = { .func = func.cfunc, + .deferred_list = NULL, + .deferred_len = 0, + .deferred_cap = 0, }, }); func.cfunc->func(ctx); + apfl_cfunc_run_deferred(ctx, apfl_call_stack_cur_entry(ctx)); return_from_function(ctx); break; default: diff --git a/src/functional-tests/re.at b/src/functional-tests/re.at new file mode 100644 index 0000000..0a21b90 --- /dev/null +++ b/src/functional-tests/re.at @@ -0,0 +1,69 @@ +===== script ===== +re := import 're + +match := { ~args -> + m := re.match ~args + if (== nil m) { + print "-- no match --" + } { + print "-- match --" + keach m { + 0 _ -> + i s -> print (& i ":" s) + } + } +} + +match "^f(o*)" "foooo" +match "^f(o*)" "FoOoo" +match "^f(o*)"::'i "FoOoo" +match "^f(o*)" "f" +match "^f(o*)" 1 "foooo" + +print "" + +replace := { ~args -> + print (re.replace ~args) +} + +replace "(x+)" { _ x -> len x } "" +replace "(x+)" { _ x -> len x } "xxx" +replace "(x+)" { _ x -> len x } "xxxyxyxx" +replace "(x+)" { _ x -> len x } 2 "xxxyxyxx" +replace "\\[(\\w+)\\]" "<$1>" "[foo] [bar]" + +print "" + +match-all := { ~args -> + print "-----" + each (re.match-all ~args) { m -> + print (& "- [" (join " ; " m) "]") + } +} + +match-all "f(\\w+)" "afoobar fizz abcdefg f" +match-all "^(\\w+)\\s*=\\s*(.*)$"::'m "foo= bar\nbar = 123" + +===== output ===== +-- match -- +1:oooo +-- no match -- +-- match -- +1:oOoo +-- match -- +1: +-- no match -- + + +3 +3y1y2 +3y1yxx + + +----- +- [foobar ; oobar] +- [fizz ; izz] +- [fg ; g] +----- +- [foo= bar ; foo ; bar] +- [bar = 123 ; bar ; 123] diff --git a/src/globals.apfl b/src/globals.apfl index 3d64923..47ecce1 100644 --- a/src/globals.apfl +++ b/src/globals.apfl @@ -115,12 +115,12 @@ substr := { start s -> substr start (len s) s - start?(is < 0) end s -> - substr (+ start (len s)) end s - start end?(is < 0) s -> - substr start (+ end (- (len s) start)) s - start end s -> - -raw-substring s start end + start?(is < 0) newlen s -> + substr (+ start (len s)) newlen s + start newlen?(is < 0) s -> + substr start (+ newlen (- (len s) start)) s + start newlen s -> + -raw-substring s start newlen } -raw-stringsearch := builtins.stringsearch @@ -259,13 +259,15 @@ add-searcher { m -> unwrap-some (get-optional m loaded-modules) { mod -> - { Some::mod } + Some::{ Some::mod } } } add-searcher { m -> unwrap-some (builtins.cmod-searcher m) { loader -> - Some::(loader) + Some::{ + Some::(loader) + } } } @@ -301,6 +303,12 @@ modules }) + map := { + _ [] -> [] + f [x ~xs] -> + [(f x) ~(map f xs)] + } + # Dictionary of exported functions [ 'if -> if @@ -365,5 +373,6 @@ 'find-first -> find-first 'unwrap-some -> unwrap-some 'import -> modules.import + 'map -> map ] } diff --git a/src/modules.c b/src/modules.c new file mode 100644 index 0000000..b163d08 --- /dev/null +++ b/src/modules.c @@ -0,0 +1,21 @@ +#include "apfl.h" + +#include "context.h" + +void +apfl_build_native_and_bytecode_combined_module( + apfl_ctx ctx, + apfl_stackidx native, + struct apfl_string_view bytecode +) { + apfl_move_to_top_of_stack(ctx, native); + + struct apfl_io_string_reader_data reader = apfl_io_string_reader_create(bytecode); + struct apfl_io_reader r = apfl_io_string_reader(&reader); + apfl_load_bytecode(ctx, r); + apfl_list_create(ctx, 0); + apfl_call(ctx, -2, -1); + apfl_list_create(ctx, 1); + apfl_list_append(ctx, -1, -3); + apfl_call(ctx, -2, -1); +} diff --git a/src/modules.h b/src/modules.h index 9d36dc1..4c6c113 100644 --- a/src/modules.h +++ b/src/modules.h @@ -7,7 +7,17 @@ extern "C" { #include "apfl.h" +void +apfl_build_native_and_bytecode_combined_module( + apfl_ctx ctx, + apfl_stackidx native, + struct apfl_string_view bytecode +); + struct apfl_string_view apfl_mod_globals(void); +struct apfl_string_view apfl_mod_re(void); + +void apfl_module_re(apfl_ctx); #ifdef __cplusplus } diff --git a/src/re.apfl b/src/re.apfl new file mode 100644 index 0000000..7ed9011 --- /dev/null +++ b/src/re.apfl @@ -0,0 +1,97 @@ +{ C -> + + toflags := { s -> + flags := [{}] + add := { f -> flags = [~flags f] } + + for (len s) { i -> + { + 'i -> add C.CASELESS + 'm -> add C.MULTILINE + 'u -> add C.UTF + 'x -> add C.EXTENDED + 'U -> add C.UNGREEDY + _ -> + } (substr i 1 s) + } + } + + with := { + regex::flags?(has type 'string) body -> + with regex::(toflags flags) body + regex::flags body -> + r := C.compile regex flags + out := body r + C.close r + out + regex body -> + with regex::[] body + } + + match := { + regex off subject -> + with regex { r -> + C.match r subject off + } + regex subject -> + match regex 0 subject + } + + Plain := (symbol) + Var := (symbol) + + replacement-to-function := { + replacement?(has type 'string) -> + m := nil + parts = [] + while { m = match "^(.*?)\\$(\\d)(.*)$" replacement } { + [_ head n replacement] = m + parts = [~parts Plain::head Var::(tonumber n)] + } + parts = [~parts Plain::replacement] + + { ~m -> + join "" (map { + Plain:s -> s + Var:n -> m@n + } parts) + } + replacement -> replacement + } + + replace-aux := { regex replacement countcond subject -> + with regex { r -> + C.replace r replacement countcond subject + } + } + + replace := { + regex replacement count subject -> + replace-aux regex (replacement-to-function replacement) (is < count) subject + regex replacement subject -> + replace-aux regex (replacement-to-function replacement) {true} subject + } + + match-all := { + regex off subject -> + with regex { r -> + C.match-all r subject off + } + regex subject -> + match-all regex 0 subject + } + + [ + 'match -> match + 'replace -> replace + 'match-all -> match-all + + 'CASELESS -> C.CASELESS + 'DOTALL -> C.DOTALL + 'EXTENDED -> C.EXTENDED + 'MULTILINE -> C.MULTILINE + 'NEVER_UTF -> C.NEVER_UTF + 'UNGREEDY -> C.UNGREEDY + 'UTF -> C.UTF + ] +} diff --git a/src/re.c b/src/re.c new file mode 100644 index 0000000..063fcda --- /dev/null +++ b/src/re.c @@ -0,0 +1,642 @@ +#include +#include + +#include "apfl.h" + +#include "alloc.h" +#include "modules.h" + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +struct contexts_for_pcre { + pcre2_general_context *gcontext; + pcre2_compile_context *ccontext; +}; + +static void +onbeforecollect_contexts(void *opaque) +{ + struct contexts_for_pcre *contexts = opaque; + + if (contexts->ccontext != NULL) { + pcre2_compile_context_free(contexts->ccontext); + } + + if (contexts->gcontext != NULL) { + pcre2_general_context_free(contexts->gcontext); + } +} + +static const struct apfl_native_object_type contexts_type = { + .size = sizeof(struct contexts_for_pcre), + .onbeforecollect = onbeforecollect_contexts, +}; + +static void +free_code(pcre2_code **codeptr) +{ + if (*codeptr != NULL) { + pcre2_code_free(*codeptr); + *codeptr = NULL; + } +} + +static void +onbeforecollect_code(void *opaque) +{ + pcre2_code **code = opaque; + free_code(code); +} + +static const struct apfl_native_object_type code_type = { + .size = sizeof(pcre2_code *), + .onbeforecollect = onbeforecollect_code, +}; + +static const size_t extrasize_for_allocsize = ((sizeof(size_t) / _Alignof(max_align_t)) + 1) * _Alignof(max_align_t); + +static void * +gcontext_malloc(PCRE2_SIZE size, void *opaque) +{ + apfl_ctx ctx = opaque; + char *mem = ALLOC_BYTES( + apfl_get_allocator(ctx), + extrasize_for_allocsize + size + ); + if (mem == NULL) { + return NULL; + } + *((size_t *)mem) = size; + return mem + extrasize_for_allocsize; +} + +static void +gcontext_free(void *ptr, void *opaque) +{ + if (ptr == NULL) { + return; + } + + apfl_ctx ctx = opaque; + + char *mem = ptr; + mem -= extrasize_for_allocsize; + + size_t size = *((size_t *)mem); + + FREE_BYTES(apfl_get_allocator(ctx), mem, size + extrasize_for_allocsize); +} + +static int contexts_registry_key; + +static APFL_DEFINE_CSYMBOL(sym_caseless, "CASELESS") +static APFL_DEFINE_CSYMBOL(sym_dotall, "DOTALL") +static APFL_DEFINE_CSYMBOL(sym_extended, "EXTENDED") +static APFL_DEFINE_CSYMBOL(sym_multiline, "MULTILINE") +static APFL_DEFINE_CSYMBOL(sym_never_utf, "NEVER_UTF") +static APFL_DEFINE_CSYMBOL(sym_ungreedy, "UNGREEDY") +static APFL_DEFINE_CSYMBOL(sym_utf, "UTF") + +static uint32_t +options_from_list(apfl_ctx ctx, apfl_stackidx list) +{ + uint32_t options = 0; + + apfl_move_to_top_of_stack(ctx, list); + + if (apfl_get_type(ctx, -1) != APFL_VALUE_LIST) { + apfl_raise_const_error(ctx, "Expected an options list"); + } + + size_t len = apfl_len(ctx, -1); + for (size_t i = 0; i < len; i++) { + apfl_get_list_member_by_index(ctx, -1, i); + apfl_cfunc sym = apfl_pop_csymbol(ctx, -1); + if (sym == sym_caseless) { + options |= PCRE2_CASELESS; + } else if (sym == sym_dotall) { + options |= PCRE2_DOTALL; + } else if (sym == sym_extended) { + options |= PCRE2_EXTENDED; + } else if (sym == sym_multiline) { + options |= PCRE2_MULTILINE; + } else if (sym == sym_never_utf) { + options |= PCRE2_NEVER_UTF; + } else if (sym == sym_ungreedy) { + options |= PCRE2_UNGREEDY; + } else if (sym == sym_utf) { + options |= PCRE2_UTF; + } + } + + apfl_drop(ctx, -1); + + return options; +} + +static struct contexts_for_pcre * +get_contexts(apfl_ctx ctx) +{ + if (!apfl_registry_try_get(ctx, &contexts_registry_key, 0)) { + apfl_raise_const_error(ctx, "Module not initialized correctly"); + } + + struct contexts_for_pcre *contexts = apfl_get_native_object(ctx, &contexts_type, -1); + + // We can safely drop the stack element and still have a valid pointer to + // the contexts, because it's still kept alive by the registry entry. + apfl_drop(ctx, -1); + + if (contexts->gcontext == NULL || contexts->ccontext == NULL) { + apfl_raise_const_error(ctx, "Module not initialized correctly"); + } + + return contexts; +} + +#define BUFSIZE 200 + +noreturn static void +raise_pcre2_error(apfl_ctx ctx, int errorcode) +{ + struct apfl_allocator allocator = apfl_get_allocator(ctx); + + unsigned char *buf = ALLOC_BYTES(allocator, BUFSIZE); + if (buf == NULL) { + apfl_raise_alloc_error(ctx); + } + int len = pcre2_get_error_message(errorcode, buf, BUFSIZE); + switch (len) { + case PCRE2_ERROR_BADDATA: + FREE_BYTES(allocator, buf, BUFSIZE); + apfl_raise_const_error(ctx, "Unknown PCRE error"); + break; + case PCRE2_ERROR_NOMEMORY: + FREE_BYTES(allocator, buf, BUFSIZE); + apfl_raise_const_error(ctx, "PCRE error does not fit"); + break; + default: + if (!apfl_move_string_onto_stack(ctx, (struct apfl_string) { + .bytes = buf, + .len = len, + .cap = BUFSIZE, + })) { + FREE_BYTES(allocator, buf, BUFSIZE); + apfl_raise_alloc_error(ctx); + } else { + apfl_raise_error(ctx, -1); + } + } +} + +static void +compile(apfl_ctx ctx) +{ + if (apfl_len(ctx, 0) != 2) { + apfl_raise_const_error(ctx, "compile needs 2 arguments"); + } + + apfl_get_list_member_by_index(ctx, 0, 0); + struct apfl_string_view s = apfl_get_string(ctx, -1); + + apfl_get_list_member_by_index(ctx, 0, 1); + uint32_t options = options_from_list(ctx, -1); + + apfl_drop(ctx, 0); + + struct contexts_for_pcre *contexts = get_contexts(ctx); + + int errorcode; + PCRE2_SIZE erroroffset; + pcre2_code *code = pcre2_compile( + s.bytes, + s.len, + options, + &errorcode, + &erroroffset, + contexts->ccontext + ); + + if (code == NULL) { + raise_pcre2_error(ctx, errorcode); + } + + pcre2_code **codeptr = apfl_push_native_object(ctx, &code_type); + *codeptr = code; +} + +static void +close_code(apfl_ctx ctx) +{ + if (apfl_len(ctx, 0) != 1) { + apfl_raise_const_error(ctx, "close needs exatly one argument"); + } + apfl_get_list_member_by_index(ctx, 0, 0); + apfl_drop(ctx, 0); + pcre2_code **codeptr = apfl_get_native_object(ctx, &code_type, -1); + free_code(codeptr); + apfl_drop(ctx, -1); +} + +static void +cleanup_match_data(apfl_ctx ctx, void *opaque) +{ + (void)ctx; + pcre2_match_data *md = opaque; + pcre2_match_data_free(md); +} + +static pcre2_code * +get_unclosed_code(apfl_ctx ctx, apfl_stackidx index) +{ + pcre2_code **codeptr = apfl_get_native_object(ctx, &code_type, index); + if (codeptr == NULL) { + apfl_raise_const_error(ctx, "pcre2 code already closed"); + } + + return *codeptr; +} + +static pcre2_match_data * +create_md(apfl_ctx ctx, pcre2_code *code) +{ + struct contexts_for_pcre *contexts = get_contexts(ctx); + pcre2_match_data *md = pcre2_match_data_create_from_pattern(code, contexts->gcontext); + if (md == NULL) { + apfl_raise_alloc_error(ctx); + } + apfl_cfunc_defer(ctx, cleanup_match_data, md); + + return md; +} + +static void +advance_utf8_rune(struct apfl_string_view sv, size_t *offset) +{ + while (*offset < sv.len && sv.bytes[*offset] & 0xC0) { + (*offset)++; + } +} + +struct iter_match_patterninfo { + bool is_utf8; + bool crlf_is_newline; +}; + +static struct iter_match_patterninfo +prepare_iter_match(pcre2_code *code) +{ + uint32_t bits; + struct iter_match_patterninfo out; + + (void)pcre2_pattern_info(code, PCRE2_INFO_ALLOPTIONS, &bits); + out.is_utf8 = (bits & PCRE2_UTF) != 0; + + (void)pcre2_pattern_info(code, PCRE2_INFO_NEWLINE, &bits); + out.crlf_is_newline = bits == PCRE2_NEWLINE_ANY + || bits == PCRE2_NEWLINE_CRLF + || bits == PCRE2_NEWLINE_ANYCRLF; + + return out; +} + +static int +iter_match( + apfl_ctx ctx, + struct apfl_string_view subject, + pcre2_code *code, + pcre2_match_data *md, + PCRE2_SIZE offset, + struct iter_match_patterninfo patterninfo, + size_t i +) { + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); + PCRE2_SIZE offset_unadjusted; + + if (i > 0) { + offset_unadjusted = ovector[1]; + } +again:; + + uint32_t options = 0; + bool last_match_was_empty = false; + if (i > 0) { + // Handle subsequent runs. Pretty much copied from pcre2demo.c + + offset = offset_unadjusted; + + if (ovector[0] == offset_unadjusted) { + last_match_was_empty = true; + if (ovector[0] == subject.len) { + return -1; + } + options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } else { + PCRE2_SIZE startchar = pcre2_get_startchar(md); + if (offset <= startchar) { + if (startchar >= subject.len) { + return -1; + } + offset = startchar + 1; + + if (patterninfo.is_utf8) { + advance_utf8_rune(subject, &offset); + } + } + } + } + + int rc = pcre2_match( + code, + subject.bytes, + subject.len, + (PCRE2_SIZE)offset, + options, + md, + NULL + ); + + if (rc > 0) { + return rc; + } + + switch(rc) { + case 0: + // Happens if the ovector was not large enough, which should never + // happen, as we've created the match data from the pattern. + apfl_raise_const_error(ctx, "ovector too small. This should not have happened :("); + return -1; + case PCRE2_ERROR_NOMATCH: + if (i == 0 || !last_match_was_empty) { + return -1; + } + + PCRE2_SIZE new_offset = ovector[1] + 1; + if ( + patterninfo.crlf_is_newline + && offset + 1 < subject.len + && subject.bytes[offset] == '\r' + && subject.bytes[offset+1] == '\n' + ) { + new_offset++; + } else if (patterninfo.is_utf8) { + advance_utf8_rune(subject, &new_offset); + } + + offset_unadjusted = new_offset; + + goto again; + default: + raise_pcre2_error(ctx, rc); + return -1; + } +} + +static void +build_matches_list( + apfl_ctx ctx, + struct apfl_string_view sv, + int rc, + PCRE2_SIZE *ovector +) { + apfl_list_create(ctx, rc); + + for (int i = 0; i < rc; i++) { + size_t a = (size_t)ovector[i*2]; + size_t b = (size_t)ovector[i*2+1]; + + if (a > b) { + apfl_push_const_string(ctx, ""); + } else { + size_t newlen = b - a; + apfl_push_string_view_copy( + ctx, + apfl_string_view_substr(sv, a, newlen) + ); + } + + apfl_list_append(ctx, -2, -1); + } +} + +static void +match(apfl_ctx ctx) +{ + if (apfl_len(ctx, 0) != 3) { + apfl_raise_const_error(ctx, "match expects exactly 3 arguments"); + } + + apfl_get_list_member_by_index(ctx, 0, 0); + pcre2_code *code = get_unclosed_code(ctx, -1); + + apfl_get_list_member_by_index(ctx, 0, 1); + struct apfl_string_view subject = apfl_get_string(ctx, -1); + + apfl_get_list_member_by_index(ctx, 0, 2); + PCRE2_SIZE offset = (PCRE2_SIZE)apfl_get_number(ctx, -1); + + apfl_drop(ctx, 0); + + pcre2_match_data *md = create_md(ctx, code); + + struct iter_match_patterninfo patterninfo = prepare_iter_match(code); + int rc = iter_match(ctx, subject, code, md, offset, patterninfo, 0); + if (rc < 0) { + apfl_push_nil(ctx); + return; + } + + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); + assert(ovector[0] <= ovector[1]); + + build_matches_list(ctx, subject, rc, ovector); +} + +static void +cleanup_string_builder(apfl_ctx ctx, void *opaque) +{ + struct apfl_string_builder *sb = opaque; + apfl_string_builder_deinit(sb); + FREE_OBJ(apfl_get_allocator(ctx), sb); +} + +static struct apfl_string_builder * +create_string_builder(apfl_ctx ctx) +{ + struct apfl_allocator allocator = apfl_get_allocator(ctx); + struct apfl_string_builder *sb = ALLOC_OBJ(allocator, struct apfl_string_builder); + if (sb == NULL) { + apfl_raise_alloc_error(ctx); + } + *sb = apfl_string_builder_init(allocator); + apfl_cfunc_defer(ctx, cleanup_string_builder, sb); + return sb; +} + +static bool +do_countcheck(apfl_ctx ctx, apfl_stackidx func, size_t i) +{ + apfl_copy(ctx, func); + apfl_list_create(ctx, 1); + apfl_push_number(ctx, (apfl_number)i); + apfl_list_append(ctx, -2, -1); + apfl_call(ctx, -2, -1); + return apfl_is_truthy(ctx, -1); +} + +static void +replace(apfl_ctx ctx) +{ + if (apfl_len(ctx, 0) != 4) { + apfl_raise_const_error(ctx, "replace expects exactly 4 arguments"); + } + + apfl_get_list_member_by_index(ctx, 0, 0); + pcre2_code *code = get_unclosed_code(ctx, -1); + apfl_get_list_member_by_index(ctx, 0, 1); + const apfl_stackidx replace = 1; + apfl_get_list_member_by_index(ctx, 0, 2); + const apfl_stackidx countcheck = 2; + apfl_get_list_member_by_index(ctx, 0, 3); + struct apfl_string_view subject = apfl_get_string(ctx, -1); + apfl_drop(ctx, 0); + + struct apfl_string_builder *sb = create_string_builder(ctx); + + pcre2_match_data *md = create_md(ctx, code); + PCRE2_SIZE offset = 0; + PCRE2_SIZE old_offset; + struct iter_match_patterninfo patterninfo = prepare_iter_match(code); + for (size_t i = 0; do_countcheck(ctx, countcheck, i); i++) { + old_offset = offset; + int rc = iter_match(ctx, subject, code, md, offset, patterninfo, i); + + if (rc < 0) { + break; + } + + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); + + if (!apfl_string_builder_append(sb, apfl_string_view_substr( + subject, + old_offset, + ovector[0] - old_offset + ))) { + apfl_raise_alloc_error(ctx); + } + + apfl_copy(ctx, replace); + build_matches_list(ctx, subject, rc, ovector); + apfl_call(ctx, -2, -1); + apfl_tostring(ctx, -1); + struct apfl_string_view replacement = apfl_get_string(ctx, -1); + + if (!apfl_string_builder_append(sb, replacement)) { + apfl_raise_alloc_error(ctx); + } + + offset = ovector[1]; + } + + if (!apfl_string_builder_append(sb, apfl_string_view_substr( + subject, + offset, + subject.len - offset + ))) { + apfl_raise_alloc_error(ctx); + } + + struct apfl_string str = apfl_string_builder_move_string(sb); + if (!apfl_move_string_onto_stack(ctx, str)) { + apfl_raise_alloc_error(ctx); + } +} + +static void +match_all(apfl_ctx ctx) +{ + if (apfl_len(ctx, 0) != 3) { + apfl_raise_const_error(ctx, "match-all expects exactly 3 arguments"); + } + + apfl_get_list_member_by_index(ctx, 0, 0); + pcre2_code *code = get_unclosed_code(ctx, -1); + + apfl_get_list_member_by_index(ctx, 0, 1); + struct apfl_string_view subject = apfl_get_string(ctx, -1); + + apfl_get_list_member_by_index(ctx, 0, 2); + PCRE2_SIZE offset = (PCRE2_SIZE)apfl_get_number(ctx, -1); + + apfl_drop(ctx, 0); + + apfl_list_create(ctx, 0); + + pcre2_match_data *md = create_md(ctx, code); + + struct iter_match_patterninfo patterninfo = prepare_iter_match(code); + int rc; + for ( + int i = 0; + (rc = iter_match(ctx, subject, code, md, offset, patterninfo, i)) >= 0; + i++ + ) { + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); + build_matches_list(ctx, subject, rc, ovector); + apfl_list_append(ctx, -2, -1); + } +} + +static void +add_sym_to_mod(apfl_ctx ctx, apfl_cfunc sym, const char *name) +{ + sym(ctx); + apfl_push_const_string(ctx, name); + apfl_dict_set(ctx, -3, -1, -2); +} + +static void +add_func_to_mod(apfl_ctx ctx, apfl_cfunc func, const char *name) +{ + apfl_push_cfunc(ctx, func, 0); + apfl_push_const_string(ctx, name); + apfl_set_func_name(ctx, -2, -1); + apfl_push_const_string(ctx, name); + apfl_dict_set(ctx, -3, -1, -2); +} + +void +apfl_module_re(apfl_ctx ctx) +{ + struct contexts_for_pcre *contexts = apfl_push_native_object(ctx, &contexts_type); + contexts->gcontext = NULL; + contexts->ccontext = NULL; + + if ((contexts->gcontext = pcre2_general_context_create(gcontext_malloc, gcontext_free, ctx)) == NULL) { + apfl_raise_alloc_error(ctx); + } + if ((contexts->ccontext = pcre2_compile_context_create(contexts->gcontext)) == NULL) { + apfl_raise_alloc_error(ctx); + } + + apfl_registry_set(ctx, &contexts_registry_key, 0, -1); + + apfl_dict_create(ctx); + + add_sym_to_mod(ctx, sym_caseless, "CASELESS"); + add_sym_to_mod(ctx, sym_dotall, "DOTALL"); + add_sym_to_mod(ctx, sym_extended, "EXTENDED"); + add_sym_to_mod(ctx, sym_multiline, "MULTILINE"); + add_sym_to_mod(ctx, sym_never_utf, "NEVER_UTF"); + add_sym_to_mod(ctx, sym_ungreedy, "UNGREEDY"); + add_sym_to_mod(ctx, sym_utf, "UTF"); + + add_func_to_mod(ctx, compile, "compile"); + add_func_to_mod(ctx, close_code, "close"); + add_func_to_mod(ctx, match, "match"); + add_func_to_mod(ctx, replace, "replace"); + add_func_to_mod(ctx, match_all, "match-all"); + + apfl_build_native_and_bytecode_combined_module(ctx, -1, apfl_mod_re()); +} diff --git a/webpage/.gitignore b/webpage/.gitignore index 0aa5503..749f2ee 100644 --- a/webpage/.gitignore +++ b/webpage/.gitignore @@ -1,2 +1,3 @@ build/ build-native/ +deps/ diff --git a/webpage/build.sh b/webpage/build.sh index 3786509..f390c88 100755 --- a/webpage/build.sh +++ b/webpage/build.sh @@ -1,5 +1,8 @@ #!/bin/sh set -e + +PCRE2VER=10.42 + cd playground rm -rf build-native mkdir build-native @@ -7,10 +10,25 @@ cd build-native cmake ../../../CMakeLists.txt make -j"$(nproc)" apflc cd .. + +rm -rf deps +mkdir deps +cd deps +curl -L -o pcre2.tar.bz2 "https://github.com/PCRE2Project/pcre2/releases/download/pcre2-${PCRE2VER}/pcre2-${PCRE2VER}.tar.bz2" +tar xjf pcre2.tar.bz2 +cd "pcre2-${PCRE2VER}" +mkdir build +cd build +emcmake cmake -DCMAKE_INSTALL_PREFIX="/home/laria/src/apfl/webpage/playground/deps/prefix" ../CMakeLists.txt +emmake make -j"$(nproc)" pcre2-8-static +emmake make -j"$(nproc)" install + +cd ../../.. + rm -rf build mkdir build cd build -emcmake cmake -DCMAKE_C_FLAGS="-O2" -DBUILD_SHARED_LIBS=NO -DApflApflcNative_DIR="$(pwd)/../build-native/" ../../../CMakeLists.txt +emcmake cmake -DCMAKE_C_FLAGS="-O2" -DBUILD_SHARED_LIBS=NO -DApflApflcNative_DIR="$(pwd)/../build-native/" -DCMAKE_PREFIX_PATH="/home/laria/src/apfl/webpage/playground/deps/prefix" ../../../CMakeLists.txt emmake make -j"$(nproc)" apfl cd .. -emcc -sASYNCIFY -O3 -oplayground.js playground.c build/src/libapfl.a +emcc -sASYNCIFY `PKG_CONFIG_PATH="/home/laria/src/apfl/webpage/playground/deps/prefix/lib/pkgconfig" pkg-config --static --cflags --libs libpcre2-8` -O3 -oplayground.js playground.c build/src/libapfl.a