pcre2: import version 10.35

Files removed as described in b418e36f2
This commit is contained in:
David Adam 2020-05-10 22:27:46 +08:00
parent 3252d0fd03
commit 73ecf1576b
21 changed files with 3039 additions and 2598 deletions

170
pcre2/CMakeLists.txt vendored
View file

@ -1,6 +1,5 @@
# CMakeLists.txt
#
#
# This file enables PCRE2 to be built with the CMake configuration and build
# tool. Download CMake in source or binary form from http://www.cmake.org/
# Converted to support PCRE2 from the original PCRE file, August 2014.
@ -85,6 +84,14 @@
# 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h
# 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied
# 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below)
# 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere)
# 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace
# 2020-04-08 Carlo added function check for secure_getenv, fixed strerror
# 2020-04-16 enh added check for __attribute__((uninitialized))
# 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and
# library versioning.
# 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator
# 2020-04-28 PH added function check for memfd_create based on Carlo's patch
PROJECT(PCRE2 C)
@ -95,14 +102,26 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
# GET_TARGET_PROPERTY. This should no longer be required.
# CMAKE_POLICY(SET CMP0026 OLD)
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
# on the command line.
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src")
# external packages
FIND_PACKAGE( BZip2 )
FIND_PACKAGE( ZLIB )
FIND_PACKAGE( Readline )
FIND_PACKAGE( Editline )
# Configuration checks
INCLUDE(CheckIncludeFile)
INCLUDE(CheckCSourceCompiles)
INCLUDE(CheckFunctionExists)
INCLUDE(CheckSymbolExists)
INCLUDE(CheckIncludeFile)
INCLUDE(CheckTypeSize)
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
@ -113,9 +132,19 @@ CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
CHECK_FUNCTION_EXISTS(memfd_create HAVE_MEMFD_CREATE)
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
CHECK_FUNCTION_EXISTS(secure_getenv HAVE_SECURE_GETENV)
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
CHECK_C_SOURCE_COMPILES(
"int main() { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }"
HAVE_ATTRIBUTE_UNINITIALIZED
)
set(CMAKE_REQUIRED_FLAGS ${ORIG_CMAKE_REQUIRED_FLAGS})
# User-configurable options
#
@ -171,8 +200,12 @@ SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL
SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
"Enable support for Just-in-time compiling.")
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
"Enable SELinux compatible execmem allocator in JIT (experimental).")
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
"Enable SELinux compatible execmem allocator in JIT (experimental).")
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
SET(PCRE2_SUPPORT_JIT_SEALLOC IGNORE)
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL
"Enable use of Just-in-time compiling in pcre2grep.")
@ -298,7 +331,19 @@ IF(PCRE2_SUPPORT_JIT)
ENDIF(PCRE2_SUPPORT_JIT)
IF(PCRE2_SUPPORT_JIT_SEALLOC)
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
SET(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
CHECK_SYMBOL_EXISTS(mkostemp stdlib.h REQUIRED)
UNSET(CMAKE_REQUIRED_DEFINITIONS)
IF(${REQUIRED})
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
ADD_DEFINITIONS(-D_GNU_SOURCE)
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
MESSAGE(FATAL_ERROR "Your configuration is not supported")
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
ELSE(${REQUIRED})
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF)
ENDIF(${REQUIRED})
ENDIF(PCRE2_SUPPORT_JIT_SEALLOC)
IF(PCRE2GREP_SUPPORT_JIT)
@ -394,12 +439,13 @@ file(STRINGS ${PROJECT_SOURCE_DIR}/configure.ac
LIMIT_COUNT 50 # Read only the first 50 lines of the file
)
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date")
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date"
"libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version")
foreach(configure_line ${configure_lines})
foreach(_substitution_variable ${SEARCHED_VARIABLES})
string(TOUPPER ${_substitution_variable} _substitution_variable_upper)
if (NOT ${_substitution_variable_upper})
string(REGEX MATCH "m4_define\\(${_substitution_variable}, \\[(.*)\\]" MACTHED_STRING ${configure_line})
string(REGEX MATCH "m4_define\\(${_substitution_variable}, *\\[(.*)\\]" MACTHED_STRING ${configure_line})
if (CMAKE_MATCH_1)
set(${_substitution_variable_upper} ${CMAKE_MATCH_1})
endif()
@ -407,21 +453,74 @@ foreach(configure_line ${configure_lines})
endforeach()
endforeach()
macro(PARSE_LIB_VERSION VARIABLE_PREFIX)
string(REPLACE ":" ";" ${VARIABLE_PREFIX}_VERSION_LIST ${${VARIABLE_PREFIX}_VERSION})
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 0 ${VARIABLE_PREFIX}_VERSION_CURRENT)
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 1 ${VARIABLE_PREFIX}_VERSION_REVISION)
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 2 ${VARIABLE_PREFIX}_VERSION_AGE)
math(EXPR ${VARIABLE_PREFIX}_SOVERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} - ${${VARIABLE_PREFIX}_VERSION_AGE}")
math(EXPR ${VARIABLE_PREFIX}_MACHO_COMPATIBILITY_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
math(EXPR ${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
set(${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION}.${${VARIABLE_PREFIX}_VERSION_REVISION}}")
set(${VARIABLE_PREFIX}_VERSION "${${VARIABLE_PREFIX}_SOVERSION}.${${VARIABLE_PREFIX}_VERSION_AGE}.${${VARIABLE_PREFIX}_VERSION_REVISION}")
endmacro()
PARSE_LIB_VERSION(LIBPCRE2_POSIX)
PARSE_LIB_VERSION(LIBPCRE2_8)
PARSE_LIB_VERSION(LIBPCRE2_16)
PARSE_LIB_VERSION(LIBPCRE2_32)
CONFIGURE_FILE(src/pcre2.h.in
${PROJECT_BINARY_DIR}/pcre2.h
@ONLY)
# What about pcre2-config and libpcre2.pc?
# Generate pkg-config files
SET(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}")
SET(prefix ${CMAKE_INSTALL_PREFIX})
SET(exec_prefix "\${prefix}")
SET(libdir "\${exec_prefix}/lib")
SET(includedir "\${prefix}/include")
CONFIGURE_FILE(libpcre2-posix.pc.in libpcre2-posix.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-posix.pc")
IF(PCRE2_BUILD_PCRE2_8)
CONFIGURE_FILE(libpcre2-8.pc.in libpcre2-8.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-8.pc")
SET(enable_pcre2_8 "yes")
ELSE()
SET(enable_pcre2_8 "no")
ENDIF()
IF(PCRE2_BUILD_PCRE2_16)
CONFIGURE_FILE(libpcre2-16.pc.in libpcre2-16.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-16.pc")
SET(enable_pcre2_16 "yes")
ELSE()
SET(enable_pcre2_16 "no")
ENDIF()
IF(PCRE2_BUILD_PCRE2_32)
CONFIGURE_FILE(libpcre2-32.pc.in libpcre2-32.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-32.pc")
SET(enable_pcre2_32 "yes")
ELSE()
SET(enable_pcre2_32 "no")
ENDIF()
CONFIGURE_FILE(pcre2-config.in pcre2-config @ONLY)
# Character table generation
OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)
IF(PCRE2_REBUILD_CHARTABLES)
ADD_EXECUTABLE(dftables src/dftables.c)
ADD_EXECUTABLE(pcre2_dftables src/pcre2_dftables.c)
ADD_CUSTOM_COMMAND(
COMMENT "Generating character tables (pcre2_chartables.c) for current locale"
DEPENDS dftables
COMMAND dftables
DEPENDS pcre2_dftables
COMMAND pcre2_dftables
ARGS ${PROJECT_BINARY_DIR}/pcre2_chartables.c
OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c
)
@ -446,6 +545,7 @@ SET(PCRE2_SOURCES
src/pcre2_error.c
src/pcre2_extuni.c
src/pcre2_find_bracket.c
src/pcre2_jit_compile.c
src/pcre2_maketables.c
src/pcre2_match.c
src/pcre2_match_data.c
@ -464,6 +564,9 @@ SET(PCRE2_SOURCES
src/pcre2_xclass.c
)
SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
SET(PCRE2POSIX_SOURCES src/pcre2posix.c)
IF(MINGW AND NOT PCRE2_STATIC)
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
@ -531,12 +634,20 @@ SET(targets)
IF(PCRE2_BUILD_PCRE2_8)
ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-8
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
SET_TARGET_PROPERTIES(pcre2-8 PROPERTIES
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_8_VERSION}
SOVERSION ${LIBPCRE2_8_SOVERSION})
SET(targets ${targets} pcre2-8)
ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
SET_PROPERTY(TARGET pcre2-posix
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
SET_TARGET_PROPERTIES(pcre2-posix PROPERTIES
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_POSIX_VERSION}
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
SET(targets ${targets} pcre2-posix)
TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
@ -554,8 +665,12 @@ ENDIF(PCRE2_BUILD_PCRE2_8)
IF(PCRE2_BUILD_PCRE2_16)
ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-16
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16)
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_16_VERSION}
SOVERSION ${LIBPCRE2_16_SOVERSION})
SET(targets ${targets} pcre2-16)
IF(MINGW AND NOT PCRE2_STATIC)
@ -572,8 +687,12 @@ ENDIF(PCRE2_BUILD_PCRE2_16)
IF(PCRE2_BUILD_PCRE2_32)
ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-32
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32)
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_32_VERSION}
SOVERSION ${LIBPCRE2_32_SOVERSION})
SET(targets ${targets} pcre2-32)
IF(MINGW AND NOT PCRE2_STATIC)
@ -746,6 +865,11 @@ INSTALL(TARGETS ${targets}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
INSTALL(FILES ${pkg_config_files} DESTINATION lib/pkgconfig)
INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"
DESTINATION bin
# Set 0755 permissions
PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)

6
pcre2/LICENCE vendored
View file

@ -26,7 +26,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
Copyright (c) 1997-2019 University of Cambridge
Copyright (c) 1997-2020 University of Cambridge
All rights reserved.
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester
Email domain: freemail.hu
Copyright(c) 2010-2019 Zoltan Herczeg
Copyright(c) 2010-2020 Zoltan Herczeg
All rights reserved.
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester
Email domain: freemail.hu
Copyright(c) 2009-2019 Zoltan Herczeg
Copyright(c) 2009-2020 Zoltan Herczeg
All rights reserved.

View file

@ -1,5 +1,6 @@
/* config.h for CMake builds */
#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
#cmakedefine HAVE_DIRENT_H 1
#cmakedefine HAVE_INTTYPES_H 1
#cmakedefine HAVE_STDINT_H 1
@ -10,7 +11,10 @@
#cmakedefine HAVE_WINDOWS_H 1
#cmakedefine HAVE_BCOPY 1
#cmakedefine HAVE_MEMFD_CREATE 1
#cmakedefine HAVE_MEMMOVE 1
#cmakedefine HAVE_SECURE_GETENV 1
#cmakedefine HAVE_STRERROR 1
#cmakedefine PCRE2_STATIC 1

12
pcre2/configure.ac vendored
View file

@ -1,7 +1,13 @@
m4_define(pcre2_major, [10])
m4_define(pcre2_minor, [34])
m4_define(pcre2_minor, [35])
m4_define(pcre2_prerelease, [])
m4_define(pcre2_date, [2019-11-21])
m4_define(pcre2_date, [2020-05-09])
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre2_8_version, [10:0:10])
m4_define(libpcre2_16_version, [10:0:10])
m4_define(libpcre2_32_version, [10:0:10])
m4_define(libpcre2_posix_version, [2:3:0])
# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.
# 50 lines of this file. Please update that if the variables above are moved.

View file

@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, second API, to be
#included by applications that call PCRE2 functions.
Copyright (c) 2016-2019 University of Cambridge
Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -181,6 +181,9 @@ pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */
#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */
#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */
#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u
#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */
/* Options for pcre2_pattern_convert(). */
@ -445,6 +448,7 @@ released, the numbers must not be changed. */
#define PCRE2_CONFIG_HEAPLIMIT 12
#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13
#define PCRE2_CONFIG_COMPILED_WIDTHS 14
#define PCRE2_CONFIG_TABLES_LENGTH 15
/* Types for code units in patterns and subject strings. */

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
Arguments:
code points to start of expression
utf TRUE if in UTF mode
ucp TRUE if in UCP mode
fcc points to the case-flipping table
list points to output list
list[0] will be filled with the opcode
@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted
*/
static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list)
{
PCRE2_UCHAR c = *code;
@ -316,7 +317,8 @@ uint32_t chr;
uint32_t *clist_dest;
const uint32_t *clist_src;
#else
(void)utf; /* Suppress "unused parameter" compiler warning */
(void)utf; /* Suppress "unused parameter" compiler warnings */
(void)ucp;
#endif
list[0] = c;
@ -396,7 +398,7 @@ switch(c)
list[2] = chr;
#ifdef SUPPORT_UNICODE
if (chr < 128 || (chr < 256 && !utf))
if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr];
else
list[3] = UCD_OTHERCASE(chr);
@ -503,6 +505,7 @@ which case the base cannot be possessified.
Arguments:
code points to the byte code
utf TRUE in UTF mode
ucp TRUE in UCP mode
cb compile data block
base_list the data list of the base opcode
base_end the end of the base opcode
@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible
*/
static BOOL
compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{
PCRE2_UCHAR c;
@ -651,7 +654,7 @@ for(;;)
while (*next_code == OP_ALT)
{
if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE;
code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1);
@ -672,7 +675,8 @@ for(;;)
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE;
if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
rec_limit))
return FALSE;
code += PRIV(OP_lengths)[c];
@ -688,7 +692,7 @@ for(;;)
/* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cb->fcc, list);
code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
Arguments:
code points to start of the byte code
utf TRUE in UTF mode
cb compile data block
Returns: 0 for success
@ -1108,13 +1111,15 @@ Returns: 0 for success
*/
int
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{
PCRE2_UCHAR c;
PCRE2_SPTR end;
PCRE2_UCHAR *repeat_opcode;
uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;)
{
@ -1126,10 +1131,11 @@ for (;;)
{
c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ?
get_chr_property_list(code, utf, cb->fcc, list) : NULL;
get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
&rec_limit))
{
switch(c)
{
@ -1181,11 +1187,11 @@ for (;;)
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{
/* end must not be NULL. */
end = get_chr_property_list(code, utf, cb->fcc, list);
end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0;
if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{
switch (c)
{

View file

@ -2,17 +2,21 @@
* Perl-Compatible Regular Expressions *
*************************************************/
/* This file was automatically written by the dftables auxiliary
/* This file was automatically written by the pcre2_dftables auxiliary
program. It contains character tables that are used when no external
tables are passed to PCRE2 by the application that calls it. The tables
are used only for characters whose code values are less than 256. */
/*The dftables program (which is distributed with PCRE2) can be used to
build alternative versions of this file. This is necessary if you are
/* This set of tables was written in the C locale. */
/* The pcre2_ftables program (which is distributed with PCRE2) can be used
to build alternative versions of this file. This is necessary if you are
running in an EBCDIC environment, or if you want to default to a different
encoding, for example ISO-8859-1. When dftables is run, it creates these
tables in the current locale. This happens automatically if PCRE2 is
configured with --enable-rebuild-chartables. */
encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates
these tables in the "C" locale by default. This happens automatically if
PCRE2 is configured with --enable-rebuild-chartables. However, you can run
pcre2_dftables manually with the -L option to build tables using the LC_ALL
locale. */
/* The following #include is present because without it gcc 4.x may remove
the array definition from the final binary if PCRE2 is built into a static
@ -102,54 +106,54 @@ const uint8_t PRIV(default_tables)[] = {
/* This table contains bit maps for various character classes. Each map is 32
bytes long and the bits run from the least significant end of each byte. The
classes that have their own maps are: space, xdigit, digit, upper, lower, word,
graph print, punct, and cntrl. Other classes are built from combinations. */
graph, print, punct, and cntrl. Other classes are built from combinations. */
0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, /* space */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* xdigit */
0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* digit */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* upper */
0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* lower */
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* word */
0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, /* graph */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, /* print */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, /* punct */
0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, /* cntrl */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -1195,13 +1195,14 @@ if (code == NULL) return NULL;
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
if (newcode == NULL) return NULL;
memcpy(newcode, code, code->blocksize);
newcode->executable_jit = NULL;
/* If the code is one that has been deserialized, increment the reference count
in the decoded tables. */
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
{
ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
(*ref_count)++;
}
@ -1229,16 +1230,17 @@ if (code == NULL) return NULL;
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
if (newcode == NULL) return NULL;
memcpy(newcode, code, code->blocksize);
newcode->executable_jit = NULL;
newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
code->memctl.memory_data);
if (newtables == NULL)
{
code->memctl.free((void *)newcode, code->memctl.memory_data);
return NULL;
}
memcpy(newtables, code->tables, tables_length);
ref_count = (PCRE2_SIZE *)(newtables + tables_length);
memcpy(newtables, code->tables, TABLES_LENGTH);
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
*ref_count = 1;
newcode->tables = newtables;
@ -1259,6 +1261,8 @@ PCRE2_SIZE* ref_count;
if (code != NULL)
{
if (code->executable_jit != NULL)
PRIV(jit_free)(code->executable_jit, &code->memctl);
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
{
@ -1266,7 +1270,7 @@ if (code != NULL)
be freed when there are no more references to them. The *ref_count should
always be > 0. */
ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
if (*ref_count > 0)
{
(*ref_count)--;
@ -3649,7 +3653,7 @@ while (ptr < ptrend)
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
/* If ( is not followed by ? it is either a capture or a special verb or an
alpha assertion. */
alpha assertion or a positive non-atomic lookahead. */
if (*ptr != CHAR_QUESTION_MARK)
{
@ -3681,10 +3685,10 @@ while (ptr < ptrend)
break;
/* Handle "alpha assertions" such as (*pla:...). Most of these are
synonyms for the historical symbolic assertions, but the script run ones
are new. They are distinguished by starting with a lower case letter.
Checking both ends of the alphabet makes this work in all character
codes. */
synonyms for the historical symbolic assertions, but the script run and
non-atomic lookaround ones are new. They are distinguished by starting
with a lower case letter. Checking both ends of the alphabet makes this
work in all character codes. */
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
{
@ -3743,9 +3747,7 @@ while (ptr < ptrend)
goto POSITIVE_LOOK_AHEAD;
case META_LOOKAHEAD_NA:
*parsed_pattern++ = meta;
ptr++;
goto POST_ASSERTION;
goto POSITIVE_NONATOMIC_LOOK_AHEAD;
case META_LOOKAHEADNOT:
goto NEGATIVE_LOOK_AHEAD;
@ -4434,6 +4436,12 @@ while (ptr < ptrend)
ptr++;
goto POST_ASSERTION;
case CHAR_ASTERISK:
POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
*parsed_pattern++ = META_LOOKAHEAD_NA;
ptr++;
goto POST_ASSERTION;
case CHAR_EXCLAMATION_MARK:
NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
*parsed_pattern++ = META_LOOKAHEADNOT;
@ -4443,20 +4451,23 @@ while (ptr < ptrend)
/* ---- Lookbehind assertions ---- */
/* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
start of the name of a capturing group. */
/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
is the start of the name of a capturing group. */
case CHAR_LESS_THAN_SIGN:
if (ptrend - ptr <= 1 ||
(ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
(ptr[1] != CHAR_EQUALS_SIGN &&
ptr[1] != CHAR_EXCLAMATION_MARK &&
ptr[1] != CHAR_ASTERISK))
{
terminator = CHAR_GREATER_THAN_SIGN;
goto DEFINE_NAME;
}
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
META_LOOKBEHIND : META_LOOKBEHINDNOT;
META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
*has_lookbehind = TRUE;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern);
@ -4629,8 +4640,6 @@ while (ptr < ptrend)
*parsed_pattern++ = META_KET;
}
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
else top_nest--;
}
@ -4895,7 +4904,7 @@ range. */
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0)
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{
int rc;
uint32_t oc, od;
@ -5310,7 +5319,8 @@ dynamically as we process the pattern. */
#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
#else /* No UTF support */
BOOL ucp = (options & PCRE2_UCP) != 0;
#else /* No Unicode support */
BOOL utf = FALSE;
#endif
@ -5555,12 +5565,12 @@ for (;; pptr++)
zerofirstcu = firstcu;
zerofirstcuflags = firstcuflags;
/* For caseless UTF mode, check whether this character has more than
one other case. If so, generate a special OP_NOTPROP item instead of
/* For caseless UTF or UCP mode, check whether this character has more
than one other case. If so, generate a special OP_NOTPROP item instead of
OP_NOTI. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0 &&
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
(d = UCD_CASESET(c)) != 0)
{
*code++ = OP_NOTPROP;
@ -5593,7 +5603,7 @@ for (;; pptr++)
uint32_t d;
#ifdef SUPPORT_UNICODE
if (utf && c > 127) d = UCD_OTHERCASE(c); else
if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
#endif
{
#if PCRE2_CODE_UNIT_WIDTH != 8
@ -6667,23 +6677,11 @@ for (;; pptr++)
}
/* For a back reference, update the back reference map and the
maximum back reference. Then, for each group, we must check to
see if it is recursive, that is, it is inside the group that it
references. A flag is set so that the group can be made atomic.
*/
maximum back reference. */
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
if (groupnumber > cb->top_backref)
cb->top_backref = groupnumber;
for (oc = cb->open_caps; oc != NULL; oc = oc->next)
{
if (oc->number == groupnumber)
{
oc->flag = TRUE;
break;
}
}
}
}
@ -7077,15 +7075,18 @@ for (;; pptr++)
previous[GET(previous, 1)] != OP_ALT)
goto END_REPEAT;
/* There is no sense in actually repeating assertions. The only
potential use of repetition is in cases when the assertion is optional.
Therefore, if the minimum is greater than zero, just ignore the repeat.
If the maximum is not zero or one, set it to 1. */
/* Perl allows all assertions to be quantified, and when they contain
capturing parentheses and/or are optional there are potential uses for
this feature. PCRE2 used to force the maximum quantifier to 1 on the
invalid grounds that further repetition was never useful. This was
always a bit pointless, since an assertion could be wrapped with a
repeated group to achieve the effect. General repetition is now
permitted, but if the maximum is unlimited it is set to one more than
the minimum. */
if (op_previous < OP_ONCE) /* Assertion */
{
if (repeat_min > 0) goto END_REPEAT;
if (repeat_max > 1) repeat_max = 1;
if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
}
/* The case of a zero minimum is special because of the need to stick
@ -7678,19 +7679,6 @@ for (;; pptr++)
cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
/* Check to see if this back reference is recursive, that it, it
is inside the group that it references. A flag is set so that the
group can be made atomic. */
for (oc = cb->open_caps; oc != NULL; oc = oc->next)
{
if (oc->number == meta_arg)
{
oc->flag = TRUE;
break;
}
}
break;
@ -7836,11 +7824,12 @@ for (;; pptr++)
NORMAL_CHAR_SET: /* Character is already in meta */
matched_char = TRUE;
/* For caseless UTF mode, check whether this character has more than one
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
/* For caseless UTF or UCP mode, check whether this character has more than
one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
*/
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0)
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
{
uint32_t caseset = UCD_CASESET(meta);
if (caseset != 0)
@ -8049,7 +8038,6 @@ if (*code == OP_CBRA)
capnumber = GET2(code, 1 + LINK_SIZE);
capitem.number = capnumber;
capitem.next = cb->open_caps;
capitem.flag = FALSE;
capitem.assert_depth = cb->assert_depth;
cb->open_caps = &capitem;
}
@ -8178,26 +8166,9 @@ for (;;)
PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
/* If it was a capturing subpattern, check to see if it contained any
recursive back references. If so, we must wrap it in atomic brackets. In
any event, remove the block from the chain. */
/* If it was a capturing subpattern, remove the block from the chain. */
if (capnumber > 0)
{
if (cb->open_caps->flag)
{
(void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
CU2BYTES(code - start_bracket));
*start_bracket = OP_ONCE;
code += 1 + LINK_SIZE;
PUT(start_bracket, 1, (int)(code - start_bracket));
*code = OP_KET;
PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
length += 2 + 2*LINK_SIZE;
}
cb->open_caps = cb->open_caps->next;
}
if (capnumber > 0) cb->open_caps = cb->open_caps->next;
/* Set values to pass back */
@ -8832,9 +8803,10 @@ memset(slot + IMM2_SIZE + length, 0,
/* This function is called to skip parts of the parsed pattern when finding the
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
the end of the branch, it is called to skip over an internal lookaround, and it
is also called to skip to the end of a class, during which it will never
encounter nested groups (but there's no need to have special code for that).
the end of the branch, it is called to skip over an internal lookaround or
(DEFINE) group, and it is also called to skip to the end of a class, during
which it will never encounter nested groups (but there's no need to have
special code for that).
When called to find the end of a branch or group, pptr must point to the first
meta code inside the branch, not the branch-starting code. In other cases it
@ -9312,14 +9284,21 @@ for (;; pptr++)
itemlength = grouplength;
break;
/* Check nested groups - advance past the initial data for each type and
then seek a fixed length with get_grouplength(). */
/* A (DEFINE) group is never obeyed inline and so it does not contribute to
the length of this branch. Skip from the following item to the next
unpaired ket. */
case META_COND_DEFINE:
pptr = parsed_skip(pptr + 1, PSKIP_KET);
break;
/* Check other nested groups - advance past the initial data for each type
and then seek a fixed length with get_grouplength(). */
case META_COND_NAME:
case META_COND_NUMBER:
case META_COND_RNAME:
case META_COND_RNUMBER:
case META_COND_DEFINE:
pptr += 2 + SIZEOFFSET;
goto CHECK_GROUP;
@ -9576,6 +9555,10 @@ for (; *pptr != META_END; pptr++)
break;
case META_COND_DEFINE:
pptr += SIZEOFFSET;
nestlevel++;
break;
case META_COND_NAME:
case META_COND_NUMBER:
case META_COND_RNAME:
@ -9656,6 +9639,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{
BOOL utf; /* Set TRUE for UTF mode */
BOOL ucp; /* Set TRUE for UCP mode */
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
pcre2_real_code *re = NULL; /* What we will return */
@ -9943,8 +9927,8 @@ if (utf)
/* Check UCP lockout. */
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
(PCRE2_UCP|PCRE2_NEVER_UCP))
ucp = (cb.external_options & PCRE2_UCP) != 0;
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
{
errorcode = ERR75;
goto HAD_EARLY_ERROR;
@ -10154,6 +10138,7 @@ write to the last 8 bytes of the structure before setting the fields. */
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
re->memctl = ccontext->memctl;
re->tables = tables;
re->executable_jit = NULL;
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
re->blocksize = re_blocksize;
re->magic_number = MAGIC_NUMBER;
@ -10319,7 +10304,7 @@ function call. */
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
}
/* Failed to compile, or error while post-processing. */
@ -10367,21 +10352,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((firstcuflags & REQ_CASELESS) != 0)
{
if (firstcu < 128 || (!utf && firstcu < 255))
if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
{
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
}
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case. In 16-bit and 32-bit modes, we can
check wide characters when UTF (and therefore UCP) is supported. */
/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case, but if UCP is set they may do. */
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (firstcu <= MAX_UTF_CODE_POINT &&
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#else
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#endif
#endif /* SUPPORT_UNICODE */
}
}
@ -10430,14 +10419,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((reqcuflags & REQ_CASELESS) != 0)
{
if (reqcu < 128 || (!utf && reqcu < 255))
if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#else
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#endif
#endif /* SUPPORT_UNICODE */
}
}
}

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2017 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -43,7 +43,8 @@ POSSIBILITY OF SUCH DAMAGE.
#endif
/* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes
its value gets changed by pcre2_internal.h to be in code units. */
its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to
be in code units. */
static int configured_link_size = LINK_SIZE;
@ -94,6 +95,7 @@ if (where == NULL) /* Requests a length */
case PCRE2_CONFIG_NEWLINE:
case PCRE2_CONFIG_PARENSLIMIT:
case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */
case PCRE2_CONFIG_TABLES_LENGTH:
case PCRE2_CONFIG_UNICODE:
return sizeof(uint32_t);
@ -191,6 +193,10 @@ switch (what)
*((uint32_t *)where) = 0;
break;
case PCRE2_CONFIG_TABLES_LENGTH:
*((uint32_t *)where) = TABLES_LENGTH;
break;
case PCRE2_CONFIG_UNICODE_VERSION:
{
#if defined SUPPORT_UNICODE

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
#endif
@ -2190,7 +2191,7 @@ for (;;)
if (clen == 0) break;
#ifdef SUPPORT_UNICODE
if (utf)
if (utf_or_ucp)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
@ -2204,7 +2205,7 @@ for (;;)
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
/* Not UTF or UCP mode */
{
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); }
@ -2339,7 +2340,7 @@ for (;;)
{
uint32_t otherd;
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2374,7 +2375,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2417,7 +2418,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2458,7 +2459,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2491,7 +2492,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -2531,7 +2532,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127)
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}
else
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -76,6 +76,17 @@ typedef int BOOL;
#include <valgrind/memcheck.h>
#endif
/* -ftrivial-auto-var-init support supports initializing all local variables
to avoid some classes of bug, but this can cause an unacceptable slowdown
for large on-stack arrays in hot functions. This macro lets us annotate
such arrays. */
#ifdef HAVE_ATTRIBUTE_UNINITIALIZED
#define PCRE2_KEEP_UNINITIALIZED __attribute__((uninitialized))
#else
#define PCRE2_KEEP_UNINITIALIZED
#endif
/* Older versions of MSVC lack snprintf(). This define allows for
warning/error-free compilation and testing with MSVC compilers back to at least
MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
@ -579,7 +590,7 @@ total length of the tables. */
#define fcc_offset 256 /* Flip case */
#define cbits_offset 512 /* Character classes */
#define ctypes_offset (cbits_offset + cbit_length) /* Character types */
#define tables_length (ctypes_offset + 256)
#define TABLES_LENGTH (ctypes_offset + 256)
/* -------------------- Character and string names ------------------------ */
@ -1759,13 +1770,11 @@ typedef struct pcre2_memctl {
/* Structure for building a chain of open capturing subpatterns during
compiling, so that instructions to close them can be compiled when (*ACCEPT) is
encountered. This is also used to identify subpatterns that contain recursive
back references to themselves, so that they can be made atomic. */
encountered. */
typedef struct open_capitem {
struct open_capitem *next; /* Chain link */
uint16_t number; /* Capture number */
uint16_t flag; /* Set TRUE if recursive back ref */
uint16_t assert_depth; /* Assertion depth when opened */
} open_capitem;
@ -1954,7 +1963,7 @@ is available. */
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
const compile_block *);
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
int *, uint32_t, uint32_t, BOOL, compile_block *);

View file

@ -618,6 +618,7 @@ here.) */
typedef struct pcre2_real_code {
pcre2_memctl memctl; /* Memory control fields */
const uint8_t *tables; /* The character tables */
void *executable_jit; /* Pointer to JIT code */
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */
uint32_t magic_number; /* Paranoid and endianness check */

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -41,10 +41,11 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains the external function pcre2_maketables(), which builds
character tables for PCRE2 in the current locale. The file is compiled on its
own as part of the PCRE2 library. However, it is also included in the
compilation of dftables.c, in which case the macro DFTABLES is defined. */
own as part of the PCRE2 library. It is also included in the compilation of
pcre2_dftables.c as a freestanding program, in which case the macro
PCRE2_DFTABLES is defined. */
#ifndef DFTABLES
#ifndef PCRE2_DFTABLES /* Compiling the library */
# ifdef HAVE_CONFIG_H
# include "config.h"
# endif
@ -61,28 +62,29 @@ compilation of dftables.c, in which case the macro DFTABLES is defined. */
a pointer to them. They are build using the ctype functions, and consequently
their contents will depend upon the current locale setting. When compiled as
part of the library, the store is obtained via a general context malloc, if
supplied, but when DFTABLES is defined (when compiling the dftables auxiliary
program) malloc() is used, and the function has a different name so as not to
clash with the prototype in pcre2.h.
supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables
freestanding auxiliary program) malloc() is used, and the function has a
different name so as not to clash with the prototype in pcre2.h.
Arguments: none when DFTABLES is defined
else a PCRE2 general context or NULL
Arguments: none when PCRE2_DFTABLES is defined
else a PCRE2 general context or NULL
Returns: pointer to the contiguous block of data
else NULL if memory allocation failed
*/
#ifdef DFTABLES /* Included in freestanding dftables.c program */
#ifdef PCRE2_DFTABLES /* Included in freestanding pcre2_dftables program */
static const uint8_t *maketables(void)
{
uint8_t *yield = (uint8_t *)malloc(tables_length);
uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH);
#else /* Not DFTABLES, compiling the library */
#else /* Not PCRE2_DFTABLES, that is, compiling the library */
PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION
pcre2_maketables(pcre2_general_context *gcontext)
{
uint8_t *yield = (uint8_t *)((gcontext != NULL)?
gcontext->memctl.malloc(tables_length, gcontext->memctl.memory_data) :
malloc(tables_length));
#endif /* DFTABLES */
gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) :
malloc(TABLES_LENGTH));
#endif /* PCRE2_DFTABLES */
int i;
uint8_t *p;
@ -103,8 +105,8 @@ exclusive ones - in some locales things may be different.
Note that the table for "space" includes everything "isspace" gives, including
VT in the default locale. This makes it work for the POSIX class [:space:].
From release 8.34 is is also correct for Perl space, because Perl added VT at
release 5.18.
From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl
space, because Perl added VT at release 5.18.
Note also that it is possible for a character to be alnum or alpha without
being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the
@ -114,24 +116,24 @@ test for alnum specially. */
memset(p, 0, cbit_length);
for (i = 0; i < 256; i++)
{
if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7);
if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7);
if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7);
if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7);
if (i == '_') p[cbit_word + i/8] |= 1u << (i&7);
if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7);
if (isxdigit(i))p[cbit_xdigit + i/8] |= 1u << (i&7);
if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7);
if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7);
if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7);
if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7);
if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7);
if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7);
if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7);
if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7);
if (i == '_') p[cbit_word + i/8] |= 1u << (i&7);
if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7);
if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7);
if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7);
if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7);
if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7);
if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7);
}
p += cbit_length;
/* Finally, the character type table. In this, we used to exclude VT from the
white space chars, because Perl didn't recognize it as such for \s and for
comments within regexes. However, Perl changed at release 5.18, so PCRE changed
at release 8.34. */
comments within regexes. However, Perl changed at release 5.18, so PCRE1
changed at release 8.34 and it's always been this way for PCRE2. */
for (i = 0; i < 256; i++)
{
@ -147,7 +149,7 @@ for (i = 0; i < 256; i++)
return yield;
}
#ifndef DFTABLES
#ifndef PCRE2_DFTABLES /* Compiling the library */
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables)
{

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2019 University of Cambridge
New API code Copyright (c) 2015-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset];
if (caseless)
{
#if defined SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UTF) != 0)
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
if (utf || (mb->poptions & PCRE2_UCP) != 0)
{
PCRE2_SPTR endptr = p + length;
/* Match characters up to the end of the reference. NOTE: the number of
code units matched may differ, because in UTF-8 there are some characters
whose upper and lower case codes have different numbers of bytes. For
@ -390,16 +394,25 @@ if (caseless)
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
sequence of two of the latter. It is important, therefore, to check the
length along the reference, not along the subject (earlier code did this
wrong). */
wrong). UCP without uses Unicode properties but without UTF encoding. */
PCRE2_SPTR endptr = p + length;
while (p < endptr)
{
uint32_t c, d;
const ucd_record *ur;
if (eptr >= mb->end_subject) return 1; /* Partial match */
GETCHARINC(c, eptr);
GETCHARINC(d, p);
if (utf)
{
GETCHARINC(c, eptr);
GETCHARINC(d, p);
}
else
{
c = *eptr++;
d = *p++;
}
ur = GET_UCD(d);
if (c != d && c != (uint32_t)((int)d + ur->other_case))
{
@ -415,7 +428,7 @@ if (caseless)
else
#endif
/* Not in UTF mode */
/* Not in UTF or UCP mode */
{
for (; length > 0; length--)
{
@ -432,7 +445,8 @@ if (caseless)
}
/* In the caseful case, we can just compare the code units, whether or not we
are in UTF mode. When partial matching, we have to do this unit-by-unit. */
are in UTF and/or UCP mode. When partial matching, we have to do this unit by
unit. */
else
{
@ -574,8 +588,8 @@ match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
heapframe *F; /* Current frame pointer */
heapframe *N = NULL; /* Temporary frame pointers */
heapframe *P = NULL;
heapframe *assert_accept_frame; /* For passing back the frame with captures */
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
/* Local variables that do not need to be preserved over calls to RRMATCH(). */
@ -598,12 +612,13 @@ BOOL condition; /* Used in conditional groups */
BOOL cur_is_word; /* Used in "word" tests */
BOOL prev_is_word; /* Used in "word" tests */
/* UTF flag */
/* UTF and UCP flags */
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif
/* This is the length of the last part of a backtracking frame that must be
@ -928,6 +943,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
#endif
/* Not UTF mode */
{
if (mb->end_subject - Feptr < 1)
@ -987,10 +1003,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
}
/* If UCP is set without UTF we must do the same as above, but with one
character per code unit. */
else if (ucp)
{
uint32_t cc = UCHAR21(Feptr);
fc = Fecode[1];
if (fc < 128)
{
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
}
else
{
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
Feptr++;
Fecode += 2;
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF mode; use the table for characters < 256. */
/* Not UTF or UCP mode; use the table for characters < 256. */
{
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@ -1010,6 +1046,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
#ifdef SUPPORT_UNICODE
if (utf)
{
@ -1026,15 +1063,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
ch = TABLE_GET(ch, mb->fcc, ch);
ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
/* UCP without UTF is as above, but with one character per code unit. */
else if (ucp)
{
uint32_t ch;
fc = UCHAR21INC(Feptr);
ch = Fecode[1];
Fecode += 2;
if (ch == fc)
{
RRETURN(MATCH_NOMATCH); /* Caseful match */
}
else if (Fop == OP_NOTI) /* If caseless */
{
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
else
#endif /* SUPPORT_UNICODE */
/* Neither UTF nor UCP is set */
{
uint32_t ch = Fecode[1];
fc = *Feptr++;
fc = UCHAR21INC(Feptr);
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
RRETURN(MATCH_NOMATCH);
Fecode += 2;
@ -1244,7 +1308,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#endif /* SUPPORT_UNICODE */
/* When not in UTF mode, load a single-code-unit character. Then proceed as
above. */
above, using Unicode casing if either UTF or UCP is set. */
Lc = *Fecode++;
@ -1253,11 +1317,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_STARI)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
/* Lc must be < 128 in UTF-8 mode. */
#ifdef SUPPORT_UNICODE
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
/* Lc will be < 128 in UTF-8 mode. */
Loc = mb->fcc[Lc];
#else /* 16-bit & 32-bit */
#ifdef SUPPORT_UNICODE
if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
Loc = TABLE_GET(Lc, mb->fcc, Lc);
@ -1490,7 +1558,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_NOTSTARI) /* Caseless */
{
#ifdef SUPPORT_UNICODE
if (utf && Lc > 127)
if ((utf || ucp) && Lc > 127)
Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
@ -6045,7 +6113,6 @@ BOOL firstline;
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
BOOL startline;
BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
@ -6069,13 +6136,19 @@ PCRE2_SPTR match_partial;
BOOL use_jit;
#endif
/* This flag is needed even when Unicode is not supported for convenience
(it is used by the IS_NEWLINE macro). */
BOOL utf = FALSE;
#ifdef SUPPORT_UNICODE
BOOL ucp = FALSE;
BOOL allow_invalid;
uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE;
#endif
#endif
#endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size;
@ -6091,7 +6164,8 @@ proves to be too small, it is replaced by a larger one on the heap. To get a
vector of the size required that is aligned for pointers, allocate it as a
vector of pointers. */
PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)];
PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
PCRE2_KEEP_UNINITIALIZED;
mb->stack_frames = (heapframe *)stack_frames_vector;
/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
@ -6147,12 +6221,13 @@ use_jit = (re->executable_jit != NULL &&
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
#endif
/* Initialize UTF parameters. */
/* Initialize UTF/UCP parameters. */
utf = (re->overall_options & PCRE2_UTF) != 0;
#ifdef SUPPORT_UNICODE
utf = (re->overall_options & PCRE2_UTF) != 0;
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
#endif
ucp = (re->overall_options & PCRE2_UCP) != 0;
#endif /* SUPPORT_UNICODE */
/* Convert the partial matching flags into an integer. */
@ -6589,9 +6664,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}
else
@ -6607,9 +6686,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
#endif
#endif /* SUPPORT_UNICODE */
}
}
@ -6756,15 +6839,16 @@ for(;;)
#endif
}
/* If we can't find the required code unit, having reached the true end
of the subject, break the bumpalong loop, to force a match failure,
except when doing partial matching, when we let the next cycle run at
the end of the subject. To see why, consider the pattern /(?<=abc)def/,
which partially matches "abc", even though the string does not contain
the starting character "d". If we have not reached the true end of the
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
we also let the cycle run, because the matching string is legitimately
allowed to start with the first code unit of a newline. */
/* If we can't find the required first code unit, having reached the
true end of the subject, break the bumpalong loop, to force a match
failure, except when doing partial matching, when we let the next cycle
run at the end of the subject. To see why, consider the pattern
/(?<=abc)def/, which partially matches "abc", even though the string
does not contain the starting character "d". If we have not reached the
true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
temporarily modified) we also let the cycle run, because the matching
string is legitimately allowed to start with the first code unit of a
newline. */
if (mb->partial == 0 && start_match >= mb->end_subject)
{

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -90,7 +90,7 @@ if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL)
if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA;
/* Compute total size. */
total_size = sizeof(pcre2_serialized_data) + tables_length;
total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH;
tables = NULL;
for (i = 0; i < number_of_codes; i++)
@ -121,8 +121,8 @@ data->number_of_codes = number_of_codes;
/* Copy all compiled code data. */
dst_bytes = bytes + sizeof(pcre2_serialized_data);
memcpy(dst_bytes, tables, tables_length);
dst_bytes += tables_length;
memcpy(dst_bytes, tables, TABLES_LENGTH);
dst_bytes += TABLES_LENGTH;
for (i = 0; i < number_of_codes; i++)
{
@ -142,7 +142,9 @@ for (i = 0; i < number_of_codes; i++)
(void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0,
sizeof(pcre2_memctl));
(void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0,
sizeof(void *));
sizeof(void *));
(void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0,
sizeof(void *));
dst_bytes += re->blocksize;
}
@ -187,12 +189,12 @@ src_bytes = bytes + sizeof(pcre2_serialized_data);
/* Decode tables. The reference count for the tables is stored immediately
following them. */
tables = memctl->malloc(tables_length + sizeof(PCRE2_SIZE), memctl->memory_data);
tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data);
if (tables == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(tables, src_bytes, tables_length);
*(PCRE2_SIZE *)(tables + tables_length) = number_of_codes;
src_bytes += tables_length;
memcpy(tables, src_bytes, TABLES_LENGTH);
*(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes;
src_bytes += TABLES_LENGTH;
/* Decode the byte stream. We must not try to read the size from the compiled
code block in the stream, because it might be unaligned, which causes errors on
@ -238,6 +240,7 @@ for (i = 0; i < number_of_codes; i++)
/* At the moment only one table is supported. */
dst_re->tables = tables;
dst_re->executable_jit = NULL;
dst_re->flags |= PCRE2_DEREF_TABLES;
codes[i] = dst_re;

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -58,7 +58,7 @@ collecting data (e.g. minimum matching length). */
/* Returns from set_start_bits() */
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
/*************************************************
@ -772,15 +772,19 @@ Arguments:
p points to the first code unit of the character
caseless TRUE if caseless
utf TRUE for UTF mode
ucp TRUE for UCP mode
Returns: pointer after the character
*/
static PCRE2_SPTR
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
BOOL ucp)
{
uint32_t c = *p++; /* First code unit */
(void)utf; /* Stop compiler warning when UTF not supported */
(void)utf; /* Stop compiler warnings when UTF not supported */
(void)ucp;
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */
@ -810,22 +814,26 @@ if (utf)
if (caseless)
{
#ifdef SUPPORT_UNICODE
if (utf)
if (utf || ucp)
{
c = UCD_OTHERCASE(c);
#if PCRE2_CODE_UNIT_WIDTH == 8
PCRE2_UCHAR buff[6];
c = UCD_OTHERCASE(c);
(void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
if (utf)
{
PCRE2_UCHAR buff[6];
(void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
else if (c < 256) SET_BIT(c);
#else /* 16-bit or 32-bit mode */
c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF */
/* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
}
@ -924,19 +932,26 @@ The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
must continue at the outer level to find at least one mandatory code unit. At
the outermost level, this function fails unless the result is SSB_DONE.
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
issues.
Arguments:
re points to the compiled regex block
code points to an expression
utf TRUE if in UTF mode
ucp TRUE if in UCP mode
depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units
SSB_DONE => Found mandatory starting code units
SSB_CONTINUE => Found optional starting code units
SSB_UNKNOWN => Hit an unrecognized opcode
SSB_TOODEEP => Recursion is too deep
*/
static int
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
int *depthptr)
{
uint32_t c;
int yield = SSB_DONE;
@ -947,6 +962,9 @@ int table_limit = utf? 16:32;
int table_limit = 32;
#endif
*depthptr += 1;
if (*depthptr > 1000) return SSB_TOODEEP;
do
{
BOOL try_next = TRUE;
@ -1103,13 +1121,17 @@ do
case OP_SCRIPT_RUN:
case OP_ASSERT:
case OP_ASSERT_NA:
rc = set_start_bits(re, tcode, utf);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
if (rc == SSB_DONE) try_next = FALSE; else
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_DONE)
{
try_next = FALSE;
}
else if (rc == SSB_CONTINUE)
{
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
}
else return rc; /* FAIL, UNKNOWN, or TOODEEP */
break;
/* If we hit ALT or KET, it means we haven't found anything mandatory in
@ -1155,8 +1177,8 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
rc = set_start_bits(re, ++tcode, utf);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
@ -1177,7 +1199,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
tcode = set_table_bit(re, tcode + 1, FALSE, utf);
tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break;
case OP_STARI:
@ -1186,7 +1208,7 @@ do
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
tcode = set_table_bit(re, tcode + 1, TRUE, utf);
tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break;
/* Single-char upto sets the bit and tries the next */
@ -1194,13 +1216,13 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break;
/* At least one single char sets the bit and stops */
@ -1212,7 +1234,7 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
(void)set_table_bit(re, tcode + 1, FALSE, utf);
(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE;
break;
@ -1223,7 +1245,7 @@ do
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
(void)set_table_bit(re, tcode + 1, TRUE, utf);
(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE;
break;
@ -1652,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */
@ -1664,7 +1687,8 @@ code units. */
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int rc = set_start_bits(re, code, utf);
int depth = 0;
int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1;
/* If a list of starting code units was set up, scan the list to see if only
@ -1712,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
}
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
mode, only values < 128 can be used. */
mode, only values < 128 can be used. In all the other cases, c is a
character value. */
#if PCRE2_CODE_UNIT_WIDTH == 8
if (c > 127) goto DONE;
if (utf && c > 127) goto DONE;
#endif
if (a < 0) a = c; /* First one found */
if (a < 0) a = c; /* First one found, save in a */
else if (b < 0) /* Second one found */
{
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
#else /* 16-bit or 32-bit */
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
if (utf && c > 127) d = UCD_OTHERCASE(c);
#endif /* Code width */
if (utf || ucp)
{
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
if (c > 127) d = UCD_OTHERCASE(c);
}
#endif /* SUPPORT_UNICODE */
if (d != a) goto DONE; /* Not other case of a */
b = c;
if (d != a) goto DONE; /* Not the other case of a */
b = c; /* Save second in b */
}
else goto DONE; /* More than two characters found */
}

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -49,8 +49,9 @@ POSSIBILITY OF SUCH DAMAGE.
#define SUBSTITUTE_OPTIONS \
(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \
PCRE2_SUBSTITUTE_UNSET_EMPTY)
PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
@ -194,6 +195,7 @@ overflow, either give an error immediately, or keep on, accumulating the
length. */
#define CHECKMEMCPY(from,length) \
{ \
if (!overflowed && lengthleft < length) \
{ \
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
@ -209,7 +211,8 @@ length. */
memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
buff_offset += length; \
lengthleft -= length; \
}
} \
}
/* Here's the function */
@ -226,11 +229,14 @@ int forcecasereset = 0;
uint32_t ovector_count;
uint32_t goptions = 0;
uint32_t suboptions;
BOOL match_data_created = FALSE;
BOOL literal = FALSE;
pcre2_match_data *internal_match_data = NULL;
BOOL escaped_literal = FALSE;
BOOL overflowed = FALSE;
BOOL use_existing_match;
BOOL replacement_only;
#ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif
PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr;
@ -248,23 +254,54 @@ lengthleft = buff_length = *blength;
*blength = PCRE2_UNSET;
ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
/* Partial matching is not valid. This must come after setting *blength to
/* Partial matching is not valid. This must come after setting *blength to
PCRE2_UNSET, so as not to imply an offset in the replacement. */
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
return PCRE2_ERROR_BADOPTION;
/* If no match data block is provided, create one. */
/* Check for using a match that has already happened. Note that the subject
pointer in the match data may be NULL after a no-match. */
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
/* If starting from an existing match, there must be an externally provided
match data block. We create an internal match_data block in two cases: (a) an
external one is not supplied (and we are not starting from an existing match);
(b) an existing match is to be used for the first substitution. In the latter
case, we copy the existing match into the internal block. This ensures that no
changes are made to the existing match data block. */
if (match_data == NULL)
{
pcre2_general_context *gcontext;
if (use_existing_match) return PCRE2_ERROR_NULL;
gcontext = (mcontext == NULL)?
(pcre2_general_context *)code :
(pcre2_general_context *)mcontext;
match_data = internal_match_data =
pcre2_match_data_create_from_pattern(code, gcontext);
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
}
else if (use_existing_match)
{
pcre2_general_context *gcontext = (mcontext == NULL)?
(pcre2_general_context *)code :
(pcre2_general_context *)mcontext;
match_data = pcre2_match_data_create_from_pattern(code, gcontext);
if (match_data == NULL) return PCRE2_ERROR_NOMEMORY;
match_data_created = TRUE;
int pairs = (code->top_bracket + 1 < match_data->oveccount)?
code->top_bracket + 1 : match_data->oveccount;
internal_match_data = pcre2_match_data_create(match_data->oveccount,
gcontext);
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
+ 2*pairs*sizeof(PCRE2_SIZE));
match_data = internal_match_data;
}
/* Remember ovector details */
ovector = pcre2_get_ovector_pointer(match_data);
ovector_count = pcre2_get_ovector_count(match_data);
@ -286,7 +323,7 @@ repend = replacement + rlength;
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar));
rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
if (rc != 0)
{
match_data->leftchar = 0;
@ -300,7 +337,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
suboptions = options & SUBSTITUTE_OPTIONS;
options &= ~SUBSTITUTE_OPTIONS;
/* Copy up to the start offset */
/* Error if the start match offset is greater than the length of the subject. */
if (start_offset > length)
{
@ -308,9 +345,13 @@ if (start_offset > length)
rc = PCRE2_ERROR_BADOFFSET;
goto EXIT;
}
CHECKMEMCPY(subject, start_offset);
/* Loop for global substituting. */
/* Copy up to the start offset, unless only the replacement is required. */
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
match is taken from the match_data that was passed in. */
subs = 0;
do
@ -318,7 +359,12 @@ do
PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
uint32_t ptrstackptr = 0;
rc = pcre2_match(code, subject, length, start_offset, options|goptions,
if (use_existing_match)
{
rc = match_data->rc;
use_existing_match = FALSE;
}
else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
match_data, mcontext);
#ifdef SUPPORT_UNICODE
@ -364,44 +410,44 @@ do
#endif
}
/* Copy what we have advanced past, reset the special global options, and
continue to the next match. */
/* Copy what we have advanced past (unless not required), reset the special
global options, and continue to the next match. */
fraglength = start_offset - save_start;
CHECKMEMCPY(subject + save_start, fraglength);
if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
goptions = 0;
continue;
}
/* Handle a successful match. Matches that use \K to end before they start
or start before the current point in the subject are not supported. */
if (ovector[1] < ovector[0] || ovector[0] < start_offset)
{
rc = PCRE2_ERROR_BADSUBSPATTERN;
goto EXIT;
}
/* Check for the same match as previous. This is legitimate after matching an
/* Check for the same match as previous. This is legitimate after matching an
empty string that starts after the initial match offset. We have tried again
at the match point in case the pattern is one like /(?<=\G.)/ which can never
match at its starting point, so running the match achieves the bumpalong. If
we do get the same (null) match at the original match point, it isn't such a
pattern, so we now do the empty string magic. In all other cases, a repeat
match should never occur. */
if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
{
if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
{
goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
ovecsave[2] = start_offset;
continue; /* Back to the top of the loop */
{
if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
{
goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
ovecsave[2] = start_offset;
continue; /* Back to the top of the loop */
}
rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
goto EXIT;
}
goto EXIT;
}
/* Count substitutions with a paranoid check for integer overflow; surely no
real call to this function would ever hit this! */
@ -412,21 +458,30 @@ do
}
subs++;
/* Copy the text leading up to the match, and remember where the insert
begins and how many ovector pairs are set. */
/* Copy the text leading up to the match (unless not required), and remember
where the insert begins and how many ovector pairs are set. */
if (rc == 0) rc = ovector_count;
fraglength = ovector[0] - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength);
if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
scb.output_offsets[0] = buff_offset;
scb.oveccount = rc;
/* Process the replacement string. Literal mode is set by \Q, but only in
extended mode when backslashes are being interpreted. In extended mode we
must handle nested substrings that are to be reprocessed. */
/* Process the replacement string. If the entire replacement is literal, just
copy it with length check. */
ptr = replacement;
for (;;)
if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
{
CHECKMEMCPY(ptr, rlength);
}
/* Within a non-literal replacement, which must be scanned character by
character, local literal mode can be set by \Q, but only in extended mode
when backslashes are being interpreted. In extended mode we must handle
nested substrings that are to be reprocessed. */
else for (;;)
{
uint32_t ch;
unsigned int chlen;
@ -443,11 +498,11 @@ do
/* Handle the next character */
if (literal)
if (escaped_literal)
{
if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
{
literal = FALSE;
escaped_literal = FALSE;
ptr += 2;
continue;
}
@ -704,7 +759,7 @@ do
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
if (utf)
if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -784,7 +839,7 @@ do
continue;
case ESC_Q:
literal = TRUE;
escaped_literal = TRUE;
continue;
case 0: /* Data character */
@ -806,7 +861,7 @@ do
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
if (utf)
if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -835,53 +890,59 @@ do
} /* End handling a literal code unit */
} /* End of loop for scanning the replacement. */
/* The replacement has been copied to the output, or its size has been
remembered. Do the callout if there is one and we have done an actual
/* The replacement has been copied to the output, or its size has been
remembered. Do the callout if there is one and we have done an actual
replacement. */
if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
{
scb.subscount = subs;
scb.subscount = subs;
scb.output_offsets[1] = buff_offset;
rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
/* A non-zero return means cancel this substitution. Instead, copy the
/* A non-zero return means cancel this substitution. Instead, copy the
matched string fragment. */
if (rc != 0)
{
PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
PCRE2_SIZE oldlength = ovector[1] - ovector[0];
buff_offset -= newlength;
lengthleft += newlength;
CHECKMEMCPY(subject + ovector[0], oldlength);
if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
/* A negative return means do not do any more. */
if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
}
}
}
/* Save the details of this match. See above for how this data is used. If we
matched an empty string, do the magic for global matches. Finally, update the
start offset to point to the rest of the subject string. */
ovecsave[0] = ovector[0];
ovecsave[1] = ovector[1];
matched an empty string, do the magic for global matches. Update the start
offset to point to the rest of the subject string. If we re-used an existing
match for the first match, switch to the internal match data block. */
ovecsave[0] = ovector[0];
ovecsave[1] = ovector[1];
ovecsave[2] = start_offset;
goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
start_offset = ovector[1];
} while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
/* Copy the rest of the subject. */
/* Copy the rest of the subject unless not required, and terminate the output
with a binary zero. */
if (!replacement_only)
{
fraglength = length - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength);
}
fraglength = length - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength);
temp[0] = 0;
CHECKMEMCPY(temp , 1);
CHECKMEMCPY(temp, 1);
/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
and matching has carried on after a full buffer, in order to compute the length
@ -903,7 +964,7 @@ else
}
EXIT:
if (match_data_created) pcre2_match_data_free(match_data);
if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
else match_data->rc = rc;
return rc;

View file

@ -265,6 +265,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Chakma0 STR_C STR_h STR_a STR_k STR_m STR_a "\0"
#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0"
#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0"
#define STRING_Chorasmian0 STR_C STR_h STR_o STR_r STR_a STR_s STR_m STR_i STR_a STR_n "\0"
#define STRING_Cn0 STR_C STR_n "\0"
#define STRING_Co0 STR_C STR_o "\0"
#define STRING_Common0 STR_C STR_o STR_m STR_m STR_o STR_n "\0"
@ -275,6 +276,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0"
#define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0"
#define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0"
#define STRING_Dives_Akuru0 STR_D STR_i STR_v STR_e STR_s STR_UNDERSCORE STR_A STR_k STR_u STR_r STR_u "\0"
#define STRING_Dogra0 STR_D STR_o STR_g STR_r STR_a "\0"
#define STRING_Duployan0 STR_D STR_u STR_p STR_l STR_o STR_y STR_a STR_n "\0"
#define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
@ -306,6 +308,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
#define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0"
#define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
#define STRING_Khitan_Small_Script0 STR_K STR_h STR_i STR_t STR_a STR_n STR_UNDERSCORE STR_S STR_m STR_a STR_l STR_l STR_UNDERSCORE STR_S STR_c STR_r STR_i STR_p STR_t "\0"
#define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0"
#define STRING_Khojki0 STR_K STR_h STR_o STR_j STR_k STR_i "\0"
#define STRING_Khudawadi0 STR_K STR_h STR_u STR_d STR_a STR_w STR_a STR_d STR_i "\0"
@ -429,6 +432,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Xsp0 STR_X STR_s STR_p "\0"
#define STRING_Xuc0 STR_X STR_u STR_c "\0"
#define STRING_Xwd0 STR_X STR_w STR_d "\0"
#define STRING_Yezidi0 STR_Y STR_e STR_z STR_i STR_d STR_i "\0"
#define STRING_Yi0 STR_Y STR_i "\0"
#define STRING_Z0 STR_Z "\0"
#define STRING_Zanabazar_Square0 STR_Z STR_a STR_n STR_a STR_b STR_a STR_z STR_a STR_r STR_UNDERSCORE STR_S STR_q STR_u STR_a STR_r STR_e "\0"
@ -464,6 +468,7 @@ const char PRIV(utt_names)[] =
STRING_Chakma0
STRING_Cham0
STRING_Cherokee0
STRING_Chorasmian0
STRING_Cn0
STRING_Co0
STRING_Common0
@ -474,6 +479,7 @@ const char PRIV(utt_names)[] =
STRING_Cyrillic0
STRING_Deseret0
STRING_Devanagari0
STRING_Dives_Akuru0
STRING_Dogra0
STRING_Duployan0
STRING_Egyptian_Hieroglyphs0
@ -505,6 +511,7 @@ const char PRIV(utt_names)[] =
STRING_Katakana0
STRING_Kayah_Li0
STRING_Kharoshthi0
STRING_Khitan_Small_Script0
STRING_Khmer0
STRING_Khojki0
STRING_Khudawadi0
@ -628,6 +635,7 @@ const char PRIV(utt_names)[] =
STRING_Xsp0
STRING_Xuc0
STRING_Xwd0
STRING_Yezidi0
STRING_Yi0
STRING_Z0
STRING_Zanabazar_Square0
@ -663,176 +671,180 @@ const ucp_type_table PRIV(utt)[] = {
{ 203, PT_SC, ucp_Chakma },
{ 210, PT_SC, ucp_Cham },
{ 215, PT_SC, ucp_Cherokee },
{ 224, PT_PC, ucp_Cn },
{ 227, PT_PC, ucp_Co },
{ 230, PT_SC, ucp_Common },
{ 237, PT_SC, ucp_Coptic },
{ 244, PT_PC, ucp_Cs },
{ 247, PT_SC, ucp_Cuneiform },
{ 257, PT_SC, ucp_Cypriot },
{ 265, PT_SC, ucp_Cyrillic },
{ 274, PT_SC, ucp_Deseret },
{ 282, PT_SC, ucp_Devanagari },
{ 293, PT_SC, ucp_Dogra },
{ 299, PT_SC, ucp_Duployan },
{ 308, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 329, PT_SC, ucp_Elbasan },
{ 337, PT_SC, ucp_Elymaic },
{ 345, PT_SC, ucp_Ethiopic },
{ 354, PT_SC, ucp_Georgian },
{ 363, PT_SC, ucp_Glagolitic },
{ 374, PT_SC, ucp_Gothic },
{ 381, PT_SC, ucp_Grantha },
{ 389, PT_SC, ucp_Greek },
{ 395, PT_SC, ucp_Gujarati },
{ 404, PT_SC, ucp_Gunjala_Gondi },
{ 418, PT_SC, ucp_Gurmukhi },
{ 427, PT_SC, ucp_Han },
{ 431, PT_SC, ucp_Hangul },
{ 438, PT_SC, ucp_Hanifi_Rohingya },
{ 454, PT_SC, ucp_Hanunoo },
{ 462, PT_SC, ucp_Hatran },
{ 469, PT_SC, ucp_Hebrew },
{ 476, PT_SC, ucp_Hiragana },
{ 485, PT_SC, ucp_Imperial_Aramaic },
{ 502, PT_SC, ucp_Inherited },
{ 512, PT_SC, ucp_Inscriptional_Pahlavi },
{ 534, PT_SC, ucp_Inscriptional_Parthian },
{ 557, PT_SC, ucp_Javanese },
{ 566, PT_SC, ucp_Kaithi },
{ 573, PT_SC, ucp_Kannada },
{ 581, PT_SC, ucp_Katakana },
{ 590, PT_SC, ucp_Kayah_Li },
{ 599, PT_SC, ucp_Kharoshthi },
{ 610, PT_SC, ucp_Khmer },
{ 616, PT_SC, ucp_Khojki },
{ 623, PT_SC, ucp_Khudawadi },
{ 633, PT_GC, ucp_L },
{ 635, PT_LAMP, 0 },
{ 638, PT_SC, ucp_Lao },
{ 642, PT_SC, ucp_Latin },
{ 648, PT_SC, ucp_Lepcha },
{ 655, PT_SC, ucp_Limbu },
{ 661, PT_SC, ucp_Linear_A },
{ 670, PT_SC, ucp_Linear_B },
{ 679, PT_SC, ucp_Lisu },
{ 684, PT_PC, ucp_Ll },
{ 687, PT_PC, ucp_Lm },
{ 690, PT_PC, ucp_Lo },
{ 693, PT_PC, ucp_Lt },
{ 696, PT_PC, ucp_Lu },
{ 699, PT_SC, ucp_Lycian },
{ 706, PT_SC, ucp_Lydian },
{ 713, PT_GC, ucp_M },
{ 715, PT_SC, ucp_Mahajani },
{ 724, PT_SC, ucp_Makasar },
{ 732, PT_SC, ucp_Malayalam },
{ 742, PT_SC, ucp_Mandaic },
{ 750, PT_SC, ucp_Manichaean },
{ 761, PT_SC, ucp_Marchen },
{ 769, PT_SC, ucp_Masaram_Gondi },
{ 783, PT_PC, ucp_Mc },
{ 786, PT_PC, ucp_Me },
{ 789, PT_SC, ucp_Medefaidrin },
{ 801, PT_SC, ucp_Meetei_Mayek },
{ 814, PT_SC, ucp_Mende_Kikakui },
{ 828, PT_SC, ucp_Meroitic_Cursive },
{ 845, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 866, PT_SC, ucp_Miao },
{ 871, PT_PC, ucp_Mn },
{ 874, PT_SC, ucp_Modi },
{ 879, PT_SC, ucp_Mongolian },
{ 889, PT_SC, ucp_Mro },
{ 893, PT_SC, ucp_Multani },
{ 901, PT_SC, ucp_Myanmar },
{ 909, PT_GC, ucp_N },
{ 911, PT_SC, ucp_Nabataean },
{ 921, PT_SC, ucp_Nandinagari },
{ 933, PT_PC, ucp_Nd },
{ 936, PT_SC, ucp_New_Tai_Lue },
{ 948, PT_SC, ucp_Newa },
{ 953, PT_SC, ucp_Nko },
{ 957, PT_PC, ucp_Nl },
{ 960, PT_PC, ucp_No },
{ 963, PT_SC, ucp_Nushu },
{ 969, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
{ 992, PT_SC, ucp_Ogham },
{ 998, PT_SC, ucp_Ol_Chiki },
{ 1007, PT_SC, ucp_Old_Hungarian },
{ 1021, PT_SC, ucp_Old_Italic },
{ 1032, PT_SC, ucp_Old_North_Arabian },
{ 1050, PT_SC, ucp_Old_Permic },
{ 1061, PT_SC, ucp_Old_Persian },
{ 1073, PT_SC, ucp_Old_Sogdian },
{ 1085, PT_SC, ucp_Old_South_Arabian },
{ 1103, PT_SC, ucp_Old_Turkic },
{ 1114, PT_SC, ucp_Oriya },
{ 1120, PT_SC, ucp_Osage },
{ 1126, PT_SC, ucp_Osmanya },
{ 1134, PT_GC, ucp_P },
{ 1136, PT_SC, ucp_Pahawh_Hmong },
{ 1149, PT_SC, ucp_Palmyrene },
{ 1159, PT_SC, ucp_Pau_Cin_Hau },
{ 1171, PT_PC, ucp_Pc },
{ 1174, PT_PC, ucp_Pd },
{ 1177, PT_PC, ucp_Pe },
{ 1180, PT_PC, ucp_Pf },
{ 1183, PT_SC, ucp_Phags_Pa },
{ 1192, PT_SC, ucp_Phoenician },
{ 1203, PT_PC, ucp_Pi },
{ 1206, PT_PC, ucp_Po },
{ 1209, PT_PC, ucp_Ps },
{ 1212, PT_SC, ucp_Psalter_Pahlavi },
{ 1228, PT_SC, ucp_Rejang },
{ 1235, PT_SC, ucp_Runic },
{ 1241, PT_GC, ucp_S },
{ 1243, PT_SC, ucp_Samaritan },
{ 1253, PT_SC, ucp_Saurashtra },
{ 1264, PT_PC, ucp_Sc },
{ 1267, PT_SC, ucp_Sharada },
{ 1275, PT_SC, ucp_Shavian },
{ 1283, PT_SC, ucp_Siddham },
{ 1291, PT_SC, ucp_SignWriting },
{ 1303, PT_SC, ucp_Sinhala },
{ 1311, PT_PC, ucp_Sk },
{ 1314, PT_PC, ucp_Sm },
{ 1317, PT_PC, ucp_So },
{ 1320, PT_SC, ucp_Sogdian },
{ 1328, PT_SC, ucp_Sora_Sompeng },
{ 1341, PT_SC, ucp_Soyombo },
{ 1349, PT_SC, ucp_Sundanese },
{ 1359, PT_SC, ucp_Syloti_Nagri },
{ 1372, PT_SC, ucp_Syriac },
{ 1379, PT_SC, ucp_Tagalog },
{ 1387, PT_SC, ucp_Tagbanwa },
{ 1396, PT_SC, ucp_Tai_Le },
{ 1403, PT_SC, ucp_Tai_Tham },
{ 1412, PT_SC, ucp_Tai_Viet },
{ 1421, PT_SC, ucp_Takri },
{ 1427, PT_SC, ucp_Tamil },
{ 1433, PT_SC, ucp_Tangut },
{ 1440, PT_SC, ucp_Telugu },
{ 1447, PT_SC, ucp_Thaana },
{ 1454, PT_SC, ucp_Thai },
{ 1459, PT_SC, ucp_Tibetan },
{ 1467, PT_SC, ucp_Tifinagh },
{ 1476, PT_SC, ucp_Tirhuta },
{ 1484, PT_SC, ucp_Ugaritic },
{ 1493, PT_SC, ucp_Unknown },
{ 1501, PT_SC, ucp_Vai },
{ 1505, PT_SC, ucp_Wancho },
{ 1512, PT_SC, ucp_Warang_Citi },
{ 1524, PT_ALNUM, 0 },
{ 1528, PT_PXSPACE, 0 },
{ 1532, PT_SPACE, 0 },
{ 1536, PT_UCNC, 0 },
{ 1540, PT_WORD, 0 },
{ 1544, PT_SC, ucp_Yi },
{ 1547, PT_GC, ucp_Z },
{ 1549, PT_SC, ucp_Zanabazar_Square },
{ 1566, PT_PC, ucp_Zl },
{ 1569, PT_PC, ucp_Zp },
{ 1572, PT_PC, ucp_Zs }
{ 224, PT_SC, ucp_Chorasmian },
{ 235, PT_PC, ucp_Cn },
{ 238, PT_PC, ucp_Co },
{ 241, PT_SC, ucp_Common },
{ 248, PT_SC, ucp_Coptic },
{ 255, PT_PC, ucp_Cs },
{ 258, PT_SC, ucp_Cuneiform },
{ 268, PT_SC, ucp_Cypriot },
{ 276, PT_SC, ucp_Cyrillic },
{ 285, PT_SC, ucp_Deseret },
{ 293, PT_SC, ucp_Devanagari },
{ 304, PT_SC, ucp_Dives_Akuru },
{ 316, PT_SC, ucp_Dogra },
{ 322, PT_SC, ucp_Duployan },
{ 331, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 352, PT_SC, ucp_Elbasan },
{ 360, PT_SC, ucp_Elymaic },
{ 368, PT_SC, ucp_Ethiopic },
{ 377, PT_SC, ucp_Georgian },
{ 386, PT_SC, ucp_Glagolitic },
{ 397, PT_SC, ucp_Gothic },
{ 404, PT_SC, ucp_Grantha },
{ 412, PT_SC, ucp_Greek },
{ 418, PT_SC, ucp_Gujarati },
{ 427, PT_SC, ucp_Gunjala_Gondi },
{ 441, PT_SC, ucp_Gurmukhi },
{ 450, PT_SC, ucp_Han },
{ 454, PT_SC, ucp_Hangul },
{ 461, PT_SC, ucp_Hanifi_Rohingya },
{ 477, PT_SC, ucp_Hanunoo },
{ 485, PT_SC, ucp_Hatran },
{ 492, PT_SC, ucp_Hebrew },
{ 499, PT_SC, ucp_Hiragana },
{ 508, PT_SC, ucp_Imperial_Aramaic },
{ 525, PT_SC, ucp_Inherited },
{ 535, PT_SC, ucp_Inscriptional_Pahlavi },
{ 557, PT_SC, ucp_Inscriptional_Parthian },
{ 580, PT_SC, ucp_Javanese },
{ 589, PT_SC, ucp_Kaithi },
{ 596, PT_SC, ucp_Kannada },
{ 604, PT_SC, ucp_Katakana },
{ 613, PT_SC, ucp_Kayah_Li },
{ 622, PT_SC, ucp_Kharoshthi },
{ 633, PT_SC, ucp_Khitan_Small_Script },
{ 653, PT_SC, ucp_Khmer },
{ 659, PT_SC, ucp_Khojki },
{ 666, PT_SC, ucp_Khudawadi },
{ 676, PT_GC, ucp_L },
{ 678, PT_LAMP, 0 },
{ 681, PT_SC, ucp_Lao },
{ 685, PT_SC, ucp_Latin },
{ 691, PT_SC, ucp_Lepcha },
{ 698, PT_SC, ucp_Limbu },
{ 704, PT_SC, ucp_Linear_A },
{ 713, PT_SC, ucp_Linear_B },
{ 722, PT_SC, ucp_Lisu },
{ 727, PT_PC, ucp_Ll },
{ 730, PT_PC, ucp_Lm },
{ 733, PT_PC, ucp_Lo },
{ 736, PT_PC, ucp_Lt },
{ 739, PT_PC, ucp_Lu },
{ 742, PT_SC, ucp_Lycian },
{ 749, PT_SC, ucp_Lydian },
{ 756, PT_GC, ucp_M },
{ 758, PT_SC, ucp_Mahajani },
{ 767, PT_SC, ucp_Makasar },
{ 775, PT_SC, ucp_Malayalam },
{ 785, PT_SC, ucp_Mandaic },
{ 793, PT_SC, ucp_Manichaean },
{ 804, PT_SC, ucp_Marchen },
{ 812, PT_SC, ucp_Masaram_Gondi },
{ 826, PT_PC, ucp_Mc },
{ 829, PT_PC, ucp_Me },
{ 832, PT_SC, ucp_Medefaidrin },
{ 844, PT_SC, ucp_Meetei_Mayek },
{ 857, PT_SC, ucp_Mende_Kikakui },
{ 871, PT_SC, ucp_Meroitic_Cursive },
{ 888, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 909, PT_SC, ucp_Miao },
{ 914, PT_PC, ucp_Mn },
{ 917, PT_SC, ucp_Modi },
{ 922, PT_SC, ucp_Mongolian },
{ 932, PT_SC, ucp_Mro },
{ 936, PT_SC, ucp_Multani },
{ 944, PT_SC, ucp_Myanmar },
{ 952, PT_GC, ucp_N },
{ 954, PT_SC, ucp_Nabataean },
{ 964, PT_SC, ucp_Nandinagari },
{ 976, PT_PC, ucp_Nd },
{ 979, PT_SC, ucp_New_Tai_Lue },
{ 991, PT_SC, ucp_Newa },
{ 996, PT_SC, ucp_Nko },
{ 1000, PT_PC, ucp_Nl },
{ 1003, PT_PC, ucp_No },
{ 1006, PT_SC, ucp_Nushu },
{ 1012, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
{ 1035, PT_SC, ucp_Ogham },
{ 1041, PT_SC, ucp_Ol_Chiki },
{ 1050, PT_SC, ucp_Old_Hungarian },
{ 1064, PT_SC, ucp_Old_Italic },
{ 1075, PT_SC, ucp_Old_North_Arabian },
{ 1093, PT_SC, ucp_Old_Permic },
{ 1104, PT_SC, ucp_Old_Persian },
{ 1116, PT_SC, ucp_Old_Sogdian },
{ 1128, PT_SC, ucp_Old_South_Arabian },
{ 1146, PT_SC, ucp_Old_Turkic },
{ 1157, PT_SC, ucp_Oriya },
{ 1163, PT_SC, ucp_Osage },
{ 1169, PT_SC, ucp_Osmanya },
{ 1177, PT_GC, ucp_P },
{ 1179, PT_SC, ucp_Pahawh_Hmong },
{ 1192, PT_SC, ucp_Palmyrene },
{ 1202, PT_SC, ucp_Pau_Cin_Hau },
{ 1214, PT_PC, ucp_Pc },
{ 1217, PT_PC, ucp_Pd },
{ 1220, PT_PC, ucp_Pe },
{ 1223, PT_PC, ucp_Pf },
{ 1226, PT_SC, ucp_Phags_Pa },
{ 1235, PT_SC, ucp_Phoenician },
{ 1246, PT_PC, ucp_Pi },
{ 1249, PT_PC, ucp_Po },
{ 1252, PT_PC, ucp_Ps },
{ 1255, PT_SC, ucp_Psalter_Pahlavi },
{ 1271, PT_SC, ucp_Rejang },
{ 1278, PT_SC, ucp_Runic },
{ 1284, PT_GC, ucp_S },
{ 1286, PT_SC, ucp_Samaritan },
{ 1296, PT_SC, ucp_Saurashtra },
{ 1307, PT_PC, ucp_Sc },
{ 1310, PT_SC, ucp_Sharada },
{ 1318, PT_SC, ucp_Shavian },
{ 1326, PT_SC, ucp_Siddham },
{ 1334, PT_SC, ucp_SignWriting },
{ 1346, PT_SC, ucp_Sinhala },
{ 1354, PT_PC, ucp_Sk },
{ 1357, PT_PC, ucp_Sm },
{ 1360, PT_PC, ucp_So },
{ 1363, PT_SC, ucp_Sogdian },
{ 1371, PT_SC, ucp_Sora_Sompeng },
{ 1384, PT_SC, ucp_Soyombo },
{ 1392, PT_SC, ucp_Sundanese },
{ 1402, PT_SC, ucp_Syloti_Nagri },
{ 1415, PT_SC, ucp_Syriac },
{ 1422, PT_SC, ucp_Tagalog },
{ 1430, PT_SC, ucp_Tagbanwa },
{ 1439, PT_SC, ucp_Tai_Le },
{ 1446, PT_SC, ucp_Tai_Tham },
{ 1455, PT_SC, ucp_Tai_Viet },
{ 1464, PT_SC, ucp_Takri },
{ 1470, PT_SC, ucp_Tamil },
{ 1476, PT_SC, ucp_Tangut },
{ 1483, PT_SC, ucp_Telugu },
{ 1490, PT_SC, ucp_Thaana },
{ 1497, PT_SC, ucp_Thai },
{ 1502, PT_SC, ucp_Tibetan },
{ 1510, PT_SC, ucp_Tifinagh },
{ 1519, PT_SC, ucp_Tirhuta },
{ 1527, PT_SC, ucp_Ugaritic },
{ 1536, PT_SC, ucp_Unknown },
{ 1544, PT_SC, ucp_Vai },
{ 1548, PT_SC, ucp_Wancho },
{ 1555, PT_SC, ucp_Warang_Citi },
{ 1567, PT_ALNUM, 0 },
{ 1571, PT_PXSPACE, 0 },
{ 1575, PT_SPACE, 0 },
{ 1579, PT_UCNC, 0 },
{ 1583, PT_WORD, 0 },
{ 1587, PT_SC, ucp_Yezidi },
{ 1594, PT_SC, ucp_Yi },
{ 1597, PT_GC, ucp_Z },
{ 1599, PT_SC, ucp_Zanabazar_Square },
{ 1616, PT_PC, ucp_Zl },
{ 1619, PT_PC, ucp_Zp },
{ 1622, PT_PC, ucp_Zs }
};
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

File diff suppressed because it is too large Load diff

View file

@ -286,7 +286,12 @@ enum {
ucp_Elymaic,
ucp_Nandinagari,
ucp_Nyiakeng_Puachue_Hmong,
ucp_Wancho
ucp_Wancho,
/* New for Unicode 13.0.0 */
ucp_Chorasmian,
ucp_Dives_Akuru,
ucp_Khitan_Small_Script,
ucp_Yezidi
};
#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2017 University of Cambridge
New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -347,7 +347,7 @@ for (p = string; length > 0; p++)
length--;
if ((*p & 0xfc00) != 0xdc00)
{
*erroroffset = p - string;
*erroroffset = p - string - 1;
return PCRE2_ERROR_UTF16_ERR2;
}
}