pcre2: import version 10.35

Files removed as described in b418e36f2
This commit is contained in:
David Adam 2020-05-10 22:27:46 +08:00
parent 3252d0fd03
commit 73ecf1576b
21 changed files with 3039 additions and 2598 deletions

170
pcre2/CMakeLists.txt vendored
View file

@ -1,6 +1,5 @@
# CMakeLists.txt # CMakeLists.txt
# #
#
# This file enables PCRE2 to be built with the CMake configuration and build # This file enables PCRE2 to be built with the CMake configuration and build
# tool. Download CMake in source or binary form from http://www.cmake.org/ # tool. Download CMake in source or binary form from http://www.cmake.org/
# Converted to support PCRE2 from the original PCRE file, August 2014. # Converted to support PCRE2 from the original PCRE file, August 2014.
@ -85,6 +84,14 @@
# 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h # 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h
# 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied # 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied
# 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below) # 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below)
# 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere)
# 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace
# 2020-04-08 Carlo added function check for secure_getenv, fixed strerror
# 2020-04-16 enh added check for __attribute__((uninitialized))
# 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and
# library versioning.
# 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator
# 2020-04-28 PH added function check for memfd_create based on Carlo's patch
PROJECT(PCRE2 C) PROJECT(PCRE2 C)
@ -95,14 +102,26 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
# GET_TARGET_PROPERTY. This should no longer be required. # GET_TARGET_PROPERTY. This should no longer be required.
# CMAKE_POLICY(SET CMP0026 OLD) # CMAKE_POLICY(SET CMP0026 OLD)
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake # For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
# on the command line.
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src")
# external packages
FIND_PACKAGE( BZip2 )
FIND_PACKAGE( ZLIB )
FIND_PACKAGE( Readline )
FIND_PACKAGE( Editline )
# Configuration checks # Configuration checks
INCLUDE(CheckIncludeFile) INCLUDE(CheckCSourceCompiles)
INCLUDE(CheckFunctionExists) INCLUDE(CheckFunctionExists)
INCLUDE(CheckSymbolExists)
INCLUDE(CheckIncludeFile)
INCLUDE(CheckTypeSize) INCLUDE(CheckTypeSize)
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H) CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
@ -113,9 +132,19 @@ CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H) CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H) CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY) CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE) CHECK_FUNCTION_EXISTS(memfd_create HAVE_MEMFD_CREATE)
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR) CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
CHECK_FUNCTION_EXISTS(secure_getenv HAVE_SECURE_GETENV)
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
CHECK_C_SOURCE_COMPILES(
"int main() { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }"
HAVE_ATTRIBUTE_UNINITIALIZED
)
set(CMAKE_REQUIRED_FLAGS ${ORIG_CMAKE_REQUIRED_FLAGS})
# User-configurable options # User-configurable options
# #
@ -171,8 +200,12 @@ SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL
SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
"Enable support for Just-in-time compiling.") "Enable support for Just-in-time compiling.")
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
"Enable SELinux compatible execmem allocator in JIT (experimental).") SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
"Enable SELinux compatible execmem allocator in JIT (experimental).")
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
SET(PCRE2_SUPPORT_JIT_SEALLOC IGNORE)
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL
"Enable use of Just-in-time compiling in pcre2grep.") "Enable use of Just-in-time compiling in pcre2grep.")
@ -298,7 +331,19 @@ IF(PCRE2_SUPPORT_JIT)
ENDIF(PCRE2_SUPPORT_JIT) ENDIF(PCRE2_SUPPORT_JIT)
IF(PCRE2_SUPPORT_JIT_SEALLOC) IF(PCRE2_SUPPORT_JIT_SEALLOC)
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1) SET(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
CHECK_SYMBOL_EXISTS(mkostemp stdlib.h REQUIRED)
UNSET(CMAKE_REQUIRED_DEFINITIONS)
IF(${REQUIRED})
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
ADD_DEFINITIONS(-D_GNU_SOURCE)
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
MESSAGE(FATAL_ERROR "Your configuration is not supported")
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
ELSE(${REQUIRED})
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF)
ENDIF(${REQUIRED})
ENDIF(PCRE2_SUPPORT_JIT_SEALLOC) ENDIF(PCRE2_SUPPORT_JIT_SEALLOC)
IF(PCRE2GREP_SUPPORT_JIT) IF(PCRE2GREP_SUPPORT_JIT)
@ -394,12 +439,13 @@ file(STRINGS ${PROJECT_SOURCE_DIR}/configure.ac
LIMIT_COUNT 50 # Read only the first 50 lines of the file LIMIT_COUNT 50 # Read only the first 50 lines of the file
) )
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date") set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date"
"libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version")
foreach(configure_line ${configure_lines}) foreach(configure_line ${configure_lines})
foreach(_substitution_variable ${SEARCHED_VARIABLES}) foreach(_substitution_variable ${SEARCHED_VARIABLES})
string(TOUPPER ${_substitution_variable} _substitution_variable_upper) string(TOUPPER ${_substitution_variable} _substitution_variable_upper)
if (NOT ${_substitution_variable_upper}) if (NOT ${_substitution_variable_upper})
string(REGEX MATCH "m4_define\\(${_substitution_variable}, \\[(.*)\\]" MACTHED_STRING ${configure_line}) string(REGEX MATCH "m4_define\\(${_substitution_variable}, *\\[(.*)\\]" MACTHED_STRING ${configure_line})
if (CMAKE_MATCH_1) if (CMAKE_MATCH_1)
set(${_substitution_variable_upper} ${CMAKE_MATCH_1}) set(${_substitution_variable_upper} ${CMAKE_MATCH_1})
endif() endif()
@ -407,21 +453,74 @@ foreach(configure_line ${configure_lines})
endforeach() endforeach()
endforeach() endforeach()
macro(PARSE_LIB_VERSION VARIABLE_PREFIX)
string(REPLACE ":" ";" ${VARIABLE_PREFIX}_VERSION_LIST ${${VARIABLE_PREFIX}_VERSION})
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 0 ${VARIABLE_PREFIX}_VERSION_CURRENT)
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 1 ${VARIABLE_PREFIX}_VERSION_REVISION)
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 2 ${VARIABLE_PREFIX}_VERSION_AGE)
math(EXPR ${VARIABLE_PREFIX}_SOVERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} - ${${VARIABLE_PREFIX}_VERSION_AGE}")
math(EXPR ${VARIABLE_PREFIX}_MACHO_COMPATIBILITY_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
math(EXPR ${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
set(${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION}.${${VARIABLE_PREFIX}_VERSION_REVISION}}")
set(${VARIABLE_PREFIX}_VERSION "${${VARIABLE_PREFIX}_SOVERSION}.${${VARIABLE_PREFIX}_VERSION_AGE}.${${VARIABLE_PREFIX}_VERSION_REVISION}")
endmacro()
PARSE_LIB_VERSION(LIBPCRE2_POSIX)
PARSE_LIB_VERSION(LIBPCRE2_8)
PARSE_LIB_VERSION(LIBPCRE2_16)
PARSE_LIB_VERSION(LIBPCRE2_32)
CONFIGURE_FILE(src/pcre2.h.in CONFIGURE_FILE(src/pcre2.h.in
${PROJECT_BINARY_DIR}/pcre2.h ${PROJECT_BINARY_DIR}/pcre2.h
@ONLY) @ONLY)
# What about pcre2-config and libpcre2.pc? # Generate pkg-config files
SET(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}")
SET(prefix ${CMAKE_INSTALL_PREFIX})
SET(exec_prefix "\${prefix}")
SET(libdir "\${exec_prefix}/lib")
SET(includedir "\${prefix}/include")
CONFIGURE_FILE(libpcre2-posix.pc.in libpcre2-posix.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-posix.pc")
IF(PCRE2_BUILD_PCRE2_8)
CONFIGURE_FILE(libpcre2-8.pc.in libpcre2-8.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-8.pc")
SET(enable_pcre2_8 "yes")
ELSE()
SET(enable_pcre2_8 "no")
ENDIF()
IF(PCRE2_BUILD_PCRE2_16)
CONFIGURE_FILE(libpcre2-16.pc.in libpcre2-16.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-16.pc")
SET(enable_pcre2_16 "yes")
ELSE()
SET(enable_pcre2_16 "no")
ENDIF()
IF(PCRE2_BUILD_PCRE2_32)
CONFIGURE_FILE(libpcre2-32.pc.in libpcre2-32.pc @ONLY)
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-32.pc")
SET(enable_pcre2_32 "yes")
ELSE()
SET(enable_pcre2_32 "no")
ENDIF()
CONFIGURE_FILE(pcre2-config.in pcre2-config @ONLY)
# Character table generation # Character table generation
OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF) OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)
IF(PCRE2_REBUILD_CHARTABLES) IF(PCRE2_REBUILD_CHARTABLES)
ADD_EXECUTABLE(dftables src/dftables.c) ADD_EXECUTABLE(pcre2_dftables src/pcre2_dftables.c)
ADD_CUSTOM_COMMAND( ADD_CUSTOM_COMMAND(
COMMENT "Generating character tables (pcre2_chartables.c) for current locale" COMMENT "Generating character tables (pcre2_chartables.c) for current locale"
DEPENDS dftables DEPENDS pcre2_dftables
COMMAND dftables COMMAND pcre2_dftables
ARGS ${PROJECT_BINARY_DIR}/pcre2_chartables.c ARGS ${PROJECT_BINARY_DIR}/pcre2_chartables.c
OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c
) )
@ -446,6 +545,7 @@ SET(PCRE2_SOURCES
src/pcre2_error.c src/pcre2_error.c
src/pcre2_extuni.c src/pcre2_extuni.c
src/pcre2_find_bracket.c src/pcre2_find_bracket.c
src/pcre2_jit_compile.c
src/pcre2_maketables.c src/pcre2_maketables.c
src/pcre2_match.c src/pcre2_match.c
src/pcre2_match_data.c src/pcre2_match_data.c
@ -464,6 +564,9 @@ SET(PCRE2_SOURCES
src/pcre2_xclass.c src/pcre2_xclass.c
) )
SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
SET(PCRE2POSIX_SOURCES src/pcre2posix.c)
IF(MINGW AND NOT PCRE2_STATIC) IF(MINGW AND NOT PCRE2_STATIC)
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc) IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
@ -531,12 +634,20 @@ SET(targets)
IF(PCRE2_BUILD_PCRE2_8) IF(PCRE2_BUILD_PCRE2_8)
ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h) ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-8 SET_TARGET_PROPERTIES(pcre2-8 PROPERTIES
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8) COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_8_VERSION}
SOVERSION ${LIBPCRE2_8_SOVERSION})
SET(targets ${targets} pcre2-8) SET(targets ${targets} pcre2-8)
ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES}) ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
SET_PROPERTY(TARGET pcre2-posix SET_TARGET_PROPERTIES(pcre2-posix PROPERTIES
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8) COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_POSIX_VERSION}
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
SET(targets ${targets} pcre2-posix) SET(targets ${targets} pcre2-posix)
TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8) TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
@ -554,8 +665,12 @@ ENDIF(PCRE2_BUILD_PCRE2_8)
IF(PCRE2_BUILD_PCRE2_16) IF(PCRE2_BUILD_PCRE2_16)
ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h) ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-16 SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16) COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_16_VERSION}
SOVERSION ${LIBPCRE2_16_SOVERSION})
SET(targets ${targets} pcre2-16) SET(targets ${targets} pcre2-16)
IF(MINGW AND NOT PCRE2_STATIC) IF(MINGW AND NOT PCRE2_STATIC)
@ -572,8 +687,12 @@ ENDIF(PCRE2_BUILD_PCRE2_16)
IF(PCRE2_BUILD_PCRE2_32) IF(PCRE2_BUILD_PCRE2_32)
ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h) ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-32 SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32) COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
VERSION ${LIBPCRE2_32_VERSION}
SOVERSION ${LIBPCRE2_32_SOVERSION})
SET(targets ${targets} pcre2-32) SET(targets ${targets} pcre2-32)
IF(MINGW AND NOT PCRE2_STATIC) IF(MINGW AND NOT PCRE2_STATIC)
@ -746,6 +865,11 @@ INSTALL(TARGETS ${targets}
RUNTIME DESTINATION bin RUNTIME DESTINATION bin
LIBRARY DESTINATION lib LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib) ARCHIVE DESTINATION lib)
INSTALL(FILES ${pkg_config_files} DESTINATION lib/pkgconfig)
INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"
DESTINATION bin
# Set 0755 permissions
PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include) INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)

6
pcre2/LICENCE vendored
View file

@ -26,7 +26,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service, University of Cambridge Computing Service,
Cambridge, England. Cambridge, England.
Copyright (c) 1997-2019 University of Cambridge Copyright (c) 1997-2020 University of Cambridge
All rights reserved. All rights reserved.
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester Email local part: hzmester
Email domain: freemail.hu Email domain: freemail.hu
Copyright(c) 2010-2019 Zoltan Herczeg Copyright(c) 2010-2020 Zoltan Herczeg
All rights reserved. All rights reserved.
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester Email local part: hzmester
Email domain: freemail.hu Email domain: freemail.hu
Copyright(c) 2009-2019 Zoltan Herczeg Copyright(c) 2009-2020 Zoltan Herczeg
All rights reserved. All rights reserved.

View file

@ -1,5 +1,6 @@
/* config.h for CMake builds */ /* config.h for CMake builds */
#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
#cmakedefine HAVE_DIRENT_H 1 #cmakedefine HAVE_DIRENT_H 1
#cmakedefine HAVE_INTTYPES_H 1 #cmakedefine HAVE_INTTYPES_H 1
#cmakedefine HAVE_STDINT_H 1 #cmakedefine HAVE_STDINT_H 1
@ -10,7 +11,10 @@
#cmakedefine HAVE_WINDOWS_H 1 #cmakedefine HAVE_WINDOWS_H 1
#cmakedefine HAVE_BCOPY 1 #cmakedefine HAVE_BCOPY 1
#cmakedefine HAVE_MEMFD_CREATE 1
#cmakedefine HAVE_MEMMOVE 1 #cmakedefine HAVE_MEMMOVE 1
#cmakedefine HAVE_SECURE_GETENV 1
#cmakedefine HAVE_STRERROR 1
#cmakedefine PCRE2_STATIC 1 #cmakedefine PCRE2_STATIC 1

12
pcre2/configure.ac vendored
View file

@ -1,7 +1,13 @@
m4_define(pcre2_major, [10]) m4_define(pcre2_major, [10])
m4_define(pcre2_minor, [34]) m4_define(pcre2_minor, [35])
m4_define(pcre2_prerelease, []) m4_define(pcre2_prerelease, [])
m4_define(pcre2_date, [2019-11-21]) m4_define(pcre2_date, [2020-05-09])
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre2_8_version, [10:0:10])
m4_define(libpcre2_16_version, [10:0:10])
m4_define(libpcre2_32_version, [10:0:10])
m4_define(libpcre2_posix_version, [2:3:0])
# NOTE: The CMakeLists.txt file searches for the above variables in the first # NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved. # 50 lines of this file. Please update that if the variables above are moved.

View file

@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, second API, to be /* This is the public header file for the PCRE library, second API, to be
#included by applications that call PCRE2 functions. #included by applications that call PCRE2 functions.
Copyright (c) 2016-2019 University of Cambridge Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -181,6 +181,9 @@ pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */
#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */ #define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */
#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */ #define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */
#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u #define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u
#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */
/* Options for pcre2_pattern_convert(). */ /* Options for pcre2_pattern_convert(). */
@ -445,6 +448,7 @@ released, the numbers must not be changed. */
#define PCRE2_CONFIG_HEAPLIMIT 12 #define PCRE2_CONFIG_HEAPLIMIT 12
#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13 #define PCRE2_CONFIG_NEVER_BACKSLASH_C 13
#define PCRE2_CONFIG_COMPILED_WIDTHS 14 #define PCRE2_CONFIG_COMPILED_WIDTHS 14
#define PCRE2_CONFIG_TABLES_LENGTH 15
/* Types for code units in patterns and subject strings. */ /* Types for code units in patterns and subject strings. */

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
Arguments: Arguments:
code points to start of expression code points to start of expression
utf TRUE if in UTF mode utf TRUE if in UTF mode
ucp TRUE if in UCP mode
fcc points to the case-flipping table fcc points to the case-flipping table
list points to output list list points to output list
list[0] will be filled with the opcode list[0] will be filled with the opcode
@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted
*/ */
static PCRE2_SPTR static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc, get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list) uint32_t *list)
{ {
PCRE2_UCHAR c = *code; PCRE2_UCHAR c = *code;
@ -316,7 +317,8 @@ uint32_t chr;
uint32_t *clist_dest; uint32_t *clist_dest;
const uint32_t *clist_src; const uint32_t *clist_src;
#else #else
(void)utf; /* Suppress "unused parameter" compiler warning */ (void)utf; /* Suppress "unused parameter" compiler warnings */
(void)ucp;
#endif #endif
list[0] = c; list[0] = c;
@ -396,7 +398,7 @@ switch(c)
list[2] = chr; list[2] = chr;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (chr < 128 || (chr < 256 && !utf)) if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr]; list[3] = fcc[chr];
else else
list[3] = UCD_OTHERCASE(chr); list[3] = UCD_OTHERCASE(chr);
@ -503,6 +505,7 @@ which case the base cannot be possessified.
Arguments: Arguments:
code points to the byte code code points to the byte code
utf TRUE in UTF mode utf TRUE in UTF mode
ucp TRUE in UCP mode
cb compile data block cb compile data block
base_list the data list of the base opcode base_list the data list of the base opcode
base_end the end of the base opcode base_end the end of the base opcode
@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible
*/ */
static BOOL static BOOL
compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb, compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{ {
PCRE2_UCHAR c; PCRE2_UCHAR c;
@ -651,7 +654,7 @@ for(;;)
while (*next_code == OP_ALT) while (*next_code == OP_ALT)
{ {
if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit)) if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE; return FALSE;
code = next_code + 1 + LINK_SIZE; code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1); next_code += GET(next_code, 1);
@ -672,7 +675,8 @@ for(;;)
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE; next_code += 1 + LINK_SIZE;
if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit)) if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
rec_limit))
return FALSE; return FALSE;
code += PRIV(OP_lengths)[c]; code += PRIV(OP_lengths)[c];
@ -688,7 +692,7 @@ for(;;)
/* We now have the next appropriate opcode to compare with the base. Check /* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */ for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cb->fcc, list); code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */ if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing /* If either opcode is a small character list, set pointers for comparing
@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
Arguments: Arguments:
code points to start of the byte code code points to start of the byte code
utf TRUE in UTF mode
cb compile data block cb compile data block
Returns: 0 for success Returns: 0 for success
@ -1108,13 +1111,15 @@ Returns: 0 for success
*/ */
int int
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb) PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{ {
PCRE2_UCHAR c; PCRE2_UCHAR c;
PCRE2_SPTR end; PCRE2_SPTR end;
PCRE2_UCHAR *repeat_opcode; PCRE2_UCHAR *repeat_opcode;
uint32_t list[8]; uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;) for (;;)
{ {
@ -1126,10 +1131,11 @@ for (;;)
{ {
c -= get_repeat_base(c) - OP_STAR; c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ? end = (c <= OP_MINUPTO) ?
get_chr_property_list(code, utf, cb->fcc, list) : NULL; get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit)) if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
&rec_limit))
{ {
switch(c) switch(c)
{ {
@ -1181,11 +1187,11 @@ for (;;)
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{ {
/* end must not be NULL. */ /* end must not be NULL. */
end = get_chr_property_list(code, utf, cb->fcc, list); end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0; list[1] = (c & 1) == 0;
if (compare_opcodes(end, utf, cb, list, end, &rec_limit)) if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{ {
switch (c) switch (c)
{ {

View file

@ -2,17 +2,21 @@
* Perl-Compatible Regular Expressions * * Perl-Compatible Regular Expressions *
*************************************************/ *************************************************/
/* This file was automatically written by the dftables auxiliary /* This file was automatically written by the pcre2_dftables auxiliary
program. It contains character tables that are used when no external program. It contains character tables that are used when no external
tables are passed to PCRE2 by the application that calls it. The tables tables are passed to PCRE2 by the application that calls it. The tables
are used only for characters whose code values are less than 256. */ are used only for characters whose code values are less than 256. */
/*The dftables program (which is distributed with PCRE2) can be used to /* This set of tables was written in the C locale. */
build alternative versions of this file. This is necessary if you are
/* The pcre2_ftables program (which is distributed with PCRE2) can be used
to build alternative versions of this file. This is necessary if you are
running in an EBCDIC environment, or if you want to default to a different running in an EBCDIC environment, or if you want to default to a different
encoding, for example ISO-8859-1. When dftables is run, it creates these encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates
tables in the current locale. This happens automatically if PCRE2 is these tables in the "C" locale by default. This happens automatically if
configured with --enable-rebuild-chartables. */ PCRE2 is configured with --enable-rebuild-chartables. However, you can run
pcre2_dftables manually with the -L option to build tables using the LC_ALL
locale. */
/* The following #include is present because without it gcc 4.x may remove /* The following #include is present because without it gcc 4.x may remove
the array definition from the final binary if PCRE2 is built into a static the array definition from the final binary if PCRE2 is built into a static
@ -102,54 +106,54 @@ const uint8_t PRIV(default_tables)[] = {
/* This table contains bit maps for various character classes. Each map is 32 /* This table contains bit maps for various character classes. Each map is 32
bytes long and the bits run from the least significant end of each byte. The bytes long and the bits run from the least significant end of each byte. The
classes that have their own maps are: space, xdigit, digit, upper, lower, word, classes that have their own maps are: space, xdigit, digit, upper, lower, word,
graph print, punct, and cntrl. Other classes are built from combinations. */ graph, print, punct, and cntrl. Other classes are built from combinations. */
0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, /* space */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* xdigit */
0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* digit */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* upper */
0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* lower */
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* word */
0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, /* graph */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, /* print */
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, /* punct */
0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, /* cntrl */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -1195,13 +1195,14 @@ if (code == NULL) return NULL;
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
if (newcode == NULL) return NULL; if (newcode == NULL) return NULL;
memcpy(newcode, code, code->blocksize); memcpy(newcode, code, code->blocksize);
newcode->executable_jit = NULL;
/* If the code is one that has been deserialized, increment the reference count /* If the code is one that has been deserialized, increment the reference count
in the decoded tables. */ in the decoded tables. */
if ((code->flags & PCRE2_DEREF_TABLES) != 0) if ((code->flags & PCRE2_DEREF_TABLES) != 0)
{ {
ref_count = (PCRE2_SIZE *)(code->tables + tables_length); ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
(*ref_count)++; (*ref_count)++;
} }
@ -1229,16 +1230,17 @@ if (code == NULL) return NULL;
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
if (newcode == NULL) return NULL; if (newcode == NULL) return NULL;
memcpy(newcode, code, code->blocksize); memcpy(newcode, code, code->blocksize);
newcode->executable_jit = NULL;
newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE), newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
code->memctl.memory_data); code->memctl.memory_data);
if (newtables == NULL) if (newtables == NULL)
{ {
code->memctl.free((void *)newcode, code->memctl.memory_data); code->memctl.free((void *)newcode, code->memctl.memory_data);
return NULL; return NULL;
} }
memcpy(newtables, code->tables, tables_length); memcpy(newtables, code->tables, TABLES_LENGTH);
ref_count = (PCRE2_SIZE *)(newtables + tables_length); ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
*ref_count = 1; *ref_count = 1;
newcode->tables = newtables; newcode->tables = newtables;
@ -1259,6 +1261,8 @@ PCRE2_SIZE* ref_count;
if (code != NULL) if (code != NULL)
{ {
if (code->executable_jit != NULL)
PRIV(jit_free)(code->executable_jit, &code->memctl);
if ((code->flags & PCRE2_DEREF_TABLES) != 0) if ((code->flags & PCRE2_DEREF_TABLES) != 0)
{ {
@ -1266,7 +1270,7 @@ if (code != NULL)
be freed when there are no more references to them. The *ref_count should be freed when there are no more references to them. The *ref_count should
always be > 0. */ always be > 0. */
ref_count = (PCRE2_SIZE *)(code->tables + tables_length); ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
if (*ref_count > 0) if (*ref_count > 0)
{ {
(*ref_count)--; (*ref_count)--;
@ -3649,7 +3653,7 @@ while (ptr < ptrend)
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
/* If ( is not followed by ? it is either a capture or a special verb or an /* If ( is not followed by ? it is either a capture or a special verb or an
alpha assertion. */ alpha assertion or a positive non-atomic lookahead. */
if (*ptr != CHAR_QUESTION_MARK) if (*ptr != CHAR_QUESTION_MARK)
{ {
@ -3681,10 +3685,10 @@ while (ptr < ptrend)
break; break;
/* Handle "alpha assertions" such as (*pla:...). Most of these are /* Handle "alpha assertions" such as (*pla:...). Most of these are
synonyms for the historical symbolic assertions, but the script run ones synonyms for the historical symbolic assertions, but the script run and
are new. They are distinguished by starting with a lower case letter. non-atomic lookaround ones are new. They are distinguished by starting
Checking both ends of the alphabet makes this work in all character with a lower case letter. Checking both ends of the alphabet makes this
codes. */ work in all character codes. */
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0) else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
{ {
@ -3743,9 +3747,7 @@ while (ptr < ptrend)
goto POSITIVE_LOOK_AHEAD; goto POSITIVE_LOOK_AHEAD;
case META_LOOKAHEAD_NA: case META_LOOKAHEAD_NA:
*parsed_pattern++ = meta; goto POSITIVE_NONATOMIC_LOOK_AHEAD;
ptr++;
goto POST_ASSERTION;
case META_LOOKAHEADNOT: case META_LOOKAHEADNOT:
goto NEGATIVE_LOOK_AHEAD; goto NEGATIVE_LOOK_AHEAD;
@ -4434,6 +4436,12 @@ while (ptr < ptrend)
ptr++; ptr++;
goto POST_ASSERTION; goto POST_ASSERTION;
case CHAR_ASTERISK:
POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
*parsed_pattern++ = META_LOOKAHEAD_NA;
ptr++;
goto POST_ASSERTION;
case CHAR_EXCLAMATION_MARK: case CHAR_EXCLAMATION_MARK:
NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */ NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
*parsed_pattern++ = META_LOOKAHEADNOT; *parsed_pattern++ = META_LOOKAHEADNOT;
@ -4443,20 +4451,23 @@ while (ptr < ptrend)
/* ---- Lookbehind assertions ---- */ /* ---- Lookbehind assertions ---- */
/* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
start of the name of a capturing group. */ is the start of the name of a capturing group. */
case CHAR_LESS_THAN_SIGN: case CHAR_LESS_THAN_SIGN:
if (ptrend - ptr <= 1 || if (ptrend - ptr <= 1 ||
(ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK)) (ptr[1] != CHAR_EQUALS_SIGN &&
ptr[1] != CHAR_EXCLAMATION_MARK &&
ptr[1] != CHAR_ASTERISK))
{ {
terminator = CHAR_GREATER_THAN_SIGN; terminator = CHAR_GREATER_THAN_SIGN;
goto DEFINE_NAME; goto DEFINE_NAME;
} }
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
META_LOOKBEHIND : META_LOOKBEHINDNOT; META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
*has_lookbehind = TRUE; *has_lookbehind = TRUE;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern); PUTOFFSET(offset, parsed_pattern);
@ -4629,8 +4640,6 @@ while (ptr < ptrend)
*parsed_pattern++ = META_KET; *parsed_pattern++ = META_KET;
} }
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
else top_nest--; else top_nest--;
} }
@ -4895,7 +4904,7 @@ range. */
if ((options & PCRE2_CASELESS) != 0) if ((options & PCRE2_CASELESS) != 0)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0) if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{ {
int rc; int rc;
uint32_t oc, od; uint32_t oc, od;
@ -5310,7 +5319,8 @@ dynamically as we process the pattern. */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0; BOOL utf = (options & PCRE2_UTF) != 0;
#else /* No UTF support */ BOOL ucp = (options & PCRE2_UCP) != 0;
#else /* No Unicode support */
BOOL utf = FALSE; BOOL utf = FALSE;
#endif #endif
@ -5555,12 +5565,12 @@ for (;; pptr++)
zerofirstcu = firstcu; zerofirstcu = firstcu;
zerofirstcuflags = firstcuflags; zerofirstcuflags = firstcuflags;
/* For caseless UTF mode, check whether this character has more than /* For caseless UTF or UCP mode, check whether this character has more
one other case. If so, generate a special OP_NOTPROP item instead of than one other case. If so, generate a special OP_NOTPROP item instead of
OP_NOTI. */ OP_NOTI. */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0 && if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
(d = UCD_CASESET(c)) != 0) (d = UCD_CASESET(c)) != 0)
{ {
*code++ = OP_NOTPROP; *code++ = OP_NOTPROP;
@ -5593,7 +5603,7 @@ for (;; pptr++)
uint32_t d; uint32_t d;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && c > 127) d = UCD_OTHERCASE(c); else if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
#endif #endif
{ {
#if PCRE2_CODE_UNIT_WIDTH != 8 #if PCRE2_CODE_UNIT_WIDTH != 8
@ -6667,23 +6677,11 @@ for (;; pptr++)
} }
/* For a back reference, update the back reference map and the /* For a back reference, update the back reference map and the
maximum back reference. Then, for each group, we must check to maximum back reference. */
see if it is recursive, that is, it is inside the group that it
references. A flag is set so that the group can be made atomic.
*/
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
if (groupnumber > cb->top_backref) if (groupnumber > cb->top_backref)
cb->top_backref = groupnumber; cb->top_backref = groupnumber;
for (oc = cb->open_caps; oc != NULL; oc = oc->next)
{
if (oc->number == groupnumber)
{
oc->flag = TRUE;
break;
}
}
} }
} }
@ -7077,15 +7075,18 @@ for (;; pptr++)
previous[GET(previous, 1)] != OP_ALT) previous[GET(previous, 1)] != OP_ALT)
goto END_REPEAT; goto END_REPEAT;
/* There is no sense in actually repeating assertions. The only /* Perl allows all assertions to be quantified, and when they contain
potential use of repetition is in cases when the assertion is optional. capturing parentheses and/or are optional there are potential uses for
Therefore, if the minimum is greater than zero, just ignore the repeat. this feature. PCRE2 used to force the maximum quantifier to 1 on the
If the maximum is not zero or one, set it to 1. */ invalid grounds that further repetition was never useful. This was
always a bit pointless, since an assertion could be wrapped with a
repeated group to achieve the effect. General repetition is now
permitted, but if the maximum is unlimited it is set to one more than
the minimum. */
if (op_previous < OP_ONCE) /* Assertion */ if (op_previous < OP_ONCE) /* Assertion */
{ {
if (repeat_min > 0) goto END_REPEAT; if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
if (repeat_max > 1) repeat_max = 1;
} }
/* The case of a zero minimum is special because of the need to stick /* The case of a zero minimum is special because of the need to stick
@ -7678,19 +7679,6 @@ for (;; pptr++)
cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
/* Check to see if this back reference is recursive, that it, it
is inside the group that it references. A flag is set so that the
group can be made atomic. */
for (oc = cb->open_caps; oc != NULL; oc = oc->next)
{
if (oc->number == meta_arg)
{
oc->flag = TRUE;
break;
}
}
break; break;
@ -7836,11 +7824,12 @@ for (;; pptr++)
NORMAL_CHAR_SET: /* Character is already in meta */ NORMAL_CHAR_SET: /* Character is already in meta */
matched_char = TRUE; matched_char = TRUE;
/* For caseless UTF mode, check whether this character has more than one /* For caseless UTF or UCP mode, check whether this character has more than
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
*/
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0) if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
{ {
uint32_t caseset = UCD_CASESET(meta); uint32_t caseset = UCD_CASESET(meta);
if (caseset != 0) if (caseset != 0)
@ -8049,7 +8038,6 @@ if (*code == OP_CBRA)
capnumber = GET2(code, 1 + LINK_SIZE); capnumber = GET2(code, 1 + LINK_SIZE);
capitem.number = capnumber; capitem.number = capnumber;
capitem.next = cb->open_caps; capitem.next = cb->open_caps;
capitem.flag = FALSE;
capitem.assert_depth = cb->assert_depth; capitem.assert_depth = cb->assert_depth;
cb->open_caps = &capitem; cb->open_caps = &capitem;
} }
@ -8178,26 +8166,9 @@ for (;;)
PUT(code, 1, (int)(code - start_bracket)); PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE; code += 1 + LINK_SIZE;
/* If it was a capturing subpattern, check to see if it contained any /* If it was a capturing subpattern, remove the block from the chain. */
recursive back references. If so, we must wrap it in atomic brackets. In
any event, remove the block from the chain. */
if (capnumber > 0) if (capnumber > 0) cb->open_caps = cb->open_caps->next;
{
if (cb->open_caps->flag)
{
(void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
CU2BYTES(code - start_bracket));
*start_bracket = OP_ONCE;
code += 1 + LINK_SIZE;
PUT(start_bracket, 1, (int)(code - start_bracket));
*code = OP_KET;
PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
length += 2 + 2*LINK_SIZE;
}
cb->open_caps = cb->open_caps->next;
}
/* Set values to pass back */ /* Set values to pass back */
@ -8832,9 +8803,10 @@ memset(slot + IMM2_SIZE + length, 0,
/* This function is called to skip parts of the parsed pattern when finding the /* This function is called to skip parts of the parsed pattern when finding the
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
the end of the branch, it is called to skip over an internal lookaround, and it the end of the branch, it is called to skip over an internal lookaround or
is also called to skip to the end of a class, during which it will never (DEFINE) group, and it is also called to skip to the end of a class, during
encounter nested groups (but there's no need to have special code for that). which it will never encounter nested groups (but there's no need to have
special code for that).
When called to find the end of a branch or group, pptr must point to the first When called to find the end of a branch or group, pptr must point to the first
meta code inside the branch, not the branch-starting code. In other cases it meta code inside the branch, not the branch-starting code. In other cases it
@ -9312,14 +9284,21 @@ for (;; pptr++)
itemlength = grouplength; itemlength = grouplength;
break; break;
/* Check nested groups - advance past the initial data for each type and /* A (DEFINE) group is never obeyed inline and so it does not contribute to
then seek a fixed length with get_grouplength(). */ the length of this branch. Skip from the following item to the next
unpaired ket. */
case META_COND_DEFINE:
pptr = parsed_skip(pptr + 1, PSKIP_KET);
break;
/* Check other nested groups - advance past the initial data for each type
and then seek a fixed length with get_grouplength(). */
case META_COND_NAME: case META_COND_NAME:
case META_COND_NUMBER: case META_COND_NUMBER:
case META_COND_RNAME: case META_COND_RNAME:
case META_COND_RNUMBER: case META_COND_RNUMBER:
case META_COND_DEFINE:
pptr += 2 + SIZEOFFSET; pptr += 2 + SIZEOFFSET;
goto CHECK_GROUP; goto CHECK_GROUP;
@ -9576,6 +9555,10 @@ for (; *pptr != META_END; pptr++)
break; break;
case META_COND_DEFINE: case META_COND_DEFINE:
pptr += SIZEOFFSET;
nestlevel++;
break;
case META_COND_NAME: case META_COND_NAME:
case META_COND_NUMBER: case META_COND_NUMBER:
case META_COND_RNAME: case META_COND_RNAME:
@ -9656,6 +9639,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{ {
BOOL utf; /* Set TRUE for UTF mode */ BOOL utf; /* Set TRUE for UTF mode */
BOOL ucp; /* Set TRUE for UCP mode */
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
pcre2_real_code *re = NULL; /* What we will return */ pcre2_real_code *re = NULL; /* What we will return */
@ -9943,8 +9927,8 @@ if (utf)
/* Check UCP lockout. */ /* Check UCP lockout. */
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == ucp = (cb.external_options & PCRE2_UCP) != 0;
(PCRE2_UCP|PCRE2_NEVER_UCP)) if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
{ {
errorcode = ERR75; errorcode = ERR75;
goto HAD_EARLY_ERROR; goto HAD_EARLY_ERROR;
@ -10154,6 +10138,7 @@ write to the last 8 bytes of the structure before setting the fields. */
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8); memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
re->memctl = ccontext->memctl; re->memctl = ccontext->memctl;
re->tables = tables; re->tables = tables;
re->executable_jit = NULL;
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t)); memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
re->blocksize = re_blocksize; re->blocksize = re_blocksize;
re->magic_number = MAGIC_NUMBER; re->magic_number = MAGIC_NUMBER;
@ -10319,7 +10304,7 @@ function call. */
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
{ {
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
} }
/* Failed to compile, or error while post-processing. */ /* Failed to compile, or error while post-processing. */
@ -10367,21 +10352,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((firstcuflags & REQ_CASELESS) != 0) if ((firstcuflags & REQ_CASELESS) != 0)
{ {
if (firstcu < 128 || (!utf && firstcu < 255)) if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
{ {
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
} }
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
8-bit UTF mode, codepoints in the range 128-255 are introductory code In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case. In 16-bit and 32-bit modes, we can points and cannot have another case, but if UCP is set they may do. */
check wide characters when UTF (and therefore UCP) is supported. */
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
else if (firstcu <= MAX_UTF_CODE_POINT && #if PCRE2_CODE_UNIT_WIDTH == 8
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#else
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu) UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS; re->flags |= PCRE2_FIRSTCASELESS;
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
@ -10430,14 +10419,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((reqcuflags & REQ_CASELESS) != 0) if ((reqcuflags & REQ_CASELESS) != 0)
{ {
if (reqcu < 128 || (!utf && reqcu < 255)) if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
{ {
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
} }
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) #if PCRE2_CODE_UNIT_WIDTH == 8
re->flags |= PCRE2_LASTCASELESS; else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#else
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
} }

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2017 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -43,7 +43,8 @@ POSSIBILITY OF SUCH DAMAGE.
#endif #endif
/* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes /* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes
its value gets changed by pcre2_internal.h to be in code units. */ its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to
be in code units. */
static int configured_link_size = LINK_SIZE; static int configured_link_size = LINK_SIZE;
@ -94,6 +95,7 @@ if (where == NULL) /* Requests a length */
case PCRE2_CONFIG_NEWLINE: case PCRE2_CONFIG_NEWLINE:
case PCRE2_CONFIG_PARENSLIMIT: case PCRE2_CONFIG_PARENSLIMIT:
case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */ case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */
case PCRE2_CONFIG_TABLES_LENGTH:
case PCRE2_CONFIG_UNICODE: case PCRE2_CONFIG_UNICODE:
return sizeof(uint32_t); return sizeof(uint32_t);
@ -191,6 +193,10 @@ switch (what)
*((uint32_t *)where) = 0; *((uint32_t *)where) = 0;
break; break;
case PCRE2_CONFIG_TABLES_LENGTH:
*((uint32_t *)where) = TABLES_LENGTH;
break;
case PCRE2_CONFIG_UNICODE_VERSION: case PCRE2_CONFIG_UNICODE_VERSION:
{ {
#if defined SUPPORT_UNICODE #if defined SUPPORT_UNICODE

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0; BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else #else
BOOL utf = FALSE; BOOL utf = FALSE;
#endif #endif
@ -2190,7 +2191,7 @@ for (;;)
if (clen == 0) break; if (clen == 0) break;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf_or_ucp)
{ {
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{ {
@ -2204,7 +2205,7 @@ for (;;)
} }
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Not UTF mode */ /* Not UTF or UCP mode */
{ {
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); } { ADD_NEW(state_offset + 2, 0); }
@ -2339,7 +2340,7 @@ for (;;)
{ {
uint32_t otherd; uint32_t otherd;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2374,7 +2375,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2417,7 +2418,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2458,7 +2459,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2491,7 +2492,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -2531,7 +2532,7 @@ for (;;)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && d >= 128) if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d); otherd = UCD_OTHERCASE(d);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{ {
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && first_cu > 127) #if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
else else
@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0)
{ {
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); #if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -76,6 +76,17 @@ typedef int BOOL;
#include <valgrind/memcheck.h> #include <valgrind/memcheck.h>
#endif #endif
/* -ftrivial-auto-var-init support supports initializing all local variables
to avoid some classes of bug, but this can cause an unacceptable slowdown
for large on-stack arrays in hot functions. This macro lets us annotate
such arrays. */
#ifdef HAVE_ATTRIBUTE_UNINITIALIZED
#define PCRE2_KEEP_UNINITIALIZED __attribute__((uninitialized))
#else
#define PCRE2_KEEP_UNINITIALIZED
#endif
/* Older versions of MSVC lack snprintf(). This define allows for /* Older versions of MSVC lack snprintf(). This define allows for
warning/error-free compilation and testing with MSVC compilers back to at least warning/error-free compilation and testing with MSVC compilers back to at least
MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
@ -579,7 +590,7 @@ total length of the tables. */
#define fcc_offset 256 /* Flip case */ #define fcc_offset 256 /* Flip case */
#define cbits_offset 512 /* Character classes */ #define cbits_offset 512 /* Character classes */
#define ctypes_offset (cbits_offset + cbit_length) /* Character types */ #define ctypes_offset (cbits_offset + cbit_length) /* Character types */
#define tables_length (ctypes_offset + 256) #define TABLES_LENGTH (ctypes_offset + 256)
/* -------------------- Character and string names ------------------------ */ /* -------------------- Character and string names ------------------------ */
@ -1759,13 +1770,11 @@ typedef struct pcre2_memctl {
/* Structure for building a chain of open capturing subpatterns during /* Structure for building a chain of open capturing subpatterns during
compiling, so that instructions to close them can be compiled when (*ACCEPT) is compiling, so that instructions to close them can be compiled when (*ACCEPT) is
encountered. This is also used to identify subpatterns that contain recursive encountered. */
back references to themselves, so that they can be made atomic. */
typedef struct open_capitem { typedef struct open_capitem {
struct open_capitem *next; /* Chain link */ struct open_capitem *next; /* Chain link */
uint16_t number; /* Capture number */ uint16_t number; /* Capture number */
uint16_t flag; /* Set TRUE if recursive back ref */
uint16_t assert_depth; /* Assertion depth when opened */ uint16_t assert_depth; /* Assertion depth when opened */
} open_capitem; } open_capitem;
@ -1954,7 +1963,7 @@ is available. */
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) #define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) #define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
const compile_block *); const compile_block *);
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
int *, uint32_t, uint32_t, BOOL, compile_block *); int *, uint32_t, uint32_t, BOOL, compile_block *);

View file

@ -618,6 +618,7 @@ here.) */
typedef struct pcre2_real_code { typedef struct pcre2_real_code {
pcre2_memctl memctl; /* Memory control fields */ pcre2_memctl memctl; /* Memory control fields */
const uint8_t *tables; /* The character tables */ const uint8_t *tables; /* The character tables */
void *executable_jit; /* Pointer to JIT code */
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */
uint32_t magic_number; /* Paranoid and endianness check */ uint32_t magic_number; /* Paranoid and endianness check */

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -41,10 +41,11 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains the external function pcre2_maketables(), which builds /* This module contains the external function pcre2_maketables(), which builds
character tables for PCRE2 in the current locale. The file is compiled on its character tables for PCRE2 in the current locale. The file is compiled on its
own as part of the PCRE2 library. However, it is also included in the own as part of the PCRE2 library. It is also included in the compilation of
compilation of dftables.c, in which case the macro DFTABLES is defined. */ pcre2_dftables.c as a freestanding program, in which case the macro
PCRE2_DFTABLES is defined. */
#ifndef DFTABLES #ifndef PCRE2_DFTABLES /* Compiling the library */
# ifdef HAVE_CONFIG_H # ifdef HAVE_CONFIG_H
# include "config.h" # include "config.h"
# endif # endif
@ -61,28 +62,29 @@ compilation of dftables.c, in which case the macro DFTABLES is defined. */
a pointer to them. They are build using the ctype functions, and consequently a pointer to them. They are build using the ctype functions, and consequently
their contents will depend upon the current locale setting. When compiled as their contents will depend upon the current locale setting. When compiled as
part of the library, the store is obtained via a general context malloc, if part of the library, the store is obtained via a general context malloc, if
supplied, but when DFTABLES is defined (when compiling the dftables auxiliary supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables
program) malloc() is used, and the function has a different name so as not to freestanding auxiliary program) malloc() is used, and the function has a
clash with the prototype in pcre2.h. different name so as not to clash with the prototype in pcre2.h.
Arguments: none when DFTABLES is defined Arguments: none when PCRE2_DFTABLES is defined
else a PCRE2 general context or NULL else a PCRE2 general context or NULL
Returns: pointer to the contiguous block of data Returns: pointer to the contiguous block of data
else NULL if memory allocation failed
*/ */
#ifdef DFTABLES /* Included in freestanding dftables.c program */ #ifdef PCRE2_DFTABLES /* Included in freestanding pcre2_dftables program */
static const uint8_t *maketables(void) static const uint8_t *maketables(void)
{ {
uint8_t *yield = (uint8_t *)malloc(tables_length); uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH);
#else /* Not DFTABLES, compiling the library */ #else /* Not PCRE2_DFTABLES, that is, compiling the library */
PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION
pcre2_maketables(pcre2_general_context *gcontext) pcre2_maketables(pcre2_general_context *gcontext)
{ {
uint8_t *yield = (uint8_t *)((gcontext != NULL)? uint8_t *yield = (uint8_t *)((gcontext != NULL)?
gcontext->memctl.malloc(tables_length, gcontext->memctl.memory_data) : gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) :
malloc(tables_length)); malloc(TABLES_LENGTH));
#endif /* DFTABLES */ #endif /* PCRE2_DFTABLES */
int i; int i;
uint8_t *p; uint8_t *p;
@ -103,8 +105,8 @@ exclusive ones - in some locales things may be different.
Note that the table for "space" includes everything "isspace" gives, including Note that the table for "space" includes everything "isspace" gives, including
VT in the default locale. This makes it work for the POSIX class [:space:]. VT in the default locale. This makes it work for the POSIX class [:space:].
From release 8.34 is is also correct for Perl space, because Perl added VT at From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl
release 5.18. space, because Perl added VT at release 5.18.
Note also that it is possible for a character to be alnum or alpha without Note also that it is possible for a character to be alnum or alpha without
being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the
@ -114,24 +116,24 @@ test for alnum specially. */
memset(p, 0, cbit_length); memset(p, 0, cbit_length);
for (i = 0; i < 256; i++) for (i = 0; i < 256; i++)
{ {
if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7); if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7);
if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7); if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7);
if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7); if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7);
if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7); if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7);
if (i == '_') p[cbit_word + i/8] |= 1u << (i&7); if (i == '_') p[cbit_word + i/8] |= 1u << (i&7);
if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7); if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7);
if (isxdigit(i))p[cbit_xdigit + i/8] |= 1u << (i&7); if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7);
if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7); if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7);
if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7); if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7);
if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7); if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7);
if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7); if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7);
} }
p += cbit_length; p += cbit_length;
/* Finally, the character type table. In this, we used to exclude VT from the /* Finally, the character type table. In this, we used to exclude VT from the
white space chars, because Perl didn't recognize it as such for \s and for white space chars, because Perl didn't recognize it as such for \s and for
comments within regexes. However, Perl changed at release 5.18, so PCRE changed comments within regexes. However, Perl changed at release 5.18, so PCRE1
at release 8.34. */ changed at release 8.34 and it's always been this way for PCRE2. */
for (i = 0; i < 256; i++) for (i = 0; i < 256; i++)
{ {
@ -147,7 +149,7 @@ for (i = 0; i < 256; i++)
return yield; return yield;
} }
#ifndef DFTABLES #ifndef PCRE2_DFTABLES /* Compiling the library */
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables) pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables)
{ {

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2019 University of Cambridge New API code Copyright (c) 2015-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset];
if (caseless) if (caseless)
{ {
#if defined SUPPORT_UNICODE #if defined SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UTF) != 0) BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
if (utf || (mb->poptions & PCRE2_UCP) != 0)
{ {
PCRE2_SPTR endptr = p + length;
/* Match characters up to the end of the reference. NOTE: the number of /* Match characters up to the end of the reference. NOTE: the number of
code units matched may differ, because in UTF-8 there are some characters code units matched may differ, because in UTF-8 there are some characters
whose upper and lower case codes have different numbers of bytes. For whose upper and lower case codes have different numbers of bytes. For
@ -390,16 +394,25 @@ if (caseless)
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
sequence of two of the latter. It is important, therefore, to check the sequence of two of the latter. It is important, therefore, to check the
length along the reference, not along the subject (earlier code did this length along the reference, not along the subject (earlier code did this
wrong). */ wrong). UCP without uses Unicode properties but without UTF encoding. */
PCRE2_SPTR endptr = p + length;
while (p < endptr) while (p < endptr)
{ {
uint32_t c, d; uint32_t c, d;
const ucd_record *ur; const ucd_record *ur;
if (eptr >= mb->end_subject) return 1; /* Partial match */ if (eptr >= mb->end_subject) return 1; /* Partial match */
GETCHARINC(c, eptr);
GETCHARINC(d, p); if (utf)
{
GETCHARINC(c, eptr);
GETCHARINC(d, p);
}
else
{
c = *eptr++;
d = *p++;
}
ur = GET_UCD(d); ur = GET_UCD(d);
if (c != d && c != (uint32_t)((int)d + ur->other_case)) if (c != d && c != (uint32_t)((int)d + ur->other_case))
{ {
@ -415,7 +428,7 @@ if (caseless)
else else
#endif #endif
/* Not in UTF mode */ /* Not in UTF or UCP mode */
{ {
for (; length > 0; length--) for (; length > 0; length--)
{ {
@ -432,7 +445,8 @@ if (caseless)
} }
/* In the caseful case, we can just compare the code units, whether or not we /* In the caseful case, we can just compare the code units, whether or not we
are in UTF mode. When partial matching, we have to do this unit-by-unit. */ are in UTF and/or UCP mode. When partial matching, we have to do this unit by
unit. */
else else
{ {
@ -574,8 +588,8 @@ match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
heapframe *F; /* Current frame pointer */ heapframe *F; /* Current frame pointer */
heapframe *N = NULL; /* Temporary frame pointers */ heapframe *N = NULL; /* Temporary frame pointers */
heapframe *P = NULL; heapframe *P = NULL;
heapframe *assert_accept_frame; /* For passing back the frame with captures */ heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
/* Local variables that do not need to be preserved over calls to RRMATCH(). */ /* Local variables that do not need to be preserved over calls to RRMATCH(). */
@ -598,12 +612,13 @@ BOOL condition; /* Used in conditional groups */
BOOL cur_is_word; /* Used in "word" tests */ BOOL cur_is_word; /* Used in "word" tests */
BOOL prev_is_word; /* Used in "word" tests */ BOOL prev_is_word; /* Used in "word" tests */
/* UTF flag */ /* UTF and UCP flags */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0; BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else #else
BOOL utf = FALSE; BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif #endif
/* This is the length of the last part of a backtracking frame that must be /* This is the length of the last part of a backtracking frame that must be
@ -928,6 +943,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
} }
else else
#endif #endif
/* Not UTF mode */ /* Not UTF mode */
{ {
if (mb->end_subject - Feptr < 1) if (mb->end_subject - Feptr < 1)
@ -987,10 +1003,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
} }
} }
/* If UCP is set without UTF we must do the same as above, but with one
character per code unit. */
else if (ucp)
{
uint32_t cc = UCHAR21(Feptr);
fc = Fecode[1];
if (fc < 128)
{
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
}
else
{
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
Feptr++;
Fecode += 2;
}
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Not UTF mode; use the table for characters < 256. */ /* Not UTF or UCP mode; use the table for characters < 256. */
{ {
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@ -1010,6 +1046,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL(); SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
} }
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf)
{ {
@ -1026,15 +1063,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (ch > 127) if (ch > 127)
ch = UCD_OTHERCASE(ch); ch = UCD_OTHERCASE(ch);
else else
ch = TABLE_GET(ch, mb->fcc, ch); ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH); if (ch == fc) RRETURN(MATCH_NOMATCH);
} }
} }
/* UCP without UTF is as above, but with one character per code unit. */
else if (ucp)
{
uint32_t ch;
fc = UCHAR21INC(Feptr);
ch = Fecode[1];
Fecode += 2;
if (ch == fc)
{
RRETURN(MATCH_NOMATCH); /* Caseful match */
}
else if (Fop == OP_NOTI) /* If caseless */
{
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Neither UTF nor UCP is set */
{ {
uint32_t ch = Fecode[1]; uint32_t ch = Fecode[1];
fc = *Feptr++; fc = UCHAR21INC(Feptr);
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
Fecode += 2; Fecode += 2;
@ -1244,7 +1308,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* When not in UTF mode, load a single-code-unit character. Then proceed as /* When not in UTF mode, load a single-code-unit character. Then proceed as
above. */ above, using Unicode casing if either UTF or UCP is set. */
Lc = *Fecode++; Lc = *Fecode++;
@ -1253,11 +1317,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_STARI) if (Fop >= OP_STARI)
{ {
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
/* Lc must be < 128 in UTF-8 mode. */ #ifdef SUPPORT_UNICODE
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
/* Lc will be < 128 in UTF-8 mode. */
Loc = mb->fcc[Lc]; Loc = mb->fcc[Lc];
#else /* 16-bit & 32-bit */ #else /* 16-bit & 32-bit */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
Loc = TABLE_GET(Lc, mb->fcc, Lc); Loc = TABLE_GET(Lc, mb->fcc, Lc);
@ -1490,7 +1558,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_NOTSTARI) /* Caseless */ if (Fop >= OP_NOTSTARI) /* Caseless */
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && Lc > 127) if ((utf || ucp) && Lc > 127)
Loc = UCD_OTHERCASE(Lc); Loc = UCD_OTHERCASE(Lc);
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -6045,7 +6113,6 @@ BOOL firstline;
BOOL has_first_cu = FALSE; BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE; BOOL has_req_cu = FALSE;
BOOL startline; BOOL startline;
BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE; BOOL memchr_not_found_first_cu = FALSE;
@ -6069,13 +6136,19 @@ PCRE2_SPTR match_partial;
BOOL use_jit; BOOL use_jit;
#endif #endif
/* This flag is needed even when Unicode is not supported for convenience
(it is used by the IS_NEWLINE macro). */
BOOL utf = FALSE;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL ucp = FALSE;
BOOL allow_invalid; BOOL allow_invalid;
uint32_t fragment_options = 0; uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT #ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE; BOOL jit_checked_utf = FALSE;
#endif #endif
#endif #endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size; PCRE2_SIZE frame_size;
@ -6091,7 +6164,8 @@ proves to be too small, it is replaced by a larger one on the heap. To get a
vector of the size required that is aligned for pointers, allocate it as a vector of the size required that is aligned for pointers, allocate it as a
vector of pointers. */ vector of pointers. */
PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]; PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
PCRE2_KEEP_UNINITIALIZED;
mb->stack_frames = (heapframe *)stack_frames_vector; mb->stack_frames = (heapframe *)stack_frames_vector;
/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
@ -6147,12 +6221,13 @@ use_jit = (re->executable_jit != NULL &&
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
#endif #endif
/* Initialize UTF parameters. */ /* Initialize UTF/UCP parameters. */
utf = (re->overall_options & PCRE2_UTF) != 0;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
utf = (re->overall_options & PCRE2_UTF) != 0;
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
#endif ucp = (re->overall_options & PCRE2_UCP) != 0;
#endif /* SUPPORT_UNICODE */
/* Convert the partial matching flags into an integer. */ /* Convert the partial matching flags into an integer. */
@ -6589,9 +6664,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{ {
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); #if PCRE2_CODE_UNIT_WIDTH == 8
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
#else
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
else else
@ -6607,9 +6686,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0)
{ {
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 #ifdef SUPPORT_UNICODE
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); #if PCRE2_CODE_UNIT_WIDTH == 8
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
#else
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
#endif #endif
#endif /* SUPPORT_UNICODE */
} }
} }
@ -6756,15 +6839,16 @@ for(;;)
#endif #endif
} }
/* If we can't find the required code unit, having reached the true end /* If we can't find the required first code unit, having reached the
of the subject, break the bumpalong loop, to force a match failure, true end of the subject, break the bumpalong loop, to force a match
except when doing partial matching, when we let the next cycle run at failure, except when doing partial matching, when we let the next cycle
the end of the subject. To see why, consider the pattern /(?<=abc)def/, run at the end of the subject. To see why, consider the pattern
which partially matches "abc", even though the string does not contain /(?<=abc)def/, which partially matches "abc", even though the string
the starting character "d". If we have not reached the true end of the does not contain the starting character "d". If we have not reached the
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
we also let the cycle run, because the matching string is legitimately temporarily modified) we also let the cycle run, because the matching
allowed to start with the first code unit of a newline. */ string is legitimately allowed to start with the first code unit of a
newline. */
if (mb->partial == 0 && start_match >= mb->end_subject) if (mb->partial == 0 && start_match >= mb->end_subject)
{ {

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -90,7 +90,7 @@ if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL)
if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA;
/* Compute total size. */ /* Compute total size. */
total_size = sizeof(pcre2_serialized_data) + tables_length; total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH;
tables = NULL; tables = NULL;
for (i = 0; i < number_of_codes; i++) for (i = 0; i < number_of_codes; i++)
@ -121,8 +121,8 @@ data->number_of_codes = number_of_codes;
/* Copy all compiled code data. */ /* Copy all compiled code data. */
dst_bytes = bytes + sizeof(pcre2_serialized_data); dst_bytes = bytes + sizeof(pcre2_serialized_data);
memcpy(dst_bytes, tables, tables_length); memcpy(dst_bytes, tables, TABLES_LENGTH);
dst_bytes += tables_length; dst_bytes += TABLES_LENGTH;
for (i = 0; i < number_of_codes; i++) for (i = 0; i < number_of_codes; i++)
{ {
@ -142,7 +142,9 @@ for (i = 0; i < number_of_codes; i++)
(void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0, (void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0,
sizeof(pcre2_memctl)); sizeof(pcre2_memctl));
(void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0, (void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0,
sizeof(void *)); sizeof(void *));
(void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0,
sizeof(void *));
dst_bytes += re->blocksize; dst_bytes += re->blocksize;
} }
@ -187,12 +189,12 @@ src_bytes = bytes + sizeof(pcre2_serialized_data);
/* Decode tables. The reference count for the tables is stored immediately /* Decode tables. The reference count for the tables is stored immediately
following them. */ following them. */
tables = memctl->malloc(tables_length + sizeof(PCRE2_SIZE), memctl->memory_data); tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data);
if (tables == NULL) return PCRE2_ERROR_NOMEMORY; if (tables == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(tables, src_bytes, tables_length); memcpy(tables, src_bytes, TABLES_LENGTH);
*(PCRE2_SIZE *)(tables + tables_length) = number_of_codes; *(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes;
src_bytes += tables_length; src_bytes += TABLES_LENGTH;
/* Decode the byte stream. We must not try to read the size from the compiled /* Decode the byte stream. We must not try to read the size from the compiled
code block in the stream, because it might be unaligned, which causes errors on code block in the stream, because it might be unaligned, which causes errors on
@ -238,6 +240,7 @@ for (i = 0; i < number_of_codes; i++)
/* At the moment only one table is supported. */ /* At the moment only one table is supported. */
dst_re->tables = tables; dst_re->tables = tables;
dst_re->executable_jit = NULL;
dst_re->flags |= PCRE2_DEREF_TABLES; dst_re->flags |= PCRE2_DEREF_TABLES;
codes[i] = dst_re; codes[i] = dst_re;

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -58,7 +58,7 @@ collecting data (e.g. minimum matching length). */
/* Returns from set_start_bits() */ /* Returns from set_start_bits() */
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
/************************************************* /*************************************************
@ -772,15 +772,19 @@ Arguments:
p points to the first code unit of the character p points to the first code unit of the character
caseless TRUE if caseless caseless TRUE if caseless
utf TRUE for UTF mode utf TRUE for UTF mode
ucp TRUE for UCP mode
Returns: pointer after the character Returns: pointer after the character
*/ */
static PCRE2_SPTR static PCRE2_SPTR
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf) set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
BOOL ucp)
{ {
uint32_t c = *p++; /* First code unit */ uint32_t c = *p++; /* First code unit */
(void)utf; /* Stop compiler warning when UTF not supported */
(void)utf; /* Stop compiler warnings when UTF not supported */
(void)ucp;
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */ 0xff. */
@ -810,22 +814,26 @@ if (utf)
if (caseless) if (caseless)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf || ucp)
{ {
c = UCD_OTHERCASE(c);
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
PCRE2_UCHAR buff[6]; if (utf)
c = UCD_OTHERCASE(c); {
(void)PRIV(ord2utf)(c, buff); PCRE2_UCHAR buff[6];
SET_BIT(buff[0]); (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
else if (c < 256) SET_BIT(c);
#else /* 16-bit or 32-bit mode */ #else /* 16-bit or 32-bit mode */
c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif #endif
} }
else else
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
/* Not UTF */ /* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
} }
@ -924,19 +932,26 @@ The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
must continue at the outer level to find at least one mandatory code unit. At must continue at the outer level to find at least one mandatory code unit. At
the outermost level, this function fails unless the result is SSB_DONE. the outermost level, this function fails unless the result is SSB_DONE.
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
issues.
Arguments: Arguments:
re points to the compiled regex block re points to the compiled regex block
code points to an expression code points to an expression
utf TRUE if in UTF mode utf TRUE if in UTF mode
ucp TRUE if in UCP mode
depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units Returns: SSB_FAIL => Failed to find any starting code units
SSB_DONE => Found mandatory starting code units SSB_DONE => Found mandatory starting code units
SSB_CONTINUE => Found optional starting code units SSB_CONTINUE => Found optional starting code units
SSB_UNKNOWN => Hit an unrecognized opcode SSB_UNKNOWN => Hit an unrecognized opcode
SSB_TOODEEP => Recursion is too deep
*/ */
static int static int
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf) set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
int *depthptr)
{ {
uint32_t c; uint32_t c;
int yield = SSB_DONE; int yield = SSB_DONE;
@ -947,6 +962,9 @@ int table_limit = utf? 16:32;
int table_limit = 32; int table_limit = 32;
#endif #endif
*depthptr += 1;
if (*depthptr > 1000) return SSB_TOODEEP;
do do
{ {
BOOL try_next = TRUE; BOOL try_next = TRUE;
@ -1103,13 +1121,17 @@ do
case OP_SCRIPT_RUN: case OP_SCRIPT_RUN:
case OP_ASSERT: case OP_ASSERT:
case OP_ASSERT_NA: case OP_ASSERT_NA:
rc = set_start_bits(re, tcode, utf); rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; if (rc == SSB_DONE)
if (rc == SSB_DONE) try_next = FALSE; else {
try_next = FALSE;
}
else if (rc == SSB_CONTINUE)
{ {
do tcode += GET(tcode, 1); while (*tcode == OP_ALT); do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE; tcode += 1 + LINK_SIZE;
} }
else return rc; /* FAIL, UNKNOWN, or TOODEEP */
break; break;
/* If we hit ALT or KET, it means we haven't found anything mandatory in /* If we hit ALT or KET, it means we haven't found anything mandatory in
@ -1155,8 +1177,8 @@ do
case OP_BRAZERO: case OP_BRAZERO:
case OP_BRAMINZERO: case OP_BRAMINZERO:
case OP_BRAPOSZERO: case OP_BRAPOSZERO:
rc = set_start_bits(re, ++tcode, utf); rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT); do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE; tcode += 1 + LINK_SIZE;
break; break;
@ -1177,7 +1199,7 @@ do
case OP_QUERY: case OP_QUERY:
case OP_MINQUERY: case OP_MINQUERY:
case OP_POSQUERY: case OP_POSQUERY:
tcode = set_table_bit(re, tcode + 1, FALSE, utf); tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break; break;
case OP_STARI: case OP_STARI:
@ -1186,7 +1208,7 @@ do
case OP_QUERYI: case OP_QUERYI:
case OP_MINQUERYI: case OP_MINQUERYI:
case OP_POSQUERYI: case OP_POSQUERYI:
tcode = set_table_bit(re, tcode + 1, TRUE, utf); tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break; break;
/* Single-char upto sets the bit and tries the next */ /* Single-char upto sets the bit and tries the next */
@ -1194,13 +1216,13 @@ do
case OP_UPTO: case OP_UPTO:
case OP_MINUPTO: case OP_MINUPTO:
case OP_POSUPTO: case OP_POSUPTO:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf); tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break; break;
case OP_UPTOI: case OP_UPTOI:
case OP_MINUPTOI: case OP_MINUPTOI:
case OP_POSUPTOI: case OP_POSUPTOI:
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf); tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break; break;
/* At least one single char sets the bit and stops */ /* At least one single char sets the bit and stops */
@ -1212,7 +1234,7 @@ do
case OP_PLUS: case OP_PLUS:
case OP_MINPLUS: case OP_MINPLUS:
case OP_POSPLUS: case OP_POSPLUS:
(void)set_table_bit(re, tcode + 1, FALSE, utf); (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE; try_next = FALSE;
break; break;
@ -1223,7 +1245,7 @@ do
case OP_PLUSI: case OP_PLUSI:
case OP_MINPLUSI: case OP_MINPLUSI:
case OP_POSPLUSI: case OP_POSPLUSI:
(void)set_table_bit(re, tcode + 1, TRUE, utf); (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE; try_next = FALSE;
break; break;
@ -1652,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
int count = 0; int count = 0;
PCRE2_UCHAR *code; PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0; BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */ /* Find start of compiled code */
@ -1664,7 +1687,8 @@ code units. */
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{ {
int rc = set_start_bits(re, code, utf); int depth = 0;
int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1; if (rc == SSB_UNKNOWN) return 1;
/* If a list of starting code units was set up, scan the list to see if only /* If a list of starting code units was set up, scan the list to see if only
@ -1712,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
} }
/* c contains the code unit value, in the range 0-255. In 8-bit UTF /* c contains the code unit value, in the range 0-255. In 8-bit UTF
mode, only values < 128 can be used. */ mode, only values < 128 can be used. In all the other cases, c is a
character value. */
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
if (c > 127) goto DONE; if (utf && c > 127) goto DONE;
#endif #endif
if (a < 0) a = c; /* First one found */ if (a < 0) a = c; /* First one found, save in a */
else if (b < 0) /* Second one found */ else if (b < 0) /* Second one found */
{ {
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c); int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8 if (utf || ucp)
if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ {
#else /* 16-bit or 32-bit */ if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ if (c > 127) d = UCD_OTHERCASE(c);
if (utf && c > 127) d = UCD_OTHERCASE(c); }
#endif /* Code width */
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
if (d != a) goto DONE; /* Not other case of a */ if (d != a) goto DONE; /* Not the other case of a */
b = c; b = c; /* Save second in b */
} }
else goto DONE; /* More than two characters found */ else goto DONE; /* More than two characters found */
} }

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2019 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -49,8 +49,9 @@ POSSIBILITY OF SUCH DAMAGE.
#define SUBSTITUTE_OPTIONS \ #define SUBSTITUTE_OPTIONS \
(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \ PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
PCRE2_SUBSTITUTE_UNSET_EMPTY) PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
@ -194,6 +195,7 @@ overflow, either give an error immediately, or keep on, accumulating the
length. */ length. */
#define CHECKMEMCPY(from,length) \ #define CHECKMEMCPY(from,length) \
{ \
if (!overflowed && lengthleft < length) \ if (!overflowed && lengthleft < length) \
{ \ { \
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
@ -209,7 +211,8 @@ length. */
memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
buff_offset += length; \ buff_offset += length; \
lengthleft -= length; \ lengthleft -= length; \
} } \
}
/* Here's the function */ /* Here's the function */
@ -226,11 +229,14 @@ int forcecasereset = 0;
uint32_t ovector_count; uint32_t ovector_count;
uint32_t goptions = 0; uint32_t goptions = 0;
uint32_t suboptions; uint32_t suboptions;
BOOL match_data_created = FALSE; pcre2_match_data *internal_match_data = NULL;
BOOL literal = FALSE; BOOL escaped_literal = FALSE;
BOOL overflowed = FALSE; BOOL overflowed = FALSE;
BOOL use_existing_match;
BOOL replacement_only;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0; BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif #endif
PCRE2_UCHAR temp[6]; PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr; PCRE2_SPTR ptr;
@ -248,23 +254,54 @@ lengthleft = buff_length = *blength;
*blength = PCRE2_UNSET; *blength = PCRE2_UNSET;
ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
/* Partial matching is not valid. This must come after setting *blength to /* Partial matching is not valid. This must come after setting *blength to
PCRE2_UNSET, so as not to imply an offset in the replacement. */ PCRE2_UNSET, so as not to imply an offset in the replacement. */
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
return PCRE2_ERROR_BADOPTION; return PCRE2_ERROR_BADOPTION;
/* If no match data block is provided, create one. */ /* Check for using a match that has already happened. Note that the subject
pointer in the match data may be NULL after a no-match. */
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
/* If starting from an existing match, there must be an externally provided
match data block. We create an internal match_data block in two cases: (a) an
external one is not supplied (and we are not starting from an existing match);
(b) an existing match is to be used for the first substitution. In the latter
case, we copy the existing match into the internal block. This ensures that no
changes are made to the existing match data block. */
if (match_data == NULL) if (match_data == NULL)
{
pcre2_general_context *gcontext;
if (use_existing_match) return PCRE2_ERROR_NULL;
gcontext = (mcontext == NULL)?
(pcre2_general_context *)code :
(pcre2_general_context *)mcontext;
match_data = internal_match_data =
pcre2_match_data_create_from_pattern(code, gcontext);
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
}
else if (use_existing_match)
{ {
pcre2_general_context *gcontext = (mcontext == NULL)? pcre2_general_context *gcontext = (mcontext == NULL)?
(pcre2_general_context *)code : (pcre2_general_context *)code :
(pcre2_general_context *)mcontext; (pcre2_general_context *)mcontext;
match_data = pcre2_match_data_create_from_pattern(code, gcontext); int pairs = (code->top_bracket + 1 < match_data->oveccount)?
if (match_data == NULL) return PCRE2_ERROR_NOMEMORY; code->top_bracket + 1 : match_data->oveccount;
match_data_created = TRUE; internal_match_data = pcre2_match_data_create(match_data->oveccount,
gcontext);
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
+ 2*pairs*sizeof(PCRE2_SIZE));
match_data = internal_match_data;
} }
/* Remember ovector details */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
ovector_count = pcre2_get_ovector_count(match_data); ovector_count = pcre2_get_ovector_count(match_data);
@ -286,7 +323,7 @@ repend = replacement + rlength;
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{ {
rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar)); rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
if (rc != 0) if (rc != 0)
{ {
match_data->leftchar = 0; match_data->leftchar = 0;
@ -300,7 +337,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
suboptions = options & SUBSTITUTE_OPTIONS; suboptions = options & SUBSTITUTE_OPTIONS;
options &= ~SUBSTITUTE_OPTIONS; options &= ~SUBSTITUTE_OPTIONS;
/* Copy up to the start offset */ /* Error if the start match offset is greater than the length of the subject. */
if (start_offset > length) if (start_offset > length)
{ {
@ -308,9 +345,13 @@ if (start_offset > length)
rc = PCRE2_ERROR_BADOFFSET; rc = PCRE2_ERROR_BADOFFSET;
goto EXIT; goto EXIT;
} }
CHECKMEMCPY(subject, start_offset);
/* Loop for global substituting. */ /* Copy up to the start offset, unless only the replacement is required. */
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
match is taken from the match_data that was passed in. */
subs = 0; subs = 0;
do do
@ -318,7 +359,12 @@ do
PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
uint32_t ptrstackptr = 0; uint32_t ptrstackptr = 0;
rc = pcre2_match(code, subject, length, start_offset, options|goptions, if (use_existing_match)
{
rc = match_data->rc;
use_existing_match = FALSE;
}
else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
match_data, mcontext); match_data, mcontext);
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
@ -364,44 +410,44 @@ do
#endif #endif
} }
/* Copy what we have advanced past, reset the special global options, and /* Copy what we have advanced past (unless not required), reset the special
continue to the next match. */ global options, and continue to the next match. */
fraglength = start_offset - save_start; fraglength = start_offset - save_start;
CHECKMEMCPY(subject + save_start, fraglength); if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
goptions = 0; goptions = 0;
continue; continue;
} }
/* Handle a successful match. Matches that use \K to end before they start /* Handle a successful match. Matches that use \K to end before they start
or start before the current point in the subject are not supported. */ or start before the current point in the subject are not supported. */
if (ovector[1] < ovector[0] || ovector[0] < start_offset) if (ovector[1] < ovector[0] || ovector[0] < start_offset)
{ {
rc = PCRE2_ERROR_BADSUBSPATTERN; rc = PCRE2_ERROR_BADSUBSPATTERN;
goto EXIT; goto EXIT;
} }
/* Check for the same match as previous. This is legitimate after matching an /* Check for the same match as previous. This is legitimate after matching an
empty string that starts after the initial match offset. We have tried again empty string that starts after the initial match offset. We have tried again
at the match point in case the pattern is one like /(?<=\G.)/ which can never at the match point in case the pattern is one like /(?<=\G.)/ which can never
match at its starting point, so running the match achieves the bumpalong. If match at its starting point, so running the match achieves the bumpalong. If
we do get the same (null) match at the original match point, it isn't such a we do get the same (null) match at the original match point, it isn't such a
pattern, so we now do the empty string magic. In all other cases, a repeat pattern, so we now do the empty string magic. In all other cases, a repeat
match should never occur. */ match should never occur. */
if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
{ {
if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
{ {
goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
ovecsave[2] = start_offset; ovecsave[2] = start_offset;
continue; /* Back to the top of the loop */ continue; /* Back to the top of the loop */
} }
rc = PCRE2_ERROR_INTERNAL_DUPMATCH; rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
goto EXIT; goto EXIT;
} }
/* Count substitutions with a paranoid check for integer overflow; surely no /* Count substitutions with a paranoid check for integer overflow; surely no
real call to this function would ever hit this! */ real call to this function would ever hit this! */
@ -412,21 +458,30 @@ do
} }
subs++; subs++;
/* Copy the text leading up to the match, and remember where the insert /* Copy the text leading up to the match (unless not required), and remember
begins and how many ovector pairs are set. */ where the insert begins and how many ovector pairs are set. */
if (rc == 0) rc = ovector_count; if (rc == 0) rc = ovector_count;
fraglength = ovector[0] - start_offset; fraglength = ovector[0] - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength); if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
scb.output_offsets[0] = buff_offset; scb.output_offsets[0] = buff_offset;
scb.oveccount = rc; scb.oveccount = rc;
/* Process the replacement string. Literal mode is set by \Q, but only in /* Process the replacement string. If the entire replacement is literal, just
extended mode when backslashes are being interpreted. In extended mode we copy it with length check. */
must handle nested substrings that are to be reprocessed. */
ptr = replacement; ptr = replacement;
for (;;) if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
{
CHECKMEMCPY(ptr, rlength);
}
/* Within a non-literal replacement, which must be scanned character by
character, local literal mode can be set by \Q, but only in extended mode
when backslashes are being interpreted. In extended mode we must handle
nested substrings that are to be reprocessed. */
else for (;;)
{ {
uint32_t ch; uint32_t ch;
unsigned int chlen; unsigned int chlen;
@ -443,11 +498,11 @@ do
/* Handle the next character */ /* Handle the next character */
if (literal) if (escaped_literal)
{ {
if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
{ {
literal = FALSE; escaped_literal = FALSE;
ptr += 2; ptr += 2;
continue; continue;
} }
@ -704,7 +759,7 @@ do
if (forcecase != 0) if (forcecase != 0)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf || ucp)
{ {
uint32_t type = UCD_CHARTYPE(ch); uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L && if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -784,7 +839,7 @@ do
continue; continue;
case ESC_Q: case ESC_Q:
literal = TRUE; escaped_literal = TRUE;
continue; continue;
case 0: /* Data character */ case 0: /* Data character */
@ -806,7 +861,7 @@ do
if (forcecase != 0) if (forcecase != 0)
{ {
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf || ucp)
{ {
uint32_t type = UCD_CHARTYPE(ch); uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L && if (PRIV(ucp_gentype)[type] == ucp_L &&
@ -835,53 +890,59 @@ do
} /* End handling a literal code unit */ } /* End handling a literal code unit */
} /* End of loop for scanning the replacement. */ } /* End of loop for scanning the replacement. */
/* The replacement has been copied to the output, or its size has been /* The replacement has been copied to the output, or its size has been
remembered. Do the callout if there is one and we have done an actual remembered. Do the callout if there is one and we have done an actual
replacement. */ replacement. */
if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
{ {
scb.subscount = subs; scb.subscount = subs;
scb.output_offsets[1] = buff_offset; scb.output_offsets[1] = buff_offset;
rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
/* A non-zero return means cancel this substitution. Instead, copy the /* A non-zero return means cancel this substitution. Instead, copy the
matched string fragment. */ matched string fragment. */
if (rc != 0) if (rc != 0)
{ {
PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
PCRE2_SIZE oldlength = ovector[1] - ovector[0]; PCRE2_SIZE oldlength = ovector[1] - ovector[0];
buff_offset -= newlength; buff_offset -= newlength;
lengthleft += newlength; lengthleft += newlength;
CHECKMEMCPY(subject + ovector[0], oldlength); if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
/* A negative return means do not do any more. */ /* A negative return means do not do any more. */
if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
} }
} }
/* Save the details of this match. See above for how this data is used. If we /* Save the details of this match. See above for how this data is used. If we
matched an empty string, do the magic for global matches. Finally, update the matched an empty string, do the magic for global matches. Update the start
start offset to point to the rest of the subject string. */ offset to point to the rest of the subject string. If we re-used an existing
match for the first match, switch to the internal match data block. */
ovecsave[0] = ovector[0];
ovecsave[1] = ovector[1]; ovecsave[0] = ovector[0];
ovecsave[1] = ovector[1];
ovecsave[2] = start_offset; ovecsave[2] = start_offset;
goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
start_offset = ovector[1]; start_offset = ovector[1];
} while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
/* Copy the rest of the subject. */ /* Copy the rest of the subject unless not required, and terminate the output
with a binary zero. */
if (!replacement_only)
{
fraglength = length - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength);
}
fraglength = length - start_offset;
CHECKMEMCPY(subject + start_offset, fraglength);
temp[0] = 0; temp[0] = 0;
CHECKMEMCPY(temp , 1); CHECKMEMCPY(temp, 1);
/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
and matching has carried on after a full buffer, in order to compute the length and matching has carried on after a full buffer, in order to compute the length
@ -903,7 +964,7 @@ else
} }
EXIT: EXIT:
if (match_data_created) pcre2_match_data_free(match_data); if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
else match_data->rc = rc; else match_data->rc = rc;
return rc; return rc;

View file

@ -265,6 +265,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Chakma0 STR_C STR_h STR_a STR_k STR_m STR_a "\0" #define STRING_Chakma0 STR_C STR_h STR_a STR_k STR_m STR_a "\0"
#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0" #define STRING_Cham0 STR_C STR_h STR_a STR_m "\0"
#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0" #define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0"
#define STRING_Chorasmian0 STR_C STR_h STR_o STR_r STR_a STR_s STR_m STR_i STR_a STR_n "\0"
#define STRING_Cn0 STR_C STR_n "\0" #define STRING_Cn0 STR_C STR_n "\0"
#define STRING_Co0 STR_C STR_o "\0" #define STRING_Co0 STR_C STR_o "\0"
#define STRING_Common0 STR_C STR_o STR_m STR_m STR_o STR_n "\0" #define STRING_Common0 STR_C STR_o STR_m STR_m STR_o STR_n "\0"
@ -275,6 +276,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0" #define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0"
#define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0" #define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0"
#define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0" #define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0"
#define STRING_Dives_Akuru0 STR_D STR_i STR_v STR_e STR_s STR_UNDERSCORE STR_A STR_k STR_u STR_r STR_u "\0"
#define STRING_Dogra0 STR_D STR_o STR_g STR_r STR_a "\0" #define STRING_Dogra0 STR_D STR_o STR_g STR_r STR_a "\0"
#define STRING_Duployan0 STR_D STR_u STR_p STR_l STR_o STR_y STR_a STR_n "\0" #define STRING_Duployan0 STR_D STR_u STR_p STR_l STR_o STR_y STR_a STR_n "\0"
#define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0" #define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
@ -306,6 +308,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0" #define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
#define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0" #define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0"
#define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0" #define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
#define STRING_Khitan_Small_Script0 STR_K STR_h STR_i STR_t STR_a STR_n STR_UNDERSCORE STR_S STR_m STR_a STR_l STR_l STR_UNDERSCORE STR_S STR_c STR_r STR_i STR_p STR_t "\0"
#define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0" #define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0"
#define STRING_Khojki0 STR_K STR_h STR_o STR_j STR_k STR_i "\0" #define STRING_Khojki0 STR_K STR_h STR_o STR_j STR_k STR_i "\0"
#define STRING_Khudawadi0 STR_K STR_h STR_u STR_d STR_a STR_w STR_a STR_d STR_i "\0" #define STRING_Khudawadi0 STR_K STR_h STR_u STR_d STR_a STR_w STR_a STR_d STR_i "\0"
@ -429,6 +432,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Xsp0 STR_X STR_s STR_p "\0" #define STRING_Xsp0 STR_X STR_s STR_p "\0"
#define STRING_Xuc0 STR_X STR_u STR_c "\0" #define STRING_Xuc0 STR_X STR_u STR_c "\0"
#define STRING_Xwd0 STR_X STR_w STR_d "\0" #define STRING_Xwd0 STR_X STR_w STR_d "\0"
#define STRING_Yezidi0 STR_Y STR_e STR_z STR_i STR_d STR_i "\0"
#define STRING_Yi0 STR_Y STR_i "\0" #define STRING_Yi0 STR_Y STR_i "\0"
#define STRING_Z0 STR_Z "\0" #define STRING_Z0 STR_Z "\0"
#define STRING_Zanabazar_Square0 STR_Z STR_a STR_n STR_a STR_b STR_a STR_z STR_a STR_r STR_UNDERSCORE STR_S STR_q STR_u STR_a STR_r STR_e "\0" #define STRING_Zanabazar_Square0 STR_Z STR_a STR_n STR_a STR_b STR_a STR_z STR_a STR_r STR_UNDERSCORE STR_S STR_q STR_u STR_a STR_r STR_e "\0"
@ -464,6 +468,7 @@ const char PRIV(utt_names)[] =
STRING_Chakma0 STRING_Chakma0
STRING_Cham0 STRING_Cham0
STRING_Cherokee0 STRING_Cherokee0
STRING_Chorasmian0
STRING_Cn0 STRING_Cn0
STRING_Co0 STRING_Co0
STRING_Common0 STRING_Common0
@ -474,6 +479,7 @@ const char PRIV(utt_names)[] =
STRING_Cyrillic0 STRING_Cyrillic0
STRING_Deseret0 STRING_Deseret0
STRING_Devanagari0 STRING_Devanagari0
STRING_Dives_Akuru0
STRING_Dogra0 STRING_Dogra0
STRING_Duployan0 STRING_Duployan0
STRING_Egyptian_Hieroglyphs0 STRING_Egyptian_Hieroglyphs0
@ -505,6 +511,7 @@ const char PRIV(utt_names)[] =
STRING_Katakana0 STRING_Katakana0
STRING_Kayah_Li0 STRING_Kayah_Li0
STRING_Kharoshthi0 STRING_Kharoshthi0
STRING_Khitan_Small_Script0
STRING_Khmer0 STRING_Khmer0
STRING_Khojki0 STRING_Khojki0
STRING_Khudawadi0 STRING_Khudawadi0
@ -628,6 +635,7 @@ const char PRIV(utt_names)[] =
STRING_Xsp0 STRING_Xsp0
STRING_Xuc0 STRING_Xuc0
STRING_Xwd0 STRING_Xwd0
STRING_Yezidi0
STRING_Yi0 STRING_Yi0
STRING_Z0 STRING_Z0
STRING_Zanabazar_Square0 STRING_Zanabazar_Square0
@ -663,176 +671,180 @@ const ucp_type_table PRIV(utt)[] = {
{ 203, PT_SC, ucp_Chakma }, { 203, PT_SC, ucp_Chakma },
{ 210, PT_SC, ucp_Cham }, { 210, PT_SC, ucp_Cham },
{ 215, PT_SC, ucp_Cherokee }, { 215, PT_SC, ucp_Cherokee },
{ 224, PT_PC, ucp_Cn }, { 224, PT_SC, ucp_Chorasmian },
{ 227, PT_PC, ucp_Co }, { 235, PT_PC, ucp_Cn },
{ 230, PT_SC, ucp_Common }, { 238, PT_PC, ucp_Co },
{ 237, PT_SC, ucp_Coptic }, { 241, PT_SC, ucp_Common },
{ 244, PT_PC, ucp_Cs }, { 248, PT_SC, ucp_Coptic },
{ 247, PT_SC, ucp_Cuneiform }, { 255, PT_PC, ucp_Cs },
{ 257, PT_SC, ucp_Cypriot }, { 258, PT_SC, ucp_Cuneiform },
{ 265, PT_SC, ucp_Cyrillic }, { 268, PT_SC, ucp_Cypriot },
{ 274, PT_SC, ucp_Deseret }, { 276, PT_SC, ucp_Cyrillic },
{ 282, PT_SC, ucp_Devanagari }, { 285, PT_SC, ucp_Deseret },
{ 293, PT_SC, ucp_Dogra }, { 293, PT_SC, ucp_Devanagari },
{ 299, PT_SC, ucp_Duployan }, { 304, PT_SC, ucp_Dives_Akuru },
{ 308, PT_SC, ucp_Egyptian_Hieroglyphs }, { 316, PT_SC, ucp_Dogra },
{ 329, PT_SC, ucp_Elbasan }, { 322, PT_SC, ucp_Duployan },
{ 337, PT_SC, ucp_Elymaic }, { 331, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 345, PT_SC, ucp_Ethiopic }, { 352, PT_SC, ucp_Elbasan },
{ 354, PT_SC, ucp_Georgian }, { 360, PT_SC, ucp_Elymaic },
{ 363, PT_SC, ucp_Glagolitic }, { 368, PT_SC, ucp_Ethiopic },
{ 374, PT_SC, ucp_Gothic }, { 377, PT_SC, ucp_Georgian },
{ 381, PT_SC, ucp_Grantha }, { 386, PT_SC, ucp_Glagolitic },
{ 389, PT_SC, ucp_Greek }, { 397, PT_SC, ucp_Gothic },
{ 395, PT_SC, ucp_Gujarati }, { 404, PT_SC, ucp_Grantha },
{ 404, PT_SC, ucp_Gunjala_Gondi }, { 412, PT_SC, ucp_Greek },
{ 418, PT_SC, ucp_Gurmukhi }, { 418, PT_SC, ucp_Gujarati },
{ 427, PT_SC, ucp_Han }, { 427, PT_SC, ucp_Gunjala_Gondi },
{ 431, PT_SC, ucp_Hangul }, { 441, PT_SC, ucp_Gurmukhi },
{ 438, PT_SC, ucp_Hanifi_Rohingya }, { 450, PT_SC, ucp_Han },
{ 454, PT_SC, ucp_Hanunoo }, { 454, PT_SC, ucp_Hangul },
{ 462, PT_SC, ucp_Hatran }, { 461, PT_SC, ucp_Hanifi_Rohingya },
{ 469, PT_SC, ucp_Hebrew }, { 477, PT_SC, ucp_Hanunoo },
{ 476, PT_SC, ucp_Hiragana }, { 485, PT_SC, ucp_Hatran },
{ 485, PT_SC, ucp_Imperial_Aramaic }, { 492, PT_SC, ucp_Hebrew },
{ 502, PT_SC, ucp_Inherited }, { 499, PT_SC, ucp_Hiragana },
{ 512, PT_SC, ucp_Inscriptional_Pahlavi }, { 508, PT_SC, ucp_Imperial_Aramaic },
{ 534, PT_SC, ucp_Inscriptional_Parthian }, { 525, PT_SC, ucp_Inherited },
{ 557, PT_SC, ucp_Javanese }, { 535, PT_SC, ucp_Inscriptional_Pahlavi },
{ 566, PT_SC, ucp_Kaithi }, { 557, PT_SC, ucp_Inscriptional_Parthian },
{ 573, PT_SC, ucp_Kannada }, { 580, PT_SC, ucp_Javanese },
{ 581, PT_SC, ucp_Katakana }, { 589, PT_SC, ucp_Kaithi },
{ 590, PT_SC, ucp_Kayah_Li }, { 596, PT_SC, ucp_Kannada },
{ 599, PT_SC, ucp_Kharoshthi }, { 604, PT_SC, ucp_Katakana },
{ 610, PT_SC, ucp_Khmer }, { 613, PT_SC, ucp_Kayah_Li },
{ 616, PT_SC, ucp_Khojki }, { 622, PT_SC, ucp_Kharoshthi },
{ 623, PT_SC, ucp_Khudawadi }, { 633, PT_SC, ucp_Khitan_Small_Script },
{ 633, PT_GC, ucp_L }, { 653, PT_SC, ucp_Khmer },
{ 635, PT_LAMP, 0 }, { 659, PT_SC, ucp_Khojki },
{ 638, PT_SC, ucp_Lao }, { 666, PT_SC, ucp_Khudawadi },
{ 642, PT_SC, ucp_Latin }, { 676, PT_GC, ucp_L },
{ 648, PT_SC, ucp_Lepcha }, { 678, PT_LAMP, 0 },
{ 655, PT_SC, ucp_Limbu }, { 681, PT_SC, ucp_Lao },
{ 661, PT_SC, ucp_Linear_A }, { 685, PT_SC, ucp_Latin },
{ 670, PT_SC, ucp_Linear_B }, { 691, PT_SC, ucp_Lepcha },
{ 679, PT_SC, ucp_Lisu }, { 698, PT_SC, ucp_Limbu },
{ 684, PT_PC, ucp_Ll }, { 704, PT_SC, ucp_Linear_A },
{ 687, PT_PC, ucp_Lm }, { 713, PT_SC, ucp_Linear_B },
{ 690, PT_PC, ucp_Lo }, { 722, PT_SC, ucp_Lisu },
{ 693, PT_PC, ucp_Lt }, { 727, PT_PC, ucp_Ll },
{ 696, PT_PC, ucp_Lu }, { 730, PT_PC, ucp_Lm },
{ 699, PT_SC, ucp_Lycian }, { 733, PT_PC, ucp_Lo },
{ 706, PT_SC, ucp_Lydian }, { 736, PT_PC, ucp_Lt },
{ 713, PT_GC, ucp_M }, { 739, PT_PC, ucp_Lu },
{ 715, PT_SC, ucp_Mahajani }, { 742, PT_SC, ucp_Lycian },
{ 724, PT_SC, ucp_Makasar }, { 749, PT_SC, ucp_Lydian },
{ 732, PT_SC, ucp_Malayalam }, { 756, PT_GC, ucp_M },
{ 742, PT_SC, ucp_Mandaic }, { 758, PT_SC, ucp_Mahajani },
{ 750, PT_SC, ucp_Manichaean }, { 767, PT_SC, ucp_Makasar },
{ 761, PT_SC, ucp_Marchen }, { 775, PT_SC, ucp_Malayalam },
{ 769, PT_SC, ucp_Masaram_Gondi }, { 785, PT_SC, ucp_Mandaic },
{ 783, PT_PC, ucp_Mc }, { 793, PT_SC, ucp_Manichaean },
{ 786, PT_PC, ucp_Me }, { 804, PT_SC, ucp_Marchen },
{ 789, PT_SC, ucp_Medefaidrin }, { 812, PT_SC, ucp_Masaram_Gondi },
{ 801, PT_SC, ucp_Meetei_Mayek }, { 826, PT_PC, ucp_Mc },
{ 814, PT_SC, ucp_Mende_Kikakui }, { 829, PT_PC, ucp_Me },
{ 828, PT_SC, ucp_Meroitic_Cursive }, { 832, PT_SC, ucp_Medefaidrin },
{ 845, PT_SC, ucp_Meroitic_Hieroglyphs }, { 844, PT_SC, ucp_Meetei_Mayek },
{ 866, PT_SC, ucp_Miao }, { 857, PT_SC, ucp_Mende_Kikakui },
{ 871, PT_PC, ucp_Mn }, { 871, PT_SC, ucp_Meroitic_Cursive },
{ 874, PT_SC, ucp_Modi }, { 888, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 879, PT_SC, ucp_Mongolian }, { 909, PT_SC, ucp_Miao },
{ 889, PT_SC, ucp_Mro }, { 914, PT_PC, ucp_Mn },
{ 893, PT_SC, ucp_Multani }, { 917, PT_SC, ucp_Modi },
{ 901, PT_SC, ucp_Myanmar }, { 922, PT_SC, ucp_Mongolian },
{ 909, PT_GC, ucp_N }, { 932, PT_SC, ucp_Mro },
{ 911, PT_SC, ucp_Nabataean }, { 936, PT_SC, ucp_Multani },
{ 921, PT_SC, ucp_Nandinagari }, { 944, PT_SC, ucp_Myanmar },
{ 933, PT_PC, ucp_Nd }, { 952, PT_GC, ucp_N },
{ 936, PT_SC, ucp_New_Tai_Lue }, { 954, PT_SC, ucp_Nabataean },
{ 948, PT_SC, ucp_Newa }, { 964, PT_SC, ucp_Nandinagari },
{ 953, PT_SC, ucp_Nko }, { 976, PT_PC, ucp_Nd },
{ 957, PT_PC, ucp_Nl }, { 979, PT_SC, ucp_New_Tai_Lue },
{ 960, PT_PC, ucp_No }, { 991, PT_SC, ucp_Newa },
{ 963, PT_SC, ucp_Nushu }, { 996, PT_SC, ucp_Nko },
{ 969, PT_SC, ucp_Nyiakeng_Puachue_Hmong }, { 1000, PT_PC, ucp_Nl },
{ 992, PT_SC, ucp_Ogham }, { 1003, PT_PC, ucp_No },
{ 998, PT_SC, ucp_Ol_Chiki }, { 1006, PT_SC, ucp_Nushu },
{ 1007, PT_SC, ucp_Old_Hungarian }, { 1012, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
{ 1021, PT_SC, ucp_Old_Italic }, { 1035, PT_SC, ucp_Ogham },
{ 1032, PT_SC, ucp_Old_North_Arabian }, { 1041, PT_SC, ucp_Ol_Chiki },
{ 1050, PT_SC, ucp_Old_Permic }, { 1050, PT_SC, ucp_Old_Hungarian },
{ 1061, PT_SC, ucp_Old_Persian }, { 1064, PT_SC, ucp_Old_Italic },
{ 1073, PT_SC, ucp_Old_Sogdian }, { 1075, PT_SC, ucp_Old_North_Arabian },
{ 1085, PT_SC, ucp_Old_South_Arabian }, { 1093, PT_SC, ucp_Old_Permic },
{ 1103, PT_SC, ucp_Old_Turkic }, { 1104, PT_SC, ucp_Old_Persian },
{ 1114, PT_SC, ucp_Oriya }, { 1116, PT_SC, ucp_Old_Sogdian },
{ 1120, PT_SC, ucp_Osage }, { 1128, PT_SC, ucp_Old_South_Arabian },
{ 1126, PT_SC, ucp_Osmanya }, { 1146, PT_SC, ucp_Old_Turkic },
{ 1134, PT_GC, ucp_P }, { 1157, PT_SC, ucp_Oriya },
{ 1136, PT_SC, ucp_Pahawh_Hmong }, { 1163, PT_SC, ucp_Osage },
{ 1149, PT_SC, ucp_Palmyrene }, { 1169, PT_SC, ucp_Osmanya },
{ 1159, PT_SC, ucp_Pau_Cin_Hau }, { 1177, PT_GC, ucp_P },
{ 1171, PT_PC, ucp_Pc }, { 1179, PT_SC, ucp_Pahawh_Hmong },
{ 1174, PT_PC, ucp_Pd }, { 1192, PT_SC, ucp_Palmyrene },
{ 1177, PT_PC, ucp_Pe }, { 1202, PT_SC, ucp_Pau_Cin_Hau },
{ 1180, PT_PC, ucp_Pf }, { 1214, PT_PC, ucp_Pc },
{ 1183, PT_SC, ucp_Phags_Pa }, { 1217, PT_PC, ucp_Pd },
{ 1192, PT_SC, ucp_Phoenician }, { 1220, PT_PC, ucp_Pe },
{ 1203, PT_PC, ucp_Pi }, { 1223, PT_PC, ucp_Pf },
{ 1206, PT_PC, ucp_Po }, { 1226, PT_SC, ucp_Phags_Pa },
{ 1209, PT_PC, ucp_Ps }, { 1235, PT_SC, ucp_Phoenician },
{ 1212, PT_SC, ucp_Psalter_Pahlavi }, { 1246, PT_PC, ucp_Pi },
{ 1228, PT_SC, ucp_Rejang }, { 1249, PT_PC, ucp_Po },
{ 1235, PT_SC, ucp_Runic }, { 1252, PT_PC, ucp_Ps },
{ 1241, PT_GC, ucp_S }, { 1255, PT_SC, ucp_Psalter_Pahlavi },
{ 1243, PT_SC, ucp_Samaritan }, { 1271, PT_SC, ucp_Rejang },
{ 1253, PT_SC, ucp_Saurashtra }, { 1278, PT_SC, ucp_Runic },
{ 1264, PT_PC, ucp_Sc }, { 1284, PT_GC, ucp_S },
{ 1267, PT_SC, ucp_Sharada }, { 1286, PT_SC, ucp_Samaritan },
{ 1275, PT_SC, ucp_Shavian }, { 1296, PT_SC, ucp_Saurashtra },
{ 1283, PT_SC, ucp_Siddham }, { 1307, PT_PC, ucp_Sc },
{ 1291, PT_SC, ucp_SignWriting }, { 1310, PT_SC, ucp_Sharada },
{ 1303, PT_SC, ucp_Sinhala }, { 1318, PT_SC, ucp_Shavian },
{ 1311, PT_PC, ucp_Sk }, { 1326, PT_SC, ucp_Siddham },
{ 1314, PT_PC, ucp_Sm }, { 1334, PT_SC, ucp_SignWriting },
{ 1317, PT_PC, ucp_So }, { 1346, PT_SC, ucp_Sinhala },
{ 1320, PT_SC, ucp_Sogdian }, { 1354, PT_PC, ucp_Sk },
{ 1328, PT_SC, ucp_Sora_Sompeng }, { 1357, PT_PC, ucp_Sm },
{ 1341, PT_SC, ucp_Soyombo }, { 1360, PT_PC, ucp_So },
{ 1349, PT_SC, ucp_Sundanese }, { 1363, PT_SC, ucp_Sogdian },
{ 1359, PT_SC, ucp_Syloti_Nagri }, { 1371, PT_SC, ucp_Sora_Sompeng },
{ 1372, PT_SC, ucp_Syriac }, { 1384, PT_SC, ucp_Soyombo },
{ 1379, PT_SC, ucp_Tagalog }, { 1392, PT_SC, ucp_Sundanese },
{ 1387, PT_SC, ucp_Tagbanwa }, { 1402, PT_SC, ucp_Syloti_Nagri },
{ 1396, PT_SC, ucp_Tai_Le }, { 1415, PT_SC, ucp_Syriac },
{ 1403, PT_SC, ucp_Tai_Tham }, { 1422, PT_SC, ucp_Tagalog },
{ 1412, PT_SC, ucp_Tai_Viet }, { 1430, PT_SC, ucp_Tagbanwa },
{ 1421, PT_SC, ucp_Takri }, { 1439, PT_SC, ucp_Tai_Le },
{ 1427, PT_SC, ucp_Tamil }, { 1446, PT_SC, ucp_Tai_Tham },
{ 1433, PT_SC, ucp_Tangut }, { 1455, PT_SC, ucp_Tai_Viet },
{ 1440, PT_SC, ucp_Telugu }, { 1464, PT_SC, ucp_Takri },
{ 1447, PT_SC, ucp_Thaana }, { 1470, PT_SC, ucp_Tamil },
{ 1454, PT_SC, ucp_Thai }, { 1476, PT_SC, ucp_Tangut },
{ 1459, PT_SC, ucp_Tibetan }, { 1483, PT_SC, ucp_Telugu },
{ 1467, PT_SC, ucp_Tifinagh }, { 1490, PT_SC, ucp_Thaana },
{ 1476, PT_SC, ucp_Tirhuta }, { 1497, PT_SC, ucp_Thai },
{ 1484, PT_SC, ucp_Ugaritic }, { 1502, PT_SC, ucp_Tibetan },
{ 1493, PT_SC, ucp_Unknown }, { 1510, PT_SC, ucp_Tifinagh },
{ 1501, PT_SC, ucp_Vai }, { 1519, PT_SC, ucp_Tirhuta },
{ 1505, PT_SC, ucp_Wancho }, { 1527, PT_SC, ucp_Ugaritic },
{ 1512, PT_SC, ucp_Warang_Citi }, { 1536, PT_SC, ucp_Unknown },
{ 1524, PT_ALNUM, 0 }, { 1544, PT_SC, ucp_Vai },
{ 1528, PT_PXSPACE, 0 }, { 1548, PT_SC, ucp_Wancho },
{ 1532, PT_SPACE, 0 }, { 1555, PT_SC, ucp_Warang_Citi },
{ 1536, PT_UCNC, 0 }, { 1567, PT_ALNUM, 0 },
{ 1540, PT_WORD, 0 }, { 1571, PT_PXSPACE, 0 },
{ 1544, PT_SC, ucp_Yi }, { 1575, PT_SPACE, 0 },
{ 1547, PT_GC, ucp_Z }, { 1579, PT_UCNC, 0 },
{ 1549, PT_SC, ucp_Zanabazar_Square }, { 1583, PT_WORD, 0 },
{ 1566, PT_PC, ucp_Zl }, { 1587, PT_SC, ucp_Yezidi },
{ 1569, PT_PC, ucp_Zp }, { 1594, PT_SC, ucp_Yi },
{ 1572, PT_PC, ucp_Zs } { 1597, PT_GC, ucp_Z },
{ 1599, PT_SC, ucp_Zanabazar_Square },
{ 1616, PT_PC, ucp_Zl },
{ 1619, PT_PC, ucp_Zp },
{ 1622, PT_PC, ucp_Zs }
}; };
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

File diff suppressed because it is too large Load diff

View file

@ -286,7 +286,12 @@ enum {
ucp_Elymaic, ucp_Elymaic,
ucp_Nandinagari, ucp_Nandinagari,
ucp_Nyiakeng_Puachue_Hmong, ucp_Nyiakeng_Puachue_Hmong,
ucp_Wancho ucp_Wancho,
/* New for Unicode 13.0.0 */
ucp_Chorasmian,
ucp_Dives_Akuru,
ucp_Khitan_Small_Script,
ucp_Yezidi
}; };
#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */ #endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */

View file

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2017 University of Cambridge New API code Copyright (c) 2016-2020 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -347,7 +347,7 @@ for (p = string; length > 0; p++)
length--; length--;
if ((*p & 0xfc00) != 0xdc00) if ((*p & 0xfc00) != 0xdc00)
{ {
*erroroffset = p - string; *erroroffset = p - string - 1;
return PCRE2_ERROR_UTF16_ERR2; return PCRE2_ERROR_UTF16_ERR2;
} }
} }