aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZoltan Herczeg <hzmester@freemail.hu>2023-12-27 18:55:04 +0000
committerZoltan Herczeg <hzmester@freemail.hu>2023-12-27 19:59:38 +0000
commit2bba84b2816046d7bcf1819e3712922345e4d7d3 (patch)
tree3fdf32ab992ca2a8de5db3ba25da0e17a09de6f3
parent542cb11242cfc9be9b6218965751bfbb13a8b6a2 (diff)
downloadpcre-2bba84b2816046d7bcf1819e3712922345e4d7d3.tar.gz
Optimize character category matching in JIT
-rw-r--r--src/pcre2_jit_compile.c92
-rw-r--r--src/pcre2_jit_test.c1
2 files changed, 60 insertions, 33 deletions
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index 021d4972..9b8bf615 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -6735,6 +6735,7 @@ JUMPTO(SLJIT_JUMP, mainloop);
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
+#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
#endif
static void check_wordboundary(compiler_common *common, BOOL ucp)
@@ -7615,6 +7616,8 @@ BOOL utf = common->utf;
#ifdef SUPPORT_UNICODE
sljit_u32 unicode_status = 0;
+sljit_u32 category_list = 0;
+sljit_u32 items;
int typereg = TMP1;
const sljit_u32 *other_cases;
#endif /* SUPPORT_UNICODE */
@@ -7633,6 +7636,7 @@ if (cc[-1] & XCL_MAP)
while (*cc != XCL_END)
{
compares++;
+
if (*cc == XCL_SINGLE)
{
cc ++;
@@ -7659,6 +7663,7 @@ while (*cc != XCL_END)
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
+
if (*cc == PT_CLIST && cc[-1] == XCL_PROP)
{
other_cases = PRIV(ucd_caseless_sets) + cc[1];
@@ -7675,25 +7680,34 @@ while (*cc != XCL_END)
min = 0;
}
+ items = 0;
+
switch(*cc)
{
case PT_ANY:
/* Any either accepts everything or ignored. */
if (cc[-1] == XCL_PROP)
- {
- compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
- if (list == backtracks)
- add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
- return;
- }
+ items = UCPCAT_ALL;
break;
case PT_LAMP:
+ items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
+ break;
+
case PT_GC:
+ items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
+ break;
+
case PT_PC:
+ items = UCPCAT(cc[1]);
+ break;
+
case PT_WORD:
+ items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
+ break;
+
case PT_ALNUM:
- unicode_status |= XCLASS_HAS_TYPE;
+ items = UCPCAT_L | UCPCAT_N;
break;
case PT_SCX:
@@ -7736,11 +7750,32 @@ while (*cc != XCL_END)
SLJIT_UNREACHABLE();
break;
}
+
+ if (items > 0)
+ {
+ if (cc[-1] == XCL_NOTPROP)
+ items ^= UCPCAT_ALL;
+ category_list |= items;
+ unicode_status |= XCLASS_HAS_TYPE;
+ compares--;
+ }
+
cc += 2;
}
#endif /* SUPPORT_UNICODE */
}
-SLJIT_ASSERT(compares > 0);
+SLJIT_ASSERT(compares > 0 || category_list > 0);
+
+#ifdef SUPPORT_UNICODE
+if (category_list == UCPCAT_ALL)
+ {
+ /* All characters are accepted, same as dotall. */
+ compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
+ if (list == backtracks)
+ add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
+ return;
+ }
+#endif /* SUPPORT_UNICODE */
/* We are not necessary in utf mode even in 8 bit mode. */
cc = ccbegin;
@@ -7841,6 +7876,9 @@ if (unicode_status & XCLASS_NEEDS_UCD)
ccbegin = cc;
+ if (category_list != 0)
+ compares++;
+
if (unicode_status & XCLASS_HAS_BIDICL)
{
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
@@ -8045,8 +8083,16 @@ if (unicode_status & XCLASS_NEEDS_UCD)
if (unicode_status & XCLASS_SAVE_CHAR)
typereg = RETURN_ADDR;
- OP1(SLJIT_MOV_U8, typereg, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
- OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, typereg, 0);
+ OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
+ OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0);
+
+ if (category_list > 0)
+ {
+ compares--;
+ invertcmp = (compares == 0 && list != backtracks);
+ OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list);
+ add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
+ }
}
}
#endif /* SUPPORT_UNICODE */
@@ -8126,26 +8172,16 @@ while (*cc != XCL_END)
break;
case PT_LAMP:
- OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt));
- jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
- break;
-
case PT_GC:
- OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]));
- jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
- break;
-
case PT_PC:
- OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(cc[1]));
- jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
- break;
-
case PT_SC:
case PT_SCX:
case PT_BOOL:
case PT_BIDICL:
+ case PT_WORD:
+ case PT_ALNUM:
compares++;
- /* Do nothing. */
+ /* Already handled. */
break;
case PT_SPACE:
@@ -8165,16 +8201,6 @@ while (*cc != XCL_END)
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
break;
- case PT_WORD:
- OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N);
- jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
- break;
-
- case PT_ALNUM:
- OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_L | UCPCAT_N);
- jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
- break;
-
case PT_CLIST:
other_cases = PRIV(ucd_caseless_sets) + cc[1];
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
index 203057d6..94b9c3af 100644
--- a/src/pcre2_jit_test.c
+++ b/src/pcre2_jit_test.c
@@ -423,6 +423,7 @@ static struct regression_test_case regression_test_cases[] = {
{ CMUP, 0, 0, 0, "[^S]\\B", "\xe2\x80\x8a" },
{ MUP, 0, 0, 0 | F_NOMATCH, "[^[:print:]\\x{f6f6}]", "\xef\x9b\xb6" },
{ MUP, 0, 0, 0, "[[:xdigit:]\\x{6500}]#", "\xe6\x94\x80#" },
+ { MUP, 0, 0, 0, "[\\pC\\PC]#", "A#" },
/* Possible empty brackets. */
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },