From de5df8348ed36ec6059fa6546b2b57d08029fa71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Thu, 20 Apr 2023 08:03:45 -0700 Subject: [PATCH] no partial match if trailing data is invalid utf Avoid returning a partial match if one was found but followed by invalid UTF, making the result consistent with JIT and unlike: PCRE2 version 10.34 2019-11-21 re> /.a/match_invalid_utf,allvector,jit data> b\xb1\=ph,ovector=1 No match 0: data> b\xb1\=ph,ovector=1,no_jit Partial match: b\x{b1} ** ovector[1] is not equal to the subject length: 1 != 2 0: 0 1 --- src/pcre2_match.c | 1 + testdata/testinput10 | 19 +++++++++++++++++++ testdata/testinput12 | 14 ++++++++++++++ testdata/testoutput10 | 32 ++++++++++++++++++++++++++++++++ testdata/testoutput12-16 | 22 ++++++++++++++++++++++ testdata/testoutput12-32 | 22 ++++++++++++++++++++++ 6 files changed, 110 insertions(+) diff --git a/src/pcre2_match.c b/src/pcre2_match.c index ea98af3c3..afe3036b2 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -7454,6 +7454,7 @@ if (utf && end_subject != true_end_subject && if (start_match >= true_end_subject) { rc = MATCH_NOMATCH; /* In case it was partial */ + match_partial = NULL; break; } diff --git a/testdata/testinput10 b/testdata/testinput10 index 53e37cbca..27321e374 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -506,6 +506,25 @@ \= Expect no match ab\x80cdef\=ph +/.a/match_invalid_utf + ab\=ph + ab\=ps + b\xf0\x91\x88b\=ph + b\xf0\x91\x88b\=ps + b\xf0\x91\x88\xb4a +\= Expect no match + b\x80\=ph + b\x80\=ps + b\xf0\x91\x88\=ph + b\xf0\x91\x88\=ps + +/.a$/match_invalid_utf + ab\=ph + ab\=ps +\= Expect no match + b\xf0\x91\x98\=ph + b\xf0\x91\x98\=ps + /ab$/match_invalid_utf ab\x80cdeab \= Expect no match diff --git a/testdata/testinput12 b/testdata/testinput12 index 9b4f8d343..7a85eb578 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -413,6 +413,20 @@ \= Expect no match ab\x{df00}cdef\=ph +/.a/match_invalid_utf + ab\=ph + ab\=ps +\= Expect no match + b\x{df00}\=ph + b\x{df00}\=ps + +/.a$/match_invalid_utf + ab\=ph + ab\=ps +\= Expect no match + b\x{df00}\=ph + b\x{df00}\=ps + /ab$/match_invalid_utf ab\x{df00}cdeab \= Expect no match diff --git a/testdata/testoutput10 b/testdata/testoutput10 index d40851061..1f4c876bb 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1646,6 +1646,38 @@ Partial match: ab ab\x80cdef\=ph No match +/.a/match_invalid_utf + ab\=ph +Partial match: b + ab\=ps +Partial match: b + b\xf0\x91\x88b\=ph +Partial match: b + b\xf0\x91\x88b\=ps +Partial match: b + b\xf0\x91\x88\xb4a + 0: \x{11234}a +\= Expect no match + b\x80\=ph +No match + b\x80\=ps +No match + b\xf0\x91\x88\=ph +No match + b\xf0\x91\x88\=ps +No match + +/.a$/match_invalid_utf + ab\=ph +Partial match: b + ab\=ps +Partial match: b +\= Expect no match + b\xf0\x91\x98\=ph +No match + b\xf0\x91\x98\=ps +No match + /ab$/match_invalid_utf ab\x80cdeab 0: ab diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 84c485817..98676324b 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1522,6 +1522,28 @@ Partial match: ab ab\x{df00}cdef\=ph No match +/.a/match_invalid_utf + ab\=ph +Partial match: b + ab\=ps +Partial match: b +\= Expect no match + b\x{df00}\=ph +No match + b\x{df00}\=ps +No match + +/.a$/match_invalid_utf + ab\=ph +Partial match: b + ab\=ps +Partial match: b +\= Expect no match + b\x{df00}\=ph +No match + b\x{df00}\=ps +No match + /ab$/match_invalid_utf ab\x{df00}cdeab 0: ab diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 03b6e3940..3a20dd4bd 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1520,6 +1520,28 @@ Partial match: ab ab\x{df00}cdef\=ph No match +/.a/match_invalid_utf + ab\=ph +Partial match: b + ab\=ps +Partial match: b +\= Expect no match + b\x{df00}\=ph +No match + b\x{df00}\=ps +No match + +/.a$/match_invalid_utf + ab\=ph +Partial match: b + ab\=ps +Partial match: b +\= Expect no match + b\x{df00}\=ph +No match + b\x{df00}\=ps +No match + /ab$/match_invalid_utf ab\x{df00}cdeab 0: ab