@@ -0,0 +1 @@ | |||
Move `glob_to_regex` and `re_word_boundary` to `matrix-python-common`. |
@@ -15,8 +15,9 @@ | |||
from typing import List | |||
from matrix_common.regex import glob_to_regex | |||
from synapse.types import JsonDict | |||
from synapse.util import glob_to_regex | |||
from ._base import Config, ConfigError | |||
@@ -16,11 +16,12 @@ import logging | |||
import os | |||
from typing import List, Optional, Pattern | |||
from matrix_common.regex import glob_to_regex | |||
from OpenSSL import SSL, crypto | |||
from twisted.internet._sslverify import Certificate, trustRootFromCertificates | |||
from synapse.config._base import Config, ConfigError | |||
from synapse.util import glob_to_regex | |||
logger = logging.getLogger(__name__) | |||
@@ -28,6 +28,7 @@ from typing import ( | |||
Union, | |||
) | |||
from matrix_common.regex import glob_to_regex | |||
from prometheus_client import Counter, Gauge, Histogram | |||
from twisted.internet import defer | |||
@@ -66,7 +67,7 @@ from synapse.replication.http.federation import ( | |||
) | |||
from synapse.storage.databases.main.lock import Lock | |||
from synapse.types import JsonDict, get_domain_from_id | |||
from synapse.util import glob_to_regex, json_decoder, unwrapFirstError | |||
from synapse.util import json_decoder, unwrapFirstError | |||
from synapse.util.async_helpers import Linearizer, concurrently_execute | |||
from synapse.util.caches.response_cache import ResponseCache | |||
from synapse.util.stringutils import parse_server_name | |||
@@ -17,9 +17,10 @@ import logging | |||
import re | |||
from typing import Any, Dict, List, Optional, Pattern, Tuple, Union | |||
from matrix_common.regex import glob_to_regex, to_word_pattern | |||
from synapse.events import EventBase | |||
from synapse.types import JsonDict, UserID | |||
from synapse.util import glob_to_regex, re_word_boundary | |||
from synapse.util.caches.lrucache import LruCache | |||
logger = logging.getLogger(__name__) | |||
@@ -184,7 +185,7 @@ class PushRuleEvaluatorForEvent: | |||
r = regex_cache.get((display_name, False, True), None) | |||
if not r: | |||
r1 = re.escape(display_name) | |||
r1 = re_word_boundary(r1) | |||
r1 = to_word_pattern(r1) | |||
r = re.compile(r1, flags=re.IGNORECASE) | |||
regex_cache[(display_name, False, True)] = r | |||
@@ -213,7 +214,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool: | |||
try: | |||
r = regex_cache.get((glob, True, word_boundary), None) | |||
if not r: | |||
r = glob_to_regex(glob, word_boundary) | |||
r = glob_to_regex(glob, word_boundary=word_boundary) | |||
regex_cache[(glob, True, word_boundary)] = r | |||
return bool(r.search(value)) | |||
except re.error: | |||
@@ -87,6 +87,7 @@ REQUIREMENTS = [ | |||
# with the latest security patches. | |||
"cryptography>=3.4.7", | |||
"ijson>=3.1", | |||
"matrix-common==1.0.0", | |||
] | |||
CONDITIONAL_REQUIREMENTS = { | |||
@@ -14,9 +14,8 @@ | |||
import json | |||
import logging | |||
import re | |||
import typing | |||
from typing import Any, Callable, Dict, Generator, Optional, Pattern | |||
from typing import Any, Callable, Dict, Generator, Optional | |||
import attr | |||
from frozendict import frozendict | |||
@@ -35,9 +34,6 @@ if typing.TYPE_CHECKING: | |||
logger = logging.getLogger(__name__) | |||
_WILDCARD_RUN = re.compile(r"([\?\*]+)") | |||
def _reject_invalid_json(val: Any) -> None: | |||
"""Do not allow Infinity, -Infinity, or NaN values in JSON.""" | |||
raise ValueError("Invalid JSON value: '%s'" % val) | |||
@@ -185,56 +181,3 @@ def log_failure( | |||
if not consumeErrors: | |||
return failure | |||
return None | |||
def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern: | |||
"""Converts a glob to a compiled regex object. | |||
Args: | |||
glob: pattern to match | |||
word_boundary: If True, the pattern will be allowed to match at word boundaries | |||
anywhere in the string. Otherwise, the pattern is anchored at the start and | |||
end of the string. | |||
Returns: | |||
compiled regex pattern | |||
""" | |||
# Patterns with wildcards must be simplified to avoid performance cliffs | |||
# - The glob `?**?**?` is equivalent to the glob `???*` | |||
# - The glob `???*` is equivalent to the regex `.{3,}` | |||
chunks = [] | |||
for chunk in _WILDCARD_RUN.split(glob): | |||
# No wildcards? re.escape() | |||
if not _WILDCARD_RUN.match(chunk): | |||
chunks.append(re.escape(chunk)) | |||
continue | |||
# Wildcards? Simplify. | |||
qmarks = chunk.count("?") | |||
if "*" in chunk: | |||
chunks.append(".{%d,}" % qmarks) | |||
else: | |||
chunks.append(".{%d}" % qmarks) | |||
res = "".join(chunks) | |||
if word_boundary: | |||
res = re_word_boundary(res) | |||
else: | |||
# \A anchors at start of string, \Z at end of string | |||
res = r"\A" + res + r"\Z" | |||
return re.compile(res, re.IGNORECASE) | |||
def re_word_boundary(r: str) -> str: | |||
""" | |||
Adds word boundary characters to the start and end of an | |||
expression to require that the match occur as a whole word, | |||
but do so respecting the fact that strings starting or ending | |||
with non-word characters will change word boundaries. | |||
""" | |||
# we can't use \b as it chokes on unicode. however \W seems to be okay | |||
# as shorthand for [^0-9A-Za-z_]. | |||
return r"(^|\W)%s(\W|$)" % (r,) |
@@ -1,59 +0,0 @@ | |||
# Copyright 2021 The Matrix.org Foundation C.I.C. | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
from synapse.util import glob_to_regex | |||
from tests.unittest import TestCase | |||
class GlobToRegexTestCase(TestCase): | |||
def test_literal_match(self): | |||
"""patterns without wildcards should match""" | |||
pat = glob_to_regex("foobaz") | |||
self.assertTrue( | |||
pat.match("FoobaZ"), "patterns should match and be case-insensitive" | |||
) | |||
self.assertFalse( | |||
pat.match("x foobaz"), "pattern should not match at word boundaries" | |||
) | |||
def test_wildcard_match(self): | |||
pat = glob_to_regex("f?o*baz") | |||
self.assertTrue( | |||
pat.match("FoobarbaZ"), | |||
"* should match string and pattern should be case-insensitive", | |||
) | |||
self.assertTrue(pat.match("foobaz"), "* should match 0 characters") | |||
self.assertFalse(pat.match("fooxaz"), "the character after * must match") | |||
self.assertFalse(pat.match("fobbaz"), "? should not match 0 characters") | |||
self.assertFalse(pat.match("fiiobaz"), "? should not match 2 characters") | |||
def test_multi_wildcard(self): | |||
"""patterns with multiple wildcards in a row should match""" | |||
pat = glob_to_regex("**baz") | |||
self.assertTrue(pat.match("agsgsbaz"), "** should match any string") | |||
self.assertTrue(pat.match("baz"), "** should match the empty string") | |||
self.assertEqual(pat.pattern, r"\A.{0,}baz\Z") | |||
pat = glob_to_regex("*?baz") | |||
self.assertTrue(pat.match("agsgsbaz"), "*? should match any string") | |||
self.assertTrue(pat.match("abaz"), "*? should match a single char") | |||
self.assertFalse(pat.match("baz"), "*? should not match the empty string") | |||
self.assertEqual(pat.pattern, r"\A.{1,}baz\Z") | |||
pat = glob_to_regex("a?*?*?baz") | |||
self.assertTrue(pat.match("a g baz"), "?*?*? should match 3 chars") | |||
self.assertFalse(pat.match("a..baz"), "?*?*? should not match 2 chars") | |||
self.assertTrue(pat.match("a.gg.baz"), "?*?*? should match 4 chars") | |||
self.assertEqual(pat.pattern, r"\Aa.{3,}baz\Z") |