1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-22 20:17:45 +01:00
searxng/searx/regexp.py
Markus Heiser 3581e1b85f [POC] increase efficiency of reg-expressions
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-06-21 18:49:34 +02:00

260 lines
9.1 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for efficient processing of regular expressions"""
from __future__ import annotations
from typing import Iterator
import abc
import re
import warnings
import json
class RegExprList(abc.ABC):
"""Abstract base class for efficient processing of lists of regular
expressions. The inheriting classes have to implement the
:py:obj:`RegExprList.load_regexp` method which is used to load the list of
regular expressions from a configuration, for example.
Intention: By concatenating the regular expressions from the list into one
regular expression, all patterns can be performed with just one search and
it is not necessary to iterate over the individual expressions and perform
n-searches.
"""
RE_GRP_PREFIX = "RegExprList"
@abc.abstractmethod
def load_regexps(self) -> list[tuple[str, tuple]]:
"""Abstract method to load the list of regular expressions from a
configuration. Returns a list of regular expressions (str) or a list of
two-digit tuples with a regular expression on its first position and
tuple of *n-objects* related to this regular expression on its second
position:
.. code:: python
[
( <regexpr_a>, (obj_a1, obj_a2, ..) ),
( <regexpr_b>, (obj_b1, obj_b2, ..) ),
..
]
If there is nothing related to the regular expression, the tuple is
empty (n=0). The **objects** must be of a simple data type (str, int,
..) so that they can be serialized (JSON).
"""
def __init__(self, chunk_size = 1000):
self.chunk_size = chunk_size
self._chunks = None
self._data_json = None
def _get_data(self):
if self._data_json is not None:
return json.loads(self._data_json)
return self.load_regexps()
@property
def JSON(self):
"""JSON representation of the regular expression list (see
:py:obj:`RegExprList.load_regexp`).
Serialize the :py:obj:`RegExprList` object into a JSON string.
"""
if self._data_json is not None:
return self._data_json
return json.dumps(self._get_data(), sort_keys=True)
@classmethod
def from_json(cls, json_str: str) -> "RegExprList":
"""Build a :py:obj:`RegExprList` object and load regular expressions from
a JSON string (compare :py:obj:`RegExprList.JSON`)."""
obj = cls()
obj._data_json = json_str
return obj
@property
def chunks(self) -> list[tuple[re.Pattern, list[tuple]]]:
"""A list of (concatenated) regular expressions"""
if self._chunks is None:
self._chunks = self.get_chunks()
return self._chunks
def get_chunks(self) -> list[tuple[re.Pattern, list[tuple]]]:
"""Returns a list chunks items. A chunk item is a two-digit tuple with
the concatened :py:obj:`re.Pattern` on its first position and a list of
tuples (aka grp_tuples) on its second position.
The regular expressions are placed in *named groups* and the group for
the match can be determined using :py:obj:`re.Match.groupdict:`.
.. code: re
(?P<{_0}>foo)|(?P<_1>bar)
.. code: python
>>> grp_tuples[0]
('foo', obj_foo_1, obj_foo_2, ...)
>>> grp_tuples[1]
('bar', obj_bar_1, obj_bar_1, ...)
"""
chunks = []
re_list = self._get_data()
chunk_re = ""
grp_tuples = []
c = -1
for pos in range(0, len(re_list)):
c += 1
objs_tpl = ()
if len(re_list[pos]) == 2:
re_str, objs_tpl = re_list[pos]
else:
re_str = re_list[pos]
grp_re = f"|(?P<{self.RE_GRP_PREFIX}_{c}>{re_str})"
if len(grp_re) + len(chunk_re) > self.chunk_size:
# remove the leading | from chunk_re
chunks.append((re.compile(chunk_re[1:]), grp_tuples))
chunk_re = ""
grp_tuples = []
chunk_re += grp_re
grp_tuples.append((re_str, ) + objs_tpl)
# Are there any leftovers from the for loop?
if chunk_re:
chunks.append((re.compile(chunk_re[1:]), grp_tuples))
return chunks
def search(self, string: str) -> tuple[re.Match, tuple] | None:
"""Search for regular expressions in ``string``. If none of the regular
expression matches, ``None`` is returned. If there is a match, the
first match (:py:obj:`re.Match`) is returned along with a tuple of
objects related to the matched pattern (compare :py:obj:`RegExprList`):
.. code:: python
( re.Match, ( <regexpr_str>, obj_1, obj_2, ..) )
"""
pos = -1
for regexp, objs_tpl in self.chunks:
m = regexp.search(string)
if m:
prefix = f"{self.RE_GRP_PREFIX}_"
for grp_name, val in m.groupdict().items():
if not grp_name.startswith(prefix):
continue
if val is None:
continue
try:
pos = int(grp_name[len(prefix):])
return (m, objs_tpl[pos])
except ValueError:
# This case should never occur unless there is something
# wrong with the regular expressions.
warnings.warn(f"ignoring group '{grp_name}' in regexpr match {m}: check your regular expressions!")
m = None
break
return None
def finditer(self, string: str) -> Iterator[tuple[re.Match, tuple]]:
"""Return an iterator yielding over all *"non-overlapping"* matches for
the RE pattern in string. Similar to :py:obj:`RegExpr.search` each
match (:py:obj:`re.Match`) comes along with a tuple of objects related
to the matched pattern:
.. code:: python
( re.Match, ( <regexpr_str>, obj_1, obj_2, ..) )
Since the list of regular expressions is concatenated and also broken up
at the boundaries of the chunks, it is not possible to ensure
*"non-overlapping"* over the entirety of all regular expressions in the
list! Nevertheless, there will be scenarios where this iterator makes
sense, e.g. if the regular expressions do not overlap.
.. caution:
Use this method with care if the :py:obj:`regular expressions in the
list <RegExprListload_regexps>` *overlap*, otherwise you get unexpected
results!
"""
pos = -1
for regexp, objs_tpl in self.chunks:
for m in regexp.finditer(string):
if m is None:
continue
prefix = f"{self.RE_GRP_PREFIX}_"
for grp_name, val in m.groupdict().items():
if not grp_name.startswith(prefix):
continue
if val is None:
continue
try:
pos = int(grp_name[len(prefix):])
yield (m, objs_tpl[pos])
except ValueError:
# This case should never occur unless there is something
# wrong with the regular expressions.
warnings.warn(f"ignoring group '{grp_name}' in regexpr match {m}: check your regular expressions!")
continue
##########################################################################
## some tests of the POC above
import pdb
def test_POC():
test_list = [
# hint: the order of the list counts!
(r'aa', ("double 'a' don't overlaps with any other regular expressions",)),
(r'a', ("single 'a' overlaps with all other regular expressions",)),
r'(.*\.)?academiapublishing\.org$',
r'(.*\.)?academiaresearch\.org$',
r'(.*\.)?academiascholarlyjournal\.org$',
r'(.*\.)?academicjournalsinc\.com$',
r'(.*\.)?academicjournalsonline\.co\.in$',
r'(.*\.)?academicjournals\.org$',
r'(.*\.)?academicoasis\.org$',
r'(.*\.)?academic-publishing-house\.com$',
r'(.*\.)?academicpub\.org$',
r'(.*\.)?academicresearchjournals\.org$',
r'(.*\.)?academicstar\.us$',
r'(.*\.)?academicsworld\.org$',
r'(.*\.)?academicwebpublishers\.org$',
r'(.*\.)?academievoorcontinuverbeteren\.nl$',
(r'(.*\.)?academyirmbr\.com$', ("XX", "YYYY", 7, 8.2)),
r'(.*\.)?academyjournals\.net$',
r'(.*\.)?academyofideas\.com$',
r'(.*\.)?academypublish\.org$'
]
class TestCls(RegExprList):
def load_regexps(self) -> list[tuple[str, tuple]] | list[str]:
return test_list
mylist = TestCls()
string = "aa.www.academyirmbr.com"
print(f"matches in '{string}' ...")
for m, tpl in mylist.finditer(string):
print(f" regexp: {tpl[0]} // match: {m.string[m.start():m.end()]} // objects related to regexp: {tpl}")
if __name__ == "__main__":
test_POC()