mirror of
https://github.com/searxng/searxng.git
synced 2024-11-21 19:51:44 +01:00
[POC] increase efficiency of reg-expressions
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
acf3f109b2
commit
3581e1b85f
259
searx/regexp.py
Normal file
259
searx/regexp.py
Normal file
@ -0,0 +1,259 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Implementations for efficient processing of regular expressions"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Iterator
|
||||
import abc
|
||||
import re
|
||||
import warnings
|
||||
import json
|
||||
|
||||
class RegExprList(abc.ABC):
|
||||
"""Abstract base class for efficient processing of lists of regular
|
||||
expressions. The inheriting classes have to implement the
|
||||
:py:obj:`RegExprList.load_regexp` method which is used to load the list of
|
||||
regular expressions from a configuration, for example.
|
||||
|
||||
Intention: By concatenating the regular expressions from the list into one
|
||||
regular expression, all patterns can be performed with just one search and
|
||||
it is not necessary to iterate over the individual expressions and perform
|
||||
n-searches.
|
||||
|
||||
"""
|
||||
|
||||
RE_GRP_PREFIX = "RegExprList"
|
||||
|
||||
@abc.abstractmethod
|
||||
def load_regexps(self) -> list[tuple[str, tuple]]:
|
||||
"""Abstract method to load the list of regular expressions from a
|
||||
configuration. Returns a list of regular expressions (str) or a list of
|
||||
two-digit tuples with a regular expression on its first position and
|
||||
tuple of *n-objects* related to this regular expression on its second
|
||||
position:
|
||||
|
||||
.. code:: python
|
||||
|
||||
[
|
||||
( <regexpr_a>, (obj_a1, obj_a2, ..) ),
|
||||
( <regexpr_b>, (obj_b1, obj_b2, ..) ),
|
||||
..
|
||||
]
|
||||
|
||||
If there is nothing related to the regular expression, the tuple is
|
||||
empty (n=0). The **objects** must be of a simple data type (str, int,
|
||||
..) so that they can be serialized (JSON).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, chunk_size = 1000):
|
||||
self.chunk_size = chunk_size
|
||||
self._chunks = None
|
||||
self._data_json = None
|
||||
|
||||
def _get_data(self):
|
||||
if self._data_json is not None:
|
||||
return json.loads(self._data_json)
|
||||
return self.load_regexps()
|
||||
|
||||
@property
|
||||
def JSON(self):
|
||||
"""JSON representation of the regular expression list (see
|
||||
:py:obj:`RegExprList.load_regexp`).
|
||||
|
||||
Serialize the :py:obj:`RegExprList` object into a JSON string.
|
||||
|
||||
"""
|
||||
if self._data_json is not None:
|
||||
return self._data_json
|
||||
return json.dumps(self._get_data(), sort_keys=True)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "RegExprList":
|
||||
"""Build a :py:obj:`RegExprList` object and load regular expressions from
|
||||
a JSON string (compare :py:obj:`RegExprList.JSON`)."""
|
||||
obj = cls()
|
||||
obj._data_json = json_str
|
||||
return obj
|
||||
|
||||
@property
|
||||
def chunks(self) -> list[tuple[re.Pattern, list[tuple]]]:
|
||||
"""A list of (concatenated) regular expressions"""
|
||||
if self._chunks is None:
|
||||
self._chunks = self.get_chunks()
|
||||
return self._chunks
|
||||
|
||||
def get_chunks(self) -> list[tuple[re.Pattern, list[tuple]]]:
|
||||
"""Returns a list chunks items. A chunk item is a two-digit tuple with
|
||||
the concatened :py:obj:`re.Pattern` on its first position and a list of
|
||||
tuples (aka grp_tuples) on its second position.
|
||||
|
||||
The regular expressions are placed in *named groups* and the group for
|
||||
the match can be determined using :py:obj:`re.Match.groupdict:`.
|
||||
|
||||
.. code: re
|
||||
|
||||
(?P<{_0}>foo)|(?P<_1>bar)
|
||||
|
||||
.. code: python
|
||||
|
||||
>>> grp_tuples[0]
|
||||
('foo', obj_foo_1, obj_foo_2, ...)
|
||||
>>> grp_tuples[1]
|
||||
('bar', obj_bar_1, obj_bar_1, ...)
|
||||
|
||||
"""
|
||||
chunks = []
|
||||
re_list = self._get_data()
|
||||
|
||||
chunk_re = ""
|
||||
grp_tuples = []
|
||||
c = -1
|
||||
|
||||
|
||||
for pos in range(0, len(re_list)):
|
||||
c += 1
|
||||
objs_tpl = ()
|
||||
if len(re_list[pos]) == 2:
|
||||
re_str, objs_tpl = re_list[pos]
|
||||
else:
|
||||
re_str = re_list[pos]
|
||||
|
||||
grp_re = f"|(?P<{self.RE_GRP_PREFIX}_{c}>{re_str})"
|
||||
|
||||
if len(grp_re) + len(chunk_re) > self.chunk_size:
|
||||
# remove the leading | from chunk_re
|
||||
chunks.append((re.compile(chunk_re[1:]), grp_tuples))
|
||||
chunk_re = ""
|
||||
grp_tuples = []
|
||||
|
||||
chunk_re += grp_re
|
||||
grp_tuples.append((re_str, ) + objs_tpl)
|
||||
|
||||
# Are there any leftovers from the for loop?
|
||||
if chunk_re:
|
||||
chunks.append((re.compile(chunk_re[1:]), grp_tuples))
|
||||
return chunks
|
||||
|
||||
|
||||
def search(self, string: str) -> tuple[re.Match, tuple] | None:
|
||||
"""Search for regular expressions in ``string``. If none of the regular
|
||||
expression matches, ``None`` is returned. If there is a match, the
|
||||
first match (:py:obj:`re.Match`) is returned along with a tuple of
|
||||
objects related to the matched pattern (compare :py:obj:`RegExprList`):
|
||||
|
||||
.. code:: python
|
||||
|
||||
( re.Match, ( <regexpr_str>, obj_1, obj_2, ..) )
|
||||
|
||||
"""
|
||||
pos = -1
|
||||
for regexp, objs_tpl in self.chunks:
|
||||
m = regexp.search(string)
|
||||
if m:
|
||||
prefix = f"{self.RE_GRP_PREFIX}_"
|
||||
for grp_name, val in m.groupdict().items():
|
||||
if not grp_name.startswith(prefix):
|
||||
continue
|
||||
if val is None:
|
||||
continue
|
||||
try:
|
||||
pos = int(grp_name[len(prefix):])
|
||||
return (m, objs_tpl[pos])
|
||||
|
||||
except ValueError:
|
||||
# This case should never occur unless there is something
|
||||
# wrong with the regular expressions.
|
||||
warnings.warn(f"ignoring group '{grp_name}' in regexpr match {m}: check your regular expressions!")
|
||||
m = None
|
||||
break
|
||||
return None
|
||||
|
||||
|
||||
def finditer(self, string: str) -> Iterator[tuple[re.Match, tuple]]:
|
||||
"""Return an iterator yielding over all *"non-overlapping"* matches for
|
||||
the RE pattern in string. Similar to :py:obj:`RegExpr.search` each
|
||||
match (:py:obj:`re.Match`) comes along with a tuple of objects related
|
||||
to the matched pattern:
|
||||
|
||||
.. code:: python
|
||||
|
||||
( re.Match, ( <regexpr_str>, obj_1, obj_2, ..) )
|
||||
|
||||
Since the list of regular expressions is concatenated and also broken up
|
||||
at the boundaries of the chunks, it is not possible to ensure
|
||||
*"non-overlapping"* over the entirety of all regular expressions in the
|
||||
list! Nevertheless, there will be scenarios where this iterator makes
|
||||
sense, e.g. if the regular expressions do not overlap.
|
||||
|
||||
.. caution:
|
||||
|
||||
Use this method with care if the :py:obj:`regular expressions in the
|
||||
list <RegExprListload_regexps>` *overlap*, otherwise you get unexpected
|
||||
results!
|
||||
|
||||
"""
|
||||
|
||||
pos = -1
|
||||
for regexp, objs_tpl in self.chunks:
|
||||
for m in regexp.finditer(string):
|
||||
if m is None:
|
||||
continue
|
||||
prefix = f"{self.RE_GRP_PREFIX}_"
|
||||
for grp_name, val in m.groupdict().items():
|
||||
if not grp_name.startswith(prefix):
|
||||
continue
|
||||
if val is None:
|
||||
continue
|
||||
try:
|
||||
pos = int(grp_name[len(prefix):])
|
||||
yield (m, objs_tpl[pos])
|
||||
|
||||
except ValueError:
|
||||
# This case should never occur unless there is something
|
||||
# wrong with the regular expressions.
|
||||
warnings.warn(f"ignoring group '{grp_name}' in regexpr match {m}: check your regular expressions!")
|
||||
continue
|
||||
|
||||
|
||||
##########################################################################
|
||||
## some tests of the POC above
|
||||
|
||||
|
||||
import pdb
|
||||
|
||||
def test_POC():
|
||||
test_list = [
|
||||
# hint: the order of the list counts!
|
||||
(r'aa', ("double 'a' don't overlaps with any other regular expressions",)),
|
||||
(r'a', ("single 'a' overlaps with all other regular expressions",)),
|
||||
r'(.*\.)?academiapublishing\.org$',
|
||||
r'(.*\.)?academiaresearch\.org$',
|
||||
r'(.*\.)?academiascholarlyjournal\.org$',
|
||||
r'(.*\.)?academicjournalsinc\.com$',
|
||||
r'(.*\.)?academicjournalsonline\.co\.in$',
|
||||
r'(.*\.)?academicjournals\.org$',
|
||||
r'(.*\.)?academicoasis\.org$',
|
||||
r'(.*\.)?academic-publishing-house\.com$',
|
||||
r'(.*\.)?academicpub\.org$',
|
||||
r'(.*\.)?academicresearchjournals\.org$',
|
||||
r'(.*\.)?academicstar\.us$',
|
||||
r'(.*\.)?academicsworld\.org$',
|
||||
r'(.*\.)?academicwebpublishers\.org$',
|
||||
r'(.*\.)?academievoorcontinuverbeteren\.nl$',
|
||||
(r'(.*\.)?academyirmbr\.com$', ("XX", "YYYY", 7, 8.2)),
|
||||
r'(.*\.)?academyjournals\.net$',
|
||||
r'(.*\.)?academyofideas\.com$',
|
||||
r'(.*\.)?academypublish\.org$'
|
||||
]
|
||||
|
||||
class TestCls(RegExprList):
|
||||
def load_regexps(self) -> list[tuple[str, tuple]] | list[str]:
|
||||
return test_list
|
||||
mylist = TestCls()
|
||||
string = "aa.www.academyirmbr.com"
|
||||
print(f"matches in '{string}' ...")
|
||||
for m, tpl in mylist.finditer(string):
|
||||
print(f" regexp: {tpl[0]} // match: {m.string[m.start():m.end()]} // objects related to regexp: {tpl}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_POC()
|
Loading…
Reference in New Issue
Block a user