[POC] increase efficiency of reg-expressions

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-21 19:51:44 +01:00 · 2024-06-18 16:52:49 +02:00 · 2024-06-18 16:52:49 +02:00 · 3581e1b85f
commit 3581e1b85f
parent acf3f109b2
1 changed files with 259 additions and 0 deletions
--- a/searx/regexp.py
+++ b/searx/regexp.py
@ -0,0 +1,259 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Implementations for efficient processing of regular expressions"""
+
+from __future__ import annotations
+from typing import Iterator
+import abc
+import re
+import warnings
+import json
+
+class RegExprList(abc.ABC):
+    """Abstract base class for efficient processing of lists of regular
+    expressions.  The inheriting classes have to implement the
+    :py:obj:`RegExprList.load_regexp` method which is used to load the list of
+    regular expressions from a configuration, for example.
+
+    Intention: By concatenating the regular expressions from the list into one
+    regular expression, all patterns can be performed with just one search and
+    it is not necessary to iterate over the individual expressions and perform
+    n-searches.
+
+    """
+
+    RE_GRP_PREFIX = "RegExprList"
+
+    @abc.abstractmethod
+    def load_regexps(self) -> list[tuple[str, tuple]]:
+        """Abstract method to load the list of regular expressions from a
+        configuration.  Returns a list of regular expressions (str) or a list of
+        two-digit tuples with a regular expression on its first position and
+        tuple of *n-objects* related to this regular expression on its second
+        position:
+
+        .. code:: python
+
+            [
+              ( <regexpr_a>, (obj_a1, obj_a2, ..) ),
+              ( <regexpr_b>, (obj_b1, obj_b2, ..) ),
+              ..
+            ]
+
+        If there is nothing related to the regular expression, the tuple is
+        empty (n=0).  The **objects** must be of a simple data type (str, int,
+        ..) so that they can be serialized (JSON).
+
+        """
+
+    def __init__(self, chunk_size = 1000):
+        self.chunk_size = chunk_size
+        self._chunks = None
+        self._data_json = None
+
+    def _get_data(self):
+        if self._data_json is not None:
+            return json.loads(self._data_json)
+        return self.load_regexps()
+
+    @property
+    def JSON(self):
+        """JSON representation of the regular expression list (see
+        :py:obj:`RegExprList.load_regexp`).
+
+        Serialize the :py:obj:`RegExprList` object into a JSON string.
+
+        """
+        if self._data_json is not None:
+            return self._data_json
+        return json.dumps(self._get_data(), sort_keys=True)
+
+    @classmethod
+    def from_json(cls, json_str: str) -> "RegExprList":
+        """Build a :py:obj:`RegExprList` object and load regular expressions from
+        a JSON string (compare :py:obj:`RegExprList.JSON`)."""
+        obj = cls()
+        obj._data_json = json_str
+        return obj
+
+    @property
+    def chunks(self) -> list[tuple[re.Pattern, list[tuple]]]:
+        """A list of (concatenated) regular expressions"""
+        if self._chunks is None:
+            self._chunks = self.get_chunks()
+        return self._chunks
+
+    def get_chunks(self) -> list[tuple[re.Pattern, list[tuple]]]:
+        """Returns a list chunks items.  A chunk item is a two-digit tuple with
+        the concatened :py:obj:`re.Pattern` on its first position and a list of
+        tuples (aka grp_tuples) on its second position.
+
+        The regular expressions are placed in *named groups* and the group for
+        the match can be determined using :py:obj:`re.Match.groupdict:`.
+
+        .. code: re
+
+           (?P<{_0}>foo)|(?P<_1>bar)
+
+        .. code: python
+
+           >>> grp_tuples[0]
+           ('foo', obj_foo_1, obj_foo_2, ...)
+           >>> grp_tuples[1]
+           ('bar', obj_bar_1, obj_bar_1, ...)
+
+        """
+        chunks = []
+        re_list = self._get_data()
+
+        chunk_re = ""
+        grp_tuples = []
+        c = -1
+
+
+        for pos in range(0, len(re_list)):
+            c += 1
+            objs_tpl = ()
+            if len(re_list[pos]) == 2:
+                re_str, objs_tpl = re_list[pos]
+            else:
+                re_str = re_list[pos]
+
+            grp_re = f"|(?P<{self.RE_GRP_PREFIX}_{c}>{re_str})"
+
+            if len(grp_re) + len(chunk_re) > self.chunk_size:
+                # remove the leading | from chunk_re
+                chunks.append((re.compile(chunk_re[1:]), grp_tuples))
+                chunk_re = ""
+                grp_tuples = []
+
+            chunk_re += grp_re
+            grp_tuples.append((re_str, ) + objs_tpl)
+
+        # Are there any leftovers from the for loop?
+        if chunk_re:
+            chunks.append((re.compile(chunk_re[1:]), grp_tuples))
+        return chunks
+
+
+    def search(self, string: str) -> tuple[re.Match, tuple] | None:
+        """Search for regular expressions in ``string``.  If none of the regular
+        expression matches, ``None`` is returned.  If there is a match, the
+        first match (:py:obj:`re.Match`) is returned along with a tuple of
+        objects related to the matched pattern (compare :py:obj:`RegExprList`):
+
+        .. code:: python
+
+           ( re.Match, ( <regexpr_str>, obj_1, obj_2, ..) )
+
+        """
+        pos = -1
+        for regexp, objs_tpl in self.chunks:
+            m = regexp.search(string)
+            if m:
+                prefix = f"{self.RE_GRP_PREFIX}_"
+                for grp_name, val in m.groupdict().items():
+                    if not grp_name.startswith(prefix):
+                        continue
+                    if val is None:
+                        continue
+                    try:
+                        pos = int(grp_name[len(prefix):])
+                        return (m, objs_tpl[pos])
+
+                    except ValueError:
+                        # This case should never occur unless there is something
+                        # wrong with the regular expressions.
+                        warnings.warn(f"ignoring group '{grp_name}' in regexpr match {m}: check your regular expressions!")
+                        m = None
+                        break
+        return None
+
+
+    def finditer(self, string: str) -> Iterator[tuple[re.Match, tuple]]:
+        """Return an iterator yielding over all *"non-overlapping"* matches for
+        the RE pattern in string.  Similar to :py:obj:`RegExpr.search` each
+        match (:py:obj:`re.Match`) comes along with a tuple of objects related
+        to the matched pattern:
+
+        .. code:: python
+
+           ( re.Match, ( <regexpr_str>, obj_1, obj_2, ..) )
+
+        Since the list of regular expressions is concatenated and also broken up
+        at the boundaries of the chunks, it is not possible to ensure
+        *"non-overlapping"* over the entirety of all regular expressions in the
+        list!  Nevertheless, there will be scenarios where this iterator makes
+        sense, e.g. if the regular expressions do not overlap.
+
+        .. caution:
+
+           Use this method with care if the :py:obj:`regular expressions in the
+           list <RegExprListload_regexps>` *overlap*, otherwise you get unexpected
+           results!
+
+        """
+
+        pos = -1
+        for regexp, objs_tpl in self.chunks:
+            for m in regexp.finditer(string):
+                if m is None:
+                    continue
+                prefix = f"{self.RE_GRP_PREFIX}_"
+                for grp_name, val in m.groupdict().items():
+                    if not grp_name.startswith(prefix):
+                        continue
+                    if val is None:
+                        continue
+                    try:
+                        pos = int(grp_name[len(prefix):])
+                        yield (m, objs_tpl[pos])
+
+                    except ValueError:
+                        # This case should never occur unless there is something
+                        # wrong with the regular expressions.
+                        warnings.warn(f"ignoring group '{grp_name}' in regexpr match {m}: check your regular expressions!")
+                        continue
+
+
+##########################################################################
+## some tests of the POC above
+
+
+import pdb
+
+def test_POC():
+    test_list = [
+        # hint: the order of the list counts!
+        (r'aa', ("double 'a' don't overlaps with any other regular expressions",)),
+        (r'a', ("single 'a' overlaps with all other regular expressions",)),
+        r'(.*\.)?academiapublishing\.org$',
+        r'(.*\.)?academiaresearch\.org$',
+        r'(.*\.)?academiascholarlyjournal\.org$',
+        r'(.*\.)?academicjournalsinc\.com$',
+        r'(.*\.)?academicjournalsonline\.co\.in$',
+        r'(.*\.)?academicjournals\.org$',
+        r'(.*\.)?academicoasis\.org$',
+        r'(.*\.)?academic-publishing-house\.com$',
+        r'(.*\.)?academicpub\.org$',
+        r'(.*\.)?academicresearchjournals\.org$',
+        r'(.*\.)?academicstar\.us$',
+        r'(.*\.)?academicsworld\.org$',
+        r'(.*\.)?academicwebpublishers\.org$',
+        r'(.*\.)?academievoorcontinuverbeteren\.nl$',
+        (r'(.*\.)?academyirmbr\.com$', ("XX", "YYYY", 7, 8.2)),
+        r'(.*\.)?academyjournals\.net$',
+        r'(.*\.)?academyofideas\.com$',
+        r'(.*\.)?academypublish\.org$'
+    ]
+
+    class TestCls(RegExprList):
+        def load_regexps(self) -> list[tuple[str, tuple]] | list[str]:
+            return test_list
+    mylist = TestCls()
+    string = "aa.www.academyirmbr.com"
+    print(f"matches in '{string}' ...")
+    for m, tpl in mylist.finditer(string):
+        print(f"  regexp: {tpl[0]} // match: {m.string[m.start():m.end()]} // objects related to regexp: {tpl}")
+
+if __name__ == "__main__":
+    test_POC()