From 44392bd436252d7c2c38a62c759712f1766c9fff Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Wed, 4 Oct 2023 14:31:31 +0200
Subject: [PATCH] [mod] improve implementation of presearch engine

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
---
 searx/engines/presearch.py | 156 +++++++++++++++++++++++++++----------
 searx/settings.yml         |   3 +-
 2 files changed, 119 insertions(+), 40 deletions(-)

diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py
index c41cf3b37..1e20465ed 100644
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@@ -1,6 +1,20 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # lint: pylint
 """Presearch (general, images, videos, news)
+
+.. hint::
+
+   The results in the video category are most often links to pages that contain
+   a video, for instance many links from preasearch's video category link
+   content from facebook (aka Meta) or Twitter (aka X).  Since these are not
+   real links to video streams SearXNG can't use the video template for this and
+   if SearXNG can't use this template, then the user doesn't want to see these
+   hits in the videos category.
+
+   TL;DR; by default presearch's video category is placed into categories::
+
+       categories: [general, web]
+
 """
 
 from urllib.parse import urlencode
@@ -19,12 +33,18 @@ paging = True
 time_range_support = True
 categories = ["general", "web"]  # general, images, videos, news
 
-search_type = "search"  # must be any of "search", "images", "videos", "news"
+search_type = "search"
+"""must be any of ``search``, ``images``, ``videos``, ``news``"""
 
 base_url = "https://presearch.com"
 safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
 
 
+def init(_):
+    if search_type not in ['search', 'images', 'videos', 'news']:
+        raise ValueError(f'presearch search_type: {search_type}')
+
+
 def _get_request_id(query, page, time_range, safesearch):
     args = {
         "q": query,
@@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
         'User-Agent': gen_useragent(),
         'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
     }
-    resp_text = get(url, headers=headers).text
+    resp_text = get(url, headers=headers).text  # type: ignore
 
     for line in resp_text.split("\n"):
         if "window.searchId = " in line:
@@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
     return None
 
 
-def _is_valid_img_src(url):
-    # in some cases, the image url is a base64 encoded string, which has to be skipped
-    return "https://" in url
-
-
 def request(query, params):
     request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
 
@@ -61,42 +76,105 @@ def request(query, params):
     return params
 
 
-def response(resp):
+def _strip_leading_strings(text):
+    for x in ['wikipedia', 'google']:
+        if text.lower().endswith(x):
+            text = text[: -len(x)]
+    return text.strip()
+
+
+def parse_search_query(json_results):
     results = []
 
-    json = resp.json()
-
-    json_results = []
-    if search_type == "search":
-        json_results = json['results'].get('standardResults', [])
-    else:
-        json_results = json.get(search_type, [])
-
-    for json_result in json_results:
+    for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
         result = {
-            'url': json_result['link'],
-            'title': json_result['title'],
-            'content': html_to_text(json_result.get('description', '')),
+            'url': item['link'],
+            'title': item['title'],
+            'img_src': item['image'],
+            'content': '',
+            'metadata': item.get('source'),
         }
-        if search_type == "images":
-            result['template'] = 'images.html'
-
-            if not _is_valid_img_src(json_result['image']):
-                continue
-
-            result['img_src'] = json_result['image']
-            if _is_valid_img_src(json_result['thumbnail']):
-                result['thumbnail'] = json_result['thumbnail']
-
-        elif search_type == "videos":
-            result['template'] = 'videos.html'
-
-            if _is_valid_img_src(json_result['image']):
-                result['thumbnail'] = json_result['image']
-
-            result['duration'] = json_result['duration']
-            result['length'] = json_result['duration']
-
         results.append(result)
 
+    for item in json_results.get('standardResults', []):
+        result = {
+            'url': item['link'],
+            'title': item['title'],
+            'content': html_to_text(item['description']),
+        }
+        results.append(result)
+
+    info = json_results.get('infoSection', {}).get('data')
+    if info:
+        attributes = []
+        for item in info.get('about', []):
+            label, value = html_to_text(item).split(':', 1)
+            value = _strip_leading_strings(value)
+            attributes.append({'label': label, 'value': value})
+        content = []
+        for item in [info['subtitle'], info['description']]:
+            item = _strip_leading_strings(html_to_text(item))
+            if item:
+                content.append(item)
+
+        results.append(
+            {
+                'infobox': info['title'],
+                'id': info['title'],
+                'img_src': info.get('image'),
+                'content': ' | '.join(content),
+                'attributes': attributes,
+            }
+        )
+    return results
+
+
+def response(resp):
+    results = []
+    json_resp = resp.json()
+
+    if search_type == 'search':
+        results = parse_search_query(json_resp['results'])
+
+    elif search_type == 'images':
+        for item in json_resp['images']:
+            results.append(
+                {
+                    'template': 'images.html',
+                    'title': item['title'],
+                    'url': item['link'],
+                    'img_src': item['image'],
+                    'thumbnail_src': item['thumbnail'],
+                }
+            )
+
+    elif search_type == 'videos':
+        # The results in the video category are most often links to pages that contain
+        # a video and not to a video stream --> SearXNG can't use the video template.
+
+        for item in json_resp['videos']:
+            metadata = [x for x in [item.get('description'), item.get('duration')] if x]
+            results.append(
+                {
+                    'title': item['title'],
+                    'url': item['link'],
+                    'content': '',
+                    'metadata': ' / '.join(metadata),
+                    'img_src': item.get('image'),
+                }
+            )
+
+    elif search_type == 'news':
+        for item in json_resp['news']:
+            metadata = [x for x in [item.get('source'), item.get('time')] if x]
+            results.append(
+                {
+                    'title': item['title'],
+                    'url': item['link'],
+                    'content': item['description'],
+                    'metadata': ' / '.join(metadata),
+                    'img_src': item.get('image'),
+                }
+            )
+
     return results
diff --git a/searx/settings.yml b/searx/settings.yml
index 0edf01762..be420528f 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1295,6 +1295,7 @@ engines:
     search_type: search
     categories: [general, web]
     shortcut: ps
+    disabled: true
 
   - name: presearch images
     engine: presearch
@@ -1307,7 +1308,7 @@ engines:
   - name: presearch videos
     engine: presearch
     search_type: videos
-    categories: [videos, web]
+    categories: [general, web]
     timeout: 4.0
     shortcut: psvid
     disabled: true