From 44392bd436252d7c2c38a62c759712f1766c9fff Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 4 Oct 2023 14:31:31 +0200 Subject: [PATCH] [mod] improve implementation of presearch engine Signed-off-by: Markus Heiser --- searx/engines/presearch.py | 156 +++++++++++++++++++++++++++---------- searx/settings.yml | 3 +- 2 files changed, 119 insertions(+), 40 deletions(-) diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py index c41cf3b37..1e20465ed 100644 --- a/searx/engines/presearch.py +++ b/searx/engines/presearch.py @@ -1,6 +1,20 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint """Presearch (general, images, videos, news) + +.. hint:: + + The results in the video category are most often links to pages that contain + a video, for instance many links from preasearch's video category link + content from facebook (aka Meta) or Twitter (aka X). Since these are not + real links to video streams SearXNG can't use the video template for this and + if SearXNG can't use this template, then the user doesn't want to see these + hits in the videos category. + + TL;DR; by default presearch's video category is placed into categories:: + + categories: [general, web] + """ from urllib.parse import urlencode @@ -19,12 +33,18 @@ paging = True time_range_support = True categories = ["general", "web"] # general, images, videos, news -search_type = "search" # must be any of "search", "images", "videos", "news" +search_type = "search" +"""must be any of ``search``, ``images``, ``videos``, ``news``""" base_url = "https://presearch.com" safesearch_map = {0: 'false', 1: 'true', 2: 'true'} +def init(_): + if search_type not in ['search', 'images', 'videos', 'news']: + raise ValueError(f'presearch search_type: {search_type}') + + def _get_request_id(query, page, time_range, safesearch): args = { "q": query, @@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch): 'User-Agent': gen_useragent(), 'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}", } - resp_text = get(url, headers=headers).text + resp_text = get(url, headers=headers).text # type: ignore for line in resp_text.split("\n"): if "window.searchId = " in line: @@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch): return None -def _is_valid_img_src(url): - # in some cases, the image url is a base64 encoded string, which has to be skipped - return "https://" in url - - def request(query, params): request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"]) @@ -61,42 +76,105 @@ def request(query, params): return params -def response(resp): +def _strip_leading_strings(text): + for x in ['wikipedia', 'google']: + if text.lower().endswith(x): + text = text[: -len(x)] + return text.strip() + + +def parse_search_query(json_results): results = [] - json = resp.json() - - json_results = [] - if search_type == "search": - json_results = json['results'].get('standardResults', []) - else: - json_results = json.get(search_type, []) - - for json_result in json_results: + for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []): result = { - 'url': json_result['link'], - 'title': json_result['title'], - 'content': html_to_text(json_result.get('description', '')), + 'url': item['link'], + 'title': item['title'], + 'img_src': item['image'], + 'content': '', + 'metadata': item.get('source'), } - if search_type == "images": - result['template'] = 'images.html' - - if not _is_valid_img_src(json_result['image']): - continue - - result['img_src'] = json_result['image'] - if _is_valid_img_src(json_result['thumbnail']): - result['thumbnail'] = json_result['thumbnail'] - - elif search_type == "videos": - result['template'] = 'videos.html' - - if _is_valid_img_src(json_result['image']): - result['thumbnail'] = json_result['image'] - - result['duration'] = json_result['duration'] - result['length'] = json_result['duration'] - results.append(result) + for item in json_results.get('standardResults', []): + result = { + 'url': item['link'], + 'title': item['title'], + 'content': html_to_text(item['description']), + } + results.append(result) + + info = json_results.get('infoSection', {}).get('data') + if info: + attributes = [] + for item in info.get('about', []): + label, value = html_to_text(item).split(':', 1) + value = _strip_leading_strings(value) + attributes.append({'label': label, 'value': value}) + content = [] + for item in [info['subtitle'], info['description']]: + item = _strip_leading_strings(html_to_text(item)) + if item: + content.append(item) + + results.append( + { + 'infobox': info['title'], + 'id': info['title'], + 'img_src': info.get('image'), + 'content': ' | '.join(content), + 'attributes': attributes, + } + ) + return results + + +def response(resp): + results = [] + json_resp = resp.json() + + if search_type == 'search': + results = parse_search_query(json_resp['results']) + + elif search_type == 'images': + for item in json_resp['images']: + results.append( + { + 'template': 'images.html', + 'title': item['title'], + 'url': item['link'], + 'img_src': item['image'], + 'thumbnail_src': item['thumbnail'], + } + ) + + elif search_type == 'videos': + # The results in the video category are most often links to pages that contain + # a video and not to a video stream --> SearXNG can't use the video template. + + for item in json_resp['videos']: + metadata = [x for x in [item.get('description'), item.get('duration')] if x] + results.append( + { + 'title': item['title'], + 'url': item['link'], + 'content': '', + 'metadata': ' / '.join(metadata), + 'img_src': item.get('image'), + } + ) + + elif search_type == 'news': + for item in json_resp['news']: + metadata = [x for x in [item.get('source'), item.get('time')] if x] + results.append( + { + 'title': item['title'], + 'url': item['link'], + 'content': item['description'], + 'metadata': ' / '.join(metadata), + 'img_src': item.get('image'), + } + ) + return results diff --git a/searx/settings.yml b/searx/settings.yml index 0edf01762..be420528f 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1295,6 +1295,7 @@ engines: search_type: search categories: [general, web] shortcut: ps + disabled: true - name: presearch images engine: presearch @@ -1307,7 +1308,7 @@ engines: - name: presearch videos engine: presearch search_type: videos - categories: [videos, web] + categories: [general, web] timeout: 4.0 shortcut: psvid disabled: true