2023-09-14 13:31:54 +02:00
|
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
# lint: pylint
|
|
|
|
"""Presearch (general, images, videos, news)
|
2023-10-04 14:31:31 +02:00
|
|
|
|
|
|
|
.. hint::
|
|
|
|
|
|
|
|
The results in the video category are most often links to pages that contain
|
|
|
|
a video, for instance many links from preasearch's video category link
|
|
|
|
content from facebook (aka Meta) or Twitter (aka X). Since these are not
|
|
|
|
real links to video streams SearXNG can't use the video template for this and
|
|
|
|
if SearXNG can't use this template, then the user doesn't want to see these
|
|
|
|
hits in the videos category.
|
|
|
|
|
|
|
|
TL;DR; by default presearch's video category is placed into categories::
|
|
|
|
|
|
|
|
categories: [general, web]
|
|
|
|
|
2023-09-14 13:31:54 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
from searx.network import get
|
|
|
|
from searx.utils import gen_useragent, html_to_text
|
|
|
|
|
|
|
|
about = {
|
|
|
|
"website": "https://presearch.io",
|
|
|
|
"wikidiata_id": "Q7240905",
|
|
|
|
"official_api_documentation": "https://docs.presearch.io/nodes/api",
|
|
|
|
"use_official_api": False,
|
|
|
|
"require_api_key": False,
|
|
|
|
"results": "JSON",
|
|
|
|
}
|
|
|
|
paging = True
|
|
|
|
time_range_support = True
|
|
|
|
categories = ["general", "web"] # general, images, videos, news
|
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
search_type = "search"
|
|
|
|
"""must be any of ``search``, ``images``, ``videos``, ``news``"""
|
2023-09-14 13:31:54 +02:00
|
|
|
|
|
|
|
base_url = "https://presearch.com"
|
|
|
|
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
|
|
|
|
|
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
def init(_):
|
|
|
|
if search_type not in ['search', 'images', 'videos', 'news']:
|
|
|
|
raise ValueError(f'presearch search_type: {search_type}')
|
|
|
|
|
|
|
|
|
2023-09-14 13:31:54 +02:00
|
|
|
def _get_request_id(query, page, time_range, safesearch):
|
|
|
|
args = {
|
|
|
|
"q": query,
|
|
|
|
"page": page,
|
|
|
|
}
|
|
|
|
if time_range:
|
|
|
|
args["time_range"] = time_range
|
|
|
|
|
|
|
|
url = f"{base_url}/{search_type}?{urlencode(args)}"
|
|
|
|
headers = {
|
|
|
|
'User-Agent': gen_useragent(),
|
|
|
|
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
|
|
|
|
}
|
2023-10-04 14:31:31 +02:00
|
|
|
resp_text = get(url, headers=headers).text # type: ignore
|
2023-09-14 13:31:54 +02:00
|
|
|
|
|
|
|
for line in resp_text.split("\n"):
|
|
|
|
if "window.searchId = " in line:
|
|
|
|
return line.split("= ")[1][:-1].replace('"', "")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def request(query, params):
|
|
|
|
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
|
|
|
|
|
|
|
params["headers"]["Accept"] = "application/json"
|
|
|
|
params["url"] = f"{base_url}/results?id={request_id}"
|
|
|
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
def _strip_leading_strings(text):
|
|
|
|
for x in ['wikipedia', 'google']:
|
|
|
|
if text.lower().endswith(x):
|
|
|
|
text = text[: -len(x)]
|
|
|
|
return text.strip()
|
2023-09-14 13:31:54 +02:00
|
|
|
|
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
def parse_search_query(json_results):
|
|
|
|
results = []
|
2023-09-14 13:31:54 +02:00
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
2023-09-14 13:31:54 +02:00
|
|
|
result = {
|
2023-10-04 14:31:31 +02:00
|
|
|
'url': item['link'],
|
|
|
|
'title': item['title'],
|
|
|
|
'img_src': item['image'],
|
|
|
|
'content': '',
|
|
|
|
'metadata': item.get('source'),
|
2023-09-14 13:31:54 +02:00
|
|
|
}
|
2023-10-04 14:31:31 +02:00
|
|
|
results.append(result)
|
2023-09-14 13:31:54 +02:00
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
for item in json_results.get('standardResults', []):
|
|
|
|
result = {
|
|
|
|
'url': item['link'],
|
|
|
|
'title': item['title'],
|
|
|
|
'content': html_to_text(item['description']),
|
|
|
|
}
|
|
|
|
results.append(result)
|
2023-09-14 13:31:54 +02:00
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
info = json_results.get('infoSection', {}).get('data')
|
|
|
|
if info:
|
|
|
|
attributes = []
|
|
|
|
for item in info.get('about', []):
|
|
|
|
label, value = html_to_text(item).split(':', 1)
|
|
|
|
value = _strip_leading_strings(value)
|
|
|
|
attributes.append({'label': label, 'value': value})
|
|
|
|
content = []
|
|
|
|
for item in [info['subtitle'], info['description']]:
|
|
|
|
item = _strip_leading_strings(html_to_text(item))
|
|
|
|
if item:
|
|
|
|
content.append(item)
|
|
|
|
|
|
|
|
results.append(
|
|
|
|
{
|
|
|
|
'infobox': info['title'],
|
|
|
|
'id': info['title'],
|
|
|
|
'img_src': info.get('image'),
|
|
|
|
'content': ' | '.join(content),
|
|
|
|
'attributes': attributes,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
return results
|
2023-09-14 13:31:54 +02:00
|
|
|
|
|
|
|
|
2023-10-04 14:31:31 +02:00
|
|
|
def response(resp):
|
|
|
|
results = []
|
|
|
|
json_resp = resp.json()
|
|
|
|
|
|
|
|
if search_type == 'search':
|
|
|
|
results = parse_search_query(json_resp['results'])
|
|
|
|
|
|
|
|
elif search_type == 'images':
|
|
|
|
for item in json_resp['images']:
|
|
|
|
results.append(
|
|
|
|
{
|
|
|
|
'template': 'images.html',
|
|
|
|
'title': item['title'],
|
|
|
|
'url': item['link'],
|
|
|
|
'img_src': item['image'],
|
|
|
|
'thumbnail_src': item['thumbnail'],
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
elif search_type == 'videos':
|
|
|
|
# The results in the video category are most often links to pages that contain
|
|
|
|
# a video and not to a video stream --> SearXNG can't use the video template.
|
|
|
|
|
|
|
|
for item in json_resp['videos']:
|
|
|
|
metadata = [x for x in [item.get('description'), item.get('duration')] if x]
|
|
|
|
results.append(
|
|
|
|
{
|
|
|
|
'title': item['title'],
|
|
|
|
'url': item['link'],
|
|
|
|
'content': '',
|
|
|
|
'metadata': ' / '.join(metadata),
|
|
|
|
'img_src': item.get('image'),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
elif search_type == 'news':
|
|
|
|
for item in json_resp['news']:
|
|
|
|
metadata = [x for x in [item.get('source'), item.get('time')] if x]
|
|
|
|
results.append(
|
|
|
|
{
|
|
|
|
'title': item['title'],
|
|
|
|
'url': item['link'],
|
|
|
|
'content': item['description'],
|
|
|
|
'metadata': ' / '.join(metadata),
|
|
|
|
'img_src': item.get('image'),
|
|
|
|
}
|
|
|
|
)
|
2023-09-14 13:31:54 +02:00
|
|
|
|
|
|
|
return results
|