diff --git a/Dockerfile b/Dockerfile index 3894aa968..f251d06ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,6 +41,8 @@ RUN apk upgrade --no-cache \ openssl-dev \ tar \ git \ + protoc \ + protobuf-dev \ && apk add --no-cache \ ca-certificates \ su-exec \ @@ -53,6 +55,7 @@ RUN apk upgrade --no-cache \ uwsgi \ uwsgi-python3 \ brotli \ + protobuf \ && pip3 install --upgrade pip \ && pip3 install --no-cache -r requirements.txt \ && apk del build-dependencies \ diff --git a/dockerfiles/uwsgi.ini b/dockerfiles/uwsgi.ini index 398a440d9..818a99cc0 100644 --- a/dockerfiles/uwsgi.ini +++ b/dockerfiles/uwsgi.ini @@ -42,3 +42,6 @@ static-map = /static=/usr/local/searx/searx/static static-expires = /* 864000 static-gzip-all = True offload-threads = %k + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 diff --git a/requirements.txt b/requirements.txt index ecf8e0c62..776bbc20b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pygments==2.1.3 python-dateutil==2.8.1 pyyaml==5.3.1 requests[socks]==2.25.1 +pycld3==0.20 diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 0d45f0b7c..f777e8595 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url from searx.results import ResultContainer from searx import logger from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery from searx.search.processors import processors, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker logger = logger.getChild('search') @@ -45,68 +47,11 @@ else: sys.exit(1) -def initialize(settings_engines=None): +def initialize(settings_engines=None, enable_checker=False): settings_engines = settings_engines or settings['engines'] initialize_processors(settings_engines) - - -class EngineRef: - - __slots__ = 'name', 'category' - - def __init__(self, name: str, category: str): - self.name = name - self.category = category - - def __repr__(self): - return "EngineRef({!r}, {!r})".format(self.name, self.category) - - def __eq__(self, other): - return self.name == other.name and self.category == other.category - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ - 'timeout_limit', 'external_bang' - - def __init__(self, - query: str, - engineref_list: typing.List[EngineRef], - lang: str='all', - safesearch: int=0, - pageno: int=1, - time_range: typing.Optional[str]=None, - timeout_limit: typing.Optional[float]=None, - external_bang: typing.Optional[str]=None): - self.query = query - self.engineref_list = engineref_list - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = time_range - self.timeout_limit = timeout_limit - self.external_bang = external_bang - - @property - def categories(self): - return list(set(map(lambda engineref: engineref.category, self.engineref_list))) - - def __repr__(self): - return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ - format(self.query, self.engineref_list, self.lang, self.safesearch, - self.pageno, self.time_range, self.timeout_limit, self.external_bang) - - def __eq__(self, other): - return self.query == other.query\ - and self.engineref_list == other.engineref_list\ - and self.lang == other.lang\ - and self.safesearch == other.safesearch\ - and self.pageno == other.pageno\ - and self.time_range == other.time_range\ - and self.timeout_limit == other.timeout_limit\ - and self.external_bang == other.external_bang + if enable_checker: + initialize_checker() class Search: diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 000000000..85b9178df --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 000000000..75b37e6c5 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import sys +import io +import os +import argparse +import logging + +import searx.search +import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts + + +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) +stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) + + +# iterator of processors +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: + yield name, processor + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +# actual check & display +def run(engine_name_list, verbose): + searx.search.initialize() + for name, processor in iter_processor(engine_name_list): + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') + + +# call by setup.py +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + parser.add_argument('--verbose', '-v', + action='store_true', dest='verbose', + help='Display details about the test results', + default=False) + args = parser.parse_args() + run(args.engine_name_list, args.verbose) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..e41bff5f5 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str(CHECKER_RESULT) + if serialized_result is not None: + return json.loads(serialized_result) + + +def _set_result(result, include_timestamp=True): + if include_timestamp: + result['timestamp'] = int(time.time() / 3600) * 3600 + storage.set_str(CHECKER_RESULT, json.dumps(result)) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = { + 'status': 'ok', + 'engines': {} + } + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result['engines'][name] = {'success': True} + else: + result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} + + _set_result(result) + logger.info('Check done') + except Exception: + _set_result({'status': 'error'}) + logger.exception('Error while running the checker') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + if schedule(every[0], _run_with_delay): + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # disabled by default + _set_result({'status': 'disabled'}) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + _set_result({'status': 'unknown'}, include_timestamp=False) + + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 000000000..71a941f73 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing +import types +import functools +import itertools +from time import time +from urllib.parse import urlparse + +import re +import cld3 +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search.models import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +logger = logger.getChild('searx.search.checker') + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'logs', 'languages' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() + + def add_error(self, test, message, *args): + # message to self.errors + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) + + def add_language(self, language): + self.languages.add(language) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results] + + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') + + def _add_language(self, text: str) -> typing.Optional[str]: + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.98 and r.is_reliable: + self.languages.add(r.language) + self.test_results.add_language(r.language) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title', repr(result.get('title', ''))) + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content', repr(result.get('content', ''))) + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self._record_error('thumbnail_src URL is invalid', thumbnail_src) + elif not _is_url_image(result.get('img_src')): + self._record_error('img_src URL is invalid', result.get('img_src')) + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid', result.get('img_src')) + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer', answer) + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content', infobox.get('content', '')) + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + """Check the ResultContainer has at least one infobox""" + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + """Check the ResultContainer has at least one answer""" + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + """Check the results of each ResultContain is unique""" + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) diff --git a/searx/search/models.py b/searx/search/models.py new file mode 100644 index 000000000..80ceaa223 --- /dev/null +++ b/searx/search/models.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing + + +class EngineRef: + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ + 'timeout_limit', 'external_bang' + + def __init__(self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str='all', + safesearch: int=0, + pageno: int=1, + time_range: typing.Optional[str]=None, + timeout_limit: typing.Optional[float]=None, + external_bang: typing.Optional[str]=None): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang + + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index cf3fd7236..eb8d296ec 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -37,3 +37,15 @@ class EngineProcessor: @abstractmethod def search(self, query, params, result_container, start_time, timeout_limit): pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + else: + return tests + + def get_default_tests(self): + return {} diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index b62f8059e..0ceb0adf2 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -179,15 +179,15 @@ class OnlineProcessor(EngineProcessor): requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required') - logger.exception('engine {0} : CAPTCHA') + logger.exception('engine {0} : CAPTCHA'.format(self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): result_container.add_unresponsive_engine(self.engine_name, 'too many requests') - logger.exception('engine {0} : Too many requests') + logger.exception('engine {0} : Too many requests'.format(self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): result_container.add_unresponsive_engine(self.engine_name, 'blocked') - logger.exception('engine {0} : Searx is blocked') + logger.exception('engine {0} : Searx is blocked'.format(self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash') @@ -211,3 +211,49 @@ class OnlineProcessor(EngineProcessor): # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0 + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('life', 'computer')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + tests['paging'] = { + 'matrix': {'query': 'time', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + if 'general' in self.engine.categories: + # avoid documentation about HTML tags (