From 0fa81fc782feb104bbd6616d87a6f441aad7d5bd Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Thu, 23 May 2024 23:21:58 +0000 Subject: [PATCH] [enh] add re-usable func to filter text --- searx/engines/brave.py | 7 ++----- searx/engines/duckduckgo.py | 17 +++++++---------- searx/engines/qwant.py | 7 +++---- searx/engines/vimeo.py | 9 +++++---- searx/engines/youtube_noapi.py | 6 ++++-- searx/utils.py | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 53 insertions(+), 25 deletions(-) diff --git a/searx/engines/brave.py b/searx/engines/brave.py index 04c2931f9..c5780a02c 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -132,6 +132,7 @@ from lxml import html from searx import locales from searx.utils import ( extract_text, + extr, eval_xpath, eval_xpath_list, eval_xpath_getindex, @@ -252,11 +253,7 @@ def response(resp): if brave_category in ('search', 'goggles'): return _parse_search(resp) - datastr = "" - for line in resp.text.split("\n"): - if "const data = " in line: - datastr = line.replace("const data = ", "").strip()[:-1] - break + datastr = extr(resp.text, "const data = ", ";\n").strip() json_data = js_variable_to_python(datastr) json_resp = json_data[1]['data']['body']['response'] diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index b874ca2f8..fced014c1 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -392,7 +392,9 @@ def fetch_traits(engine_traits: EngineTraits): SearXNG's locale. """ - # pylint: disable=too-many-branches, too-many-statements + # pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel + from searx.utils import extr, js_variable_to_python + # fetch regions engine_traits.all_locale = 'wt-wt' @@ -403,11 +405,9 @@ def fetch_traits(engine_traits: EngineTraits): if not resp.ok: # type: ignore print("ERROR: response from DuckDuckGo is not OK.") - pos = resp.text.find('regions:{') + 8 # type: ignore - js_code = resp.text[pos:] # type: ignore - pos = js_code.find('}') + 1 - regions = json.loads(js_code[:pos]) + js_code = extr(resp.text, 'regions:', ',snippetLengths') + regions = json.loads(js_code) for eng_tag, name in regions.items(): if eng_tag == 'wt-wt': @@ -439,12 +439,9 @@ def fetch_traits(engine_traits: EngineTraits): engine_traits.custom['lang_region'] = {} - pos = resp.text.find('languages:{') + 10 # type: ignore - js_code = resp.text[pos:] # type: ignore - pos = js_code.find('}') + 1 - js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') - languages = json.loads(js_code) + js_code = extr(resp.text, 'languages:', ',regions') + languages = js_variable_to_python(js_code) for eng_lang, name in languages.items(): if eng_lang == 'wt_WT': diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index facd47bb9..989fe1445 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -312,13 +312,12 @@ def fetch_traits(engine_traits: EngineTraits): # pylint: disable=import-outside-toplevel from searx import network from searx.locales import region_tag + from searx.utils import extr resp = network.get(about['website']) - text = resp.text - text = text[text.find('INITIAL_PROPS') :] - text = text[text.find('{') : text.find('')] + json_string = extr(resp.text, 'INITIAL_PROPS = ', '') - q_initial_props = loads(text) + q_initial_props = loads(json_string) q_locales = q_initial_props.get('locales') eng_tag_list = set() diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 2449345e6..d46468d8d 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -7,6 +7,8 @@ from urllib.parse import urlencode from json import loads from dateutil import parser +from searx.utils import extr + # about about = { "website": 'https://vimeo.com/', @@ -23,7 +25,7 @@ paging = True # search-url base_url = 'https://vimeo.com/' -search_url = base_url + '/search/page:{pageno}?{query}' +search_url = base_url + 'search/page:{pageno}?{query}' # do search-request @@ -36,9 +38,8 @@ def request(query, params): # get response from search-request def response(resp): results = [] - data_start_pos = resp.text.find('{"filtered"') - data_end_pos = resp.text.find(';\n', data_start_pos + 1) - data = loads(resp.text[data_start_pos:data_end_pos]) + + data = loads(extr(resp.text, 'var data = ', ';\n')) # parse results for result in data['filtered']['data']: diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index b015dff8d..c2136c3ca 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -7,6 +7,8 @@ from functools import reduce from json import loads, dumps from urllib.parse import quote_plus +from searx.utils import extr + # about about = { "website": 'https://www.youtube.com/', @@ -109,8 +111,8 @@ def parse_next_page_response(response_text): def parse_first_page_response(response_text): results = [] - results_data = response_text[response_text.find('ytInitialData') :] - results_data = results_data[results_data.find('{') : results_data.find(';')] + results_data = extr(response_text, 'ytInitialData = ', ';') + results_json = loads(results_data) if results_data else {} sections = ( results_json.get('contents', {}) diff --git a/searx/utils.py b/searx/utils.py index f50618ea2..58ff72bb9 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -2,6 +2,9 @@ """Utility functions for the engines """ + +from __future__ import annotations + import re import importlib import importlib.util @@ -371,6 +374,35 @@ def convert_str_to_int(number_str: str) -> int: return 0 +def extr(txt: str, begin: str, end: str, default: str = ""): + """Extract the string between ``begin`` and ``end`` from ``txt`` + + :param txt: String to search in + :param begin: First string to be searched for + :param end: Second string to be searched for after ``begin`` + :param default: Default value if one of ``begin`` or ``end`` is not + found. Defaults to an empty string. + :return: The string between the two search-strings ``begin`` and ``end``. + If at least one of ``begin`` or ``end`` is not found, the value of + ``default`` is returned. + + Examples: + >>> extr("abcde", "a", "e") + "bcd" + >>> extr("abcde", "a", "z", deafult="nothing") + "nothing" + + """ + + # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129 + + try: + first = txt.index(begin) + len(begin) + return txt[first : txt.index(end, first)] + except ValueError: + return default + + def int_or_zero(num: Union[List[str], str]) -> int: """Convert num to int or 0. num can be either a str or a list. If num is a list, the first element is converted to int (or return 0 if the list is empty).