From 488ace1da9f2d9328316abad78b354219a57be90 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 25 Nov 2021 19:38:14 +0100 Subject: [PATCH 1/2] [fix] google engine - suggestion BTW: google no longer offers *spelling suggestions* Closes: https://github.com/searxng/searxng/issues/442 Signed-off-by: Markus Heiser --- searx/engines/google.py | 11 +---------- searx/engines/google_videos.py | 4 ---- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 4e6fa6190..578dec60c 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -138,12 +138,7 @@ content_xpath = './/div[@class="IsZvec"]' # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. -suggestion_xpath = '//div[contains(@class, "card-section")]//a' - -# Since google does *auto-correction* on the first query these are not really -# *spelling suggestions*, we use them anyway. -spelling_suggestion_xpath = '//div[@class="med"]/p/a' - +suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. @@ -322,7 +317,6 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) - # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: @@ -379,9 +373,6 @@ def response(resp): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - for correction in eval_xpath_list(dom, spelling_suggestion_xpath): - results.append({'correction': extract_text(correction)}) - # return results return results diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 9403ef4f7..86dc1867d 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -37,7 +37,6 @@ from searx.engines.google import ( href_xpath, content_xpath, suggestion_xpath, - spelling_suggestion_xpath, detect_google_sorry, ) @@ -186,7 +185,4 @@ def response(resp): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - for correction in eval_xpath_list(dom, spelling_suggestion_xpath): - results.append({'correction': extract_text(correction)}) - return results From 1ce09df9aa4d08d2125dca8f83906c5954048d0a Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 Nov 2021 01:14:17 +0100 Subject: [PATCH 2/2] [fix] google video engine - rework of the HTML parser The google video response has been changed slightly, a rework of the parser was needed. Signed-off-by: Markus Heiser --- searx/engines/google_videos.py | 53 ++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 86dc1867d..abf046f4c 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -31,11 +31,8 @@ from searx.engines.google import ( get_lang_info, time_range_dict, filter_mapping, - results_xpath, g_section_with_header, title_xpath, - href_xpath, - content_xpath, suggestion_xpath, detect_google_sorry, ) @@ -73,11 +70,27 @@ def _re(regexpr): RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) return RE_CACHE[regexpr] + +def scrap_out_thumbs_src(dom): + ret_val = {} + thumb_name = 'dimg_' + for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): + _script = script.text + # "dimg_35":"https://i.ytimg.c....", + _dimurl = _re("s='([^']*)").findall( _script) + for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script): + v = v.replace(r'\u003d','=') + v = v.replace(r'\u0026','&') + ret_val[k] = v + logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) + return ret_val + + def scrap_out_thumbs(dom): """Scrap out thumbnail data from