From 6bfb101e6e85e116cef1489e93f2bf761ade283e Mon Sep 17 00:00:00 2001 From: Austin-Olacsi <138650713+Austin-Olacsi@users.noreply.github.com> Date: Tue, 4 Jun 2024 02:21:28 -0600 Subject: [PATCH] initial xpath work (WIP) --- searx/engines/yandex.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index b3363ef2b..9c02c8eb1 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -7,6 +7,7 @@ from urllib.parse import urlencode, urlparse, parse_qs from lxml import html from html import unescape from searx import logger +from searx import utils from searx.exceptions import SearxEngineCaptchaException from datetime import datetime @@ -31,6 +32,10 @@ base_url_videos = 'https://yandex.com/video/search' url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63' +url_xpath = "//a[@class='b-serp-item__title-link']/@href" +title_xpath = "//h3[@class='b-serp-item__title']/a[@class='b-serp-item__title-link']/span" +content_xpath = "//div[@class='b-serp-item__content']//div[@class='b-serp-item__text']" + images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E(.*?)', text) - contents = re.findall(r'"b-serp-item__text">(.*?)', text) + results = [] + for result_dom in results_dom: + urls = result_dom.xpath(url_xpath) + titles = result_dom.xpath(title_xpath) + contents = result_dom.xpath(content_xpath) - results = [ - { - "url": url, - "title": title, - "content": content, - } - for url, title, content in zip(urls, titles, contents) - ] + title_texts = [title.xpath("normalize-space(.)") for title in titles] + content_texts = [content.xpath("normalize-space(.)") for content in contents] + + for url, title_text, content_text in zip(urls, title_texts, content_texts): + results.append({ + "url": url, + "title": title_text, + "content": content_text, + }) return results