1
0
mirror of https://github.com/searxng/searxng.git synced 2024-09-21 04:20:42 +02:00

initial xpath work (WIP)

This commit is contained in:
Austin-Olacsi 2024-06-04 02:21:28 -06:00 committed by GitHub
parent 21b0926c50
commit 6bfb101e6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -7,6 +7,7 @@ from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from html import unescape from html import unescape
from searx import logger from searx import logger
from searx import utils
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException
from datetime import datetime from datetime import datetime
@ -31,6 +32,10 @@ base_url_videos = 'https://yandex.com/video/search'
url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63' url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63'
url_xpath = "//a[@class='b-serp-item__title-link']/@href"
title_xpath = "//h3[@class='b-serp-item__title']/a[@class='b-serp-item__title-link']/span"
content_xpath = "//div[@class='b-serp-item__content']//div[@class='b-serp-item__text']"
images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E<X120"},"assets":{"las":"justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;"},"extraContent":{"names":["i-react-ajax-adapter"]}}}' images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E<X120"},"assets":{"las":"justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;"},"extraContent":{"names":["i-react-ajax-adapter"]}}}'
videos_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"vital-incut","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"^G]!q<X120"},"assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}' videos_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"vital-incut","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"^G]!q<X120"},"assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}'
@ -103,27 +108,29 @@ def get_youtube_iframe_src(url):
if video_id: if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] return 'https://www.youtube-nocookie.com/embed/' + video_id[0]
def response(resp): def response(resp):
if yandex_category == 'web': if yandex_category == 'web':
if (resp.url).path.startswith('/showcaptcha'): if (resp.url).path.startswith('/showcaptcha'):
raise SearxEngineCaptchaException() raise SearxEngineCaptchaException()
html_data = html.fromstring(resp.text) dom = html.fromstring(resp.text)
text = html.tostring(html_data, encoding='unicode') results_dom = dom.xpath('//li[contains(@class, "serp-item")]')
urls = re.findall(r'title-link" href="(.*?)" target="_blank"', text) results = []
titles = re.findall(r'tabindex="4"><span>(.*?)</span></a></h3>', text) for result_dom in results_dom:
contents = re.findall(r'"b-serp-item__text">(.*?)</div>', text) urls = result_dom.xpath(url_xpath)
titles = result_dom.xpath(title_xpath)
contents = result_dom.xpath(content_xpath)
results = [ title_texts = [title.xpath("normalize-space(.)") for title in titles]
{ content_texts = [content.xpath("normalize-space(.)") for content in contents]
for url, title_text, content_text in zip(urls, title_texts, content_texts):
results.append({
"url": url, "url": url,
"title": title, "title": title_text,
"content": content, "content": content_text,
} })
for url, title, content in zip(urls, titles, contents)
]
return results return results