1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-10 15:10:10 +01:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Austin-Olacsi
73de563bac
parse image results without re (WIP) 2024-06-04 02:25:49 -06:00
Austin-Olacsi
6bfb101e6e
initial xpath work (WIP) 2024-06-04 02:21:28 -06:00

View File

@ -5,8 +5,10 @@
import re import re
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import humanize_bytes
from html import unescape from html import unescape
from searx import logger from searx import logger
from searx import utils
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException
from datetime import datetime from datetime import datetime
@ -31,6 +33,10 @@ base_url_videos = 'https://yandex.com/video/search'
url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63' url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63'
url_xpath = "//a[@class='b-serp-item__title-link']/@href"
title_xpath = "//h3[@class='b-serp-item__title']/a[@class='b-serp-item__title-link']/span"
content_xpath = "//div[@class='b-serp-item__content']//div[@class='b-serp-item__text']"
images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E<X120"},"assets":{"las":"justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;"},"extraContent":{"names":["i-react-ajax-adapter"]}}}' images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E<X120"},"assets":{"las":"justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;"},"extraContent":{"names":["i-react-ajax-adapter"]}}}'
videos_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"vital-incut","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"^G]!q<X120"},"assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}' videos_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"vital-incut","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"^G]!q<X120"},"assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}'
@ -103,27 +109,29 @@ def get_youtube_iframe_src(url):
if video_id: if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] return 'https://www.youtube-nocookie.com/embed/' + video_id[0]
def response(resp): def response(resp):
if yandex_category == 'web': if yandex_category == 'web':
if (resp.url).path.startswith('/showcaptcha'): if (resp.url).path.startswith('/showcaptcha'):
raise SearxEngineCaptchaException() raise SearxEngineCaptchaException()
html_data = html.fromstring(resp.text) dom = html.fromstring(resp.text)
text = html.tostring(html_data, encoding='unicode') results_dom = dom.xpath('//li[contains(@class, "serp-item")]')
urls = re.findall(r'title-link" href="(.*?)" target="_blank"', text) results = []
titles = re.findall(r'tabindex="4"><span>(.*?)</span></a></h3>', text) for result_dom in results_dom:
contents = re.findall(r'"b-serp-item__text">(.*?)</div>', text) urls = result_dom.xpath(url_xpath)
titles = result_dom.xpath(title_xpath)
contents = result_dom.xpath(content_xpath)
results = [ title_texts = [title.xpath("normalize-space(.)") for title in titles]
{ content_texts = [content.xpath("normalize-space(.)") for content in contents]
"url": url,
"title": title, for url, title_text, content_text in zip(urls, title_texts, content_texts):
"content": content, results.append({
} "url": url,
for url, title, content in zip(urls, titles, contents) "title": title_text,
] "content": content_text,
})
return results return results
@ -132,21 +140,49 @@ def response(resp):
raise SearxEngineCaptchaException() raise SearxEngineCaptchaException()
html_data = html.fromstring(resp.text) html_data = html.fromstring(resp.text)
text = unescape(html.tostring(html_data, encoding='unicode')) html_sample = unescape(html.tostring(html_data, encoding='unicode'))
urls = re.findall(r'"img_href":"(.*?)"', text) start_tag = 'data-state="'
titles = re.findall(r'"alt":"(.*?)"', text) end_tag = '"advRsyaSearchColumn":null}}"'
results = [ start_pos = html_sample.find(start_tag)
{ start_pos += len(start_tag)
end_pos = html_sample.find(end_tag, start_pos)
end_pos += len(end_tag) - 1
content_between_tags = html_sample[start_pos:end_pos]
json_resp = utils.js_variable_to_python(content_between_tags)
# save to a file
#with open('/path/to/yandexdump.txt', 'w') as f:
#sys.stdout = f
#print(json_resp)
results = []
for item_id, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
title = item_data['snippet']['title']
source = item_data['snippet']['url']
thumb = item_data['image']
# fullsize_image = item_data['origUrl']
fullsize_image = item_data['viewerData']['dups'][0]['url']
# height = item_data['height']
# width = item_data['width']
height = item_data['viewerData']['dups'][0]['h']
width = item_data['viewerData']['dups'][0]['w']
filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
humanized_filesize = humanize_bytes(filesize)
results.append({
"title": title, "title": title,
"url": url, "url": source,
"img_src": url, "img_src": fullsize_image,
"thumbnail_src": url, "filesize": humanized_filesize,
"thumbnail_src": thumb,
"template": "images.html", "template": "images.html",
} "resolution": f'{width} x {height}'
for url, title in zip(urls, titles) })
]
return results return results