1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-10 15:10:10 +01:00

parse image results without re (WIP)

This commit is contained in:
Austin-Olacsi 2024-06-04 02:25:49 -06:00 committed by GitHub
parent 6bfb101e6e
commit 73de563bac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5,6 +5,7 @@
import re import re
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import humanize_bytes
from html import unescape from html import unescape
from searx import logger from searx import logger
from searx import utils from searx import utils
@ -139,21 +140,49 @@ def response(resp):
raise SearxEngineCaptchaException() raise SearxEngineCaptchaException()
html_data = html.fromstring(resp.text) html_data = html.fromstring(resp.text)
text = unescape(html.tostring(html_data, encoding='unicode')) html_sample = unescape(html.tostring(html_data, encoding='unicode'))
urls = re.findall(r'"img_href":"(.*?)"', text) start_tag = 'data-state="'
titles = re.findall(r'"alt":"(.*?)"', text) end_tag = '"advRsyaSearchColumn":null}}"'
results = [ start_pos = html_sample.find(start_tag)
{ start_pos += len(start_tag)
end_pos = html_sample.find(end_tag, start_pos)
end_pos += len(end_tag) - 1
content_between_tags = html_sample[start_pos:end_pos]
json_resp = utils.js_variable_to_python(content_between_tags)
# save to a file
#with open('/path/to/yandexdump.txt', 'w') as f:
#sys.stdout = f
#print(json_resp)
results = []
for item_id, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
title = item_data['snippet']['title']
source = item_data['snippet']['url']
thumb = item_data['image']
# fullsize_image = item_data['origUrl']
fullsize_image = item_data['viewerData']['dups'][0]['url']
# height = item_data['height']
# width = item_data['width']
height = item_data['viewerData']['dups'][0]['h']
width = item_data['viewerData']['dups'][0]['w']
filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
humanized_filesize = humanize_bytes(filesize)
results.append({
"title": title, "title": title,
"url": url, "url": source,
"img_src": url, "img_src": fullsize_image,
"thumbnail_src": url, "filesize": humanized_filesize,
"thumbnail_src": thumb,
"template": "images.html", "template": "images.html",
} "resolution": f'{width} x {height}'
for url, title in zip(urls, titles) })
]
return results return results