1
0
mirror of https://github.com/searxng/searxng.git synced 2024-09-21 04:20:42 +02:00

try to fix image parsing broken sometimes

This commit is contained in:
Austin-Olacsi 2024-06-05 17:40:51 -06:00 committed by GitHub
parent 73de563bac
commit c04133e2e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3,6 +3,7 @@
"""Yandex (Web, images, videos)"""
import re
import sys
from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html
from searx.utils import humanize_bytes
@ -142,23 +143,25 @@ def response(resp):
html_data = html.fromstring(resp.text)
html_sample = unescape(html.tostring(html_data, encoding='unicode'))
start_tag = 'data-state="'
end_tag = '"advRsyaSearchColumn":null}}"'
start_tag = '{"location":"/images/search/'
end_tag = 'advRsyaSearchColumn":null}}'
start_pos = html_sample.find(start_tag)
start_pos += len(start_tag)
start_index = html_sample.find(start_tag)
start_index = start_index if start_index != -1 else -1
end_pos = html_sample.find(end_tag, start_pos)
end_pos += len(end_tag) - 1
end_index = html_sample.find(end_tag, start_index)
end_index = end_index + len(end_tag) if end_index != -1 else -1
content_between_tags = html_sample[start_index:end_index] if start_index != -1 and end_index != -1 else None
# # save to a file
# with open('/path/to/output.txt', 'w') as f:
# sys.stdout = f
# print(selected_text)
content_between_tags = html_sample[start_pos:end_pos]
json_resp = utils.js_variable_to_python(content_between_tags)
# save to a file
#with open('/path/to/yandexdump.txt', 'w') as f:
#sys.stdout = f
#print(json_resp)
results = []
for item_id, item_data in json_resp['initialState']['serpList']['items']['entities'].items():