2024-09-21 04:20:42 +02:00
1 changed files with 25 additions and 61 deletions
--- a/searx/engines/yandex.py
+++ b/searx/engines/yandex.py
@ -5,10 +5,8 @@
 import re
 from urllib.parse import urlencode, urlparse, parse_qs
 from lxml import html
-from searx.utils import humanize_bytes
 from html import unescape
 from searx import logger
-from searx import utils
 from searx.exceptions import SearxEngineCaptchaException
 from datetime import datetime

@ -33,10 +31,6 @@ base_url_videos = 'https://yandex.com/video/search'

 url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63'

-url_xpath = "//a[@class='b-serp-item__title-link']/@href"
-title_xpath = "//h3[@class='b-serp-item__title']/a[@class='b-serp-item__title-link']/span"
-content_xpath = "//div[@class='b-serp-item__content']//div[@class='b-serp-item__text']"
-
 images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E<X120"},"assets":{"las":"justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;"},"extraContent":{"names":["i-react-ajax-adapter"]}}}'

 videos_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"vital-incut","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"^G]!q<X120"},"assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}'
@ -109,29 +103,27 @@ def get_youtube_iframe_src(url):
        if video_id:
            return 'https://www.youtube-nocookie.com/embed/' + video_id[0]

+
 def response(resp):
    if yandex_category == 'web':
        if (resp.url).path.startswith('/showcaptcha'):
            raise SearxEngineCaptchaException()

-        dom = html.fromstring(resp.text)
-        results_dom = dom.xpath('//li[contains(@class, "serp-item")]')
+        html_data = html.fromstring(resp.text)
+        text = html.tostring(html_data, encoding='unicode')

-        results = []
-        for result_dom in results_dom:
-            urls = result_dom.xpath(url_xpath)
-            titles = result_dom.xpath(title_xpath)
-            contents = result_dom.xpath(content_xpath)
+        urls = re.findall(r'title-link" href="(.*?)" target="_blank"', text)
+        titles = re.findall(r'tabindex="4"><span>(.*?)</span></a></h3>', text)
+        contents = re.findall(r'"b-serp-item__text">(.*?)</div>', text)

-            title_texts = [title.xpath("normalize-space(.)") for title in titles]
-            content_texts = [content.xpath("normalize-space(.)") for content in contents]
-
-            for url, title_text, content_text in zip(urls, title_texts, content_texts):
-                results.append({
+        results = [
+            {
                "url": url,
-                    "title": title_text,
-                    "content": content_text,
-                })
+                "title": title,
+                "content": content,
+            }
+            for url, title, content in zip(urls, titles, contents)
+        ]

        return results

@ -140,49 +132,21 @@ def response(resp):
            raise SearxEngineCaptchaException()

        html_data = html.fromstring(resp.text)
-        html_sample = unescape(html.tostring(html_data, encoding='unicode'))
+        text = unescape(html.tostring(html_data, encoding='unicode'))

-        start_tag = 'data-state="'
-        end_tag = '"advRsyaSearchColumn":null}}"'
+        urls = re.findall(r'"img_href":"(.*?)"', text)
+        titles = re.findall(r'"alt":"(.*?)"', text)

-        start_pos = html_sample.find(start_tag)
-        start_pos += len(start_tag)
-
-        end_pos = html_sample.find(end_tag, start_pos)
-        end_pos += len(end_tag) - 1
-
-        content_between_tags = html_sample[start_pos:end_pos]
-
-        json_resp = utils.js_variable_to_python(content_between_tags)
-
-        # save to a file
-        #with open('/path/to/yandexdump.txt', 'w') as f:
-        #sys.stdout = f
-        #print(json_resp)
-
-        results = []
-        for item_id, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
-            title = item_data['snippet']['title']
-            source = item_data['snippet']['url']
-            thumb = item_data['image']
-#            fullsize_image = item_data['origUrl']
-            fullsize_image = item_data['viewerData']['dups'][0]['url']
-#            height = item_data['height']
-#            width = item_data['width']
-            height = item_data['viewerData']['dups'][0]['h']
-            width = item_data['viewerData']['dups'][0]['w']
-            filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
-            humanized_filesize = humanize_bytes(filesize)
-
-            results.append({
+        results = [
+            {
                "title": title,
-                "url": source,
-                "img_src": fullsize_image,
-                "filesize": humanized_filesize,
-                "thumbnail_src": thumb,
+                "url": url,
+                "img_src": url,
+                "thumbnail_src": url,
                "template": "images.html",
-                "resolution": f'{width} x {height}'
-            })
+            }
+            for url, title in zip(urls, titles)
+        ]

        return results