From 3f86f9280d408643a53460946d5840d9e1a049dd Mon Sep 17 00:00:00 2001 From: Hackurei <138650713+Hackurei@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:38:24 -0700 Subject: [PATCH] draft: yandex engine (web, images, videos) --- searx/engines/yandex.py | 200 ++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 18 ++++ 2 files changed, 218 insertions(+) create mode 100644 searx/engines/yandex.py diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..b3363ef2b --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,200 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Yandex (Web, images, videos)""" + +import re +from urllib.parse import urlencode, urlparse, parse_qs +from lxml import html +from html import unescape +from searx import logger +from searx.exceptions import SearxEngineCaptchaException +from datetime import datetime + +# about +about = { + "website": 'https://yandex.com/', + "wikidata_id": 'Q5281', + "official_api_documentation": "?", + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general', 'images', 'videos'] +paging = True + +# search-url +base_url_web = 'https://yandex.com/search/site/' +base_url_images = 'https://yandex.com/images/search' +base_url_videos = 'https://yandex.com/video/search' + +url_extension = 'tmpl_version=releases%2Ffrontend%2Fvideo%2Fv1.1168.0%238d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63' + +images_request_block = '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"i-global__params:ajax","params":{},"version":2},{"block":"search2:ajax","params":{},"version":2},{"block":"preview__isWallpaper","params":{},"version":2},{"block":"content_type_search","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"cookies_ajax","params":{},"version":2},{"block":"advanced-search-block","params":{},"version":2}],"metadata":{"bundles":{"lb":"AS?(E 1: + query_params_web.update({"p": params["pageno"] - 1, "lr": "21180"}) + query_params_images.update({"p": params["pageno"] - 1}) + query_params_videos.update({"p": params["pageno"] - 1}) + + params['method'] = 'GET' + params['headers']['accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' + params['headers']['accept-encoding'] = 'gzip' + params['headers']['accept-language'] = 'en-US,en;q=0.5' + params['headers']['dnt'] = '1' + params['headers']['referer'] = 'https://yandex.com/images/search' + params['headers']['connection'] = 'keep-alive' + params['headers']['upgrade-insecure-requests'] = '1' + params['headers']['sec-fetch-dest'] = 'document' + params['headers']['sec-fetch-mode'] = 'navigate' + params['headers']['sec-fetch-site'] = 'cross-site' + params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A1#1685406411.szm.1:1920x1080:1920x999"} + + if yandex_category == 'web': + params['url'] = f"{base_url_web}?{urlencode(query_params_web)}" + elif yandex_category == 'images': + params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}" + elif yandex_category == 'videos': + params['url'] = f"{base_url_videos}?{url_extension}{urlencode(query_params_videos)}" + + return params + + +def get_youtube_iframe_src(url): + parsed_url = urlparse(url) + + # Check for http://www.youtube.com/v/videoid format + if ( + parsed_url.netloc.endswith('youtube.com') + and parsed_url.path.startswith('/v/') + and len(parsed_url.path.split('/')) == 3 + ): + video_id = parsed_url.path.split('/')[-1] + return 'https://www.youtube-nocookie.com/embed/' + video_id + + # Check for http://www.youtube.com/watch?v=videoid format + elif parsed_url.netloc.endswith('youtube.com') and parsed_url.path == '/watch' and parsed_url.query: + video_id = parse_qs(parsed_url.query).get('v', []) + if video_id: + return 'https://www.youtube-nocookie.com/embed/' + video_id[0] + + +def response(resp): + if yandex_category == 'web': + if (resp.url).path.startswith('/showcaptcha'): + raise SearxEngineCaptchaException() + + html_data = html.fromstring(resp.text) + text = html.tostring(html_data, encoding='unicode') + + urls = re.findall(r'title-link" href="(.*?)" target="_blank"', text) + titles = re.findall(r'tabindex="4">(.*?)', text) + contents = re.findall(r'"b-serp-item__text">(.*?)', text) + + results = [ + { + "url": url, + "title": title, + "content": content, + } + for url, title, content in zip(urls, titles, contents) + ] + + return results + + elif yandex_category == 'images': + if (resp.url).path.startswith('/showcaptcha'): + raise SearxEngineCaptchaException() + + html_data = html.fromstring(resp.text) + text = unescape(html.tostring(html_data, encoding='unicode')) + + urls = re.findall(r'"img_href":"(.*?)"', text) + titles = re.findall(r'"alt":"(.*?)"', text) + + results = [ + { + "title": title, + "url": url, + "img_src": url, + "thumbnail_src": url, + "template": "images.html", + } + for url, title in zip(urls, titles) + ] + + return results + + elif yandex_category == 'videos': + if (resp.url).path.startswith('/showcaptcha'): + raise SearxEngineCaptchaException() + + html_data = html.fromstring(resp.text) + text = unescape(html.tostring(html_data, encoding='unicode')) + + video_urls = re.findall(r'ner" href="(.*?)"', text) + valid_urls = [url for url in video_urls if url.startswith('http')] + thumbnail_urls = re.findall(r'"image":"//(.*?)"', text) + https_thumbnail_urls = ['https://' + url for url in thumbnail_urls] + durations = re.findall(r'"durationText":"(.*?)"', text) + titles = re.findall(r'"clear_title":"(.*?)"', text) + descriptions = re.findall(r'"clear_description":"(.*?)"', text) + authors = re.findall(r'"clipChannel":{"name":"(.*?)",', text) + raw_dates = re.findall(r'"time":"(.*?)"', text) + + embedded_urls = [get_youtube_iframe_src(url) for url in valid_urls] + + results = [] + for url, title, description, author, raw_date, duration, thumbnail, embedded_url in zip( + valid_urls, + titles[::2], + descriptions, + authors, + raw_dates[::2], + durations, + https_thumbnail_urls[::2], + embedded_urls, + ): + date_timestamp = datetime.strptime(raw_date.split("T")[0], "%Y-%m-%d") + date_utc = datetime.utcfromtimestamp(date_timestamp.timestamp()) + + results.append( + { + "url": url, + "title": title, + "content": description, + "author": author, + "publishedDate": date_utc, + "length": duration, + "thumbnail": thumbnail, + "iframe_src": embedded_url, + "template": "videos.html", + } + ) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 92bb0ff26..2b0a1c8b6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -2022,6 +2022,24 @@ engines: base_url: https://yacy.searchlab.eu shortcut: yai disabled: true + + - name: yandex + engine: yandex + yandex_category: web + shortcut: yd + disabled: true + + - name: yandex images + engine: yandex + yandex_category: images + shortcut: ydi + disabled: true + + - name: yandex videos + engine: yandex + yandex_category: videos + shortcut: ydv + disabled: true - name: rumble engine: rumble