From cbf1e9097929cf851d31bfd17e87bec7d1e51422 Mon Sep 17 00:00:00 2001 From: Austin-Olacsi <138650713+Austin-Olacsi@users.noreply.github.com> Date: Sat, 14 Sep 2024 16:28:35 -0600 Subject: [PATCH] add get_embeded_stream_url to searx.utils --- searx/engines/brave.py | 15 ++-------- searx/engines/duckduckgo_extra.py | 3 +- searx/engines/google_videos.py | 2 ++ searx/engines/qwant.py | 2 ++ searx/utils.py | 48 ++++++++++++++++++++++++++++++- 5 files changed, 56 insertions(+), 14 deletions(-) diff --git a/searx/engines/brave.py b/searx/engines/brave.py index 6f7e342e7..648aee562 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -123,7 +123,6 @@ from typing import Any, TYPE_CHECKING from urllib.parse import ( urlencode, urlparse, - parse_qs, ) from dateutil import parser @@ -137,6 +136,7 @@ from searx.utils import ( eval_xpath_list, eval_xpath_getindex, js_variable_to_python, + get_embeded_stream_url, ) from searx.enginelib.traits import EngineTraits @@ -311,7 +311,7 @@ def _parse_search(resp): # In my tests a video tag in the WEB search was most often not a # video, except the ones from youtube .. - iframe_src = _get_iframe_src(url) + iframe_src = get_embeded_stream_url(url) if iframe_src: item['iframe_src'] = iframe_src item['template'] = 'videos.html' @@ -328,15 +328,6 @@ def _parse_search(resp): return result_list -def _get_iframe_src(url): - parsed_url = urlparse(url) - if parsed_url.path == '/watch' and parsed_url.query: - video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore - if video_id: - return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore - return None - - def _parse_news(json_resp): result_list = [] @@ -392,7 +383,7 @@ def _parse_videos(json_resp): if result['thumbnail'] is not None: item['thumbnail'] = result['thumbnail']['src'] - iframe_src = _get_iframe_src(url) + iframe_src = get_embeded_stream_url(url) if iframe_src: item['iframe_src'] = iframe_src diff --git a/searx/engines/duckduckgo_extra.py b/searx/engines/duckduckgo_extra.py index 83ca38c26..b30574d6c 100644 --- a/searx/engines/duckduckgo_extra.py +++ b/searx/engines/duckduckgo_extra.py @@ -7,6 +7,7 @@ DuckDuckGo Extra (images, videos, news) from datetime import datetime from typing import TYPE_CHECKING from urllib.parse import urlencode +from searx.utils import get_embeded_stream_url from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import from searx.engines.duckduckgo import ( @@ -108,7 +109,7 @@ def _video_result(result): 'title': result['title'], 'content': result['description'], 'thumbnail': result['images'].get('small') or result['images'].get('medium'), - 'iframe_src': result['embed_url'], + 'iframe_src': get_embeded_stream_url(result['content']), 'source': result['provider'], 'length': result['duration'], 'metadata': result.get('uploader'), diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 4a032ef0f..c8fc934af 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -34,6 +34,7 @@ from searx.engines.google import ( detect_google_sorry, ) from searx.enginelib.traits import EngineTraits +from searx.utils import get_embeded_stream_url if TYPE_CHECKING: import logging @@ -125,6 +126,7 @@ def response(resp): 'content': content, 'author': pub_info, 'thumbnail': thumbnail, + 'iframe_src': get_embeded_stream_url(url), 'template': 'videos.html', } ) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index c30018d85..7ad6cf58a 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -61,6 +61,7 @@ from searx.utils import ( eval_xpath, eval_xpath_list, extract_text, + get_embeded_stream_url, ) traits: EngineTraits @@ -303,6 +304,7 @@ def parse_web_api(resp): 'title': title, 'url': res_url, 'content': content, + 'iframe_src': get_embeded_stream_url(res_url), 'publishedDate': pub_date, 'thumbnail': thumbnail, 'template': 'videos.html', diff --git a/searx/utils.py b/searx/utils.py index 407d44cd0..c0c6261f9 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -17,7 +17,7 @@ from os.path import splitext, join from random import choice from html.parser import HTMLParser from html import escape -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin, urlparse, parse_qs, urlencode from markdown_it import MarkdownIt from lxml import html @@ -615,6 +615,52 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore return _FASTTEXT_MODEL +def get_embeded_stream_url(url): + """ + Converts a standard video URL into its embed format. Supported services include Youtube, + Facebook, Instagram, TikTok, and Dailymotion. + """ + parsed_url = urlparse(url) + iframe_src = None + + # YouTube + if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query: + video_id = parse_qs(parsed_url.query).get('v', []) + if video_id: + iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0] + + # Facebook + elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']: + encoded_href = urlencode({'href': url}) + iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href + + # Instagram + elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'): + if parsed_url.path.endswith('/'): + iframe_src = url + 'embed' + else: + iframe_src = url + '/embed' + + # TikTok + elif ( + parsed_url.netloc in ['www.tiktok.com', 'tiktok.com'] + and parsed_url.path.startswith('/@') + and '/video/' in parsed_url.path + ): + path_parts = parsed_url.path.split('/video/') + video_id = path_parts[1] + iframe_src = 'https://www.tiktok.com/embed/' + video_id + + # Dailymotion + elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'): + path_parts = parsed_url.path.split('/') + if len(path_parts) == 3: + video_id = path_parts[2] + iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id + + return iframe_src + + def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]: """Detect the language of the ``text`` parameter.