From 560a14e77bcf804ab55ae1cc8c93ce0b2289cf2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9on=20Tiek=C3=B6tter?= Date: Sun, 6 Feb 2022 17:50:00 +0100 Subject: [PATCH 1/5] [fix] wikidata info box images Wikidata info box images are now loaded from uploads.wikimedia.org instead of commons.wikimedia.org to prevent redirects Co-authored-by: Markus Heiser --- searx/engines/wikidata.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index e5d3f55c0..ce500cda6 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -18,6 +18,7 @@ from searx.engines.wikipedia import ( # pylint: disable=unused-import _fetch_supported_languages, supported_languages_url, ) +from hashlib import md5 # about about = { @@ -185,6 +186,36 @@ def response(resp): return results +_IMG_SRC_DEFAULT_URL_PREFIX = "https://commons.wikimedia.org/wiki/Special:FilePath/" +_IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/" + + +def get_thumbnail(img_src): + """Get Thumbnail image from wikimedia commons + + Images from commons.wikimedia.org are (HTTP) redirected to + upload.wikimedia.org. The redirected URL can be calculated by this + function. + + - https://stackoverflow.com/a/33691240 + + """ + logger.debug('get_thumbnail(): %s', img_src) + if not img_src is None and _IMG_SRC_DEFAULT_URL_PREFIX in img_src.split()[0]: + img_src_name = ( + img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "") + .split("?", 1)[0] + .replace("%20", "_") + .replace("%28", "(") + .replace("%29", ")") + ) + img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest() + img_src = _IMG_SRC_NEW_URL_PREFIX + img_src_name_md5[0] + "/" + img_src_name_md5[0:2] + "/" + img_src_name + logger.debug('get_thumbnail() redirected: %s', img_src) + + return img_src + + def get_results(attribute_result, attributes, language): results = [] infobox_title = attribute_result.get('itemLabel') @@ -221,7 +252,7 @@ def get_results(attribute_result, attributes, language): # replace the current image only the priority is lower # (the infobox contain only one image). if attribute.priority < img_src_priority: - img_src = value + img_src = get_thumbnail(value) img_src_priority = attribute.priority elif attribute_type == WDGeoAttribute: # geocoordinate link From a50f32bcfcfce1de94c4eadb646c35489617fba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9on=20Tiek=C3=B6tter?= Date: Sun, 6 Feb 2022 23:25:50 +0100 Subject: [PATCH 2/5] wikidata: load thumbnail instead of full image --- searx/engines/wikidata.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index ce500cda6..18dac3efb 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -187,7 +187,7 @@ def response(resp): _IMG_SRC_DEFAULT_URL_PREFIX = "https://commons.wikimedia.org/wiki/Special:FilePath/" -_IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/" +_IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/thumb/" def get_thumbnail(img_src): @@ -209,8 +209,21 @@ def get_thumbnail(img_src): .replace("%28", "(") .replace("%29", ")") ) + img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[1] + img_src_size = img_src_size[img_src_size.index("=") + 1 : img_src_size.index("&")] img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest() - img_src = _IMG_SRC_NEW_URL_PREFIX + img_src_name_md5[0] + "/" + img_src_name_md5[0:2] + "/" + img_src_name + img_src = ( + _IMG_SRC_NEW_URL_PREFIX + + img_src_name_md5[0] + + "/" + + img_src_name_md5[0:2] + + "/" + + img_src_name + + "/" + + img_src_size + + "px-" + + img_src_name + ) logger.debug('get_thumbnail() redirected: %s', img_src) return img_src From a13c5d70c7b3773afcb2c6bfe684f212b3a70707 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 6 Feb 2022 23:35:55 +0100 Subject: [PATCH 3/5] [fix] wikidata engine: select image with higher (not lower) priority Signed-off-by: Markus Heiser --- searx/engines/wikidata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 18dac3efb..c6a551e9c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -238,7 +238,7 @@ def get_results(attribute_result, attributes, language): infobox_attributes = [] infobox_content = attribute_result.get('itemDescription', []) img_src = None - img_src_priority = 100 + img_src_priority = 0 for attribute in attributes: value = attribute.get_str(attribute_result, language) @@ -264,7 +264,7 @@ def get_results(attribute_result, attributes, language): # this attribute is an image. # replace the current image only the priority is lower # (the infobox contain only one image). - if attribute.priority < img_src_priority: + if attribute.priority > img_src_priority: img_src = get_thumbnail(value) img_src_priority = attribute.priority elif attribute_type == WDGeoAttribute: From 1c151ae92bfe51faa89af523194dca631a7c9378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9on=20Tiek=C3=B6tter?= Date: Mon, 7 Feb 2022 00:19:25 +0100 Subject: [PATCH 4/5] [fix] wikidata: URL decoding and file extension handling Add '.png' to the second img_src_name if it has the extension '.svg'. Use urllib.parse.unquote for URL decoding. --- searx/engines/wikidata.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index c6a551e9c..a89f79bf9 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -4,7 +4,7 @@ """ -from urllib.parse import urlencode +from urllib.parse import urlencode, unquote from json import loads from dateutil.parser import isoparse @@ -202,13 +202,13 @@ def get_thumbnail(img_src): """ logger.debug('get_thumbnail(): %s', img_src) if not img_src is None and _IMG_SRC_DEFAULT_URL_PREFIX in img_src.split()[0]: - img_src_name = ( - img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "") - .split("?", 1)[0] - .replace("%20", "_") - .replace("%28", "(") - .replace("%29", ")") - ) + img_src_name = unquote(img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[0].replace("%20", "_")) + img_src_name_first = img_src_name + img_src_name_second = img_src_name + + if ".svg" in img_src_name.split()[0]: + img_src_name_second = img_src_name + ".png" + img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[1] img_src_size = img_src_size[img_src_size.index("=") + 1 : img_src_size.index("&")] img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest() @@ -218,11 +218,11 @@ def get_thumbnail(img_src): + "/" + img_src_name_md5[0:2] + "/" - + img_src_name + + img_src_name_first + "/" + img_src_size + "px-" - + img_src_name + + img_src_name_second ) logger.debug('get_thumbnail() redirected: %s', img_src) From a967e5959012814210bebd48c7e0f75ab6f865ef Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 7 Feb 2022 10:15:32 +0100 Subject: [PATCH 5/5] [pylint] searx/engines/wikidata.py (no functional change) Signed-off-by: Markus Heiser --- searx/engines/wikidata.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index a89f79bf9..592a51ec8 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Wikidata """ - Wikidata -""" - +# pylint: disable=missing-class-docstring +from hashlib import md5 from urllib.parse import urlencode, unquote from json import loads @@ -18,7 +19,6 @@ from searx.engines.wikipedia import ( # pylint: disable=unused-import _fetch_supported_languages, supported_languages_url, ) -from hashlib import md5 # about about = { @@ -230,6 +230,7 @@ def get_thumbnail(img_src): def get_results(attribute_result, attributes, language): + # pylint: disable=too-many-branches results = [] infobox_title = attribute_result.get('itemLabel') infobox_id = attribute_result['item'] @@ -322,6 +323,7 @@ def get_query(query, language): def get_attributes(language): + # pylint: disable=too-many-statements attributes = [] def add_value(name): @@ -462,7 +464,7 @@ def get_attributes(language): class WDAttribute: - + # pylint: disable=no-self-use __slots__ = ('name',) def __init__(self, name): @@ -483,7 +485,7 @@ class WDAttribute: def get_group_by(self): return "" - def get_str(self, result, language): + def get_str(self, result, language): # pylint: disable=unused-argument return result.get(self.name + 's') def __repr__(self): @@ -624,6 +626,7 @@ class WDImageAttribute(WDURLAttribute): class WDDateAttribute(WDAttribute): + # pylint: disable=no-self-use def get_select(self): return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name) @@ -644,7 +647,7 @@ class WDDateAttribute(WDAttribute): def get_group_by(self): return self.get_select() - def format_8(self, value, locale): + def format_8(self, value, locale): # pylint: disable=unused-argument # precision: less than a year return value @@ -717,7 +720,7 @@ class WDDateAttribute(WDAttribute): else: value = t[0] return format_method(value, language) - except Exception: + except Exception: # pylint: disable=broad-except return value return value @@ -731,7 +734,7 @@ def debug_explain_wikidata_query(query, method='GET'): return http_response.content -def init(engine_settings=None): +def init(engine_settings=None): # pylint: disable=unused-argument # WIKIDATA_PROPERTIES : add unit symbols WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS)