From 2179079a9173b33b81e1084fc1e8e181c19ef8e9 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 2 Aug 2019 13:37:13 +0200 Subject: [PATCH] [fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). --- searx/engines/flickr_noapi.py | 12 ++++++------ searx/utils.py | 19 +++++++++++++++++++ tests/unit/test_utils.py | 7 +++++++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index eeee413ec..198ac2cff 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -16,7 +16,8 @@ from json import loads from time import time import re from searx.engines import logger -from searx.url_utils import urlencode, unquote +from searx.url_utils import urlencode +from searx.utils import ecma_unescape, html_to_text logger = logger.getChild('flickr-noapi') @@ -75,11 +76,10 @@ def response(resp): for index in legend: photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])] - author = unquote(photo.get('realname', '')) - source = unquote(photo.get('username', '')) + ' @ Flickr' - title = unquote(photo.get('title', '')) - content = unquote(photo.get('description', '')) - + author = ecma_unescape(photo.get('realname', '')) + source = ecma_unescape(photo.get('username', '')) + ' @ Flickr' + title = ecma_unescape(photo.get('title', '')) + content = html_to_text(ecma_unescape(photo.get('description', ''))) img_src = None # From the biggest to the lowest format for image_size in image_sizes: diff --git a/searx/utils.py b/searx/utils.py index b7e914557..d88bc9897 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import csv import hashlib import hmac @@ -44,6 +45,9 @@ logger = logger.getChild('utils') blocked_tags = ('script', 'style') +ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) +ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) + useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__)) + "/data/useragents.json", 'r', encoding='utf-8').read()) @@ -415,3 +419,18 @@ def to_string(obj): return obj.__str__() if hasattr(obj, '__repr__'): return obj.__repr__() + + +def ecma_unescape(s): + """ + python implementation of the unescape javascript function + + https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string + https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape + """ + # s = unicode(s) + # "%u5409" becomes "吉" + s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s) + # "%20" becomes " ", "%F3" becomes "ó" + s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s) + return s diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index fbaed2bd1..b09b9d414 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -90,6 +90,13 @@ class TestUtils(SearxTestCase): self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL') self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL') + def test_ecma_unscape(self): + self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') + self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), + u'text using %xx: ó') + self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), + u'text using %u: 吉, 世界') + class TestHTMLTextExtractor(SearxTestCase):