searxng/searx/utils.py

# -*- coding: utf-8 -*-
import os
import sys
import re
import json

from imp import load_source
from numbers import Number
from os.path import splitext, join
from io import open
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse, unquote

from lxml import html
from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult
from babel.core import get_global


from searx import settings
from searx.version import VERSION_STRING
from searx.languages import language_codes
from searx import logger


logger = logger.getChild('utils')

blocked_tags = ('script',
                'style')

ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)

useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
                             + "/data/useragents.json", 'r', encoding='utf-8').read())

xpath_cache = dict()
lang_to_lc_cache = dict()


def searx_useragent():
    return 'searx/{searx_version} {suffix}'.format(
           searx_version=VERSION_STRING,
           suffix=settings['outgoing'].get('useragent_suffix', ''))


def gen_useragent(os=None):
    return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))


class HTMLTextExtractorException(Exception):
    pass


class HTMLTextExtractor(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.result = []
        self.tags = []

    def handle_starttag(self, tag, attrs):
        self.tags.append(tag)

    def handle_endtag(self, tag):
        if not self.tags:
            return

        if tag != self.tags[-1]:
            raise HTMLTextExtractorException()

        self.tags.pop()

    def is_valid_tag(self):
        return not self.tags or self.tags[-1] not in blocked_tags

    def handle_data(self, d):
        if not self.is_valid_tag():
            return
        self.result.append(d)

    def handle_charref(self, number):
        if not self.is_valid_tag():
            return
        if number[0] in ('x', 'X'):
            codepoint = int(number[1:], 16)
        else:
            codepoint = int(number)
        self.result.append(chr(codepoint))

    def handle_entityref(self, name):
        if not self.is_valid_tag():
            return
        # codepoint = htmlentitydefs.name2codepoint[name]
        # self.result.append(chr(codepoint))
        self.result.append(name)

    def get_text(self):
        return ''.join(self.result).strip()


def html_to_text(html):
    html = html.replace('\n', ' ')
    html = ' '.join(html.split())
    s = HTMLTextExtractor()
    try:
        s.feed(html)
    except HTMLTextExtractorException:
        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
    return s.get_text()


def extract_text(xpath_results):
    '''
    if xpath_results is list, extract the text from each result and concat the list
    if xpath_results is a xml element, extract all the text node from it
    ( text_content() method from lxml )
    if xpath_results is a string element, then it's already done
    '''
    if type(xpath_results) == list:
        # it's list of result : concat everything using recursive call
        result = ''
        for e in xpath_results:
            result = result + extract_text(e)
        return result.strip()
    elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
        # it's a string
        return ''.join(xpath_results)
    else:
        # it's a element
        text = html.tostring(
            xpath_results, encoding='unicode', method='text', with_tail=False
        )
        text = text.strip().replace('\n', ' ')
        return ' '.join(text.split())


def extract_url(xpath_results, search_url):
    if xpath_results == []:
        raise Exception('Empty url resultset')
    url = extract_text(xpath_results)

    if url.startswith('//'):
        # add http or https to this kind of url //example.com/
        parsed_search_url = urlparse(search_url)
        url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
    elif url.startswith('/'):
        # fix relative url to the search engine
        url = urljoin(search_url, url)

    # fix relative urls that fall through the crack
    if '://' not in url:
        url = urljoin(search_url, url)

    # normalize url
    url = normalize_url(url)

    return url


def normalize_url(url):
    parsed_url = urlparse(url)

    # add a / at this end of the url if there is no path
    if not parsed_url.netloc:
        raise Exception('Cannot parse url')
    if not parsed_url.path:
        url += '/'

    # FIXME : hack for yahoo
    if parsed_url.hostname == 'search.yahoo.com'\
       and parsed_url.path.startswith('/r'):
        p = parsed_url.path
        mark = p.find('/**')
        if mark != -1:
            return unquote(p[mark + 3:]).decode()

    return url


def dict_subset(d, properties):
    result = {}
    for k in properties:
        if k in d:
            result[k] = d[k]
    return result


# get element in list or default value
def list_get(a_list, index, default=None):
    if len(a_list) > index:
        return a_list[index]
    else:
        return default


def get_torrent_size(filesize, filesize_multiplier):
    try:
        filesize = float(filesize)

        if filesize_multiplier == 'TB':
            filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
        elif filesize_multiplier == 'GB':
            filesize = int(filesize * 1024 * 1024 * 1024)
        elif filesize_multiplier == 'MB':
            filesize = int(filesize * 1024 * 1024)
        elif filesize_multiplier == 'KB':
            filesize = int(filesize * 1024)
        elif filesize_multiplier == 'TiB':
            filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
        elif filesize_multiplier == 'GiB':
            filesize = int(filesize * 1000 * 1000 * 1000)
        elif filesize_multiplier == 'MiB':
            filesize = int(filesize * 1000 * 1000)
        elif filesize_multiplier == 'KiB':
            filesize = int(filesize * 1000)
    except:
        filesize = None

    return filesize


def convert_str_to_int(number_str):
    if number_str.isdigit():
        return int(number_str)
    else:
        return 0


# convert a variable to integer or return 0 if it's not a number
def int_or_zero(num):
    if isinstance(num, list):
        if len(num) < 1:
            return 0
        num = num[0]
    return convert_str_to_int(num)


def is_valid_lang(lang):
    if isinstance(lang, bytes):
        lang = lang.decode()
    is_abbr = (len(lang) == 2)
    lang = lang.lower()
    if is_abbr:
        for l in language_codes:
            if l[0][:2] == lang:
                return (True, l[0][:2], l[3].lower())
        return False
    else:
        for l in language_codes:
            if l[1].lower() == lang or l[3].lower() == lang:
                return (True, l[0][:2], l[3].lower())
        return False


def _get_lang_to_lc_dict(lang_list):
    key = str(lang_list)
    value = lang_to_lc_cache.get(key, None)
    if value is None:
        value = dict()
        for lc in lang_list:
            value.setdefault(lc.split('-')[0], lc)
        lang_to_lc_cache[key] = value
    return value


# auxiliary function to match lang_code in lang_list
def _match_language(lang_code, lang_list=[], custom_aliases={}):
    # replace language code with a custom alias if necessary
    if lang_code in custom_aliases:
        lang_code = custom_aliases[lang_code]

    if lang_code in lang_list:
        return lang_code

    # try to get the most likely country for this language
    subtags = get_global('likely_subtags').get(lang_code)
    if subtags:
        subtag_parts = subtags.split('_')
        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
        if new_code in custom_aliases:
            new_code = custom_aliases[new_code]
        if new_code in lang_list:
            return new_code

    # try to get the any supported country for this language
    return _get_lang_to_lc_dict(lang_list).get(lang_code, None)


# get the language code from lang_list that best matches locale_code
def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
    # try to get language from given locale_code
    language = _match_language(locale_code, lang_list, custom_aliases)
    if language:
        return language

    locale_parts = locale_code.split('-')
    lang_code = locale_parts[0]

    # try to get language using an equivalent country code
    if len(locale_parts) > 1:
        country_alias = get_global('territory_aliases').get(locale_parts[-1])
        if country_alias:
            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
            if language:
                return language

    # try to get language using an equivalent language code
    alias = get_global('language_aliases').get(lang_code)
    if alias:
        language = _match_language(alias, lang_list, custom_aliases)
        if language:
            return language

    if lang_code != locale_code:
        # try to get language from given language without giving the country
        language = _match_language(lang_code, lang_list, custom_aliases)

    return language or fallback


def load_module(filename, module_dir):
    modname = splitext(filename)[0]
    if modname in sys.modules:
        del sys.modules[modname]
    filepath = join(module_dir, filename)
    module = load_source(modname, filepath)
    module.name = modname
    return module


def to_string(obj):
    if isinstance(obj, str):
        return obj
    if isinstance(obj, Number):
        return str(obj)
    if hasattr(obj, '__str__'):
        return obj.__str__()
    if hasattr(obj, '__repr__'):
        return obj.__repr__()


def ecma_unescape(s):
    """
    python implementation of the unescape javascript function

    https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
    https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
    """
    # s = unicode(s)
    # "%u5409" becomes "吉"
    s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s)
    # "%20" becomes " ", "%F3" becomes "ó"
    s = ecma_unescape2_re.sub(lambda e: chr(int(e.group(1), 16)), s)
    return s


def get_engine_from_settings(name):
    """Return engine configuration from settings.yml of a given engine name"""

    if 'engines' not in settings:
        return {}

    for engine in settings['engines']:
        if 'name' not in engine:
            continue
        if name == engine['name']:
            return engine

    return {}


def get_xpath(xpath_str):
    result = xpath_cache.get(xpath_str, None)
    if result is None:
        result = XPath(xpath_str)
        xpath_cache[xpath_str] = result
    return result


def eval_xpath(element, xpath_str):
    xpath = get_xpath(xpath_str)
    return xpath(element)
[fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). 2019-08-02 13:37:13 +02:00			`# -- coding: utf-8 --`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`import os`
			`import sys`
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00			`import re`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`import json`
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00
[mod] move load_module function to utils 2016-11-19 17:51:19 +01:00			`from imp import load_source`
[fix] convert json engine result attributes to string - closes #1006 2017-12-01 20:45:24 +01:00			`from numbers import Number`
[mod] move load_module function to utils 2016-11-19 17:51:19 +01:00			`from os.path import splitext, join`
[mod] add searx/webutils.py contains utility functions and classes used only by webapp.py 2020-09-19 18:25:24 +02:00			`from io import open`
add multi theming support 2014-04-25 01:46:40 +02:00			`from random import choice`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`from html.parser import HTMLParser`
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`from urllib.parse import urljoin, urlparse, unquote`

			`from lxml import html`
			`from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`from babel.core import get_global`
add multi theming support 2014-04-25 01:46:40 +02:00
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00
refactor engine's search language handling Add match_language function in utils to match any user given language code with a list of engine's supported languages. Also add language_aliases dict on each engine to translate standard language codes into the custom codes used by the engine. 2018-03-01 05:30:48 +01:00			`from searx import settings`
[enh] make version of searx readable 2014-11-18 11:37:42 +01:00			`from searx.version import VERSION_STRING`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`from searx.languages import language_codes`
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00			`from searx import logger`
[enh] make version of searx readable 2014-11-18 11:37:42 +01:00
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00
			`logger = logger.getChild('utils')`
[fix] highlighting only html 2014-01-10 23:38:08 +01:00
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`blocked_tags = ('script',`
			`'style')`

[fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). 2019-08-02 13:37:13 +02:00			`ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)`
			`ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)`

[mod] fetch firefox versions in a standalone script 2017-05-28 15:46:45 +02:00			`useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))`
			`+ "/data/useragents.json", 'r', encoding='utf-8').read())`
[enh] own useragent handling init 2014-01-12 20:13:14 +01:00
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`xpath_cache = dict()`
[mod] use cache in _match_language function to speed up searx start time significantly 2019-07-18 21:32:17 +02:00			`lang_to_lc_cache = dict()`

fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00
add faroo engine support 2014-10-17 12:34:51 +02:00			`def searx_useragent():`
Flake8 and Twitter corrections Lots of Flake8 corrections Maybe we should change the rule to allow lines of 120 chars. It seems more usable. Big twitter correction : now it outputs the words in right order... 2014-12-29 21:31:04 +01:00			`return 'searx/{searx_version} {suffix}'.format(`
			`searx_version=VERSION_STRING,`
[mod] change settings file structure according to #314 2015-08-02 19:38:27 +02:00			`suffix=settings['outgoing'].get('useragent_suffix', ''))`
[fix] pep8 part II. 2014-10-19 12:41:04 +02:00

fetch_firefox_version.py : compatible with Python 3 and minor fixes. 2018-08-05 10:55:42 +02:00			`def gen_useragent(os=None):`
			`return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))`
[mod] fetch firefox versions in a standalone script 2017-05-28 15:46:45 +02:00

[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception Close #2188 2020-09-11 10:23:56 +02:00			`class HTMLTextExtractorException(Exception):`
			`pass`


[enh] utils.py added 2013-11-08 23:44:26 +01:00			`class HTMLTextExtractor(HTMLParser):`
Fix quantity of blank lines after code object. 2016-07-10 16:44:27 +02:00
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`def __init__(self):`
			`HTMLParser.__init__(self)`
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00			`self.result = []`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`self.tags = []`

			`def handle_starttag(self, tag, attrs):`
			`self.tags.append(tag)`

			`def handle_endtag(self, tag):`
[fix] handle single closing element in HTMLTextExtractor 2015-01-22 17:43:45 +01:00			`if not self.tags:`
			`return`

[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if tag != self.tags[-1]:`
[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception Close #2188 2020-09-11 10:23:56 +02:00			`raise HTMLTextExtractorException()`
[fix] handle single closing element in HTMLTextExtractor 2015-01-22 17:43:45 +01:00
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`self.tags.pop()`

			`def is_valid_tag(self):`
			`return not self.tags or self.tags[-1] not in blocked_tags`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
			`def handle_data(self, d):`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if not self.is_valid_tag():`
			`return`
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`self.result.append(d)`

			`def handle_charref(self, number):`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if not self.is_valid_tag():`
			`return`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`if number[0] in ('x', 'X'):`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`codepoint = int(number[1:], 16)`
			`else:`
			`codepoint = int(number)`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`self.result.append(chr(codepoint))`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
			`def handle_entityref(self, name):`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if not self.is_valid_tag():`
			`return`
[fix] pep8 part II. 2014-10-19 12:41:04 +02:00			`# codepoint = htmlentitydefs.name2codepoint[name]`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`# self.result.append(chr(codepoint))`
[fix] html escape 2013-11-18 16:47:20 +01:00			`self.result.append(name)`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
			`def get_text(self):`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`return ''.join(self.result).strip()`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`def html_to_text(html):`
Replace every bunch of whitespaces with only one space in HTML text 2015-01-30 21:00:49 +01:00			`html = html.replace('\n', ' ')`
			`html = ' '.join(html.split())`
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`s = HTMLTextExtractor()`
[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception Close #2188 2020-09-11 10:23:56 +02:00			`try:`
			`s.feed(html)`
			`except HTMLTextExtractorException:`
			`logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)`
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`return s.get_text()`
[enh] csv output support 2013-11-15 18:55:18 +01:00

[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`def extract_text(xpath_results):`
			`'''`
			`if xpath_results is list, extract the text from each result and concat the list`
			`if xpath_results is a xml element, extract all the text node from it`
			`( text_content() method from lxml )`
			`if xpath_results is a string element, then it's already done`
			`'''`
			`if type(xpath_results) == list:`
			`# it's list of result : concat everything using recursive call`
			`result = ''`
			`for e in xpath_results:`
			`result = result + extract_text(e)`
			`return result.strip()`
			`elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:`
			`# it's a string`
			`return ''.join(xpath_results)`
			`else:`
			`# it's a element`
			`text = html.tostring(`
			`xpath_results, encoding='unicode', method='text', with_tail=False`
			`)`
			`text = text.strip().replace('\n', ' ')`
			`return ' '.join(text.split())`


			`def extract_url(xpath_results, search_url):`
			`if xpath_results == []:`
			`raise Exception('Empty url resultset')`
			`url = extract_text(xpath_results)`

			`if url.startswith('//'):`
			`# add http or https to this kind of url //example.com/`
			`parsed_search_url = urlparse(search_url)`
			`url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)`
			`elif url.startswith('/'):`
			`# fix relative url to the search engine`
			`url = urljoin(search_url, url)`

			`# fix relative urls that fall through the crack`
			`if '://' not in url:`
			`url = urljoin(search_url, url)`

			`# normalize url`
			`url = normalize_url(url)`

			`return url`


			`def normalize_url(url):`
			`parsed_url = urlparse(url)`

			`# add a / at this end of the url if there is no path`
			`if not parsed_url.netloc:`
			`raise Exception('Cannot parse url')`
			`if not parsed_url.path:`
			`url += '/'`

			`# FIXME : hack for yahoo`
			`if parsed_url.hostname == 'search.yahoo.com'\`
			`and parsed_url.path.startswith('/r'):`
			`p = parsed_url.path`
			`mark = p.find('/**')`
			`if mark != -1:`
			`return unquote(p[mark + 3:]).decode()`

			`return url`


[enh] image-proxy : handle ETag and date related headers, add hash to URL 2015-01-17 21:54:40 +01:00			`def dict_subset(d, properties):`
			`result = {}`
			`for k in properties:`
			`if k in d:`
			`result[k] = d[k]`
			`return result`
[mod] pretty url separation 2015-01-29 19:44:52 +01:00

[fix] bing_news based on RSS output format 2015-06-04 18:30:08 +02:00			`# get element in list or default value`
			`def list_get(a_list, index, default=None):`
			`if len(a_list) > index:`
			`return a_list[index]`
			`else:`
			`return default`
add digbt engine Unfortunately, it is quite slow so it is disabled. Furthermore, the display of number of files is wrong on digbt.org, so it is not displayed on searx. 2016-08-13 14:55:47 +02:00

			`def get_torrent_size(filesize, filesize_multiplier):`
			`try:`
			`filesize = float(filesize)`

			`if filesize_multiplier == 'TB':`
			`filesize = int(filesize * 1024 * 1024 * 1024 * 1024)`
			`elif filesize_multiplier == 'GB':`
			`filesize = int(filesize * 1024 * 1024 * 1024)`
			`elif filesize_multiplier == 'MB':`
			`filesize = int(filesize * 1024 * 1024)`
			`elif filesize_multiplier == 'KB':`
			`filesize = int(filesize * 1024)`
fix kickass torrents engine 2016-10-11 19:31:42 +02:00			`elif filesize_multiplier == 'TiB':`
			`filesize = int(filesize * 1000 * 1000 * 1000 * 1000)`
			`elif filesize_multiplier == 'GiB':`
			`filesize = int(filesize * 1000 * 1000 * 1000)`
			`elif filesize_multiplier == 'MiB':`
			`filesize = int(filesize * 1000 * 1000)`
			`elif filesize_multiplier == 'KiB':`
			`filesize = int(filesize * 1000)`
add digbt engine Unfortunately, it is quite slow so it is disabled. Furthermore, the display of number of files is wrong on digbt.org, so it is not displayed on searx. 2016-08-13 14:55:47 +02:00			`except:`
			`filesize = None`

			`return filesize`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00

fix kickass torrents engine 2016-10-11 19:31:42 +02:00			`def convert_str_to_int(number_str):`
			`if number_str.isdigit():`
			`return int(number_str)`
			`else:`
			`return 0`


[mod] int_or_zero refactored to searx_utils 2017-09-04 20:05:04 +02:00			`# convert a variable to integer or return 0 if it's not a number`
			`def int_or_zero(num):`
			`if isinstance(num, list):`
			`if len(num) < 1:`
			`return 0`
			`num = num[0]`
			`return convert_str_to_int(num)`


[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`def is_valid_lang(lang):`
Drop Python 2 (5/n): searx.utils.is_valid_lang, input parameter is a str instead of bytes Fix bug in translated.py and dictzone.py 2020-09-08 16:08:37 +02:00			`if isinstance(lang, bytes):`
			`lang = lang.decode()`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`is_abbr = (len(lang) == 2)`
Drop Python 2 (5/n): searx.utils.is_valid_lang, input parameter is a str instead of bytes Fix bug in translated.py and dictzone.py 2020-09-08 16:08:37 +02:00			`lang = lang.lower()`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`if is_abbr:`
			`for l in language_codes:`
[fix] convert bytes type to string in language detection (fixes dictzone) 2019-10-16 14:52:57 +02:00			`if l[0][:2] == lang:`
[fix] is_valid_lang fixed for new languages.py + dictzone engine encoding 2017-06-15 10:51:09 +02:00			`return (True, l[0][:2], l[3].lower())`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`return False`
			`else:`
			`for l in language_codes:`
[fix] convert bytes type to string in language detection (fixes dictzone) 2019-10-16 14:52:57 +02:00			`if l[1].lower() == lang or l[3].lower() == lang:`
[fix] is_valid_lang fixed for new languages.py + dictzone engine encoding 2017-06-15 10:51:09 +02:00			`return (True, l[0][:2], l[3].lower())`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`return False`
[mod] move load_module function to utils 2016-11-19 17:51:19 +01:00

[mod] use cache in _match_language function to speed up searx start time significantly 2019-07-18 21:32:17 +02:00			`def _get_lang_to_lc_dict(lang_list):`
			`key = str(lang_list)`
			`value = lang_to_lc_cache.get(key, None)`
			`if value is None:`
			`value = dict()`
			`for lc in lang_list:`
			`value.setdefault(lc.split('-')[0], lc)`
			`lang_to_lc_cache[key] = value`
			`return value`


refactor engine's search language handling Add match_language function in utils to match any user given language code with a list of engine's supported languages. Also add language_aliases dict on each engine to translate standard language codes into the custom codes used by the engine. 2018-03-01 05:30:48 +01:00			`# auxiliary function to match lang_code in lang_list`
			`def _match_language(lang_code, lang_list=[], custom_aliases={}):`
			`# replace language code with a custom alias if necessary`
			`if lang_code in custom_aliases:`
			`lang_code = custom_aliases[lang_code]`

			`if lang_code in lang_list:`
			`return lang_code`

			`# try to get the most likely country for this language`
			`subtags = get_global('likely_subtags').get(lang_code)`
			`if subtags:`
			`subtag_parts = subtags.split('_')`
			`new_code = subtag_parts[0] + '-' + subtag_parts[-1]`
			`if new_code in custom_aliases:`
			`new_code = custom_aliases[new_code]`
			`if new_code in lang_list:`
			`return new_code`

			`# try to get the any supported country for this language`
[mod] use cache in _match_language function to speed up searx start time significantly 2019-07-18 21:32:17 +02:00			`return _get_lang_to_lc_dict(lang_list).get(lang_code, None)`
refactor engine's search language handling Add match_language function in utils to match any user given language code with a list of engine's supported languages. Also add language_aliases dict on each engine to translate standard language codes into the custom codes used by the engine. 2018-03-01 05:30:48 +01:00

			`# get the language code from lang_list that best matches locale_code`
			`def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):`
			`# try to get language from given locale_code`
			`language = _match_language(locale_code, lang_list, custom_aliases)`
			`if language:`
			`return language`

			`locale_parts = locale_code.split('-')`
			`lang_code = locale_parts[0]`

			`# try to get language using an equivalent country code`
			`if len(locale_parts) > 1:`
			`country_alias = get_global('territory_aliases').get(locale_parts[-1])`
			`if country_alias:`
			`language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)`
			`if language:`
			`return language`

			`# try to get language using an equivalent language code`
			`alias = get_global('language_aliases').get(lang_code)`
			`if alias:`
			`language = _match_language(alias, lang_list, custom_aliases)`
			`if language:`
			`return language`

			`if lang_code != locale_code:`
			`# try to get language from given language without giving the country`
			`language = _match_language(lang_code, lang_list, custom_aliases)`

			`return language or fallback`


[mod] move load_module function to utils 2016-11-19 17:51:19 +01:00			`def load_module(filename, module_dir):`
			`modname = splitext(filename)[0]`
			`if modname in sys.modules:`
			`del sys.modules[modname]`
			`filepath = join(module_dir, filename)`
			`module = load_source(modname, filepath)`
			`module.name = modname`
			`return module`
fix hmac python3 compatibility 2017-07-20 15:44:02 +02:00

[fix] convert json engine result attributes to string - closes #1006 2017-12-01 20:45:24 +01:00			`def to_string(obj):`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`if isinstance(obj, str):`
[fix] convert json engine result attributes to string - closes #1006 2017-12-01 20:45:24 +01:00			`return obj`
			`if isinstance(obj, Number):`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`return str(obj)`
[fix] convert json engine result attributes to string - closes #1006 2017-12-01 20:45:24 +01:00			`if hasattr(obj, '__str__'):`
			`return obj.__str__()`
			`if hasattr(obj, '__repr__'):`
			`return obj.__repr__()`
[fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). 2019-08-02 13:37:13 +02:00

			`def ecma_unescape(s):`
			`"""`
			`python implementation of the unescape javascript function`

			`https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string`
			`https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape`
			`"""`
			`# s = unicode(s)`
			`# "%u5409" becomes "吉"`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s)`
[fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). 2019-08-02 13:37:13 +02:00			`# "%20" becomes " ", "%F3" becomes "ó"`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`s = ecma_unescape2_re.sub(lambda e: chr(int(e.group(1), 16)), s)`
[fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). 2019-08-02 13:37:13 +02:00			`return s`
add initial support for offline engines && command engine 2019-09-23 17:14:32 +02:00

			`def get_engine_from_settings(name):`
			`"""Return engine configuration from settings.yml of a given engine name"""`

			`if 'engines' not in settings:`
			`return {}`

fix pep 8 check 2019-09-30 14:27:13 +02:00			`for engine in settings['engines']:`
add initial support for offline engines && command engine 2019-09-23 17:14:32 +02:00			`if 'name' not in engine:`
			`continue`
			`if name == engine['name']:`
			`return engine`

			`return {}`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00

			`def get_xpath(xpath_str):`
			`result = xpath_cache.get(xpath_str, None)`
			`if result is None:`
			`result = XPath(xpath_str)`
			`xpath_cache[xpath_str] = result`
			`return result`


			`def eval_xpath(element, xpath_str):`
			`xpath = get_xpath(xpath_str)`
			`return xpath(element)`