searxng/searx/utils.py

import cStringIO
import csv
import os
import re

from babel.dates import format_date
from codecs import getincrementalencoder
from HTMLParser import HTMLParser
from random import choice

from searx.version import VERSION_STRING
from searx.languages import language_codes
from searx import settings
from searx import logger


logger = logger.getChild('utils')

ua_versions = ('40.0',
               '41.0',
               '42.0',
               '43.0',
               '44.0',
               '45.0',
               '46.0',
               '47.0')

ua_os = ('Windows NT 6.3; WOW64',
         'X11; Linux x86_64',
         'X11; Linux x86')

ua = "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"

blocked_tags = ('script',
                'style')


def gen_useragent():
    # TODO
    return ua.format(os=choice(ua_os), version=choice(ua_versions))


def searx_useragent():
    return 'searx/{searx_version} {suffix}'.format(
           searx_version=VERSION_STRING,
           suffix=settings['outgoing'].get('useragent_suffix', ''))


def highlight_content(content, query):

    if not content:
        return None
    # ignoring html contents
    # TODO better html content detection
    if content.find('<') != -1:
        return content

    query = query.decode('utf-8')
    if content.lower().find(query.lower()) > -1:
        query_regex = u'({0})'.format(re.escape(query))
        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
                         content, flags=re.I | re.U)
    else:
        regex_parts = []
        for chunk in query.split():
            if len(chunk) == 1:
                regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk)))
            else:
                regex_parts.append(u'{0}'.format(re.escape(chunk)))
        query_regex = u'({0})'.format('|'.join(regex_parts))
        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
                         content, flags=re.I | re.U)

    return content


class HTMLTextExtractor(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.result = []
        self.tags = []

    def handle_starttag(self, tag, attrs):
        self.tags.append(tag)

    def handle_endtag(self, tag):
        if not self.tags:
            return

        if tag != self.tags[-1]:
            raise Exception("invalid html")

        self.tags.pop()

    def is_valid_tag(self):
        return not self.tags or self.tags[-1] not in blocked_tags

    def handle_data(self, d):
        if not self.is_valid_tag():
            return
        self.result.append(d)

    def handle_charref(self, number):
        if not self.is_valid_tag():
            return
        if number[0] in (u'x', u'X'):
            codepoint = int(number[1:], 16)
        else:
            codepoint = int(number)
        self.result.append(unichr(codepoint))

    def handle_entityref(self, name):
        if not self.is_valid_tag():
            return
        # codepoint = htmlentitydefs.name2codepoint[name]
        # self.result.append(unichr(codepoint))
        self.result.append(name)

    def get_text(self):
        return u''.join(self.result).strip()


def html_to_text(html):
    html = html.replace('\n', ' ')
    html = ' '.join(html.split())
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()


class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = getincrementalencoder(encoding)()

    def writerow(self, row):
        unicode_row = []
        for col in row:
            if type(col) == str or type(col) == unicode:
                unicode_row.append(col.encode('utf-8').strip())
            else:
                unicode_row.append(col)
        self.writer.writerow(unicode_row)
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


def get_themes(root):
    """Returns available themes list."""

    static_path = os.path.join(root, 'static')
    templates_path = os.path.join(root, 'templates')

    themes = os.listdir(os.path.join(static_path, 'themes'))
    return static_path, templates_path, themes


def get_static_files(base_path):
    base_path = os.path.join(base_path, 'static')
    static_files = set()
    base_path_length = len(base_path) + 1
    for directory, _, files in os.walk(base_path):
        for filename in files:
            f = os.path.join(directory[base_path_length:], filename)
            static_files.add(f)
    return static_files


def get_result_templates(base_path):
    base_path = os.path.join(base_path, 'templates')
    result_templates = set()
    base_path_length = len(base_path) + 1
    for directory, _, files in os.walk(base_path):
        if directory.endswith('result_templates'):
            for filename in files:
                f = os.path.join(directory[base_path_length:], filename)
                result_templates.add(f)
    return result_templates


def format_date_by_locale(date, locale_string):
    # strftime works only on dates after 1900

    if date.year <= 1900:
        return date.isoformat().split('T')[0]

    if locale_string == 'all':
        locale_string = settings['ui']['default_locale'] or 'en_US'

    # to avoid crashing if locale is not supported by babel
    try:
        formatted_date = format_date(date, locale=locale_string)
    except:
        formatted_date = format_date(date, "YYYY-MM-dd")

    return formatted_date


def dict_subset(d, properties):
    result = {}
    for k in properties:
        if k in d:
            result[k] = d[k]
    return result


def prettify_url(url, max_length=74):
    if len(url) > max_length:
        chunk_len = max_length / 2 + 1
        return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
    else:
        return url


# get element in list or default value
def list_get(a_list, index, default=None):
    if len(a_list) > index:
        return a_list[index]
    else:
        return default


def get_torrent_size(filesize, filesize_multiplier):
    try:
        filesize = float(filesize)

        if filesize_multiplier == 'TB':
            filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
        elif filesize_multiplier == 'GB':
            filesize = int(filesize * 1024 * 1024 * 1024)
        elif filesize_multiplier == 'MB':
            filesize = int(filesize * 1024 * 1024)
        elif filesize_multiplier == 'KB':
            filesize = int(filesize * 1024)
    except:
        filesize = None

    return filesize


def is_valid_lang(lang):
    is_abbr = (len(lang) == 2)
    if is_abbr:
        for l in language_codes:
            if l[0][:2] == lang.lower():
                return (True, l[0][:2], l[1].lower())
        return False
    else:
        for l in language_codes:
            if l[1].lower() == lang.lower():
                return (True, l[0][:2], l[1].lower())
        return False
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00			`import cStringIO`
			`import csv`
			`import os`
			`import re`

[fix][mod] wikidata date handling refactor - fixes #387 2015-09-07 22:39:33 +02:00			`from babel.dates import format_date`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`from codecs import getincrementalencoder`
add multi theming support 2014-04-25 01:46:40 +02:00			`from HTMLParser import HTMLParser`
			`from random import choice`

[enh] make version of searx readable 2014-11-18 11:37:42 +01:00			`from searx.version import VERSION_STRING`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00			`from searx.languages import language_codes`
[enh] make version of searx readable 2014-11-18 11:37:42 +01:00			`from searx import settings`
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00			`from searx import logger`
[enh] make version of searx readable 2014-11-18 11:37:42 +01:00
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00
			`logger = logger.getChild('utils')`
[fix] highlighting only html 2014-01-10 23:38:08 +01:00
[enh] update useragent versions 2016-07-04 23:11:39 +02:00			`ua_versions = ('40.0',`
			`'41.0',`
			`'42.0',`
			`'43.0',`
			`'44.0',`
			`'45.0',`
			`'46.0',`
			`'47.0')`
[enh] user agent string update 2014-11-22 18:37:42 +01:00
[mod] useragent generation 2014-05-20 16:55:49 +02:00			`ua_os = ('Windows NT 6.3; WOW64',`
			`'X11; Linux x86_64',`
			`'X11; Linux x86')`
[mod] change settings file structure according to #314 2015-08-02 19:38:27 +02:00
[fix] user agent : the "rv:{version}" was missing (can be a issue with some engine, like flickr) 2015-05-02 12:35:57 +02:00			`ua = "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"`
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`blocked_tags = ('script',`
			`'style')`

[fix] pep8 2014-03-04 19:26:09 +01:00
[mod] function name 2014-01-18 21:53:59 +01:00			`def gen_useragent():`
[enh] own useragent handling init 2014-01-12 20:13:14 +01:00			`# TODO`
[enh] better useragent string generation 2014-03-04 14:20:37 +01:00			`return ua.format(os=choice(ua_os), version=choice(ua_versions))`
[enh] own useragent handling init 2014-01-12 20:13:14 +01:00
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00
add faroo engine support 2014-10-17 12:34:51 +02:00			`def searx_useragent():`
Flake8 and Twitter corrections Lots of Flake8 corrections Maybe we should change the rule to allow lines of 120 chars. It seems more usable. Big twitter correction : now it outputs the words in right order... 2014-12-29 21:31:04 +01:00			`return 'searx/{searx_version} {suffix}'.format(`
			`searx_version=VERSION_STRING,`
[mod] change settings file structure according to #314 2015-08-02 19:38:27 +02:00			`suffix=settings['outgoing'].get('useragent_suffix', ''))`
[fix] pep8 part II. 2014-10-19 12:41:04 +02:00

[fix] highlighting only html 2014-01-10 23:38:08 +01:00			`def highlight_content(content, query):`

			`if not content:`
			`return None`
			`# ignoring html contents`
			`# TODO better html content detection`
			`if content.find('<') != -1:`
			`return content`

			`query = query.decode('utf-8')`
			`if content.lower().find(query.lower()) > -1:`
			`query_regex = u'({0})'.format(re.escape(query))`
[fix] pep8 2014-05-16 16:51:23 +02:00			`content = re.sub(query_regex, '<span class="highlight">\\1</span>',`
			`content, flags=re.I \| re.U)`
[fix] highlighting only html 2014-01-10 23:38:08 +01:00			`else:`
			`regex_parts = []`
			`for chunk in query.split():`
			`if len(chunk) == 1:`
Fix anomalous backslash in string 2016-07-11 15:29:47 +02:00			`regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk)))`
[fix] highlighting only html 2014-01-10 23:38:08 +01:00			`else:`
			`regex_parts.append(u'{0}'.format(re.escape(chunk)))`
			`query_regex = u'({0})'.format('\|'.join(regex_parts))`
[fix] pep8 2014-05-16 16:51:23 +02:00			`content = re.sub(query_regex, '<span class="highlight">\\1</span>',`
			`content, flags=re.I \| re.U)`
[fix] highlighting only html 2014-01-10 23:38:08 +01:00
			`return content`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`class HTMLTextExtractor(HTMLParser):`
Fix quantity of blank lines after code object. 2016-07-10 16:44:27 +02:00
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`def __init__(self):`
			`HTMLParser.__init__(self)`
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00			`self.result = []`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`self.tags = []`

			`def handle_starttag(self, tag, attrs):`
			`self.tags.append(tag)`

			`def handle_endtag(self, tag):`
[fix] handle single closing element in HTMLTextExtractor 2015-01-22 17:43:45 +01:00			`if not self.tags:`
			`return`

[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if tag != self.tags[-1]:`
			`raise Exception("invalid html")`
[fix] handle single closing element in HTMLTextExtractor 2015-01-22 17:43:45 +01:00
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`self.tags.pop()`

			`def is_valid_tag(self):`
			`return not self.tags or self.tags[-1] not in blocked_tags`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
			`def handle_data(self, d):`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if not self.is_valid_tag():`
			`return`
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`self.result.append(d)`

			`def handle_charref(self, number):`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if not self.is_valid_tag():`
			`return`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`if number[0] in (u'x', u'X'):`
			`codepoint = int(number[1:], 16)`
			`else:`
			`codepoint = int(number)`
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`self.result.append(unichr(codepoint))`

			`def handle_entityref(self, name):`
[fix] ignore scripts/styles in html_to_text 2015-01-01 14:13:56 +01:00			`if not self.is_valid_tag():`
			`return`
[fix] pep8 part II. 2014-10-19 12:41:04 +02:00			`# codepoint = htmlentitydefs.name2codepoint[name]`
			`# self.result.append(unichr(codepoint))`
[fix] html escape 2013-11-18 16:47:20 +01:00			`self.result.append(name)`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
			`def get_text(self):`
A bit of utils unit tests 2015-01-27 20:03:33 +01:00			`return u''.join(self.result).strip()`
[enh] utils.py added 2013-11-08 23:44:26 +01:00
fix: robot fw, entry points, some flake8, package searx egg 2014-01-19 22:59:01 +01:00
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`def html_to_text(html):`
Replace every bunch of whitespaces with only one space in HTML text 2015-01-30 21:00:49 +01:00			`html = html.replace('\n', ' ')`
			`html = ' '.join(html.split())`
[enh] utils.py added 2013-11-08 23:44:26 +01:00			`s = HTMLTextExtractor()`
			`s.feed(html)`
			`return s.get_text()`
[enh] csv output support 2013-11-15 18:55:18 +01:00

			`class UnicodeWriter:`
			`"""`
			`A CSV writer which will write rows to CSV file "f",`
			`which is encoded in the given encoding.`
			`"""`

			`def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):`
			`# Redirect output to a queue`
			`self.queue = cStringIO.StringIO()`
			`self.writer = csv.writer(self.queue, dialect=dialect, **kwds)`
			`self.stream = f`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`self.encoder = getincrementalencoder(encoding)()`
[enh] csv output support 2013-11-15 18:55:18 +01:00
			`def writerow(self, row):`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`unicode_row = []`
			`for col in row:`
			`if type(col) == str or type(col) == unicode:`
			`unicode_row.append(col.encode('utf-8').strip())`
			`else:`
			`unicode_row.append(col)`
			`self.writer.writerow(unicode_row)`
[enh] csv output support 2013-11-15 18:55:18 +01:00			`# Fetch UTF-8 output from the queue ...`
			`data = self.queue.getvalue()`
			`data = data.decode("utf-8")`
			`# ... and reencode it into the target encoding`
			`data = self.encoder.encode(data)`
			`# write to the target stream`
			`self.stream.write(data)`
			`# empty queue`
			`self.queue.truncate(0)`

			`def writerows(self, rows):`
			`for row in rows:`
			`self.writerow(row)`
add multi theming support 2014-04-25 01:46:40 +02:00

			`def get_themes(root):`
			`"""Returns available themes list."""`

			`static_path = os.path.join(root, 'static')`
			`templates_path = os.path.join(root, 'templates')`

[enh] themes static content refactor 2015-01-01 17:48:12 +01:00			`themes = os.listdir(os.path.join(static_path, 'themes'))`
add multi theming support 2014-04-25 01:46:40 +02:00			`return static_path, templates_path, themes`
[enh] themes static content refactor 2015-01-01 17:48:12 +01:00

			`def get_static_files(base_path):`
[enh] better result template handling 2015-01-01 18:59:53 +01:00			`base_path = os.path.join(base_path, 'static')`
[enh] themes static content refactor 2015-01-01 17:48:12 +01:00			`static_files = set()`
[enh] better result template handling 2015-01-01 18:59:53 +01:00			`base_path_length = len(base_path) + 1`
			`for directory, _, files in os.walk(base_path):`
[enh] themes static content refactor 2015-01-01 17:48:12 +01:00			`for filename in files:`
			`f = os.path.join(directory[base_path_length:], filename)`
			`static_files.add(f)`
			`return static_files`
[enh] better result template handling 2015-01-01 18:59:53 +01:00

			`def get_result_templates(base_path):`
			`base_path = os.path.join(base_path, 'templates')`
			`result_templates = set()`
			`base_path_length = len(base_path) + 1`
			`for directory, _, files in os.walk(base_path):`
			`if directory.endswith('result_templates'):`
			`for filename in files:`
			`f = os.path.join(directory[base_path_length:], filename)`
			`result_templates.add(f)`
			`return result_templates`
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00

[fix][mod] wikidata date handling refactor - fixes #387 2015-09-07 22:39:33 +02:00			`def format_date_by_locale(date, locale_string):`
[enh] date formatting by locale 2015-01-11 13:26:40 +01:00			`# strftime works only on dates after 1900`
[fix][mod] wikidata date handling refactor - fixes #387 2015-09-07 22:39:33 +02:00
			`if date.year <= 1900:`
			`return date.isoformat().split('T')[0]`

			`if locale_string == 'all':`
			`locale_string = settings['ui']['default_locale'] or 'en_US'`

[fix] exception if locale doesn't have a date format occitan, for example 2016-06-04 06:02:53 +02:00			`# to avoid crashing if locale is not supported by babel`
			`try:`
			`formatted_date = format_date(date, locale=locale_string)`
			`except:`
			`formatted_date = format_date(date, "YYYY-MM-dd")`

			`return formatted_date`
[enh] image-proxy : handle ETag and date related headers, add hash to URL 2015-01-17 21:54:40 +01:00

			`def dict_subset(d, properties):`
			`result = {}`
			`for k in properties:`
			`if k in d:`
			`result[k] = d[k]`
			`return result`
[mod] pretty url separation 2015-01-29 19:44:52 +01:00

[enh] test utils.prettify_url 2015-09-07 19:22:01 +02:00			`def prettify_url(url, max_length=74):`
			`if len(url) > max_length:`
			`chunk_len = max_length / 2 + 1`
			`return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])`
[mod] pretty url separation 2015-01-29 19:44:52 +01:00			`else:`
			`return url`
[enh] default disabled engines - closes #109 2015-01-31 23:11:45 +01:00

[fix] bing_news based on RSS output format 2015-06-04 18:30:08 +02:00			`# get element in list or default value`
			`def list_get(a_list, index, default=None):`
			`if len(a_list) > index:`
			`return a_list[index]`
			`else:`
			`return default`
add digbt engine Unfortunately, it is quite slow so it is disabled. Furthermore, the display of number of files is wrong on digbt.org, so it is not displayed on searx. 2016-08-13 14:55:47 +02:00

			`def get_torrent_size(filesize, filesize_multiplier):`
			`try:`
			`filesize = float(filesize)`

			`if filesize_multiplier == 'TB':`
			`filesize = int(filesize * 1024 * 1024 * 1024 * 1024)`
			`elif filesize_multiplier == 'GB':`
			`filesize = int(filesize * 1024 * 1024 * 1024)`
			`elif filesize_multiplier == 'MB':`
			`filesize = int(filesize * 1024 * 1024)`
			`elif filesize_multiplier == 'KB':`
			`filesize = int(filesize * 1024)`
			`except:`
			`filesize = None`

			`return filesize`
[enh] is_valid_lang moved to utils 2016-09-06 16:43:48 +02:00

			`def is_valid_lang(lang):`
			`is_abbr = (len(lang) == 2)`
			`if is_abbr:`
			`for l in language_codes:`
			`if l[0][:2] == lang.lower():`
			`return (True, l[0][:2], l[1].lower())`
			`return False`
			`else:`
			`for l in language_codes:`
			`if l[1].lower() == lang.lower():`
			`return (True, l[0][:2], l[1].lower())`
			`return False`