From c9bab0e833725cae4a7ee1303cc4772c1b3ffb01 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 13 Sep 2014 18:25:25 +0200 Subject: [PATCH 1/5] add comments to search.py * add comments * add licence-header --- searx/search.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/searx/search.py b/searx/search.py index 19b286d4d..c9baf6119 100644 --- a/searx/search.py +++ b/searx/search.py @@ -1,3 +1,20 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, +''' + import grequests from itertools import izip_longest, chain from datetime import datetime @@ -9,46 +26,67 @@ from searx.engines import ( from searx.languages import language_codes from searx.utils import gen_useragent + number_of_searches = 0 +# get default reqest parameter def default_request_params(): return { 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} +# create a callback wrapper for the search engine results def make_callback(engine_name, results, suggestions, callback, params): + # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params + + # update stats with current page-load-time engines[engine_name].stats['page_load_time'] += \ (datetime.now() - params['started']).total_seconds() + try: search_results = callback(response) except Exception, e: + # increase errors stats engines[engine_name].stats['errors'] += 1 results[engine_name] = cb_res + + # print engine name and specific error message print '[E] Error with engine "{0}":\n\t{1}'.format( engine_name, str(e)) return + for result in search_results: result['engine'] = engine_name + + # if it is a suggestion, add it to list of suggestions if 'suggestion' in result: # TODO type checks suggestions.add(result['suggestion']) continue + + # append result cb_res.append(result) + results[engine_name] = cb_res + return process_callback +# score results and remove duplications def score_results(results): + # calculate scoring parameters flat_res = filter( None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) engines_len = len(results) + results = [] + # deduplication + scoring for i, res in enumerate(flat_res): @@ -62,34 +100,54 @@ def score_results(results): res['engines'] = [res['engine']] weight = 1.0 + # get weight of this engine if possible if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) + # calculate score for that engine score = int((flat_len - i) / engines_len) * weight + 1 + duplicated = False + # check for duplicates for new_res in results: + # remove / from the end of the url if required p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa + + # check if that result is a duplicate if res['host'] == new_res['host'] and\ unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res break + + # merge duplicates together if duplicated: + # using content with more text if res.get('content') > duplicated.get('content'): duplicated['content'] = res['content'] + + # increase result-score duplicated['score'] += score + + # add engine to list of result-engines duplicated['engines'].append(res['engine']) + + # using https if possible if duplicated['parsed_url'].scheme == 'https': continue elif res['parsed_url'].scheme == 'https': duplicated['url'] = res['parsed_url'].geturl() duplicated['parsed_url'] = res['parsed_url'] + + # if there is no duplicate found, append result else: res['score'] = score results.append(res) + + # return results sorted by score return sorted(results, key=itemgetter('score'), reverse=True) @@ -98,6 +156,7 @@ class Search(object): """Search information container""" def __init__(self, request): + # init vars super(Search, self).__init__() self.query = None self.engines = [] @@ -105,18 +164,23 @@ class Search(object): self.paging = False self.pageno = 1 self.lang = 'all' + + # set blocked engines if request.cookies.get('blocked_engines'): self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa else: self.blocked_engines = [] + self.results = [] self.suggestions = [] self.request_data = {} + # set specific language if set if request.cookies.get('language')\ and request.cookies['language'] in (x[0] for x in language_codes): self.lang = request.cookies['language'] + # set request method if request.method == 'POST': self.request_data = request.form else: @@ -126,51 +190,72 @@ class Search(object): if not self.request_data.get('q'): raise Exception('noquery') + # set query self.query = self.request_data['q'] + # set pagenumber pageno_param = self.request_data.get('pageno', '1') if not pageno_param.isdigit() or int(pageno_param) < 1: raise Exception('wrong pagenumber') self.pageno = int(pageno_param) + # parse query, if tags are set, which change the serch engine or search-language self.parse_query() self.categories = [] + # if engines are calculated from query, set categories by using that informations if self.engines: self.categories = list(set(engine['category'] for engine in self.engines)) + + # otherwise, using defined categories to calculate which engines should be used else: + # set used categories for pd_name, pd in self.request_data.items(): if pd_name.startswith('category_'): category = pd_name[9:] + # if category is not found in list, skip if not category in categories: continue + + # add category to list self.categories.append(category) + + # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie) if not self.categories: cookie_categories = request.cookies.get('categories', '') cookie_categories = cookie_categories.split(',') for ccateg in cookie_categories: if ccateg in categories: self.categories.append(ccateg) + + # if still no category is specified, using general as default-category if not self.categories: self.categories = ['general'] + # using all engines for that search, which are declared under the specific categories for categ in self.categories: self.engines.extend({'category': categ, 'name': x.name} for x in categories[categ] if not x.name in self.blocked_engines) + # parse query, if tags are set, which change the serch engine or search-language def parse_query(self): query_parts = self.query.split() modified = False + + # check if language-prefix is set if query_parts[0].startswith(':'): lang = query_parts[0][1:].lower() + # check if any language-code equal with declared language-codes for lc in language_codes: lang_id, lang_name, country = map(str.lower, lc) + + # if correct language-code is found, set it as new search-language if lang == lang_id\ or lang_id.startswith(lang)\ or lang == lang_name\ @@ -179,56 +264,78 @@ class Search(object): modified = True break + # check if category/engine prefix is set elif query_parts[0].startswith('!'): prefix = query_parts[0][1:].replace('_', ' ') + # check if prefix equal with engine shortcut if prefix in engine_shortcuts\ and not engine_shortcuts[prefix] in self.blocked_engines: modified = True self.engines.append({'category': 'none', 'name': engine_shortcuts[prefix]}) + + # check if prefix equal with engine name elif prefix in engines\ and not prefix in self.blocked_engines: modified = True self.engines.append({'category': 'none', 'name': prefix}) + + # check if prefix equal with categorie name elif prefix in categories: modified = True + # using all engines for that search, which are declared under that categorie name self.engines.extend({'category': prefix, 'name': engine.name} for engine in categories[prefix] if not engine in self.blocked_engines) + + # if language, category or engine were specificed in this query, search for more tags which does the same if modified: self.query = self.query.replace(query_parts[0], '', 1).strip() self.parse_query() + # do search-request def search(self, request): global number_of_searches + + # init vars requests = [] results = {} suggestions = set() + + # increase number of active searches number_of_searches += 1 + + # set default useragent #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() + # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] + # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue + # if search-language is set and engine does not provide language-support, skip if self.lang != 'all' and not engine.language_support: continue + # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = self.pageno request_params['language'] = self.lang + + # update request parameters dependent on search-engine (contained in engines folder) request_params = engine.request(self.query.encode('utf-8'), request_params) @@ -236,6 +343,7 @@ class Search(object): # TODO add support of offline engines pass + # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], results, @@ -244,6 +352,7 @@ class Search(object): request_params ) + # create dictionary which contain all informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), @@ -251,6 +360,7 @@ class Search(object): timeout=engine.timeout ) + # specific type of request (GET or POST) if request_params['method'] == 'GET': req = grequests.get else: @@ -261,17 +371,25 @@ class Search(object): if not request_params['url']: continue + # append request to list requests.append(req(request_params['url'], **request_args)) + + # send all search-request grequests.map(requests) + + # update engine-specific stats for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) + # score results and remove duplications results = score_results(results) + # update engine stats, using calculated score for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] + # return results and suggestions return results, suggestions From 7da4b3dc82048169b7ca1720ef91cac567196e1f Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 13 Sep 2014 18:32:28 +0200 Subject: [PATCH 2/5] add comments to languages.py * add comments * add licence-header --- searx/languages.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/searx/languages.py b/searx/languages.py index 8b12e5ffe..df5fabf74 100644 --- a/searx/languages.py +++ b/searx/languages.py @@ -1,3 +1,21 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, +''' + +# list of language codes language_codes = ( ("ar_XA", "Arabic", "Arabia"), ("bg_BG", "Bulgarian", "Bulgaria"), From bc30c4f4ade9a6d8322c1b46680f998ebf1cda0b Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 13 Sep 2014 18:39:03 +0200 Subject: [PATCH 3/5] add comments to __init__.py * add comments * add licence-header --- searx/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/searx/__init__.py b/searx/__init__.py index 375a5414a..17da2f353 100644 --- a/searx/__init__.py +++ b/searx/__init__.py @@ -1,3 +1,20 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, +''' + from os import environ from os.path import realpath, dirname, join, abspath try: @@ -10,11 +27,14 @@ except: searx_dir = abspath(dirname(__file__)) engine_dir = dirname(realpath(__file__)) +# if possible set path to settings using the enviroment variable SEARX_SETTINGS_PATH if 'SEARX_SETTINGS_PATH' in environ: settings_path = environ['SEARX_SETTINGS_PATH'] +# otherwise using default path else: settings_path = join(searx_dir, 'settings.yml') +# load settings with open(settings_path) as settings_yaml: settings = load(settings_yaml) From 22da73b8bb7a995e3bb3823ae9afbaca04690274 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 13 Sep 2014 18:44:11 +0200 Subject: [PATCH 4/5] little update update comments in search.py --- searx/search.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/searx/search.py b/searx/search.py index c9baf6119..d2a9dd42f 100644 --- a/searx/search.py +++ b/searx/search.py @@ -251,7 +251,7 @@ class Search(object): if query_parts[0].startswith(':'): lang = query_parts[0][1:].lower() - # check if any language-code equal with declared language-codes + # check if any language-code is equal with declared language-codes for lc in language_codes: lang_id, lang_name, country = map(str.lower, lc) @@ -268,21 +268,21 @@ class Search(object): elif query_parts[0].startswith('!'): prefix = query_parts[0][1:].replace('_', ' ') - # check if prefix equal with engine shortcut + # check if prefix is equal with engine shortcut if prefix in engine_shortcuts\ and not engine_shortcuts[prefix] in self.blocked_engines: modified = True self.engines.append({'category': 'none', 'name': engine_shortcuts[prefix]}) - # check if prefix equal with engine name + # check if prefix is equal with engine name elif prefix in engines\ and not prefix in self.blocked_engines: modified = True self.engines.append({'category': 'none', 'name': prefix}) - # check if prefix equal with categorie name + # check if prefix is equal with categorie name elif prefix in categories: modified = True # using all engines for that search, which are declared under that categorie name @@ -305,7 +305,7 @@ class Search(object): results = {} suggestions = set() - # increase number of active searches + # increase number of searches number_of_searches += 1 # set default useragent From 53dc92b0d75b5cd1ffcf92cdd81a60566385bbb3 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 13 Sep 2014 18:47:28 +0200 Subject: [PATCH 5/5] update comments in autocomplete.py * update comments * add licence-header --- searx/autocomplete.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/searx/autocomplete.py b/searx/autocomplete.py index a36dfaf54..183769af8 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -1,3 +1,21 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, +''' + + from lxml import etree from requests import get from json import loads @@ -22,7 +40,7 @@ def dbpedia(query): def duckduckgo(query): - # wikipedia autocompleter + # duckduckgo autocompleter url = 'https://ac.duckduckgo.com/ac/?{0}&type=list' resp = loads(get(url.format(urlencode(dict(q=query)))).text)