From 93c0c49e9aba719c8c8e6b171e0dd515a586d32b Mon Sep 17 00:00:00 2001 From: Noemi Vanyi Date: Sun, 17 Jul 2016 18:42:30 +0200 Subject: [PATCH] add time range search with yahoo --- searx/engines/__init__.py | 3 ++- searx/engines/yahoo.py | 33 +++++++++++++++++++++++++-------- searx/search.py | 9 +++++++-- searx/webapp.py | 1 + 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 099baa587..2c735a188 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -42,7 +42,8 @@ engine_default_args = {'paging': False, 'shortcut': '-', 'disabled': False, 'suspend_end_time': 0, - 'continuous_errors': 0} + 'continuous_errors': 0, + 'time_range_support': False} def load_module(filename): diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index b8b40e4aa..2334614cb 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -20,10 +20,12 @@ from searx.engines.xpath import extract_text, extract_url categories = ['general'] paging = True language_support = True +time_range_support = True # search-url base_url = 'https://search.yahoo.com/' search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' +search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' # specific xpath variables results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" @@ -32,6 +34,9 @@ title_xpath = './/h3/a' content_xpath = './/div[@class="compText aAbs"]' suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" +time_range_dict = {'day': ['1d', 'd'], + 'week': ['1w', 'w'], + 'month': ['1m', 'm']} # remove yahoo-specific tracking-url def parse_url(url_string): @@ -51,18 +56,30 @@ def parse_url(url_string): return unquote(url_string[start:end]) +def _get_url(query, offset, language, time_range): + if time_range: + return base_url + search_url_with_time.format(offset=offset, + query=urlencode({'p': query}), + lang=language, + age=time_range_dict[time_range][0], + btf=time_range_dict[time_range][1]) + return base_url + search_url.format(offset=offset, + query=urlencode({'p': query}), + lang=language) + + +def _get_language(params): + if params['language'] == 'all': + return 'en' + return params['language'].split('_')[0] + + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + language = _get_language(params) - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('_')[0] - - params['url'] = base_url + search_url.format(offset=offset, - query=urlencode({'p': query}), - lang=language) + params['url'] = _get_url(query, offset, language, params['time_range']) # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ diff --git a/searx/search.py b/searx/search.py index a40801640..377d9ceda 100644 --- a/searx/search.py +++ b/searx/search.py @@ -138,6 +138,7 @@ class Search(object): self.paging = False self.pageno = 1 self.lang = 'all' + self.time_range = None # set blocked engines self.disabled_engines = request.preferences.engines.get_disabled() @@ -178,9 +179,9 @@ class Search(object): if len(query_obj.languages): self.lang = query_obj.languages[-1] - self.engines = query_obj.engines + self.time_range = self.request_data.get('time_range') - self.categories = [] + self.engines = query_obj.engines # if engines are calculated from query, # set categories by using that informations @@ -279,6 +280,9 @@ class Search(object): if self.lang != 'all' and not engine.language_support: continue + if self.time_range and not engine.time_range_support: + continue + # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent @@ -293,6 +297,7 @@ class Search(object): # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = request.preferences.get_value('safesearch') + request_params['time_range'] = self.time_range # update request parameters dependent on # search-engine (contained in engines folder) diff --git a/searx/webapp.py b/searx/webapp.py index e9d27a0db..7ae826026 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -459,6 +459,7 @@ def index(): paging=search.paging, number_of_results=format_decimal(number_of_results), pageno=search.pageno, + time_range=search.time_range, base_url=get_base_url(), suggestions=search.result_container.suggestions, answers=search.result_container.answers,