From e6e4de8ba037f1356104289555bd8bd63fedbc9c Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:14:57 +0200 Subject: [PATCH] rewrite duckduckgo engine and add comments --- searx/engines/duckduckgo.py | 71 +++++++++++++++++++------------------ searx/settings.yml | 2 -- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 58cbc9872..eae79481d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,24 +1,48 @@ +## DuckDuckGo (Web) +# +# @website https://duckduckgo.com/ +# @provide-api yes (https://duckduckgo.com/api), but not all results from search-site +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo rewrite to api +# @todo language support + from urllib import urlencode from lxml.html import fromstring from searx.utils import html_to_text -url = 'https://duckduckgo.com/html?{query}&s={offset}' +# engine dependent config +categories = ['general'] +paging = True locale = 'us-en' +# search-url +url = 'https://duckduckgo.com/html?{query}&s={offset}' +# specific xpath variables +result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa +url_xpath = './/a[@class="large"]/@href' +title_xpath = './/a[@class="large"]//text()' +content_xpath = './/div[@class="snippet"]//text()' + + +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 30 - q = urlencode({'q': query, - 'l': locale}) - params['url'] = url.format(query=q, offset=offset) + + params['url'] = url.format( + query=urlencode({'q': query, 'l': locale}), + offset=offset) + return params +# get response from search-request def response(resp): - result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa - url_xpath = './/a[@class="large"]/@href' - title_xpath = './/a[@class="large"]//text()' - content_xpath = './/div[@class="snippet"]//text()' results = [] doc = fromstring(resp.text) @@ -28,38 +52,17 @@ def response(resp): res_url = r.xpath(url_xpath)[-1] except: continue + if not res_url: continue + title = html_to_text(''.join(r.xpath(title_xpath))) content = html_to_text(''.join(r.xpath(content_xpath))) + + # append result results.append({'title': title, 'content': content, 'url': res_url}) + # return results return results - - -#from json import loads -#search_url = url + 'd.js?{query}&p=1&s={offset}' -# -#paging = True -# -# -#def request(query, params): -# offset = (params['pageno'] - 1) * 30 -# q = urlencode({'q': query, -# 'l': locale}) -# params['url'] = search_url.format(query=q, offset=offset) -# return params -# -# -#def response(resp): -# results = [] -# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] -# for r in search_res: -# if not r.get('t'): -# continue -# results.append({'title': r['t'], -# 'content': html_to_text(r['a']), -# 'url': r['u']}) -# return results diff --git a/searx/settings.yml b/searx/settings.yml index 6d398f871..5a9254070 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -37,7 +37,6 @@ engines: - name : deviantart engine : deviantart - categories : images shortcut : da timeout: 3.0 @@ -47,7 +46,6 @@ engines: - name : duckduckgo engine : duckduckgo - locale : en-us shortcut : ddg # down - website is under criminal investigation by the UK