From a46bbb40422564b5576b81c978fb734dbf45a9ce Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 18:49:42 +0200 Subject: [PATCH] fix stackoverflow and add comments --- searx/engines/stackoverflow.py | 48 +++++++++++++++++++++++++++------- searx/settings.yml | 1 - 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index e24b309c1..edbe74a70 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -1,30 +1,58 @@ +## Stackoverflow (It) +# +# @website https://stackoverflow.com/ +# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content + from urlparse import urljoin from cgi import escape from urllib import urlencode from lxml import html +# engine dependent config categories = ['it'] - -url = 'http://stackoverflow.com/' -search_url = url+'search?{query}&page={pageno}' -result_xpath = './/div[@class="excerpt"]//text()' - paging = True +# search-url +url = 'http://stackoverflow.com/' +search_url = url+'search?{query}&page={pageno}' +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +title_xpath = './/text()' +content_xpath = './/div[@class="excerpt"]//text()' + + +# do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) - for result in dom.xpath('//div[@class="question-summary search-result"]'): - link = result.xpath('.//div[@class="result-link"]//a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(' '.join(link.xpath('.//text()'))) - content = escape(' '.join(result.xpath(result_xpath))) - results.append({'url': href, 'title': title, 'content': content}) + title = escape(' '.join(link.xpath(title_xpath))) + content = escape(' '.join(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 00ea2c339..a08a15403 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -90,7 +90,6 @@ engines: - name : stackoverflow engine : stackoverflow - categories : it shortcut : st - name : startpage