From 343e555ee96ab8b40b5ed4dce287331685eff3fa Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 8 Apr 2018 20:35:34 -0500 Subject: [PATCH 1/4] [fix] append http if no scheme is provided in xpath's extact_url This solves a bug with Yahoo where some results don't specify a protocol. --- searx/engines/xpath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index c8c56da44..50f98d935 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -53,7 +53,7 @@ def extract_url(xpath_results, search_url): if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(search_url) - url = u'{0}:{1}'.format(parsed_search_url.scheme, url) + url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url) elif url.startswith('/'): # fix relative url to the search engine url = urljoin(search_url, url) From 835d1edd5834c3c8117dc4614cb0b0b4316d3153 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 8 Apr 2018 20:56:05 -0500 Subject: [PATCH 2/4] [fix] google news xpath --- searx/engines/google_news.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 41abf0a01..aadcb76df 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -68,8 +68,8 @@ def response(resp): for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): try: r = { - 'url': result.xpath('.//a[@class="l _PMs"]')[0].attrib.get("href"), - 'title': ''.join(result.xpath('.//a[@class="l _PMs"]//text()')), + 'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"), + 'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')), 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), } except: From b12857a70dd947a804e667d864ba56055b528ee0 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 8 Apr 2018 21:17:00 -0500 Subject: [PATCH 3/4] [fix] make search requests on wikidata more accurate --- searx/engines/wikidata.py | 13 ++++++------- searx/settings.yml | 1 + 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 1fdbc9869..fe53609c1 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -27,7 +27,7 @@ result_count = 1 # urls wikidata_host = 'https://www.wikidata.org' url_search = wikidata_host \ - + '/wiki/Special:ItemDisambiguation?{query}' + + '/w/index.php?{query}' wikidata_api = wikidata_host + '/w/api.php' url_detail = wikidata_api\ @@ -40,7 +40,7 @@ url_map = 'https://www.openstreetmap.org/'\ url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' # xpaths -wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' +wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' title_xpath = '//*[contains(@class,"wikibase-title-label")]' description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' property_xpath = '//div[@id="{propertyid}"]' @@ -57,22 +57,21 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): - language = match_language(params['language'], supported_languages).split('-')[0] - params['url'] = url_search.format( - query=urlencode({'label': query, 'language': language})) + query=urlencode({'search': query})) return params def response(resp): results = [] html = fromstring(resp.text) - wikidata_ids = html.xpath(wikidata_ids_xpath) + search_results = html.xpath(wikidata_ids_xpath) language = match_language(resp.search_params['language'], supported_languages).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 - for wikidata_id in wikidata_ids[:result_count]: + for search_result in search_results[:result_count]: + wikidata_id = search_result.split('/')[-1] url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) jsonresponse = loads(htmlresponse.text) diff --git a/searx/settings.yml b/searx/settings.yml index 70750fc96..d72d01a54 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -174,6 +174,7 @@ engines: - name : wikidata engine : wikidata shortcut : wd + timeout : 3.0 weight : 2 - name : duckduckgo From 96877862269f35aefc0b3ca7a7cb8812b1555dc4 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 8 Apr 2018 23:31:24 -0500 Subject: [PATCH 4/4] update unit tests for google news and wikidata --- tests/unit/engines/test_google_news.py | 4 ++-- tests/unit/engines/test_wikidata.py | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/unit/engines/test_google_news.py b/tests/unit/engines/test_google_news.py index fbc6d344d..a041a79b9 100644 --- a/tests/unit/engines/test_google_news.py +++ b/tests/unit/engines/test_google_news.py @@ -42,7 +42,7 @@ class TestGoogleNewsEngine(SearxTestCase):

- Example title + Example title

@@ -63,7 +63,7 @@ class TestGoogleNewsEngine(SearxTestCase):

- Example title 2 + Example title 2

diff --git a/tests/unit/engines/test_wikidata.py b/tests/unit/engines/test_wikidata.py index 1ad21768c..545ef9ed8 100644 --- a/tests/unit/engines/test_wikidata.py +++ b/tests/unit/engines/test_wikidata.py @@ -9,20 +9,15 @@ from searx.testing import SearxTestCase class TestWikidataEngine(SearxTestCase): def test_request(self): - wikidata.supported_languages = ['en', 'es'] query = 'test_query' dicto = defaultdict(dict) - dicto['language'] = 'en-US' params = wikidata.request(query, dicto) self.assertIn('url', params) self.assertIn(query, params['url']) self.assertIn('wikidata.org', params['url']) - self.assertIn('en', params['url']) - dicto['language'] = 'es-ES' params = wikidata.request(query, dicto) self.assertIn(query, params['url']) - self.assertIn('es', params['url']) # successful cases are not tested here to avoid sending additional requests def test_response(self): @@ -31,6 +26,7 @@ class TestWikidataEngine(SearxTestCase): self.assertRaises(AttributeError, wikidata.response, '') self.assertRaises(AttributeError, wikidata.response, '[]') + wikidata.supported_languages = ['en', 'es'] response = mock.Mock(text='', search_params={"language": "en"}) self.assertEqual(wikidata.response(response), [])