From 8d335dbdaedd6113242e785e8fabac86128d069a Mon Sep 17 00:00:00 2001 From: a01200356 Date: Mon, 14 Mar 2016 00:32:36 -0600 Subject: [PATCH 1/3] [enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api --- AUTHORS.rst | 1 + searx/engines/wikidata.py | 18 +-- searx/engines/wikipedia.py | 114 +++++++++++++++++++ searx/results.py | 12 +- searx/settings.yml | 4 +- searx/templates/oscar/infobox.html | 4 +- tests/unit/engines/test_wikipedia.py | 160 +++++++++++++++++++++++++++ 7 files changed, 298 insertions(+), 15 deletions(-) create mode 100644 searx/engines/wikipedia.py create mode 100644 tests/unit/engines/test_wikipedia.py diff --git a/AUTHORS.rst b/AUTHORS.rst index 974fbeb15..3605332ea 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -43,3 +43,4 @@ generally made searx better: - Kang-min Liu - Kirill Isakov - Guilhem Bonnefille +- Marc Abonce Seguin diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 9f3496b72..8aa2fcd5c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale): results.append({'title': title, 'url': official_website}) wikipedia_link_count = 0 - if language != 'en': - wikipedia_link_count += add_url(urls, - 'Wikipedia (' + language + ')', - get_wikilink(result, language + - 'wiki')) - wikipedia_en_link = get_wikilink(result, 'enwiki') + wikipedia_link = get_wikilink(result, language + 'wiki') wikipedia_link_count += add_url(urls, - 'Wikipedia (en)', - wikipedia_en_link) + 'Wikipedia (' + language + ')', + wikipedia_link) + if language != 'en': + wikipedia_en_link = get_wikilink(result, 'enwiki') + wikipedia_link_count += add_url(urls, + 'Wikipedia (en)', + wikipedia_en_link) if wikipedia_link_count == 0: misc_language = get_wiki_firstlanguage(result, 'wiki') if misc_language is not None: @@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): else: results.append({ 'infobox': title, - 'id': wikipedia_en_link, + 'id': wikipedia_link, 'content': description, 'attributes': attributes, 'urls': urls diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py new file mode 100644 index 000000000..fed7b263f --- /dev/null +++ b/searx/engines/wikipedia.py @@ -0,0 +1,114 @@ +""" + Wikipedia (Web) + + @website https://{language}.wikipedia.org + @provide-api yes + + @using-api yes + @results JSON + @stable yes + @parse url, infobox +""" + +from json import loads +from urllib import urlencode, quote + +# search-url +base_url = 'https://{language}.wikipedia.org/' +search_postfix = 'w/api.php?'\ + 'action=query'\ + '&format=json'\ + '&{query}'\ + '&prop=extracts|pageimages'\ + '&exintro'\ + '&explaintext'\ + '&pithumbsize=300'\ + '&redirects' + + +# set language in base_url +def url_lang(lang): + if lang == 'all': + language = 'en' + else: + language = lang.split('_')[0] + + return base_url.format(language=language) + + +# do search-request +def request(query, params): + if query.islower(): + query += '|' + query.title() + + params['url'] = url_lang(params['language']) \ + + search_postfix.format(query=urlencode({'titles': query})) + + return params + + +# get first meaningful paragraph +# this should filter out disambiguation pages and notes above first paragraph +# "magic numbers" were obtained by fine tuning +def extract_first_paragraph(content, title, image): + first_paragraph = None + + failed_attempts = 0 + for paragraph in content.split('\n'): + + starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) + length = len(paragraph) + + if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): + first_paragraph = paragraph + break + + failed_attempts += 1 + if failed_attempts > 3: + return None + + return first_paragraph + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.content) + + # wikipedia article's unique id + # first valid id is assumed to be the requested article + for article_id in search_result['query']['pages']: + page = search_result['query']['pages'][article_id] + if int(article_id) > 0: + break + + if int(article_id) < 0: + return [] + + title = page.get('title') + + image = page.get('thumbnail') + if image: + image = image.get('source') + + extract = page.get('extract') + + summary = extract_first_paragraph(extract, title, image) + if not summary: + return [] + + # link to wikipedia article + # parenthesis are not quoted to make infobox mergeable with wikidata's + wikipedia_link = url_lang(resp.search_params['language']) \ + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')') + + results.append({'url': wikipedia_link, 'title': title}) + + results.append({'infobox': title, + 'id': wikipedia_link, + 'content': summary, + 'img_src': image, + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) + + return results diff --git a/searx/results.py b/searx/results.py index 5d51eb5b5..c3040b305 100644 --- a/searx/results.py +++ b/searx/results.py @@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2): urls1 = infobox1.get('urls', None) if urls1 is None: urls1 = [] - infobox1.set('urls', urls1) + infobox1['urls'] = urls1 urlSet = set() for url in infobox1.get('urls', []): @@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2): if url.get('url', None) not in urlSet: urls1.append(url) + if 'img_src' in infobox2: + img1 = infobox1.get('img_src', None) + img2 = infobox2.get('img_src') + if img1 is None: + infobox1['img_src'] = img2 + if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) if attributes1 is None: attributes1 = [] - infobox1.set('attributes', attributes1) + infobox1['attributes'] = attributes1 attributeSet = set() for attribute in infobox1.get('attributes', []): @@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2): if result_content_len(content2) > result_content_len(content1): infobox1['content'] = content2 else: - infobox1.set('content', content2) + infobox1['content'] = content2 def result_score(result): diff --git a/searx/settings.yml b/searx/settings.yml index 96ac4e716..ff85684ac 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -43,10 +43,9 @@ engines: shortcut : bs - name : wikipedia - engine : mediawiki + engine : wikipedia shortcut : wp base_url : 'https://{language}.wikipedia.org/' - number_of_results : 1 - name : bing engine : bing @@ -93,6 +92,7 @@ engines: - name : ddg definitions engine : duckduckgo_definitions shortcut : ddd + disabled : True - name : digg engine : digg diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html index d87d98453..606a5d22c 100644 --- a/searx/templates/oscar/infobox.html +++ b/searx/templates/oscar/infobox.html @@ -1,8 +1,9 @@
-

{{ infobox.infobox }}

+

{{ infobox.infobox }}

+ {% if infobox.img_src %}{{ infobox.infobox }}{% endif %} {% if infobox.content %}

{{ infobox.content }}

{% endif %} @@ -28,5 +29,6 @@ {% endfor %}
{% endif %} +
diff --git a/tests/unit/engines/test_wikipedia.py b/tests/unit/engines/test_wikipedia.py new file mode 100644 index 000000000..d1c44036d --- /dev/null +++ b/tests/unit/engines/test_wikipedia.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import wikipedia +from searx.testing import SearxTestCase + + +class TestWikipediaEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['language'] = 'fr_FR' + params = wikipedia.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('test_query', params['url']) + self.assertIn('Test_Query', params['url']) + self.assertIn('fr.wikipedia.org', params['url']) + + query = 'Test_Query' + params = wikipedia.request(query, dicto) + self.assertIn('Test_Query', params['url']) + self.assertNotIn('test_query', params['url']) + + dicto['language'] = 'all' + params = wikipedia.request(query, dicto) + self.assertIn('en', params['url']) + + def test_response(self): + dicto = defaultdict(dict) + dicto['language'] = 'fr' + + self.assertRaises(AttributeError, wikipedia.response, None) + self.assertRaises(AttributeError, wikipedia.response, []) + self.assertRaises(AttributeError, wikipedia.response, '') + self.assertRaises(AttributeError, wikipedia.response, '[]') + + # page not found + json = """ + { + "batchcomplete": "", + "query": { + "normalized": [], + "pages": { + "-1": { + "ns": 0, + "title": "", + "missing": "" + } + } + } + }""" + response = mock.Mock(content=json, search_params=dicto) + self.assertEqual(wikipedia.response(response), []) + + # normal case + json = """ + { + "batchcomplete": "", + "query": { + "normalized": [], + "pages": { + "12345": { + "pageid": 12345, + "ns": 0, + "title": "The Title", + "extract": "The Title is...", + "thumbnail": { + "source": "img_src.jpg" + }, + "pageimage": "img_name.jpg" + } + } + } + }""" + response = mock.Mock(content=json, search_params=dicto) + results = wikipedia.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], u'The Title') + self.assertIn('fr.wikipedia.org/wiki/The_Title', results[0]['url']) + self.assertEqual(results[1]['infobox'], u'The Title') + self.assertIn('fr.wikipedia.org/wiki/The_Title', results[1]['id']) + self.assertIn('The Title is...', results[1]['content']) + self.assertEqual(results[1]['img_src'], 'img_src.jpg') + + # disambiguation page + json = """ + { + "batchcomplete": "", + "query": { + "normalized": [], + "pages": { + "12345": { + "pageid": 12345, + "ns": 0, + "title": "The Title", + "extract": "The Title can be:\\nThe Title 1\\nThe Title 2\\nThe Title 3\\nThe Title 4......................................................................................................................................." """ # noqa + json += """ + } + } + } + }""" + response = mock.Mock(content=json, search_params=dicto) + results = wikipedia.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 0) + + # no image + json = """ + { + "batchcomplete": "", + "query": { + "normalized": [], + "pages": { + "12345": { + "pageid": 12345, + "ns": 0, + "title": "The Title", + "extract": "The Title is......................................................................................................................................................................................." """ # noqa + json += """ + } + } + } + }""" + response = mock.Mock(content=json, search_params=dicto) + results = wikipedia.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('The Title is...', results[1]['content']) + self.assertEqual(results[1]['img_src'], None) + + # title not in first paragraph + json = u""" + { + "batchcomplete": "", + "query": { + "normalized": [], + "pages": { + "12345": { + "pageid": 12345, + "ns": 0, + "title": "披頭四樂隊", + "extract": "披头士乐队....................................................................................................................................................................................................\\n披頭四樂隊...", """ # noqa + json += """ + "thumbnail": { + "source": "img_src.jpg" + }, + "pageimage": "img_name.jpg" + } + } + } + }""" + response = mock.Mock(content=json, search_params=dicto) + results = wikipedia.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[1]['infobox'], u'披頭四樂隊') + self.assertIn(u'披头士乐队...', results[1]['content']) From 6dca14e95d08479fb49314cb4093be36ac49cf94 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sun, 17 Apr 2016 16:21:44 -0500 Subject: [PATCH 2/3] [enh] multilingual duckduckgo_definitions --- searx/engines/duckduckgo_definitions.py | 7 ++++++- searx/templates/oscar/infobox.html | 6 +++--- tests/unit/engines/test_duckduckgo_definitions.py | 4 ++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 793e97d22..dc25d416f 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,5 +1,6 @@ import json from urllib import urlencode +from re import sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text @@ -19,8 +20,8 @@ def result_to_text(url, text, htmlResult): def request(query, params): - # TODO add kl={locale} params['url'] = url.format(query=urlencode({'q': query})) + params['headers']['Accept-Language'] = params['language'] return params @@ -103,6 +104,10 @@ def response(resp): urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) + # to merge with wikidata's infobox + if infobox_id: + infobox_id = sub(r'^http:', r'https:', infobox_id) + # entity entity = search_res.get('Entity', None) # TODO continent / country / department / location / waterfall / diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html index 606a5d22c..c72cfb638 100644 --- a/searx/templates/oscar/infobox.html +++ b/searx/templates/oscar/infobox.html @@ -1,9 +1,9 @@
-

{{ infobox.infobox }}

+

{{ infobox.infobox }}

- + {% if infobox.img_src %}{{ infobox.infobox }}{% endif %} {% if infobox.content %}

{{ infobox.content }}

{% endif %} @@ -29,6 +29,6 @@ {% endfor %}
{% endif %} - +
diff --git a/tests/unit/engines/test_duckduckgo_definitions.py b/tests/unit/engines/test_duckduckgo_definitions.py index 71c84235c..39da64175 100644 --- a/tests/unit/engines/test_duckduckgo_definitions.py +++ b/tests/unit/engines/test_duckduckgo_definitions.py @@ -123,7 +123,7 @@ class TestDDGDefinitionsEngine(SearxTestCase): self.assertEqual(results[1]['url'], 'result first url') self.assertEqual(results[2]['suggestion'], 'text') self.assertEqual(results[3]['infobox'], 'heading') - self.assertEqual(results[3]['id'], 'http://definition.url') + self.assertEqual(results[3]['id'], 'https://definition.url') self.assertEqual(results[3]['entity'], 'Entity') self.assertIn('abstract', results[3]['content']) self.assertIn('this is the definition', results[3]['content']) @@ -240,7 +240,7 @@ class TestDDGDefinitionsEngine(SearxTestCase): self.assertEqual(type(results), list) self.assertEqual(len(results), 1) self.assertEqual(results[0]['infobox'], 'heading') - self.assertEqual(results[0]['id'], 'http://definition.url') + self.assertEqual(results[0]['id'], 'https://definition.url') self.assertEqual(results[0]['entity'], 'Entity') self.assertIn('abstract', results[0]['content']) self.assertIn('this is the definition', results[0]['content']) From a44faa77167980a414df2cbe936a52359351f455 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Mon, 18 Apr 2016 10:52:16 -0500 Subject: [PATCH 3/3] [fix] compile regex in ddg_definitions --- searx/engines/duckduckgo_definitions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index dc25d416f..208ccca28 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,6 +1,6 @@ import json from urllib import urlencode -from re import sub +from re import compile, sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text @@ -8,6 +8,8 @@ from searx.engines.xpath import extract_text url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' +http_regex = compile(r'^http:') + def result_to_text(url, text, htmlResult): # TODO : remove result ending with "Meaning" or "Category" @@ -106,7 +108,7 @@ def response(resp): # to merge with wikidata's infobox if infobox_id: - infobox_id = sub(r'^http:', r'https:', infobox_id) + infobox_id = http_regex.sub('https:', infobox_id) # entity entity = search_res.get('Entity', None)