1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-19 02:40:11 +01:00

[fix] Bing-Web engine: XPath to get the wikipedia result

Modify the XPath selector to get the wikipedia result plus small fixes.

About result content: especially with the Wikipedia result, we'd get several
paragraph elements, only the first paragraph would be taken and displayed on the
search result
This commit is contained in:
ahmad-alkadri 2023-01-03 22:59:01 +01:00 committed by Markus Heiser
parent 4e355564d2
commit 9ee99423fe

View File

@ -4,6 +4,7 @@
- https://github.com/searx/searx/issues/2019#issuecomment-648227442 - https://github.com/searx/searx/issues/2019#issuecomment-648227442
""" """
# pylint: disable=too-many-branches
import re import re
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
@ -74,7 +75,6 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
result_len = 0 result_len = 0
@ -84,12 +84,20 @@ def response(resp):
url_to_resolve = [] url_to_resolve = []
url_to_resolve_index = [] url_to_resolve_index = []
for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')):
link = eval_xpath(result, './/h2/a')[0] link = eval_xpath(result, './/h2/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = extract_text(eval_xpath(result, './/p'))
# Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
content = eval_xpath(result, '(.//p)[1]')
for p in content:
for e in p.xpath('.//a'):
e.getparent().remove(e)
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
e.getparent().remove(e)
content = extract_text(content)
# get the real URL either using the URL shown to user or following the Bing URL # get the real URL either using the URL shown to user or following the Bing URL
if url.startswith('https://www.bing.com/ck/a?'): if url.startswith('https://www.bing.com/ck/a?'):