1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-22 20:17:45 +01:00

[mod] bing: resolve redirect without additional requests

Remove the usage of searx.network.multi_requests
The results from Bing contains the target URL encoded in base64
See the u parameter, remove the first two character "a1", and done.

Also add a comment the check of the result_len / pageno
( from https://github.com/searx/searx/pull/1387 )
This commit is contained in:
Alexandre Flament 2023-08-27 09:34:18 +00:00 committed by Markus Heiser
parent f57842b05d
commit faa4280e1a

View File

@ -29,10 +29,11 @@ inaccuracies there too):
# pylint: disable=too-many-branches, invalid-name # pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import base64
import datetime import datetime
import re import re
import uuid import uuid
from urllib.parse import urlencode from urllib.parse import parse_qs, urlencode, urlparse
from lxml import html from lxml import html
import babel import babel
import babel.languages import babel.languages
@ -179,9 +180,7 @@ def request(query, params):
def response(resp): def response(resp):
# pylint: disable=too-many-locals,import-outside-toplevel # pylint: disable=too-many-locals
from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762
results = [] results = []
result_len = 0 result_len = 0
@ -190,9 +189,6 @@ def response(resp):
# parse results again if nothing is found yet # parse results again if nothing is found yet
url_to_resolve = []
url_to_resolve_index = []
i = 0
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'): for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(result, './/h2/a', 0, None) link = eval_xpath_getindex(result, './/h2/a', 0, None)
@ -208,38 +204,21 @@ def response(resp):
e.getparent().remove(e) e.getparent().remove(e)
content = extract_text(content) content = extract_text(content)
# get the real URL either using the URL shown to user or following the Bing URL # get the real URL
if url.startswith('https://www.bing.com/ck/a?'): if url.startswith('https://www.bing.com/ck/a?'):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) # get the first value of u parameter
# Bing can shorten the URL either at the end or in the middle of the string url_query = urlparse(url).query
if ( parsed_url_query = parse_qs(url_query)
url_cite param_u = parsed_url_query["u"][0]
and url_cite.startswith('https://') # remove "a1" in front
and '' not in url_cite encoded_url = param_u[2:]
and '...' not in url_cite # add padding
and '' not in url_cite encoded_url = encoded_url + '=' * (-len(encoded_url) % 4)
): # decode base64 encoded URL
# no need for an additional HTTP request url = base64.urlsafe_b64decode(encoded_url).decode()
url = url_cite
else:
# resolve the URL with an additional HTTP request
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
url_to_resolve_index.append(i)
url = None # remove the result if the HTTP Bing redirect raise an exception
# append result # append result
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
# increment result pointer for the next iteration in this loop
i += 1
# resolve all Bing redirections in parallel
request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
]
response_list = multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
# get number_of_results # get number_of_results
try: try:
@ -258,6 +237,10 @@ def response(resp):
logger.debug('result error :\n%s', e) logger.debug('result error :\n%s', e)
if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
# Avoid reading more results than avalaible.
# For example, if there is 100 results from some search and we try to get results from 120 to 130,
# Bing will send back the results from 0 to 10 and no error.
# If we compare results count with the first parameter of the request we can avoid this "invalid" results.
return [] return []
results.append({'number_of_results': result_len}) results.append({'number_of_results': result_len})