mirror of
https://github.com/searxng/searxng.git
synced 2024-11-04 20:30:11 +01:00
Merge pull request #1219 from dalf/follow_bing_redirect
bing.py: remove redirection links
This commit is contained in:
commit
44f2eb50a5
@ -8,7 +8,8 @@
|
||||
import re
|
||||
from urllib.parse import urlencode, urlparse, parse_qs
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath, extract_text, match_language
|
||||
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
|
||||
from searx.network import multi_requests, Request
|
||||
|
||||
about = {
|
||||
"website": 'https://www.bing.com',
|
||||
@ -79,30 +80,48 @@ def response(resp):
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
|
||||
|
||||
# IMO //div[@class="sa_cc"] does no longer match
|
||||
logger.debug('found //div[@class="sa_cc"] --> %s', result)
|
||||
|
||||
link = eval_xpath(result, './/h3/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
content = extract_text(eval_xpath(result, './/p'))
|
||||
|
||||
# append result
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
|
||||
# parse results again if nothing is found yet
|
||||
for result in eval_xpath(dom, '//li[@class="b_algo"]'):
|
||||
|
||||
url_to_resolve = []
|
||||
url_to_resolve_index = []
|
||||
for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
|
||||
|
||||
link = eval_xpath(result, './/h2/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
content = extract_text(eval_xpath(result, './/p'))
|
||||
|
||||
# get the real URL either using the URL shown to user or following the Bing URL
|
||||
if url.startswith('https://www.bing.com/ck/a?'):
|
||||
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
|
||||
# Bing can shorten the URL either at the end or in the middle of the string
|
||||
if (
|
||||
url_cite.startswith('https://')
|
||||
and '…' not in url_cite
|
||||
and '...' not in url_cite
|
||||
and '›' not in url_cite
|
||||
):
|
||||
# no need for an additional HTTP request
|
||||
url = url_cite
|
||||
else:
|
||||
# resolve the URL with an additional HTTP request
|
||||
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
|
||||
url_to_resolve_index.append(i)
|
||||
url = None # remove the result if the HTTP Bing redirect raise an exception
|
||||
|
||||
# append result
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
|
||||
# resolve all Bing redirections in parallel
|
||||
request_list = [
|
||||
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
|
||||
]
|
||||
response_list = multi_requests(request_list)
|
||||
for i, redirect_response in enumerate(response_list):
|
||||
if not isinstance(redirect_response, Exception):
|
||||
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
|
||||
|
||||
# get number_of_results
|
||||
try:
|
||||
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
|
||||
if "-" in result_len_container:
|
||||
|
@ -8,7 +8,8 @@ import concurrent.futures
|
||||
from queue import SimpleQueue
|
||||
from types import MethodType
|
||||
from timeit import default_timer
|
||||
from typing import Iterable, Tuple
|
||||
from typing import Iterable, NamedTuple, Tuple, List, Dict, Union
|
||||
from contextlib import contextmanager
|
||||
|
||||
import httpx
|
||||
import anyio
|
||||
@ -48,9 +49,23 @@ def get_context_network():
|
||||
return THREADLOCAL.__dict__.get('network') or get_network()
|
||||
|
||||
|
||||
def request(method, url, **kwargs):
|
||||
"""same as requests/requests/api.py request(...)"""
|
||||
@contextmanager
|
||||
def _record_http_time():
|
||||
# pylint: disable=too-many-branches
|
||||
time_before_request = default_timer()
|
||||
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
|
||||
try:
|
||||
yield start_time
|
||||
finally:
|
||||
# update total_time.
|
||||
# See get_time_for_thread() and reset_time_for_thread()
|
||||
if hasattr(THREADLOCAL, 'total_time'):
|
||||
time_after_request = default_timer()
|
||||
THREADLOCAL.total_time += time_after_request - time_before_request
|
||||
|
||||
|
||||
def _get_timeout(start_time, kwargs):
|
||||
# pylint: disable=too-many-branches
|
||||
|
||||
# timeout (httpx)
|
||||
if 'timeout' in kwargs:
|
||||
@ -65,45 +80,84 @@ def request(method, url, **kwargs):
|
||||
|
||||
# ajdust actual timeout
|
||||
timeout += 0.2 # overhead
|
||||
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
|
||||
if start_time:
|
||||
timeout -= default_timer() - start_time
|
||||
|
||||
# raise_for_error
|
||||
check_for_httperror = True
|
||||
if 'raise_for_httperror' in kwargs:
|
||||
check_for_httperror = kwargs['raise_for_httperror']
|
||||
del kwargs['raise_for_httperror']
|
||||
return timeout
|
||||
|
||||
# requests compatibility
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
|
||||
# network
|
||||
network = get_context_network()
|
||||
def request(method, url, **kwargs):
|
||||
"""same as requests/requests/api.py request(...)"""
|
||||
with _record_http_time() as start_time:
|
||||
network = get_context_network()
|
||||
timeout = _get_timeout(start_time, kwargs)
|
||||
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
|
||||
try:
|
||||
return future.result(timeout)
|
||||
except concurrent.futures.TimeoutError as e:
|
||||
raise httpx.TimeoutException('Timeout', request=None) from e
|
||||
|
||||
# do request
|
||||
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
|
||||
try:
|
||||
response = future.result(timeout)
|
||||
except concurrent.futures.TimeoutError as e:
|
||||
raise httpx.TimeoutException('Timeout', request=None) from e
|
||||
|
||||
# requests compatibility
|
||||
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
|
||||
response.ok = not response.is_error
|
||||
def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]:
|
||||
"""send multiple HTTP requests in parallel. Wait for all requests to finish."""
|
||||
with _record_http_time() as start_time:
|
||||
# send the requests
|
||||
network = get_context_network()
|
||||
loop = get_loop()
|
||||
future_list = []
|
||||
for request_desc in request_list:
|
||||
timeout = _get_timeout(start_time, request_desc.kwargs)
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop
|
||||
)
|
||||
future_list.append((future, timeout))
|
||||
|
||||
# update total_time.
|
||||
# See get_time_for_thread() and reset_time_for_thread()
|
||||
if hasattr(THREADLOCAL, 'total_time'):
|
||||
time_after_request = default_timer()
|
||||
THREADLOCAL.total_time += time_after_request - time_before_request
|
||||
# read the responses
|
||||
responses = []
|
||||
for future, timeout in future_list:
|
||||
try:
|
||||
responses.append(future.result(timeout))
|
||||
except concurrent.futures.TimeoutError:
|
||||
responses.append(httpx.TimeoutException('Timeout', request=None))
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
responses.append(e)
|
||||
return responses
|
||||
|
||||
# raise an exception
|
||||
if check_for_httperror:
|
||||
raise_for_httperror(response)
|
||||
|
||||
return response
|
||||
class Request(NamedTuple):
|
||||
"""Request description for the multi_requests function"""
|
||||
|
||||
method: str
|
||||
url: str
|
||||
kwargs: Dict[str, str] = {}
|
||||
|
||||
@staticmethod
|
||||
def get(url, **kwargs):
|
||||
return Request('GET', url, kwargs)
|
||||
|
||||
@staticmethod
|
||||
def options(url, **kwargs):
|
||||
return Request('OPTIONS', url, kwargs)
|
||||
|
||||
@staticmethod
|
||||
def head(url, **kwargs):
|
||||
return Request('HEAD', url, kwargs)
|
||||
|
||||
@staticmethod
|
||||
def post(url, **kwargs):
|
||||
return Request('POST', url, kwargs)
|
||||
|
||||
@staticmethod
|
||||
def put(url, **kwargs):
|
||||
return Request('PUT', url, kwargs)
|
||||
|
||||
@staticmethod
|
||||
def patch(url, **kwargs):
|
||||
return Request('PATCH', url, kwargs)
|
||||
|
||||
@staticmethod
|
||||
def delete(url, **kwargs):
|
||||
return Request('DELETE', url, kwargs)
|
||||
|
||||
|
||||
def get(url, **kwargs):
|
||||
|
@ -13,6 +13,7 @@ import httpx
|
||||
|
||||
from searx import logger, searx_debug
|
||||
from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
|
||||
from .raise_for_httperror import raise_for_httperror
|
||||
|
||||
|
||||
logger = logger.getChild('network')
|
||||
@ -226,6 +227,27 @@ class Network:
|
||||
kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
|
||||
return kwargs_clients
|
||||
|
||||
@staticmethod
|
||||
def extract_do_raise_for_httperror(kwargs):
|
||||
do_raise_for_httperror = True
|
||||
if 'raise_for_httperror' in kwargs:
|
||||
do_raise_for_httperror = kwargs['raise_for_httperror']
|
||||
del kwargs['raise_for_httperror']
|
||||
return do_raise_for_httperror
|
||||
|
||||
@staticmethod
|
||||
def patch_response(response, do_raise_for_httperror):
|
||||
if isinstance(response, httpx.Response):
|
||||
# requests compatibility (response is not streamed)
|
||||
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
|
||||
response.ok = not response.is_error
|
||||
|
||||
# raise an exception
|
||||
if do_raise_for_httperror:
|
||||
raise_for_httperror(response)
|
||||
|
||||
return response
|
||||
|
||||
def is_valid_response(self, response):
|
||||
# pylint: disable=too-many-boolean-expressions
|
||||
if (
|
||||
@ -239,6 +261,7 @@ class Network:
|
||||
async def call_client(self, stream, method, url, **kwargs):
|
||||
retries = self.retries
|
||||
was_disconnected = False
|
||||
do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs)
|
||||
kwargs_clients = Network.extract_kwargs_clients(kwargs)
|
||||
while retries >= 0: # pragma: no cover
|
||||
client = await self.get_client(**kwargs_clients)
|
||||
@ -248,7 +271,7 @@ class Network:
|
||||
else:
|
||||
response = await client.request(method, url, **kwargs)
|
||||
if self.is_valid_response(response) or retries <= 0:
|
||||
return response
|
||||
return Network.patch_response(response, do_raise_for_httperror)
|
||||
except httpx.RemoteProtocolError as e:
|
||||
if not was_disconnected:
|
||||
# the server has closed the connection:
|
||||
|
@ -141,28 +141,28 @@ class TestNetworkRequestRetries(SearxTestCase):
|
||||
async def test_retries_ok(self):
|
||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||
network = Network(enable_http=True, retries=1, retry_on_http_error=403)
|
||||
response = await network.request('GET', 'https://example.com/')
|
||||
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
|
||||
await network.aclose()
|
||||
|
||||
async def test_retries_fail_int(self):
|
||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||
network = Network(enable_http=True, retries=0, retry_on_http_error=403)
|
||||
response = await network.request('GET', 'https://example.com/')
|
||||
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
self.assertEqual(response.status_code, 403)
|
||||
await network.aclose()
|
||||
|
||||
async def test_retries_fail_list(self):
|
||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||
network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429])
|
||||
response = await network.request('GET', 'https://example.com/')
|
||||
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
self.assertEqual(response.status_code, 403)
|
||||
await network.aclose()
|
||||
|
||||
async def test_retries_fail_bool(self):
|
||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||
network = Network(enable_http=True, retries=0, retry_on_http_error=True)
|
||||
response = await network.request('GET', 'https://example.com/')
|
||||
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
self.assertEqual(response.status_code, 403)
|
||||
await network.aclose()
|
||||
|
||||
@ -178,7 +178,7 @@ class TestNetworkRequestRetries(SearxTestCase):
|
||||
|
||||
with patch.object(httpx.AsyncClient, 'request', new=get_response):
|
||||
network = Network(enable_http=True, retries=2)
|
||||
response = await network.request('GET', 'https://example.com/')
|
||||
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
|
||||
await network.aclose()
|
||||
@ -190,7 +190,7 @@ class TestNetworkRequestRetries(SearxTestCase):
|
||||
with patch.object(httpx.AsyncClient, 'request', new=get_response):
|
||||
network = Network(enable_http=True, retries=0)
|
||||
with self.assertRaises(httpx.RequestError):
|
||||
await network.request('GET', 'https://example.com/')
|
||||
await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
await network.aclose()
|
||||
|
||||
|
||||
@ -237,6 +237,6 @@ class TestNetworkStreamRetries(SearxTestCase):
|
||||
|
||||
with patch.object(httpx.AsyncClient, 'stream', new=stream):
|
||||
network = Network(enable_http=True, retries=0, retry_on_http_error=403)
|
||||
response = await network.stream('GET', 'https://example.com/')
|
||||
response = await network.stream('GET', 'https://example.com/', raise_for_httperror=False)
|
||||
self.assertEqual(response.status_code, 403)
|
||||
await network.aclose()
|
||||
|
Loading…
Reference in New Issue
Block a user