searxng/searx/engines/bing.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing (Web)

- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""

import re
from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
from searx.network import multi_requests, Request

about = {
    "website": 'https://www.bing.com',
    "wikidata_id": 'Q182496',
    "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = False
safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {}

# search-url
base_url = 'https://www.bing.com/'

# initial query:     https://www.bing.com/search?q=foo&search=&form=QBLH
inital_query = 'search?{query}&search=&form=QBLH'

# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
page_query = 'search?{query}&search=&first={offset}&FORM=PERE'


def _get_offset_from_pageno(pageno):
    return (pageno - 1) * 10 + 1


def request(query, params):

    offset = _get_offset_from_pageno(params.get('pageno', 1))

    # logger.debug("params['pageno'] --> %s", params.get('pageno'))
    # logger.debug("          offset --> %s", offset)

    search_string = page_query
    if offset == 1:
        search_string = inital_query

    if params['language'] == 'all':
        lang = 'EN'
    else:
        lang = match_language(params['language'], supported_languages, language_aliases)

    query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)

    search_path = search_string.format(query=urlencode({'q': query}), offset=offset)

    if offset > 1:
        referer = base_url + inital_query.format(query=urlencode({'q': query}))
        params['headers']['Referer'] = referer
        logger.debug("headers.Referer --> %s", referer)

    params['url'] = base_url + search_path
    params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    return params


def response(resp):

    results = []
    result_len = 0

    dom = html.fromstring(resp.text)

    # parse results again if nothing is found yet

    url_to_resolve = []
    url_to_resolve_index = []
    for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):

        link = eval_xpath(result, './/h2/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        content = extract_text(eval_xpath(result, './/p'))

        # get the real URL either using the URL shown to user or following the Bing URL
        if url.startswith('https://www.bing.com/ck/a?'):
            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
            # Bing can shorten the URL either at the end or in the middle of the string
            if (
                url_cite.startswith('https://')
                and '…' not in url_cite
                and '...' not in url_cite
                and '›' not in url_cite
            ):
                # no need for an additional HTTP request
                url = url_cite
            else:
                # resolve the URL with an additional HTTP request
                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
                url_to_resolve_index.append(i)
                url = None  # remove the result if the HTTP Bing redirect raise an exception

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    # resolve all Bing redirections in parallel
    request_list = [
        Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
    ]
    response_list = multi_requests(request_list)
    for i, redirect_response in enumerate(response_list):
        if not isinstance(redirect_response, Exception):
            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']

    # get number_of_results
    try:
        result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
        if "-" in result_len_container:

            # Remove the part "from-to" for paginated request ...
            result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]

        result_len_container = re.sub('[^0-9]', '', result_len_container)

        if len(result_len_container) > 0:
            result_len = int(result_len_container)

    except Exception as e:  # pylint: disable=broad-except
        logger.debug('result error :\n%s', e)

    if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
        return []

    results.append({'number_of_results': result_len})
    return results


# get supported languages from their site
def _fetch_supported_languages(resp):

    lang_tags = set()

    dom = html.fromstring(resp.text)
    lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')

    for _li in lang_links:

        href = eval_xpath(_li, './/@href')[0]
        (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
        query = parse_qs(query, keep_blank_values=True)

        # fmt: off
        setlang = query.get('setlang', [None, ])[0]
        # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
        lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2]  # fmt: skip
        # fmt: on

        tag = lang + '-' + nation if nation else lang
        lang_tags.add(tag)

    return list(lang_tags)
-												[enh] engines: add about variable

move meta information from comment to the about variable
so the preferences, the documentation can show these information

											
										
										
											2021-01-13 11:31:25 +01:00
+								# SPDX-License-Identifier: AGPL-3.0-or-later
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
+								# lint: pylint
 								"""Bing (Web)
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
 								- https://github.com/searx/searx/issues/2019#issuecomment-648227442
-												update versions.cfg to use the current up-to-date packages

											
										
										
											2015-05-02 15:45:17 +02:00
+								"""
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								import re
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 16:47:47 +01:00
+								from urllib.parse import urlencode, urlparse, parse_qs
-												Improves PEP8 compatibility.

											
										
										
											2014-02-05 20:24:31 +01:00
+								from lxml import html
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 18:24:47 +02:00
+								from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
 								from searx.network import multi_requests, Request
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
-												[enh] engines: add about variable

move meta information from comment to the about variable
so the preferences, the documentation can show these information

											
										
										
											2021-01-13 11:31:25 +01:00
+								about = {
 								    "website": 'https://www.bing.com',
 								    "wikidata_id": 'Q182496',
 								    "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
 								    "use_official_api": False,
 								    "require_api_key": False,
 								    "results": 'HTML',
 								}
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
+								# engine dependent config
-												[enh] add more categories

											
										
										
											2021-12-22 16:58:52 +01:00
+								categories = ['general', 'web']
-												[enh] bing, google paging support

											
										
										
											2014-01-29 21:14:38 +01:00
+								paging = True
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
+								time_range_support = False
 								safesearch = False
-												[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-01 17:01:59 +02:00
+								send_accept_language_header = True
-												[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.

											
										
										
											2016-11-06 03:51:38 +01:00
+								supported_languages_url = 'https://www.bing.com/account/general'
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 16:47:47 +01:00
+								language_aliases = {}
-												[enh] bing, google paging support

											
										
										
											2014-01-29 21:14:38 +01:00
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
+								# search-url
 								base_url = 'https://www.bing.com/'
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
 								# initial query:     https://www.bing.com/search?q=foo&search=&form=QBLH
 								inital_query = 'search?{query}&search=&form=QBLH'
 								# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
 								page_query = 'search?{query}&search=&first={offset}&FORM=PERE'
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								def _get_offset_from_pageno(pageno):
 								    return (pageno - 1) * 10 + 1
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
+								def request(query, params):
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
+								    offset = _get_offset_from_pageno(params.get('pageno', 1))
 								    # logger.debug("params['pageno'] --> %s", params.get('pageno'))
 								    # logger.debug("          offset --> %s", offset)
 								    search_string = page_query
 								    if offset == 1:
 								        search_string = inital_query
-												Revert "remove 'all' option from search languages"

This reverts commit 4d1770398a6af8902e75c0bd885781584d39e796.

											
										
										
											2019-01-06 15:27:46 +01:00
+								    if params['language'] == 'all':
 								        lang = 'EN'
 								    else:
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								        lang = match_language(params['language'], supported_languages, language_aliases)
-												[fix] use english as default language in bing

If no language is specified, bing returns results with multiple languages
for one query which isn't really useful. Setting english as default
insted if nothing.

											
										
										
											2016-12-30 18:17:14 +01:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								    query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								    search_path = search_string.format(query=urlencode({'q': query}), offset=offset)
-												fix bing "garbage" results (issue #1275)

											
										
										
											2018-05-21 01:10:22 +02:00
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
+								    if offset > 1:
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								        referer = base_url + inital_query.format(query=urlencode({'q': query}))
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
+								        params['headers']['Referer'] = referer
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								        logger.debug("headers.Referer --> %s", referer)
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
+								    params['url'] = base_url + search_path
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								    params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 13:41:12 +01:00
+								    return params
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
+								def response(resp):
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
+								    results = []
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								    result_len = 0
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
-												[fix] bing unicode encode error - fixes #408

											
										
										
											2015-08-28 14:51:32 +02:00
+								    dom = html.fromstring(resp.text)
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
+								    # parse results again if nothing is found yet
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 18:24:47 +02:00
 								    url_to_resolve = []
 								    url_to_resolve_index = []
 								    for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
-												[mod] speed optimization

compile XPath only once
avoid redundant call to urlparse
get_locale(webapp.py): avoid useless call to request.accept_languages.best_match

											
										
										
											2019-11-15 09:31:37 +01:00
+								        link = eval_xpath(result, './/h2/a')[0]
-												[enh] bing updates ++ language support

											
										
										
											2013-10-25 01:37:48 +02:00
+								        url = link.attrib.get('href')
-												Add bing in the test units

											
										
										
											2015-01-25 20:14:37 +01:00
+								        title = extract_text(link)
-												[mod] speed optimization

compile XPath only once
avoid redundant call to urlparse
get_locale(webapp.py): avoid useless call to request.accept_languages.best_match

											
										
										
											2019-11-15 09:31:37 +01:00
+								        content = extract_text(eval_xpath(result, './/p'))
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 18:24:47 +02:00
+								        # get the real URL either using the URL shown to user or following the Bing URL
 								        if url.startswith('https://www.bing.com/ck/a?'):
 								            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
 								            # Bing can shorten the URL either at the end or in the middle of the string
 								            if (
 								                url_cite.startswith('https://')
 								                and '…' not in url_cite
 								                and '...' not in url_cite
 								                and '›' not in url_cite
 								            ):
 								                # no need for an additional HTTP request
 								                url = url_cite
 								            else:
 								                # resolve the URL with an additional HTTP request
 								                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
 								                url_to_resolve_index.append(i)
 								                url = None  # remove the result if the HTTP Bing redirect raise an exception
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
+								        # append result
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								        results.append({'url': url, 'title': title, 'content': content})
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 14:38:59 +02:00
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 18:24:47 +02:00
+								    # resolve all Bing redirections in parallel
 								    request_list = [
 								        Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
 								    ]
 								    response_list = multi_requests(request_list)
 								    for i, redirect_response in enumerate(response_list):
 								        if not isinstance(redirect_response, Exception):
 								            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
 								    # get number_of_results
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								    try:
-												[fix] handle missing result size

											
										
										
											2020-01-02 22:28:47 +01:00
+								        result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								        if "-" in result_len_container:
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								            # Remove the part "from-to" for paginated request ...
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								            result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
 								        result_len_container = re.sub('[^0-9]', '', result_len_container)
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								        if len(result_len_container) > 0:
 								            result_len = int(result_len_container)
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 11:40:12 +01:00
 								    except Exception as e:  # pylint: disable=broad-except
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								        logger.debug('result error :\n%s', e)
-												[fix] handle missing result size

											
										
										
											2020-01-02 22:28:47 +01:00
+								    if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 16:15:40 +02:00
+								        return []
 								    results.append({'number_of_results': result_len})
-												[enh] bing engine added

											
										
										
											2013-10-24 23:52:57 +02:00
+								    return results
-												[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.

											
										
										
											2016-11-06 03:51:38 +01:00
 								# get supported languages from their site
-												tests for _fetch_supported_languages in engines
and refactor method to make it testable without making requests

											
										
										
											2016-12-15 07:34:43 +01:00
+								def _fetch_supported_languages(resp):
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 16:47:47 +01:00
-												bugfix: fetch_supported_languages bing, -news, -videos, -images

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2020-03-01 08:01:36 +01:00
+								    lang_tags = set()
-												tests for _fetch_supported_languages in engines
and refactor method to make it testable without making requests

											
										
										
											2016-12-15 07:34:43 +01:00
+								    dom = html.fromstring(resp.text)
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 16:47:47 +01:00
+								    lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')
 								    for _li in lang_links:
 								        href = eval_xpath(_li, './/@href')[0]
 								        (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
 								        query = parse_qs(query, keep_blank_values=True)
 								        # fmt: off
 								        setlang = query.get('setlang', [None, ])[0]
 								        # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
 								        lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2]  # fmt: skip
 								        # fmt: on
-												bing engine: _fetch_supported_languages: don't use the language code as a country

ref #1029

											
										
										
											2022-03-31 22:03:34 +02:00
+								        tag = lang + '-' + nation if nation else lang
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 16:47:47 +01:00
+								        lang_tags.add(tag)
-												bugfix: fetch_supported_languages bing, -news, -videos, -images

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2020-03-01 08:01:36 +01:00
 								    return list(lang_tags)