searxng/searx/engines/wikipedia.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This module implements the Wikipedia engine.  Some of this implementations
are shared by other engines:

- :ref:`wikidata engine`

The list of supported languages is fetched from the article linked by
:py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia
does not support one Wikipedia for all the languages, but there is one Wikipedia
for every language (:py:obj:`fetch_traits`).
"""

import urllib.parse
import babel

from lxml import html

from searx import network
from searx.locales import language_tag
from searx.enginelib.traits import EngineTraits

traits: EngineTraits

# about
about = {
    "website": 'https://www.wikipedia.org/',
    "wikidata_id": 'Q52',
    "official_api_documentation": 'https://en.wikipedia.org/api/',
    "use_official_api": True,
    "require_api_key": False,
    "results": 'JSON',
}

send_accept_language_header = True

wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
"""The *editing depth* of Wikipedia is one of several possible rough indicators
of the encyclopedia's collaborative quality, showing how frequently its articles
are updated.  The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized.
"""

# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
the first paragraph of the page in plain text and HTML as well as the type of
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
and link previews in the apps.

.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_

"""


def request(query, params):
    """Assemble a request (`wikipedia rest_v1 summary API`_)."""
    if query.islower():
        query = query.title()

    engine_language = traits.get_language(params['searxng_locale'], 'en')
    wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
    title = urllib.parse.quote(query)

    # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
    # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)

    params['raise_for_httperror'] = False
    params['soft_max_redirects'] = 2

    return params


# get response from search-request
def response(resp):

    results = []
    if resp.status_code == 404:
        return []
    if resp.status_code == 400:
        try:
            api_result = resp.json()
        except Exception:  # pylint: disable=broad-except
            pass
        else:
            if (
                api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
                and api_result['detail'] == 'title-invalid-characters'
            ):
                return []

    network.raise_for_httperror(resp)

    api_result = resp.json()
    title = api_result['title']
    wikipedia_link = api_result['content_urls']['desktop']['page']
    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})

    if api_result.get('type') == 'standard':
        results.append(
            {
                'infobox': title,
                'id': wikipedia_link,
                'content': api_result.get('extract', ''),
                'img_src': api_result.get('thumbnail', {}).get('source'),
                'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
            }
        )

    return results


# Nonstandard language codes
#
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).

lang_map = {
    'be-tarask': 'bel',
    'ak': 'aka',
    'als': 'gsw',
    'bat-smg': 'sgs',
    'cbk-zam': 'cbk',
    'fiu-vro': 'vro',
    'map-bms': 'map',
    'nrm': 'nrf',
    'roa-rup': 'rup',
    'nds-nl': 'nds',
    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
    'zh-min-nan': 'nan',
    'zh-yue': 'yue',
    'an': 'arg',
    'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason)
}

unknown_langs = [
    'an',  # Aragonese
    'ba',  # Bashkir
    'bar',  # Bavarian
    'bcl',  # Central Bicolano
    'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be'
    'bpy',  # Bishnupriya Manipuri is unknown by babel
    'hif',  # Fiji Hindi
    'ilo',  # Ilokano
    'li',  # Limburgish
    'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
    'sh',  # Serbo-Croatian
    'simple',  # simple english is not know as a natural language different to english (babel)
    'vo',  # Volapük
    'wa',  # Walloon
]


def fetch_traits(engine_traits: EngineTraits):
    """Fetch languages from Wikipedia.

    The location of the Wikipedia address of a language is mapped in a
    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
    (``wiki_netloc``).  Here is a reduced example:

    .. code:: python

       traits.custom['wiki_netloc'] = {
           "en": "en.wikipedia.org",
           ..
           "gsw": "als.wikipedia.org",
           ..
           "zh": "zh.wikipedia.org",
           "zh-classical": "zh-classical.wikipedia.org"
       }

    """

    engine_traits.custom['wiki_netloc'] = {}

    # insert alias to map from a region like zh-CN to a language zh_Hans
    engine_traits.languages['zh_Hans'] = 'zh'

    resp = network.get(wikipedia_article_depth)
    if not resp.ok:
        print("ERROR: response from Wikipedia is not OK.")

    dom = html.fromstring(resp.text)
    for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):

        cols = row.xpath('./td')
        if not cols:
            continue
        cols = [c.text_content().strip() for c in cols]

        depth = float(cols[3].replace('-', '0').replace(',', ''))
        articles = int(cols[4].replace(',', '').replace(',', ''))

        if articles < 10000:
            # exclude languages with too few articles
            continue

        if int(depth) < 20:
            # Rough indicator of a Wikipedia’s quality, showing how frequently
            # its articles are updated.
            continue

        eng_tag = cols[2]
        wiki_url = row.xpath('./td[3]/a/@href')[0]
        wiki_url = urllib.parse.urlparse(wiki_url)

        if eng_tag in unknown_langs:
            continue

        try:
            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
        except babel.UnknownLocaleError:
            print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
            continue

        conflict = engine_traits.languages.get(sxng_tag)
        if conflict:
            if conflict != eng_tag:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
            continue

        engine_traits.languages[sxng_tag] = eng_tag
        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
-												[enh] engines: add about variable

move meta information from comment to the about variable
so the preferences, the documentation can show these information

											
										
										
											2021-01-13 11:31:25 +01:00
+								# SPDX-License-Identifier: AGPL-3.0-or-later
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								# lint: pylint
 								"""This module implements the Wikipedia engine.  Some of this implementations
 								are shared by other engines:
 								- :ref:`wikidata engine`
 								The list of supported languages is fetched from the article linked by
 								:py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia
 								does not support one Wikipedia for all the languages, but there is one Wikipedia
 								for every language (:py:obj:`fetch_traits`).
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
+								"""
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								import urllib.parse
 								import babel
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								from lxml import html
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								from searx import network
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								from searx.locales import language_tag
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								from searx.enginelib.traits import EngineTraits
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								traits: EngineTraits
-												[enh] add supported_languages on engines and auto-generate languages.py

											
										
										
											2016-08-06 06:34:56 +02:00
-												[enh] engines: add about variable

move meta information from comment to the about variable
so the preferences, the documentation can show these information

											
										
										
											2021-01-13 11:31:25 +01:00
+								# about
 								about = {
 								    "website": 'https://www.wikipedia.org/',
 								    "wikidata_id": 'Q52',
 								    "official_api_documentation": 'https://en.wikipedia.org/api/',
 								    "use_official_api": True,
 								    "require_api_key": False,
 								    "results": 'JSON',
 								}
-												[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-01 17:01:59 +02:00
+								send_accept_language_header = True
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
 								"""The *editing depth* of Wikipedia is one of several possible rough indicators
 								of the encyclopedia's collaborative quality, showing how frequently its articles
 								are updated.  The measurement of depth was introduced after some limitations of
 								the classic measurement of article count were realized.
 								"""
 								# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
 								rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
 								"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
 								the first paragraph of the page in plain text and HTML as well as the type of
 								page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
 								and link previews in the apps.
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								"""
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
 								def request(query, params):
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    """Assemble a request (`wikipedia rest_v1 summary API`_)."""
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
+								    if query.islower():
-												use Wikipedia's REST v1 API

											
										
										
											2020-09-08 07:05:21 +02:00
+								        query = query.title()
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    engine_language = traits.get_language(params['searxng_locale'], 'en')
 								    wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
 								    title = urllib.parse.quote(query)
 								    # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
 								    # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
 								    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
-												add support for Chinese variants in Wikipedia

											
										
										
											2021-02-09 05:56:45 +01:00
-												[enh] add raise_for_httperror

check HTTP response:
* detect some comme CAPTCHA challenge (no solving). In this case the engine is suspended for long a time.
* otherwise raise HTTPError as before

the check is done in poolrequests.py (was before in search.py).

update qwant, wikipedia, wikidata to use raise_for_httperror instead of raise_for_status

											
										
										
											2020-12-09 21:23:20 +01:00
+								    params['raise_for_httperror'] = False
-												[fix] wikipedia engine: don't raise an error when the query is not found

Add a new parameter "raise_for_status", set by default to True.
When True, any HTTP status code >= 300 raise an exception ( #2332 )
When False, the engine can manage the HTTP status code by itself.

											
										
										
											2020-12-04 20:04:39 +01:00
+								    params['soft_max_redirects'] = 2
-												use Wikipedia's REST v1 API

											
										
										
											2020-09-08 07:05:21 +02:00
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
+								    return params
 								# get response from search-request
 								def response(resp):
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
 								    results = []
-												[fix] wikipedia engine: don't raise an error when the query is not found

Add a new parameter "raise_for_status", set by default to True.
When True, any HTTP status code >= 300 raise an exception ( #2332 )
When False, the engine can manage the HTTP status code by itself.

											
										
										
											2020-12-04 20:04:39 +01:00
+								    if resp.status_code == 404:
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
+								        return []
-												[upd] wikipedia engine: return an empty result on query with illegal characters

on some queries (like an IT error message), wikipedia returns an HTTP error 400.
this commit returns an empty result instead of showing an error to the user.

											
										
										
											2021-02-11 12:29:21 +01:00
+								    if resp.status_code == 400:
 								        try:
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								            api_result = resp.json()
 								        except Exception:  # pylint: disable=broad-except
-												[upd] wikipedia engine: return an empty result on query with illegal characters

on some queries (like an IT error message), wikipedia returns an HTTP error 400.
this commit returns an empty result instead of showing an error to the user.

											
										
										
											2021-02-11 12:29:21 +01:00
+								            pass
 								        else:
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								            if (
 								                api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
 								                and api_result['detail'] == 'title-invalid-characters'
 								            ):
-												[upd] wikipedia engine: return an empty result on query with illegal characters

on some queries (like an IT error message), wikipedia returns an HTTP error 400.
this commit returns an empty result instead of showing an error to the user.

											
										
										
											2021-02-11 12:29:21 +01:00
+								                return []
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								    network.raise_for_httperror(resp)
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    api_result = resp.json()
-												[fix] wikipedia: remove HTML from the title

fr.wikipedia.org (and it seems not other wikipedia websites),
adds HTML to api_result['displayTitle'].
(Search for '!wp :fr Braid' for example)

The commit uses api_result['title']

											
										
										
											2021-03-25 08:31:39 +01:00
+								    title = api_result['title']
-												use Wikipedia's REST v1 API

											
										
										
											2020-09-08 07:05:21 +02:00
+								    wikipedia_link = api_result['content_urls']['desktop']['page']
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
 								    if api_result.get('type') == 'standard':
 								        results.append(
 								            {
 								                'infobox': title,
 								                'id': wikipedia_link,
 								                'content': api_result.get('extract', ''),
 								                'img_src': api_result.get('thumbnail', {}).get('source'),
 								                'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
 								            }
 								        )
-												[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api

											
										
										
											2016-03-14 07:32:36 +01:00
 								    return results
-												[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.

											
										
										
											2016-11-06 03:51:38 +01:00
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								# Nonstandard language codes
 								#
 								# These Wikipedias use language codes that do not conform to the ISO 639
 								# standard (which is how wiki subdomains are chosen nowadays).
 								lang_map = {
 								    'be-tarask': 'bel',
 								    'ak': 'aka',
 								    'als': 'gsw',
 								    'bat-smg': 'sgs',
 								    'cbk-zam': 'cbk',
 								    'fiu-vro': 'vro',
 								    'map-bms': 'map',
 								    'nrm': 'nrf',
 								    'roa-rup': 'rup',
 								    'nds-nl': 'nds',
 								    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
 								    'zh-min-nan': 'nan',
 								    'zh-yue': 'yue',
 								    'an': 'arg',
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason)
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								}
 								unknown_langs = [
 								    'an',  # Aragonese
 								    'ba',  # Bashkir
 								    'bar',  # Bavarian
 								    'bcl',  # Central Bicolano
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be'
 								    'bpy',  # Bishnupriya Manipuri is unknown by babel
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								    'hif',  # Fiji Hindi
 								    'ilo',  # Ilokano
 								    'li',  # Limburgish
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								    'sh',  # Serbo-Croatian
 								    'simple',  # simple english is not know as a natural language different to english (babel)
 								    'vo',  # Volapük
 								    'wa',  # Walloon
 								]
 								def fetch_traits(engine_traits: EngineTraits):
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    """Fetch languages from Wikipedia.
 								    The location of the Wikipedia address of a language is mapped in a
 								    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
 								    (``wiki_netloc``).  Here is a reduced example:
 								    .. code:: python
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								       traits.custom['wiki_netloc'] = {
 								           "en": "en.wikipedia.org",
 								           ..
 								           "gsw": "als.wikipedia.org",
 								           ..
 								           "zh": "zh.wikipedia.org",
 								           "zh-classical": "zh-classical.wikipedia.org"
 								       }
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    """
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								    engine_traits.custom['wiki_netloc'] = {}
 								    # insert alias to map from a region like zh-CN to a language zh_Hans
 								    engine_traits.languages['zh_Hans'] = 'zh'
 								    resp = network.get(wikipedia_article_depth)
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								    if not resp.ok:
 								        print("ERROR: response from Wikipedia is not OK.")
 								    dom = html.fromstring(resp.text)
 								    for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
 								        cols = row.xpath('./td')
 								        if not cols:
 								            continue
 								        cols = [c.text_content().strip() for c in cols]
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								        depth = float(cols[3].replace('-', '0').replace(',', ''))
 								        articles = int(cols[4].replace(',', '').replace(',', ''))
 								        if articles < 10000:
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								            # exclude languages with too few articles
 								            continue
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								        if int(depth) < 20:
 								            # Rough indicator of a Wikipedia’s quality, showing how frequently
 								            # its articles are updated.
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								            continue
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								        eng_tag = cols[2]
 								        wiki_url = row.xpath('./td[3]/a/@href')[0]
 								        wiki_url = urllib.parse.urlparse(wiki_url)
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
 								        if eng_tag in unknown_langs:
 								            continue
 								        try:
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								        except babel.UnknownLocaleError:
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								            print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
-												[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-08 16:22:26 +02:00
+								            continue
 								        conflict = engine_traits.languages.get(sxng_tag)
 								        if conflict:
 								            if conflict != eng_tag:
 								                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
 								            continue
-												[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-28 19:12:59 +02:00
+								        engine_traits.languages[sxng_tag] = eng_tag
 								        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc