1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-22 12:10:11 +01:00

[mod] fetch supported languages for several engines

utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.
This commit is contained in:
marc 2016-11-05 20:51:38 -06:00
parent 92c6e88ad3
commit f62ce21f50
26 changed files with 3633 additions and 362 deletions

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,7 @@ from os.path import realpath, dirname
import sys
from flask_babel import gettext
from operator import itemgetter
from json import loads
from searx import settings
from searx import logger
from searx.utils import load_module
@ -78,6 +79,9 @@ def load_engine(engine_data):
if not hasattr(engine, arg_name):
setattr(engine, arg_name, arg_value)
if engine_data['name'] in languages:
setattr(engine, 'supported_languages', languages[engine_data['name']])
# checking required variables
for engine_attr in dir(engine):
if engine_attr.startswith('_'):
@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']:
logger.error('No engines found. Edit your settings.yml')
exit(2)
languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
for engine_data in settings['engines']:
engine = load_engine(engine_data)
if engine is not None:

View File

@ -15,12 +15,14 @@
from urllib import urlencode
from lxml import html
from requests import get
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
paging = True
language_support = True
supported_languages_url = 'https://www.bing.com/account/general'
# search-url
base_url = 'https://www.bing.com/'
@ -81,3 +83,16 @@ def response(resp):
# return results
return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(supported_languages_url)
dom = html.fromstring(response.text)
options = dom.xpath('//div[@id="limit-languages"]//input')
for option in options:
code = option.xpath('./@id')[0].replace('_', '-')
supported_languages.append(code)
return supported_languages

View File

@ -19,7 +19,7 @@ from urllib import urlencode
from lxml import html
from json import loads
import re
from searx.engines.bing import supported_languages
from searx.engines.bing import fetch_supported_languages
# engine dependent config
categories = ['images']

View File

@ -17,7 +17,7 @@ from datetime import datetime
from dateutil import parser
from lxml import etree
from searx.utils import list_get
from searx.engines.bing import supported_languages
from searx.engines.bing import fetch_supported_languages
# engine dependent config
categories = ['news']

View File

@ -15,29 +15,12 @@
from urllib import urlencode
from json import loads
from datetime import datetime
from requests import get
# engine dependent config
categories = ['videos']
paging = True
language_support = True
supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
"ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
"cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
"de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
"fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
"gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
"hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
"is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
"km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
"la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
"mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
"nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
"oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
"rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
"sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
"su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
"ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
"uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
# search-url
# see http://www.dailymotion.com/doc/api/obj-video.html
@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr
embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
supported_languages_url = 'https://api.dailymotion.com/languages'
# do search-request
def request(query, params):
@ -92,3 +77,23 @@ def response(resp):
# return results
return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = {}
response = get(supported_languages_url)
response_json = loads(response.text)
for language in response_json['list']:
supported_languages[language['code']] = {}
name = language['native_name']
if name:
supported_languages[language['code']]['name'] = name
english_name = language['name']
if english_name:
supported_languages[language['code']]['english_name'] = english_name
return supported_languages

View File

@ -15,19 +15,15 @@
from urllib import urlencode
from lxml.html import fromstring
from requests import get
from json import loads
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
paging = True
language_support = True
supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT",
"es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
"el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
"kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
"es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
"sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
"th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
supported_languages_url = 'https://duckduckgo.com/d2030.js'
time_range_support = True
# search-url
@ -65,8 +61,6 @@ def request(query, params):
locale = 'xa' + params['language'].split('-')[0]
elif params['language'][-2:] == 'GB':
locale = 'uk' + params['language'].split('-')[0]
elif params['language'] == 'es-419':
locale = 'xl-es'
else:
locale = params['language'].split('-')
if len(locale) == 2:
@ -120,3 +114,18 @@ def response(resp):
# return results
return results
# get supported languages from their site
def fetch_supported_languages():
response = get(supported_languages_url)
# response is a js file with regions as an embedded object
response_page = response.text
response_page = response_page[response_page.find('regions:{') + 8:]
response_page = response_page[:response_page.find('}') + 1]
regions_json = loads(response_page)
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
return supported_languages

View File

@ -4,7 +4,7 @@ from re import compile, sub
from lxml import html
from searx.utils import html_to_text
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import supported_languages
from searx.engines.duckduckgo import fetch_supported_languages
url = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'

View File

@ -14,6 +14,8 @@ from json import loads
from random import randint
from time import time
from urllib import urlencode
from requests import get
from lxml.html import fromstring
# engine dependent config
categories = ['general']
@ -40,11 +42,7 @@ url_xpath = './/url'
title_xpath = './/title'
content_xpath = './/sum'
supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de",
"nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
"th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
"hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
"hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
# do search-request
@ -90,3 +88,17 @@ def response(resp):
# return results
return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(supported_languages_url)
dom = fromstring(response.text)
links = dom.xpath('//span[@id="menu2"]/a')
for link in links:
code = link.xpath('./@href')[0][-2:]
if code != 'xx' and code not in supported_languages:
supported_languages.append(code)
return supported_languages

View File

@ -12,6 +12,7 @@ import re
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html, etree
from requests import get
from searx.engines.xpath import extract_text, extract_url
from searx.search import logger
@ -23,20 +24,6 @@ categories = ['general']
paging = True
language_support = True
use_locale_domain = True
supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
"ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
"xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
"gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
"bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
"tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
"lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
"ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
"ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
"sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
"tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
"mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
"fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
"kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
time_range_support = True
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.'
maps_path = '/maps'
redirect_path = '/url'
images_path = '/images'
supported_languages_url = 'https://www.google.com/preferences?#languages'
# specific xpath variables
results_xpath = '//div[@class="g"]'
@ -373,3 +361,17 @@ def attributes_to_html(attributes):
retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
retval = retval + '</table>'
return retval
# get supported languages from their site
def fetch_supported_languages():
supported_languages = {}
response = get(supported_languages_url)
dom = html.fromstring(response.text)
options = dom.xpath('//select[@name="hl"]/option')
for option in options:
code = option.xpath('./@value')[0].split('-')[0]
name = option.text[:-1].title()
supported_languages[code] = {"name": name}
return supported_languages

View File

@ -13,7 +13,7 @@
from lxml import html
from urllib import urlencode
from json import loads
from searx.engines.google import supported_languages
from searx.engines.google import fetch_supported_languages
# search-url
categories = ['news']

View File

@ -15,7 +15,6 @@
from json import loads
from string import Formatter
from urllib import urlencode, quote
from searx.engines.wikipedia import supported_languages
# engine dependent config
categories = ['general']

View File

@ -20,11 +20,6 @@ from searx.utils import html_to_text
categories = None
paging = True
language_support = True
supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
"fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
"el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
"es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
"th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
category_to_keyword = {'general': 'web',
'images': 'images',
@ -51,15 +46,7 @@ def request(query, params):
# add language tag if specified
if params['language'] != 'all':
locale = params['language'].split('-')
if len(locale) == 2 and params['language'] in supported_languages:
params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
else:
# try to get a country code for language
for lang in supported_languages:
if locale[0] == lang.split('-')[0]:
params['url'] += '&locale=' + lang.replace('-', '_').lower()
break
params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
return params

View File

@ -24,11 +24,6 @@ categories = ['general']
# paging = False
language_support = True
supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
"ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
"el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
"lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
"sv", "tl", "th", "tr", "uk", "vi"]
# search-url
base_url = 'https://startpage.com/'

View File

@ -22,7 +22,7 @@ language = ""
# search-url
url = 'http://www.subtitleseeker.com/'
search_url = url + 'search/TITLES/{query}&p={pageno}'
search_url = url + 'search/TITLES/{query}?p={pageno}'
# specific xpath variables
results_xpath = '//div[@class="boxRows"]'
@ -51,7 +51,8 @@ def response(resp):
elif resp.search_params['language'] != 'all':
search_lang = [lc[3]
for lc in language_codes
if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
search_lang = search_lang[0].split(' (')[0]
# parse results
for result in dom.xpath(results_xpath):

View File

@ -13,17 +13,13 @@
from json import loads
from urllib import urlencode, unquote
import re
from requests import get
from lxml.html import fromstring
# engine dependent config
categories = ['general', 'images']
paging = True
language_support = True
supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
"es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
"zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
"en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
"en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
"tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
# search-url
base_url = 'https://swisscows.ch/'
@ -114,3 +110,16 @@ def response(resp):
# return results
return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(base_url)
dom = fromstring(response.text)
options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
for option in options:
code = option.xpath('./@data-val')[0]
supported_languages.append(code)
return supported_languages

View File

@ -15,7 +15,7 @@ from searx import logger
from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.utils import format_date_by_locale
from searx.engines.wikipedia import supported_languages
from searx.engines.wikipedia import fetch_supported_languages
from json import loads
from lxml.html import fromstring
@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
def request(query, params):
language = params['language'].split('_')[0]
language = params['language'].split('-')[0]
if language == 'all':
language = 'en'
@ -72,7 +72,7 @@ def response(resp):
html = fromstring(resp.content)
wikidata_ids = html.xpath(wikidata_ids_xpath)
language = resp.search_params['language'].split('_')[0]
language = resp.search_params['language'].split('-')[0]
if language == 'all':
language = 'en'

View File

@ -12,36 +12,9 @@
from json import loads
from urllib import urlencode, quote
from requests import get
from lxml.html import fromstring
supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
"pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
"ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
"eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
"lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
"el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
"mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
"be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
"mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
"cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
"scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
"ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
"am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
"os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
"sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
"gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
"tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
"zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
"udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
"ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
"kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
"pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
"rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
"pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
"roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
"ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
"xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
"ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
"dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
# search-url
base_url = 'https://{language}.wikipedia.org/'
@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
'&explaintext'\
'&pithumbsize=300'\
'&redirects'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url
@ -142,3 +116,24 @@ def response(resp):
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = {}
response = get(supported_languages_url)
dom = fromstring(response.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[2].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
if articles >= 10000:
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
return supported_languages

View File

@ -14,16 +14,13 @@
from urllib import urlencode
from urlparse import unquote
from lxml import html
from requests import get
from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general']
paging = True
language_support = True
supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
"et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
"ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
"sl", "es", "sv", "th", "tr"]
time_range_support = True
# search-url
@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
supported_languages_url = 'https://search.yahoo.com/web/advanced'
# specific xpath variables
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
url_xpath = './/h3/a/@href'
@ -142,3 +141,16 @@ def response(resp):
# return results
return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(supported_languages_url)
dom = html.fromstring(response.text)
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
for option in options:
code = option.xpath('./@value')[0][5:]
supported_languages.append(code)
return supported_languages

View File

@ -12,7 +12,7 @@
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import parse_url, supported_languages
from searx.engines.yahoo import parse_url, fetch_supported_languages
from datetime import datetime, timedelta
import re
from dateutil import parser

View File

@ -4,39 +4,29 @@
language_codes = (
(u"ach", u"Acoli", u"", u""),
(u"af", u"Afrikaans", u"", u"Afrikaans"),
(u"af", u"Afrikaans", u"", u""),
(u"ak", u"Akan", u"", u""),
(u"als", u"Alemannisch", u"", u"Alemannic"),
(u"am", u"አማርኛ", u"", u"Amharic"),
(u"an", u"Aragonés", u"", u"Aragonese"),
(u"am", u"አማርኛ", u"", u""),
(u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
(u"arz", u"مصرى (Maṣri)", u"", u"Egyptian Arabic"),
(u"ast", u"Asturianu", u"", u"Asturian"),
(u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
(u"azb", u"تۆرکجه", u"", u"South Azerbaijani"),
(u"ba", u"Башҡорт", u"", u"Bashkir"),
(u"ban", u"Balinese", u"", u""),
(u"bar", u"Boarisch", u"", u"Bavarian"),
(u"be", u"Беларуская", u"", u"Belarusian"),
(u"bem", u"Ichibemba", u"", u""),
(u"bg-BG", u"Български", u"България", u"Bulgarian"),
(u"bn", u"বাংলা", u"", u"Bengali"),
(u"bpy", u"ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী", u"", u"Bishnupriya Manipuri"),
(u"br", u"Brezhoneg", u"", u"Breton"),
(u"bs", u"Bosanski", u"", u"Bosnian"),
(u"bug", u"Basa Ugi", u"", u"Buginese"),
(u"bn", u"বাংলা", u"", u""),
(u"br", u"Brezhoneg", u"", u""),
(u"bs", u"Bosanski", u"", u""),
(u"ca", u"Català", u"", u"Catalan"),
(u"ca-CT", u"Català", u"", u"Catalan"),
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
(u"ce", u"Нохчийн", u"", u"Chechen"),
(u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
(u"chr", u"ᏣᎳᎩ", u"", u""),
(u"ckb", u"Soranî / کوردی", u"", u"Sorani"),
(u"ckb", u"Central Kurdish", u"", u""),
(u"co", u"Corsican", u"", u""),
(u"crs", u"Seychellois Creole", u"", u""),
(u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
(u"cv", u"Чăваш", u"", u"Chuvash"),
(u"cy", u"Cymraeg", u"", u"Welsh"),
(u"cy", u"Cymraeg", u"", u""),
(u"da-DK", u"Dansk", u"Danmark", u"Danish"),
(u"de", u"Deutsch", u"", u"German"),
(u"de-AT", u"Deutsch", u"Österreich", u"German"),
@ -70,148 +60,129 @@ language_codes = (
(u"eu", u"Euskara", u"", u"Basque"),
(u"fa", u"فارسی", u"", u"Persian"),
(u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
(u"fo", u"Føroyskt", u"", u"Faroese"),
(u"fo", u"Føroyskt", u"", u""),
(u"fr", u"Français", u"", u"French"),
(u"fr-BE", u"Français", u"Belgique", u"French"),
(u"fr-CA", u"Français", u"Canada", u"French"),
(u"fr-CH", u"Français", u"Suisse", u"French"),
(u"fr-FR", u"Français", u"France", u"French"),
(u"fy", u"Frysk", u"", u"West Frisian"),
(u"ga", u"Gaeilge", u"", u"Irish"),
(u"fy", u"West-Frysk", u"", u""),
(u"ga", u"Gaeilge", u"", u""),
(u"gaa", u"Ga", u"", u""),
(u"gd", u"Gàidhlig", u"", u"Scottish Gaelic"),
(u"gd", u"Gàidhlig", u"", u""),
(u"gl", u"Galego", u"", u"Galician"),
(u"gn", u"Guarani", u"", u""),
(u"gu", u"ગુજરાતી", u"", u"Gujarati"),
(u"gu", u"ગુજરાતી", u"", u""),
(u"ha", u"Hausa", u"", u""),
(u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
(u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
(u"hi", u"हिन्दी", u"", u"Hindi"),
(u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
(u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
(u"ht", u"Krèyol ayisyen", u"", u"Haitian"),
(u"ht", u"Haitian Creole", u"", u""),
(u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
(u"hy", u"Հայերեն", u"", u"Armenian"),
(u"ia", u"Interlingua", u"", u"Interlingua"),
(u"ia", u"Interlingua", u"", u""),
(u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
(u"ig", u"Igbo", u"", u""),
(u"io", u"Ido", u"", u"Ido"),
(u"is", u"Íslenska", u"", u"Icelandic"),
(u"is", u"Íslenska", u"", u""),
(u"it", u"Italiano", u"", u"Italian"),
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
(u"it-IT", u"Italiano", u"Italia", u"Italian"),
(u"iw", u"עברית", u"", u""),
(u"ja-JP", u"日本語", u"日本", u"Japanese"),
(u"jv", u"Basa Jawa", u"", u"Javanese"),
(u"ka", u"ქართული", u"", u"Georgian"),
(u"kg", u"Kongo", u"", u""),
(u"kk", u"Қазақша", u"", u"Kazakh"),
(u"km", u"ខ្មែរ", u"", u""),
(u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
(u"kn", u"ಕನ್ನಡ", u"", u""),
(u"ko-KR", u"한국어", u"대한민국", u"Korean"),
(u"kri", u"Krio (Sierra Leone)", u"", u""),
(u"ku", u"Kurdî / كوردی", u"", u"Kurdish"),
(u"ky", u"Кыргызча", u"", u"Kirghiz"),
(u"kri", u"Krio", u"", u""),
(u"ky", u"Кыргызча", u"", u""),
(u"la", u"Latina", u"", u"Latin"),
(u"lb", u"Lëtzebuergesch", u"", u"Luxembourgish"),
(u"lg", u"Luganda", u"", u""),
(u"li", u"Limburgs", u"", u"Limburgish"),
(u"lmo", u"Lumbaart", u"", u"Lombard"),
(u"ln", u"Lingála", u"", u""),
(u"lo", u"ລາວ", u"", u""),
(u"loz", u"Lozi", u"", u""),
(u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
(u"lua", u"Luba-Lulua", u"", u""),
(u"lv-LV", u"Latviešu", u"Latvijas Republika", u"Latvian"),
(u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
(u"mfe", u"Kreol Morisien", u"", u""),
(u"mg", u"Malagasy", u"", u"Malagasy"),
(u"mg", u"Malagasy", u"", u""),
(u"mi", u"Maori", u"", u""),
(u"min", u"Minangkabau", u"", u"Minangkabau"),
(u"mk", u"Македонски", u"", u"Macedonian"),
(u"ml", u"മലയാളം", u"", u"Malayalam"),
(u"mn", u"Монгол", u"", u"Mongolian"),
(u"mr", u"मराठी", u"", u"Marathi"),
(u"mrj", u"Кырык Мары (Kyryk Mary)", u"", u"Hill Mari"),
(u"mk", u"Македонски", u"", u""),
(u"ml", u"മലയാളം", u"", u""),
(u"mn", u"Монгол", u"", u""),
(u"mr", u"मराठी", u"", u""),
(u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
(u"mt", u"Malti", u"", u""),
(u"my", u"မြန်မာဘာသာ", u"", u"Burmese"),
(u"mzn", u"مَزِروني", u"", u"Mazandarani"),
(u"nah", u"Nāhuatl", u"", u"Nahuatl"),
(u"nap", u"Nnapulitano", u"", u"Neapolitan"),
(u"nds-nl", u"Plattdüütsch", u"Nedderlannen", u"Low Saxon"),
(u"ne", u"नेपाली", u"", u"Nepali"),
(u"new", u"नेपाल भाषा", u"", u"Newar"),
(u"my", u"ဗမာ", u"", u""),
(u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
(u"ne", u"नेपाली", u"", u""),
(u"nl", u"Nederlands", u"", u"Dutch"),
(u"nl-BE", u"Nederlands", u"België", u"Dutch"),
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
(u"nn", u"Nynorsk", u"", u"Norwegian (Nynorsk)"),
(u"no-NO", u"Norsk (Bokmål)", u"Norge", u"Norwegian (Bokmål)"),
(u"nn", u"Nynorsk", u"", u"Norwegian"),
(u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
(u"nso", u"Northern Sotho", u"", u""),
(u"ny", u"Nyanja", u"", u""),
(u"nyn", u"Runyankore", u"", u""),
(u"oc", u"Occitan", u"", u"Occitan"),
(u"oc", u"Occitan", u"", u""),
(u"om", u"Oromoo", u"", u""),
(u"or", u"ଓଡ଼ିଆ", u"", u"Oriya"),
(u"os", u"Иронау", u"", u"Ossetian"),
(u"pa", u"ਪੰਜਾਬੀ", u"", u"Punjabi"),
(u"or", u"ଓଡ଼ିଆ", u"", u""),
(u"pa", u"ਪੰਜਾਬੀ", u"", u""),
(u"pcm", u"Nigerian Pidgin", u"", u""),
(u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
(u"pms", u"Piemontèis", u"", u"Piedmontese"),
(u"pnb", u"شاہ مکھی پنجابی (Shāhmukhī Pañjābī)", u"", u"Western Punjabi"),
(u"ps", u"پښتو", u"", u""),
(u"pt", u"Português", u"", u"Portuguese"),
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
(u"qu", u"Runa Simi", u"", u"Quechua"),
(u"qu", u"Runasimi", u"", u""),
(u"rm", u"Rumantsch", u"", u""),
(u"rn", u"Ikirundi", u"", u""),
(u"ro-RO", u"Română", u"România", u"Romanian"),
(u"ru-RU", u"Русский", u"Россия", u"Russian"),
(u"rw", u"Kinyarwanda", u"", u""),
(u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
(u"sah", u"Саха тыла (Saxa Tyla)", u"", u"Sakha"),
(u"scn", u"Sicilianu", u"", u"Sicilian"),
(u"sco", u"Scots", u"", u"Scots"),
(u"sd", u"Sindhi", u"", u""),
(u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
(u"si", u"සිංහල", u"", u"Sinhalese"),
(u"si", u"සිංහල", u"", u""),
(u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
(u"sl-SI", u"Slovenščina", u"Slovenija", u"Slovenian"),
(u"sl", u"Slovenščina", u"", u"Slovenian"),
(u"sn", u"Chishona", u"", u""),
(u"so", u"Soomaali", u"", u""),
(u"sq", u"Shqip", u"", u"Albanian"),
(u"sr-ME", u"Српски / Srpski", u"Црна Гора", u"Serbian"),
(u"sq", u"Shqip", u"", u""),
(u"sr", u"Српски / Srpski", u"", u"Serbian"),
(u"st", u"Southern Sotho", u"", u""),
(u"su", u"Basa Sunda", u"", u"Sundanese"),
(u"su", u"Sundanese", u"", u""),
(u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
(u"sw", u"Kiswahili", u"", u"Swahili"),
(u"ta", u"தமிழ்", u"", u"Tamil"),
(u"te", u"తెలుగు", u"", u"Telugu"),
(u"tg", u"Тоҷикӣ", u"", u"Tajik"),
(u"sw", u"Kiswahili", u"", u""),
(u"ta", u"தமிழ்", u"", u""),
(u"te", u"తెలుగు", u"", u""),
(u"tg", u"Tajik", u"", u""),
(u"th-TH", u"ไทย", u"ไทย", u"Thai"),
(u"ti", u"ትግርኛ", u"", u""),
(u"tk", u"Turkmen", u"", u""),
(u"tl-PH", u"Tagalog", u"Pilipinas", u"Tagalog"),
(u"tl-PH", u"Filipino", u"Pilipinas", u""),
(u"tlh", u"Klingon", u"", u""),
(u"tn", u"Tswana", u"", u""),
(u"to", u"Lea Fakatonga", u"", u""),
(u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
(u"tt", u"Tatarça / Татарча", u"", u"Tatar"),
(u"tt", u"Tatar", u"", u""),
(u"tum", u"Tumbuka", u"", u""),
(u"tw", u"Twi", u"", u""),
(u"ug", u"ئۇيغۇرچە", u"", u""),
(u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
(u"ur", u"اردو", u"", u"Urdu"),
(u"uz", u"Ozbek", u"", u"Uzbek"),
(u"vec", u"Vèneto", u"", u"Venetian"),
(u"ve", u"Venda", u"", u"Venda"),
(u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
(u"vo", u"Volapük", u"", u"Volapük"),
(u"wa", u"Walon", u"", u"Walloon"),
(u"war", u"Winaray", u"", u"Waray-Waray"),
(u"wo", u"Wolof", u"", u""),
(u"xh", u"Xhosa", u"", u""),
(u"yi", u"ייִדיש", u"", u"Yiddish"),
(u"yo", u"Yorùbá", u"", u"Yoruba"),
(u"yi", u"ייִדיש", u"", u""),
(u"yo", u"Èdè Yorùbá", u"", u""),
(u"zh", u"中文", u"", u"Chinese"),
(u"zh-CN", u"中文", u"中国", u"Chinese"),
(u"zh-HK", u"中文", u"香港", u"Chinese"),

View File

@ -514,7 +514,7 @@ def index():
answers=result_container.answers,
infoboxes=result_container.infoboxes,
paging=result_container.paging,
current_language=search.lang,
current_language=search_query.lang,
base_url=get_base_url(),
theme=get_current_theme_name(),
favicons=global_favicons[themes.index(get_current_theme_name())]

View File

@ -17,7 +17,7 @@ class TestSubtitleseekerEngine(SearxTestCase):
def test_response(self):
dicto = defaultdict(dict)
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
response = mock.Mock(search_params=dicto)
self.assertRaises(AttributeError, subtitleseeker.response, None)

View File

@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
class TestWikipediaEngine(SearxTestCase):
def test_request(self):
wikipedia.supported_languages = ['fr', 'en']
query = 'test_query'
dicto = defaultdict(dict)
dicto['language'] = 'fr-FR'

164
utils/fetch_languages.py Normal file
View File

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
# This script generates languages.py from intersecting each engine's supported languages.
#
# The country names are obtained from http://api.geonames.org which requires registering as a user.
#
# Output files (engines_languages.json and languages.py)
# are written in current directory to avoid overwriting in case something goes wrong.
from requests import get
from urllib import urlencode
from lxml.html import fromstring
from json import loads, dumps
import io
from sys import path
path.append('../searx') # noqa
from searx.engines import engines
# Geonames API for country names.
geonames_user = '' # ADD USER NAME HERE
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
# Output files.
engines_languages_file = 'engines_languages.json'
languages_file = 'languages.py'
engines_languages = {}
languages = {}
# To filter out invalid codes and dialects.
def valid_code(lang_code):
# filter invalid codes
# sl-SL is technically not invalid, but still a mistake
if lang_code[:2] == 'xx'\
or lang_code == 'sl-SL'\
or lang_code == 'wt-WT'\
or lang_code == 'jw'\
or lang_code[-2:] == 'UK'\
or lang_code[-2:] == 'XA'\
or lang_code[-2:] == 'XL':
return False
# filter dialects
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return False
if len(lang_code) == 2 and len(lang_code[1]) > 2:
return False
return True
# Get country name in specified language.
def get_country_name(locale):
if geonames_user is '':
return ''
locale = locale.split('-')
if len(locale) != 2:
return ''
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
'country': locale[1],
'username': geonames_user}))
response = get(url)
json = loads(response.text)
content = json.get('geonames', None)
if content is None or len(content) != 1:
print "No country name found for " + locale[0] + "-" + locale[1]
return ''
return content[0].get('countryName', '')
# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages():
for engine_name in engines:
if hasattr(engines[engine_name], 'fetch_supported_languages'):
try:
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
except Exception as e:
print e
# write json file
f = io.open(engines_languages_file, "w", encoding="utf-8")
f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
f.close()
# Join all language lists.
# Iterate all languages supported by each engine.
def join_language_lists():
# include wikipedia first for more accurate language names
# exclude languages with too few articles
languages.update({code: lang for code, lang
in engines_languages['wikipedia'].iteritems()
if valid_code(code) and lang['articles'] >= 100000})
for engine_name in engines_languages:
for locale in engines_languages[engine_name]:
if not valid_code(locale):
continue
# if language is not on list or if it has no name yet
if locale not in languages or not languages[locale].get('name'):
if isinstance(engines_languages[engine_name], dict) \
and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
languages[locale] = engines_languages[engine_name][locale]
else:
languages[locale] = {}
# get locales that have no name yet
for locale in languages.keys():
if not languages[locale].get('name'):
# try to get language and country names
name = languages.get(locale.split('-')[0], {}).get('name', None)
if name:
languages[locale]['name'] = name
languages[locale]['country'] = get_country_name(locale) or ''
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
else:
# filter out locales with no name
del languages[locale]
# Remove countryless language if language is featured in only one country.
def filter_single_country_languages():
prev_lang = None
for code in sorted(languages):
lang = code.split('-')[0]
if lang == prev_lang:
countries += 1
else:
if prev_lang is not None and countries == 1:
del languages[prev_lang]
countries = 0
prev_lang = lang
# Write languages.py.
def write_languages_file():
new_file = open(languages_file, 'w')
file_content = '# -*- coding: utf-8 -*-\n'
file_content += '# list of language codes\n'
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
file_content += '\nlanguage_codes = ('
for code in sorted(languages):
file_content += '\n (u"' + code + '"'\
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\
+ ', u"' + languages[code].get('country', '') + '"'\
+ ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
# remove last comma
file_content = file_content[:-1]
file_content += '\n)\n'
new_file.write(file_content.encode('utf8'))
new_file.close()
if __name__ == "__main__":
fetch_supported_languages()
join_language_lists()
filter_single_country_languages()
write_languages_file()

View File

@ -1,169 +0,0 @@
# -*- coding: utf-8 -*-
# This script generates languages.py from
# intersecting each engine's supported languages.
#
# The language's native names are obtained from
# Wikipedia and Google's supported languages.
#
# The country names are obtained from http://api.geonames.org
# which requires registering as a user.
#
# Output file (languages.py) is written in current directory
# to avoid overwriting in case something goes wrong.
from requests import get
from urllib import urlencode
from lxml.html import fromstring
from json import loads
from sys import path
path.append('../searx')
from searx.engines import engines
# list of names
wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
google_languages_url = 'https://www.google.com/preferences?#languages'
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
geonames_user = '' # add user name here
google_json_name = 'google.preferences.langMap'
languages = {}
# To filter out invalid codes and dialects.
def valid_code(lang_code):
# filter invalid codes
# sl-SL is technically not invalid, but still a mistake
if lang_code[:2] == 'xx'\
or lang_code == 'sl-SL'\
or lang_code == 'jw'\
or lang_code[-2:] == 'UK'\
or lang_code[-2:] == 'XA'\
or lang_code[-2:] == 'XL':
return False
# filter dialects
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return False
if len(lang_code) == 2 and len(lang_code[1]) > 2:
return False
return True
# Get country name in specified language.
def get_country_name(locale):
if geonames_user is '':
return ''
locale = locale.split('-')
if len(locale) != 2:
return ''
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
'country': locale[1],
'username': geonames_user}))
response = get(url)
json = loads(response.text)
content = json.get('geonames', None)
if content is None or len(content) != 1:
print "No country name found for " + locale[0] + "-" + locale[1]
print json
return ''
return content[0].get('countryName', '')
# Get language names from Wikipedia.
def get_wikipedia_languages():
response = get(wiki_languages_url)
dom = fromstring(response.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[2].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
# exclude language variants and languages with few articles
if code not in languages and articles >= 10000 and valid_code(code):
languages[code] = (name, '', english_name)
# Get language names from Google.
def get_google_languages():
response = get(google_languages_url)
dom = fromstring(response.text)
options = dom.xpath('//select[@name="hl"]/option')
for option in options:
code = option.xpath('./@value')[0].split('-')[0]
name = option.text[:-1].title()
if code not in languages and valid_code(code):
languages[code] = (name, '', '')
# Join all language lists.
# iterate all languages supported by each engine
def join_language_lists():
for engine_name in engines:
for locale in engines[engine_name].supported_languages:
locale = locale.replace('_', '-')
if locale not in languages and valid_code(locale):
# try to get language name
language = languages.get(locale.split('-')[0], None)
if language == None:
print engine_name + ": " + locale
continue
country = get_country_name(locale)
languages[locale] = (language[0], country, language[2])
# Remove countryless language if language is featured in only one country.
def filter_single_country_languages():
prev_lang = None
for code in sorted(languages):
lang = code.split('-')[0]
if lang == prev_lang:
countries += 1
else:
if prev_lang is not None and countries == 1:
del languages[prev_lang]
countries = 0
prev_lang = lang
# Write languages.py.
def write_languages_file():
new_file = open('languages.py', 'w')
file_content = '# -*- coding: utf-8 -*-\n'
file_content += '# list of language codes\n'
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
file_content += '\nlanguage_codes = ('
for code in sorted(languages):
(name, country, english) = languages[code]
file_content += '\n (u"' + code + '"'\
+ ', u"' + name + '"'\
+ ', u"' + country + '"'\
+ ', u"' + english + '"),'
# remove last comma
file_content = file_content[:-1]
file_content += '\n)\n'
new_file.write(file_content.encode('utf8'))
new_file.close()
if __name__ == "__main__":
get_wikipedia_languages()
get_google_languages()
join_language_lists()
filter_single_country_languages()
write_languages_file()