LibreTranslate/app/language.py

import string

from argostranslate import translate
from polyglot.detect.base import Detector, UnknownLanguage

__languages = None

def load_languages():
    global __languages

    if __languages is None or len(__languages) == 0:
        __languages = translate.get_installed_languages()

    return __languages

def detect_languages(text):
    # detect batch processing
    if isinstance(text, list):
        is_batch = True
    else:
        is_batch = False
        text = [text]

    # get the candidates
    candidates = []
    for t in text:
        try:
            d = Detector(t).languages
            for i in range(len(d)):
                d[i].text_length = len(t)
            candidates.extend(d)
        except UnknownLanguage:
            pass

    # total read bytes of the provided text
    text_length_total = sum(c.text_length for c in candidates)

    # Load language codes
    languages = load_languages()
    lang_codes = [l.code for l in languages]

    # only use candidates that are supported by argostranslate
    candidate_langs = list(
        filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates)
    )

    # this happens if no language could be detected
    if not candidate_langs:
        # use language "en" by default but with zero confidence
        return [{"confidence": 0.0, "language": "en"}]

    # for multiple occurrences of the same language (can happen on batch detection)
    # calculate the average confidence for each language
    if is_batch:
        temp_average_list = []
        for lang_code in lang_codes:
            # get all candidates for a specific language
            lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
            if len(lc) > 1:
                # if more than one is present, calculate the average confidence
                lang = lc[0]
                lang.confidence = sum(l.confidence for l in lc) / len(lc)
                lang.text_length = sum(l.text_length for l in lc)
                temp_average_list.append(lang)
            elif lc:
                # otherwise just add it to the temporary list
                temp_average_list.append(lc[0])

        if temp_average_list:
            # replace the list
            candidate_langs = temp_average_list

    # sort the candidates descending based on the detected confidence
    candidate_langs.sort(
        key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
    )

    return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]


def improve_translation_formatting(source, translation, improve_punctuation=True):
    source = source.strip()

    if not len(source) or not len(translation):
        return ""
    
    if improve_punctuation:
        source_last_char = source[len(source) - 1]
        translation_last_char = translation[len(translation) - 1]

        punctuation_chars = ['!', '?', '.', ',', ';']
        if source_last_char in punctuation_chars:
            if translation_last_char != source_last_char:
                if translation_last_char in punctuation_chars:
                    translation = translation[:-1]

                translation += source_last_char
        elif translation_last_char in punctuation_chars:
            translation = translation[:-1]

    if source.islower():
        return translation.lower()

    if source.isupper():
        return translation.upper()

    if source[0].islower():
        return translation[0].lower() + translation[1:]

    if source[0].isupper():
        return translation[0].upper() + translation[1:]

    return translation
added transliteration before actual translation -> e.g. if the source language is Russian, argostranslate expects a cyrillic text 2021-03-11 12:32:26 +01:00			`import string`

First commit 2020-12-19 23:40:37 +01:00			`from argostranslate import translate`
Catch unknown language 2021-03-12 16:53:09 +01:00			`from polyglot.detect.base import Detector, UnknownLanguage`
First commit 2020-12-19 23:40:37 +01:00
Memoize 2022-03-04 16:24:29 +01:00			`__languages = None`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00
Fix language detection error The root cause was load_installed_languages() of argostranslate being called at the top of the file instead of inside a function, this caused the list of installed languages to incorrectly be returned as an empty list. 2022-03-04 09:23:11 +01:00			`def load_languages():`
Memoize 2022-03-04 16:24:29 +01:00			`global __languages`

			`if __languages is None or len(__languages) == 0:`
Upgrade deprecated Argos Translate call - load_installed_languages has been deprecated in favor of get_installed_languages 2022-04-30 13:15:54 +02:00			`__languages = translate.get_installed_languages()`
Memoize 2022-03-04 16:24:29 +01:00
			`return __languages`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00
			`def detect_languages(text):`
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`# detect batch processing`
			`if isinstance(text, list):`
			`is_batch = True`
			`else:`
			`is_batch = False`
			`text = [text]`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00
			`# get the candidates`
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`candidates = []`
			`for t in text:`
Catch unknown language 2021-03-12 16:53:09 +01:00			`try:`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`d = Detector(t).languages`
			`for i in range(len(d)):`
			`d[i].text_length = len(t)`
			`candidates.extend(d)`
Fixed all pep8 errors Removed unused imports, variables 2021-05-18 05:51:33 +02:00			`except UnknownLanguage:`
Catch unknown language 2021-03-12 16:53:09 +01:00			`pass`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`# total read bytes of the provided text`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`text_length_total = sum(c.text_length for c in candidates)`
allow batch processing for language detection 2021-03-11 10:52:38 +01:00
Fix language detection error The root cause was load_installed_languages() of argostranslate being called at the top of the file instead of inside a function, this caused the list of installed languages to incorrectly be returned as an empty list. 2022-03-04 09:23:11 +01:00			`# Load language codes`
			`languages = load_languages()`
Memoize 2022-03-04 16:24:29 +01:00			`lang_codes = [l.code for l in languages]`
Fix language detection error The root cause was load_installed_languages() of argostranslate being called at the top of the file instead of inside a function, this caused the list of installed languages to incorrectly be returned as an empty list. 2022-03-04 09:23:11 +01:00
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`# only use candidates that are supported by argostranslate`
Linted with black 2021-05-18 05:41:02 +02:00			`candidate_langs = list(`
Memoize 2022-03-04 16:24:29 +01:00			`filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates)`
Linted with black 2021-05-18 05:41:02 +02:00			`)`
allow batch processing for language detection 2021-03-11 10:52:38 +01:00
			`# this happens if no language could be detected`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00			`if not candidate_langs:`
			`# use language "en" by default but with zero confidence`
Linted with black 2021-05-18 05:41:02 +02:00			`return [{"confidence": 0.0, "language": "en"}]`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`# for multiple occurrences of the same language (can happen on batch detection)`
			`# calculate the average confidence for each language`
			`if is_batch:`
			`temp_average_list = []`
Memoize 2022-03-04 16:24:29 +01:00			`for lang_code in lang_codes:`
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`# get all candidates for a specific language`
			`lc = list(filter(lambda l: l.code == lang_code, candidate_langs))`
			`if len(lc) > 1:`
			`# if more than one is present, calculate the average confidence`
			`lang = lc[0]`
			`lang.confidence = sum(l.confidence for l in lc) / len(lc)`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`lang.text_length = sum(l.text_length for l in lc)`
allow batch processing for language detection 2021-03-11 10:52:38 +01:00			`temp_average_list.append(lang)`
			`elif lc:`
			`# otherwise just add it to the temporary list`
			`temp_average_list.append(lc[0])`

			`if temp_average_list:`
			`# replace the list`
			`candidate_langs = temp_average_list`

use polyglot for detecting the language 2021-03-11 10:01:12 +01:00			`# sort the candidates descending based on the detected confidence`
Linted with black 2021-05-18 05:41:02 +02:00			`candidate_langs.sort(`
improve auto-detect for batch requests with multiple languages 2021-08-02 07:06:56 +02:00			`key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True`
Linted with black 2021-05-18 05:41:02 +02:00			`)`
use polyglot for detecting the language 2021-03-11 10:01:12 +01:00
Linted with black 2021-05-18 05:41:02 +02:00			`return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]`
added transliteration before actual translation -> e.g. if the source language is Russian, argostranslate expects a cyrillic text 2021-03-11 12:32:26 +01:00

move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`def improve_translation_formatting(source, translation, improve_punctuation=True):`
			`source = source.strip()`

Handle empty translation 2022-12-09 20:35:39 +01:00			`if not len(source) or not len(translation):`
move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`return ""`
Handle empty translation 2022-12-09 20:35:39 +01:00
move and improve_translation in language.py, use it for transliteration 2022-09-23 13:59:13 +02:00			`if improve_punctuation:`
			`source_last_char = source[len(source) - 1]`
			`translation_last_char = translation[len(translation) - 1]`

			`punctuation_chars = ['!', '?', '.', ',', ';']`
			`if source_last_char in punctuation_chars:`
			`if translation_last_char != source_last_char:`
			`if translation_last_char in punctuation_chars:`
			`translation = translation[:-1]`

			`translation += source_last_char`
			`elif translation_last_char in punctuation_chars:`
			`translation = translation[:-1]`

			`if source.islower():`
			`return translation.lower()`

			`if source.isupper():`
			`return translation.upper()`

			`if source[0].islower():`
			`return translation[0].lower() + translation[1:]`

			`if source[0].isupper():`
			`return translation[0].upper() + translation[1:]`

			`return translation`