Merge pull request #526 from pierotofy/langdetect

Use lingua for language detection
2024-11-16 20:40:10 +01:00 · 2023-10-30 13:14:04 -04:00 · 2023-10-30 13:14:04 -04:00 · f9712c800c
commit f9712c800c
parent e7347c9fef 2972292fc5
5 changed files with 52 additions and 87 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-1.4.1
+1.5.0
--- a/libretranslate/app.py
+++ b/libretranslate/app.py
@ -644,6 +644,7 @@ def create_app(args):
                        }
                    )
        except Exception as e:
            raise e
            abort(500, description=_("Cannot translate text: %(text)s", text=str(e)))
    @bp.post("/translate_file")
--- a/libretranslate/detect.py
+++ b/libretranslate/detect.py
@ -1,83 +1,43 @@
-# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py
+from functools import lru_cache
-import unicodedata
+import linguars
 from lexilang.detector import detect as lldetect
 import pycld2 as cld2
 class UnknownLanguageError(Exception):
  pass
 class Language:
-  def __init__(self, choice):
+  def __init__(self, code, confidence):
    name, code, confidence, bytesize = choice
    self.code = code
    self.name = name
    self.confidence = float(confidence)
    self.read_bytes = int(bytesize)
  def __str__(self):
-    return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
+    return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
            "read bytes:{:>6}".format(self.name, self.code,
                                    self.confidence, self.read_bytes))
-  @staticmethod
+@lru_cache(maxsize=None)
-  def from_code(code):
+def load_detector(langcodes = ()):
-    return Language(("", code, 100, 0))
+  languages = []
  for lc in langcodes:
    try:
      languages.append(linguars.Language.from_iso_code_639_1(lc))
    except Exception:
      print(f"{lc} is not supported by lingua")
      pass # Not supported
  return linguars.LanguageDetector(languages=languages)
 class Detector:
-  """ Detect the language used in a snippet of text."""
+  def __init__(self, langcodes = ()):
-
+    self.langcodes = langcodes
-  def __init__(self, text, quiet=False):
+    self.detector = load_detector(langcodes)
    """ Detector of the language used in `text`.
    Args:
      text (string): unicode string.
    """
    self.__text = text
    self.reliable = True
    """False if the detector used Best Effort strategy in detection."""
    self.quiet = quiet
    """If true, exceptions will be silenced."""
    self.detect(text)
  @staticmethod
  def supported_languages():
    """Returns a list of the languages that can be detected by pycld2."""
    return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")]
  def detect(self, text):
-    """Decide which language is used to write the text.
+    if len(text) < 18:
-    The method tries first to detect the language with high reliability. If
+      code, conf = lldetect(text, self.langcodes)
-    that is not possible, the method switches to best effort strategy.
+      if conf > 0:
-    Args:
+        return [Language(code, round(conf * 100))]
      text (string): A snippet of text, the longer it is the more reliable we
                     can detect the language used to write the text.
    """
    try:
      reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
    except cld2.error as e:
      if "input contains invalid UTF-8" in str(e):
        # Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514
        # related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
        text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')])
        reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
      else:
        raise e
-    if not reliable:
+    top_3_choices = self.detector.confidence(text)[:3]
-      self.reliable = False
+    if top_3_choices[0][1] == 0:
-      reliable, index, top_3_choices = cld2.detect(text, bestEffort=True)
+      return [Language("en", 0)]
    return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
      if not self.quiet and not reliable:
        raise UnknownLanguageError("Try passing a longer snippet of text")
    self.languages = [Language(x) for x in top_3_choices]
    self.language = self.languages[0]
    return self.language
  def __str__(self):
    text = f"Prediction is reliable: {self.reliable}\n"
    text += "\n".join([f"Language {i+1}: {str(l)}"
                        for i,l in enumerate(self.languages)])
    return text
--- a/libretranslate/language.py
+++ b/libretranslate/language.py
@ -1,7 +1,9 @@
 from functools import lru_cache
 from argostranslate import translate
-from libretranslate.detect import Detector, UnknownLanguageError
+from libretranslate.detect import Detector
 __languages = None
@ -13,6 +15,11 @@ def load_languages():
    return __languages
@lru_cache(maxsize=None)
 def load_lang_codes():
    languages = load_languages()
    return tuple(l.code for l in languages)
 def detect_languages(text):
    # detect batch processing
    if isinstance(text, list):
@ -21,31 +28,24 @@ def detect_languages(text):
        is_batch = False
        text = [text]
    lang_codes = load_lang_codes()
    # get the candidates
    candidates = []
    for t in text:
        try:
-            d = Detector(t).languages
+            d = Detector(lang_codes).detect(t)
            for i in range(len(d)):
                d[i].text_length = len(t)
            candidates.extend(d)
-        except UnknownLanguageError:
+        except Exception as e:
-            pass
+            print(str(e))
    # total read bytes of the provided text
    text_length_total = sum(c.text_length for c in candidates)
    # Load language codes
    languages = load_languages()
    lang_codes = [l.code for l in languages]
    # only use candidates that are supported by argostranslate
    candidate_langs = list(
        filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates)
    )
    # this happens if no language could be detected
-    if not candidate_langs:
+    if not candidates:
        # use language "en" by default but with zero confidence
        return [{"confidence": 0.0, "language": "en"}]
@ -55,7 +55,7 @@ def detect_languages(text):
        temp_average_list = []
        for lang_code in lang_codes:
            # get all candidates for a specific language
-            lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
+            lc = list(filter(lambda l: l.code == lang_code, candidates))
            if len(lc) > 1:
                # if more than one is present, calculate the average confidence
                lang = lc[0]
@ -68,14 +68,14 @@ def detect_languages(text):
        if temp_average_list:
            # replace the list
-            candidate_langs = temp_average_list
+            candidates = temp_average_list
    # sort the candidates descending based on the detected confidence
-    candidate_langs.sort(
+    candidates.sort(
        key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
    )
-    return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]
+    return [{"confidence": l.confidence, "language": l.code} for l in candidates]
 def improve_translation_formatting(source, translation, improve_punctuation=True):
@ -107,6 +107,9 @@ def improve_translation_formatting(source, translation, improve_punctuation=True
    if source.isupper():
        return translation.upper()
    if len(translation) == 0:
        return source
    if source[0].islower():
        return translation[0].lower() + translation[1:]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,7 +42,8 @@ dependencies = [
    "Flask-Session ==0.4.0",
    "waitress ==2.1.2",
    "expiringdict ==1.2.2",
-    " LTpycld2==0.42",
+    "linguars==0.4.0",
    "lexilang==1.0.1",
    "morfessor ==2.0.6",
    "appdirs ==1.4.4",
    "APScheduler ==3.9.1",