Switch linguars for langdetect

2024-11-16 20:40:10 +01:00 · 2023-10-30 20:39:30 -04:00 · 2023-10-30 20:39:30 -04:00 · 02ea8ae011
commit 02ea8ae011
parent 5033f58e0d
3 changed files with 23 additions and 37 deletions
--- a/libretranslate/app.py
+++ b/libretranslate/app.py
@ -554,22 +554,8 @@ def create_app(args):
                )

        if source_lang == "auto":
-            source_langs = []
-            auto_detect_texts = q if batch else [q]
-
-            overall_candidates = detect_languages(q)
-
-            for text_to_check in auto_detect_texts:
-                if len(text_to_check) > 40:
-                    candidate_langs = detect_languages(text_to_check)
-                else:
-                    # Unable to accurately detect languages for short texts
-                    candidate_langs = overall_candidates
-                source_langs.append(candidate_langs[0])
-
-                if args.debug:
-                    print(text_to_check, candidate_langs)
-                    print("Auto detected: %s" % candidate_langs[0]["language"])
+            candidate_langs = detect_languages(q if batch else [q])
+            source_langs = [candidate_langs[0]]
        else:
            if batch:
                source_langs = [ {"confidence": 100.0, "language": source_lang} for text in q]
--- a/libretranslate/detect.py
+++ b/libretranslate/detect.py
@ -1,6 +1,9 @@
-from functools import lru_cache

-import linguars
+from langdetect import DetectorFactory
+
+DetectorFactory.seed = 0
+
+from langdetect import detect_langs
 from lexilang.detector import detect as lldetect


@ -12,34 +15,31 @@ class Language:
  def __str__(self):
    return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")

-@lru_cache(maxsize=None)
-def load_detector(langcodes = ()):
-  languages = []
-  for lc in langcodes:
-    if lc == 'zt':
-      continue
-    try:
-      languages.append(linguars.Language.from_iso_code_639_1(lc))
-    except Exception:
-      print(f"{lc} is not supported by lingua")
-      pass # Not supported
-
-  return linguars.LanguageDetector(languages=languages)
+def check_lang(langcodes, lang):
+  return normalized_lang_code(lang) in langcodes

+def normalized_lang_code(lang):
+  code = lang.lang
+  # Handle zh-cn
+  if code.startswith("zh"):
+    code = "zh"
+  return code

 class Detector:
  def __init__(self, langcodes = ()):
    self.langcodes = langcodes
-    self.detector = load_detector(langcodes)

  def detect(self, text):
-    if len(text) < 18:
+    if len(text) < 20:
      code, conf = lldetect(text, self.langcodes)
      if conf > 0:
        return [Language(code, round(conf * 100))]

-    top_3_choices = self.detector.confidence(text)[:3]
-    if top_3_choices[0][1] == 0:
+    top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3]
+    if not len(top_3_choices):
+      return [Language("en", 0)]
+    if top_3_choices[0].prob == 0:
      return [Language("en", 0)]
-    return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
+
+    return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices]

--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,7 +42,7 @@ dependencies = [
    "Flask-Session ==0.4.0",
    "waitress ==2.1.2",
    "expiringdict ==1.2.2",
-    "linguars==0.4.0",
+    "langdetect==1.0.9",
    "lexilang==1.0.1",
    "morfessor ==2.0.6",
    "appdirs ==1.4.4",