From 6ff5bba000f6fab3698e6b81c7fbc9dfdcff9fe2 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 12:52:33 -0400 Subject: [PATCH] Add lexilang for language detection on short texts --- libretranslate/detect.py | 7 +++++++ libretranslate/language.py | 2 +- pyproject.toml | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index b4ec9b5..a0b15b2 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -1,6 +1,7 @@ from functools import lru_cache import linguars +from lexilang.detector import detect as lldetect class Language: @@ -26,9 +27,15 @@ def load_detector(langcodes = ()): class Detector: def __init__(self, langcodes = ()): + self.langcodes = langcodes self.detector = load_detector(langcodes) def detect(self, text): + if len(text) < 18: + code, conf = lldetect(text, self.langcodes) + if conf > 0: + return [Language(code, round(conf * 100))] + top_3_choices = self.detector.confidence(text)[:3] if top_3_choices[0][1] == 0: return [Language("en", 0)] diff --git a/libretranslate/language.py b/libretranslate/language.py index 7d1ea2c..86921bc 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -18,7 +18,7 @@ def load_languages(): @lru_cache(maxsize=None) def load_lang_codes(): languages = load_languages() - return (l.code for l in languages) + return tuple(l.code for l in languages) def detect_languages(text): # detect batch processing diff --git a/pyproject.toml b/pyproject.toml index f4d3459..3e200cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "waitress ==2.1.2", "expiringdict ==1.2.2", "linguars==0.4.0", + "lexilang==1.0.1", "morfessor ==2.0.6", "appdirs ==1.4.4", "APScheduler ==3.9.1",