2023-10-30 05:03:00 +01:00
|
|
|
from functools import lru_cache
|
2022-12-11 06:45:14 +01:00
|
|
|
|
2023-10-30 05:20:11 +01:00
|
|
|
import linguars
|
|
|
|
|
|
|
|
|
2023-07-09 12:29:11 +02:00
|
|
|
class Language:
|
2023-10-30 05:03:00 +01:00
|
|
|
def __init__(self, code, confidence):
|
2022-12-11 06:45:14 +01:00
|
|
|
self.code = code
|
|
|
|
self.confidence = float(confidence)
|
|
|
|
|
|
|
|
def __str__(self):
|
2023-10-30 05:20:11 +01:00
|
|
|
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
|
2023-10-30 05:03:00 +01:00
|
|
|
|
|
|
|
@lru_cache(maxsize=None)
|
|
|
|
def load_detector(langcodes = ()):
|
|
|
|
languages = []
|
|
|
|
for lc in langcodes:
|
|
|
|
try:
|
|
|
|
languages.append(linguars.Language.from_iso_code_639_1(lc))
|
2023-10-30 05:20:11 +01:00
|
|
|
except Exception:
|
2023-10-30 05:09:52 +01:00
|
|
|
print(f"{lc} is not supported by lingua")
|
2023-10-30 05:03:00 +01:00
|
|
|
pass # Not supported
|
2023-10-30 05:35:40 +01:00
|
|
|
|
2023-10-30 05:03:00 +01:00
|
|
|
return linguars.LanguageDetector(languages=languages)
|
2022-12-11 06:45:14 +01:00
|
|
|
|
|
|
|
|
2023-07-09 12:29:11 +02:00
|
|
|
class Detector:
|
2023-10-30 05:03:00 +01:00
|
|
|
def __init__(self, langcodes = ()):
|
|
|
|
self.detector = load_detector(langcodes)
|
2022-12-11 06:45:14 +01:00
|
|
|
|
|
|
|
def detect(self, text):
|
2023-10-30 05:03:00 +01:00
|
|
|
top_3_choices = self.detector.confidence(text)[:3]
|
|
|
|
if top_3_choices[0][1] == 0:
|
|
|
|
return [Language("en", 0)]
|
|
|
|
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
|
2023-07-09 12:38:03 +02:00
|
|
|
|