From b825370a88a7730f20ea0a7d868f8bcd780525f2 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 9 Oct 2023 15:46:50 -0400 Subject: [PATCH] Fix pycld2 error --- libretranslate/detect.py | 13 ++++++++++++- libretranslate/static/css/main.css | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index b9f6f2e..5483935 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -1,5 +1,7 @@ # Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py +import unicodedata + import pycld2 as cld2 @@ -52,7 +54,16 @@ class Detector: text (string): A snippet of text, the longer it is the more reliable we can detect the language used to write the text. """ - reliable, index, top_3_choices = cld2.detect(text, bestEffort=False) + try: + reliable, index, top_3_choices = cld2.detect(text, bestEffort=False) + except cld2.error as e: + if "input contains invalid UTF-8" in str(e): + # Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514 + # related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790 + text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')]) + reliable, index, top_3_choices = cld2.detect(text, bestEffort=False) + else: + raise e if not reliable: self.reliable = False diff --git a/libretranslate/static/css/main.css b/libretranslate/static/css/main.css index 1e11ad6..ea4ba2d 100644 --- a/libretranslate/static/css/main.css +++ b/libretranslate/static/css/main.css @@ -162,7 +162,7 @@ h3.header { left: 2px; } -.locale-panel a:hovselecter{ +.locale-panel a:hover{ background-color: transparent !important; }