patch: remove unidecode as it was transliterating non-latin chars (#434)

resolves #298
2024-07-02 15:20:37 +02:00 · 2023-12-13 11:54:55 -08:00 · 2023-12-13 11:54:55 -08:00 · da0cec7aa2
commit da0cec7aa2
parent b444171ef3
1 changed files with 1 additions and 2 deletions
--- a/collector/scripts/watch/convert/as_pdf.py
+++ b/collector/scripts/watch/convert/as_pdf.py
@ -3,7 +3,6 @@ from langchain.document_loaders import PyMuPDFLoader # better UTF support and me
 from slugify import slugify
 from ..utils import guid, file_creation_time, write_to_server_documents, move_source
 from ...utils import tokenize
-from unidecode import unidecode

 # Process all PDF-related documents.
 def as_pdf(**kwargs):
@ -29,7 +28,7 @@ def as_pdf(**kwargs):
  page_content = ''
  for page in fitz.open(fullpath):
    print(f"-- Parsing content from pg {page.number} --")
-    page_content += unidecode(page.get_text('text'))
+    page_content += str(page.get_text('text'))

  if len(page_content) == 0:
    print(f"Resulting page content was empty - no text could be extracted from the document.")