patch: remove unidecode as it was transliterating non-latin chars (#434)

resolves #298
This commit is contained in:
Timothy Carambat 2023-12-13 11:54:55 -08:00 committed by GitHub
parent b444171ef3
commit da0cec7aa2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3,7 +3,6 @@ from langchain.document_loaders import PyMuPDFLoader # better UTF support and me
from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize
from unidecode import unidecode
# Process all PDF-related documents.
def as_pdf(**kwargs):
@ -29,7 +28,7 @@ def as_pdf(**kwargs):
page_content = ''
for page in fitz.open(fullpath):
print(f"-- Parsing content from pg {page.number} --")
page_content += unidecode(page.get_text('text'))
page_content += str(page.get_text('text'))
if len(page_content) == 0:
print(f"Resulting page content was empty - no text could be extracted from the document.")