From 3e7847673944f1cfecba58d1c3366fcc0a71e531 Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Tue, 19 Sep 2023 01:21:37 +0200 Subject: [PATCH] Franzbischoff document improvements (#241) * cosmetic changes to be compatible to hadolint * common configuration for most editors until better plugins comes up * Changes on PDF metadata, using PyMuPDF (faster and more compatible) * small changes on other file ingestions in order to try to keep the fields equal * Lint, review, and review * fixed unknown chars * Use PyMuPDF for pdf loading for 200% speed increase linting --------- Co-authored-by: Francisco Bischoff Co-authored-by: Francisco Bischoff <984592+franzbischoff@users.noreply.github.com> --- .editorconfig | 12 ++++ collector/requirements.txt | 7 ++- collector/scripts/gitbook.py | 8 +-- collector/scripts/watch/convert/as_docx.py | 20 +++--- .../scripts/watch/convert/as_markdown.py | 13 ++-- collector/scripts/watch/convert/as_mbox.py | 12 ++-- collector/scripts/watch/convert/as_pdf.py | 61 ++++++++++++------- collector/scripts/watch/convert/as_text.py | 10 +-- 8 files changed, 93 insertions(+), 50 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..5d47c21c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/collector/requirements.txt b/collector/requirements.txt index 756cb3db..c2a1487a 100644 --- a/collector/requirements.txt +++ b/collector/requirements.txt @@ -54,6 +54,7 @@ mypy-extensions==1.0.0 nltk==3.8.1 numexpr==2.8.4 numpy==1.23.5 +oauthlib==3.2.2 olefile==0.46 openapi-schema-pydantic==1.2.4 openpyxl==3.1.2 @@ -68,8 +69,8 @@ pycparser==2.21 pydantic==1.10.8 pyee==8.2.2 Pygments==2.15.1 +PyMuPDF==1.22.5 pypandoc==1.4 -pypdf==3.9.0 pyppeteer==1.0.2 pyquery==2.0.0 python-dateutil==2.8.2 @@ -83,6 +84,7 @@ PyYAML==6.0 regex==2023.5.5 requests==2.31.0 requests-html==0.10.0 +requests-oauthlib==1.3.1 rfc3986==1.5.0 rich==13.0.1 six==1.16.0 @@ -94,9 +96,11 @@ tenacity==8.2.2 text-unidecode==1.3 tiktoken==0.4.0 tqdm==4.65.0 +tweepy==4.14.0 typer==0.9.0 typing-inspect==0.9.0 typing_extensions==4.6.3 +Unidecode==1.3.6 unstructured==0.7.1 urllib3==1.26.16 uuid==1.30 @@ -110,4 +114,3 @@ XlsxWriter==3.1.2 yarl==1.9.2 youtube-transcript-api==0.6.0 zipp==3.15.0 -tweepy==4.14.0 diff --git a/collector/scripts/gitbook.py b/collector/scripts/gitbook.py index 76da8050..98625bf8 100644 --- a/collector/scripts/gitbook.py +++ b/collector/scripts/gitbook.py @@ -29,10 +29,10 @@ def gitbook(): data = { 'id': str(uuid4()), 'url': metadata.get('source'), - "title": metadata.get('title'), - "description": metadata.get('title'), - "published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'), - "wordCount": len(content), + 'title': metadata.get('title'), + 'description': metadata.get('title'), + 'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'), + 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py index ade70e57..6d16650e 100644 --- a/collector/scripts/watch/convert/as_docx.py +++ b/collector/scripts/watch/convert/as_docx.py @@ -18,16 +18,19 @@ def as_docx(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'docAuthor': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better bescription + 'docSource': 'Docx Text file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") @@ -45,16 +48,19 @@ def as_odt(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'author': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better bescription + 'docSource': 'ODT Text file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py index 49cf538c..3e1a3dba 100644 --- a/collector/scripts/watch/convert/as_markdown.py +++ b/collector/scripts/watch/convert/as_markdown.py @@ -18,16 +18,19 @@ def as_markdown(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), - 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'title': f"{filename}", # TODO: find a better metadata + 'docAuthor': 'Unknown', # TODO: find a better metadata + 'description': 'Unknown', # TODO: find a better metadata + 'docSource': 'markdown file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py index 0fa17985..96c9b25b 100644 --- a/collector/scripts/watch/convert/as_mbox.py +++ b/collector/scripts/watch/convert/as_mbox.py @@ -1,5 +1,5 @@ import os -import datetime +import datetime import email.utils from mailbox import mbox from slugify import slugify @@ -36,12 +36,14 @@ def as_mbox(**kwargs): date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S") else: date_sent = None - + data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), - 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'title': message['Subject'], + 'docAuthor': message['From'], + 'description': f"email {message['From']} to {message['To']}", + 'docSource': "mbox file uploaded by the user.", 'published': file_creation_time(fullpath), 'sender': message['From'], 'recipient': message['To'], diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index 12163cf2..c5517332 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -1,38 +1,53 @@ -import os, time -from langchain.document_loaders import PyPDFLoader +import os, fitz +from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata from slugify import slugify from ..utils import guid, file_creation_time, write_to_server_documents, move_source from ...utils import tokenize +from unidecode import unidecode -# Process all text-related documents. +# Process all PDF-related documents. def as_pdf(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') filename = kwargs.get('filename') ext = kwargs.get('ext', '.txt') remove = kwargs.get('remove_on_complete', False) fullpath = f"{parent_dir}/{filename}{ext}" - destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}" - - loader = PyPDFLoader(fullpath) - pages = loader.load_and_split() print(f"-- Working {fullpath} --") - for page in pages: - pg_num = page.metadata.get('page') - print(f"-- Working page {pg_num} --") + loader = PyMuPDFLoader(fullpath) + pages = loader.load() - content = page.page_content - data = { - 'id': guid(), - 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), - 'title': f"{filename}_pg{pg_num}{ext}", - 'description': "a custom file uploaded by the user.", - 'published': file_creation_time(fullpath), - 'wordCount': len(content), - 'pageContent': content, - 'token_count_estimate': len(tokenize(content)) - } - write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination) + if len(pages) == 0: + print(f"{fullpath} parsing resulted in no pages - nothing to do.") + return False + + # Set doc to the first page so we can still get the metadata from PyMuPDF but without all the unicode issues. + doc = pages[0] + del loader + del pages + page_content = '' + for page in fitz.open(fullpath): + print(f"-- Parsing content from pg {page.number} --") + page_content += unidecode(page.get_text('text')) + + title = doc.metadata.get('title') + author = doc.metadata.get('author') + subject = doc.metadata.get('subject') + data = { + 'id': guid(), + 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), + 'title': title if title else f"{filename}{ext}", + 'docAuthor': author if author else 'No author found', + 'description': subject if subject else 'No description found.', + 'docSource': 'pdf file uploaded by the user.', + 'chunkSource': f"{filename}{ext}", + 'published': file_creation_time(fullpath), + 'wordCount': len(page_content), # Technically a letter count :p + 'pageContent': page_content, + 'token_count_estimate': len(tokenize(page_content)) + } + + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py index a9935b48..f3389c84 100644 --- a/collector/scripts/watch/convert/as_text.py +++ b/collector/scripts/watch/convert/as_text.py @@ -14,16 +14,18 @@ def as_text(**kwargs): print(f"-- Working {fullpath} --") data = { - 'id': guid(), + 'id': guid(), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'title': f"{filename}{ext}", - 'description': "a custom file uploaded by the user.", + 'docAuthor': 'Unknown', # TODO: Find a better author + 'description': 'Unknown', # TODO: Find a better description + 'chunkSource': f"{filename}{ext}", 'published': file_creation_time(fullpath), 'wordCount': len(content), 'pageContent': content, 'token_count_estimate': len(tokenize(content)) } - + write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) - print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")