mbox parsing improvements v1 (#308)

* mbox parsing improvements v1

* autobots roll out!
This commit is contained in:
Francisco Bischoff 2023-10-30 18:57:33 +00:00 committed by GitHub
parent c3fa67bc86
commit 26dba59249
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,61 +1,127 @@
import os import os
import datetime import datetime
import email.utils import email.utils
from mailbox import mbox import re
import quopri
import base64
from mailbox import mbox, mboxMessage
from slugify import slugify from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scripts.watch.utils import (
guid,
file_creation_time,
write_to_server_documents,
move_source,
)
from scripts.utils import tokenize
def get_content(message: mboxMessage) -> str:
content = "None"
# if message.is_multipart():
for part in message.walk():
if part.get_content_type() == "text/plain":
content = part.get_payload(decode=True)
break
elif part.get_content_type() == "text/html":
soup = BeautifulSoup(part.get_payload(decode=True), "html.parser")
content = soup.get_text()
if isinstance(content, bytes):
try:
content = content.decode("utf-8")
except UnicodeDecodeError:
content = content.decode("latin-1")
return content
def parse_subject(subject: str) -> str:
# Check if subject is Quoted-Printable encoded
if subject.startswith("=?") and subject.endswith("?="):
# Extract character set and encoding information
match = re.match(r"=\?(.+)\?(.)\?(.+)\?=", subject)
if match:
charset = match.group(1)
encoding = match.group(2)
encoded_text = match.group(3)
is_quoted_printable = encoding.upper() == "Q"
is_base64 = encoding.upper() == "B"
if is_quoted_printable:
# Decode Quoted-Printable encoded text
subject = quopri.decodestring(encoded_text).decode(charset)
elif is_base64:
# Decode Base64 encoded text
subject = base64.b64decode(encoded_text).decode(charset)
return subject
# Process all mbox-related documents. # Process all mbox-related documents.
def as_mbox(**kwargs): def as_mbox(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir') parent_dir = kwargs.get("directory", "hotdir")
filename = kwargs.get('filename') filename = kwargs.get("filename")
ext = kwargs.get('ext', '.mbox') ext = kwargs.get("ext", ".mbox")
remove = kwargs.get('remove_on_complete', False) remove = kwargs.get("remove_on_complete", False)
if filename is not None:
filename = str(filename)
else:
print("[ERROR]: No filename provided.")
return (False, "No filename provided.")
fullpath = f"{parent_dir}/{filename}{ext}" fullpath = f"{parent_dir}/{filename}{ext}"
print(f"-- Working {fullpath} --") print(f"-- Working {fullpath} --")
box = mbox(fullpath) box = mbox(fullpath)
for message in box: for message in box:
content = "" content = get_content(message)
if message.is_multipart(): content = content.strip().replace("\r\n", "\n")
for part in message.get_payload():
if part.get_content_type() == 'text/plain':
content = part.get_payload()
elif part.get_content_type() == 'text/html':
soup = BeautifulSoup(part.get_payload(), 'html.parser')
content = soup.get_text()
else:
content = message.get_payload()
date_tuple = email.utils.parsedate_tz(message['Date']) if len(content) == 0:
print("[WARNING]: Mail with no content. Ignored.")
continue
date_tuple = email.utils.parsedate_tz(message["Date"])
if date_tuple: if date_tuple:
local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple)) local_date = datetime.datetime.fromtimestamp(
email.utils.mktime_tz(date_tuple)
)
date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S") date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
else: else:
date_sent = None date_sent = None
subject = message["Subject"]
if subject is None:
print("[WARNING]: Mail with no subject. But has content.")
subject = "None"
else:
subject = parse_subject(subject)
abs_path = os.path.abspath(
f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"
)
data = { data = {
'id': guid(), "id": guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), "url": f"file://{abs_path}",
'title': message['Subject'], "title": subject,
'docAuthor': message['From'], "docAuthor": message["From"],
'description': f"email {message['From']} to {message['To']}", "description": f"email from {message['From']} to {message['To']}",
'docSource': "mbox file uploaded by the user.", "docSource": "mbox file uploaded by the user.",
'published': file_creation_time(fullpath), "published": file_creation_time(fullpath),
'sender': message['From'], "sender": message["From"],
'recipient': message['To'], "recipient": message["To"],
'subject': message['Subject'], "subject": subject,
'date_sent': date_sent, "date_sent": date_sent,
'wordCount': len(content), "wordCount": len(content),
'pageContent': content, "pageContent": content,
'token_count_estimate': len(tokenize(content)) "token_count_estimate": len(tokenize(content)),
} }
write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
return(True, None) return (True, None)