mbox parsing improvements v1 (#308)

* mbox parsing improvements v1 * autobots roll out!
2024-10-05 18:30:09 +02:00 · 2023-10-30 18:57:33 +00:00 · 2023-10-30 18:57:33 +00:00 · 26dba59249
commit 26dba59249
parent c3fa67bc86
1 changed files with 100 additions and 34 deletions
--- a/collector/scripts/watch/convert/as_mbox.py
+++ b/collector/scripts/watch/convert/as_mbox.py
@ -1,57 +1,123 @@
 import os
 import datetime
 import email.utils
-from mailbox import mbox
+import re
 import quopri
 import base64
 from mailbox import mbox, mboxMessage
 from slugify import slugify
 from ..utils import guid, file_creation_time, write_to_server_documents, move_source
 from ...utils import tokenize
 from bs4 import BeautifulSoup
 from scripts.watch.utils import (
    guid,
    file_creation_time,
    write_to_server_documents,
    move_source,
 )
 from scripts.utils import tokenize
 def get_content(message: mboxMessage) -> str:
    content = "None"
    # if message.is_multipart():
    for part in message.walk():
        if part.get_content_type() == "text/plain":
            content = part.get_payload(decode=True)
            break
        elif part.get_content_type() == "text/html":
            soup = BeautifulSoup(part.get_payload(decode=True), "html.parser")
            content = soup.get_text()
    if isinstance(content, bytes):
        try:
            content = content.decode("utf-8")
        except UnicodeDecodeError:
            content = content.decode("latin-1")
    return content
 def parse_subject(subject: str) -> str:
    # Check if subject is Quoted-Printable encoded
    if subject.startswith("=?") and subject.endswith("?="):
        # Extract character set and encoding information
        match = re.match(r"=\?(.+)\?(.)\?(.+)\?=", subject)
        if match:
            charset = match.group(1)
            encoding = match.group(2)
            encoded_text = match.group(3)
            is_quoted_printable = encoding.upper() == "Q"
            is_base64 = encoding.upper() == "B"
            if is_quoted_printable:
                # Decode Quoted-Printable encoded text
                subject = quopri.decodestring(encoded_text).decode(charset)
            elif is_base64:
                # Decode Base64 encoded text
                subject = base64.b64decode(encoded_text).decode(charset)
    return subject
 # Process all mbox-related documents.
 def as_mbox(**kwargs):
-    parent_dir = kwargs.get('directory', 'hotdir')
+    parent_dir = kwargs.get("directory", "hotdir")
-    filename = kwargs.get('filename')
+    filename = kwargs.get("filename")
-    ext = kwargs.get('ext', '.mbox')
+    ext = kwargs.get("ext", ".mbox")
-    remove = kwargs.get('remove_on_complete', False)
+    remove = kwargs.get("remove_on_complete", False)
    if filename is not None:
        filename = str(filename)
    else:
        print("[ERROR]: No filename provided.")
        return (False, "No filename provided.")
    fullpath = f"{parent_dir}/{filename}{ext}"
    print(f"-- Working {fullpath} --")
    box = mbox(fullpath)
    for message in box:
-        content = ""
+        content = get_content(message)
-        if message.is_multipart():
+        content = content.strip().replace("\r\n", "\n")
            for part in message.get_payload():
                if part.get_content_type() == 'text/plain':
                    content = part.get_payload()
                elif part.get_content_type() == 'text/html':
                    soup = BeautifulSoup(part.get_payload(), 'html.parser')
                    content = soup.get_text()
        else:
            content = message.get_payload()
-        date_tuple = email.utils.parsedate_tz(message['Date'])
+        if len(content) == 0:
            print("[WARNING]: Mail with no content. Ignored.")
            continue
        date_tuple = email.utils.parsedate_tz(message["Date"])
        if date_tuple:
-            local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
+            local_date = datetime.datetime.fromtimestamp(
                email.utils.mktime_tz(date_tuple)
            )
            date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
        else:
            date_sent = None
        subject = message["Subject"]
        if subject is None:
            print("[WARNING]: Mail with no subject. But has content.")
            subject = "None"
        else:
            subject = parse_subject(subject)
        abs_path = os.path.abspath(
            f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"
        )
        data = {
-            'id': guid(),
+            "id": guid(),
-            'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
+            "url": f"file://{abs_path}",
-            'title': message['Subject'],
+            "title": subject,
-            'docAuthor': message['From'],
+            "docAuthor": message["From"],
-            'description': f"email {message['From']} to {message['To']}",
+            "description": f"email from {message['From']} to {message['To']}",
-            'docSource': "mbox file uploaded by the user.",
+            "docSource": "mbox file uploaded by the user.",
-            'published': file_creation_time(fullpath),
+            "published": file_creation_time(fullpath),
-            'sender': message['From'],
+            "sender": message["From"],
-            'recipient': message['To'],
+            "recipient": message["To"],
-            'subject': message['Subject'],
+            "subject": subject,
-            'date_sent': date_sent,
+            "date_sent": date_sent,
-            'wordCount': len(content),
+            "wordCount": len(content),
-            'pageContent': content,
+            "pageContent": content,
-            'token_count_estimate': len(tokenize(content))
+            "token_count_estimate": len(tokenize(content)),
        }
        write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")