anything-llm/collector/scripts/link_utils.py

23 lines
1.1 KiB
Python
Raw Normal View History

2023-06-04 04:28:07 +02:00
import json
from datetime import datetime
from dotenv import load_dotenv
2023-11-15 01:41:39 +01:00
from .watch.utils import guid
from .utils import tokenize
2023-06-04 04:28:07 +02:00
load_dotenv()
def append_meta(request, text, metadata_only = False):
meta = {
2023-11-15 01:41:39 +01:00
'id': guid(),
2023-06-04 04:28:07 +02:00
'url': request.url,
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
2023-11-15 01:41:39 +01:00
'docAuthor': 'N/A',
'docSource': 'webpage',
'chunkSource': request.url,
2023-06-04 04:28:07 +02:00
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(text.split(' ')),
2023-11-15 01:41:39 +01:00
'pageContent': text,
'token_count_estimate':len(tokenize(text)),
2023-06-04 04:28:07 +02:00
}
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta