mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-14 18:40:11 +01:00
7edfccaf9a
* WIP adding url uploads to document picker * fix manual script for uploading url to custom-documents * fix metadata for url scraping * wip url parsing * update how async link scraping works * docker-compose defaults added no autocomplete on URLs --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
45 lines
1.8 KiB
Python
45 lines
1.8 KiB
Python
import json, pyppeteer
|
|
from datetime import datetime
|
|
from .watch.utils import guid
|
|
from dotenv import load_dotenv
|
|
from .watch.utils import guid
|
|
from .utils import tokenize
|
|
from requests_html import AsyncHTMLSession
|
|
|
|
load_dotenv()
|
|
|
|
def normalize_url(url):
|
|
if(url.endswith('.web')):
|
|
return url
|
|
return f"{url}.web"
|
|
|
|
def append_meta(request, text, metadata_only = False):
|
|
meta = {
|
|
'id': guid(),
|
|
'url': normalize_url(request.url),
|
|
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
|
|
'docAuthor': 'N/A',
|
|
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
|
|
'docSource': 'web page',
|
|
'chunkSource': request.url,
|
|
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
|
|
'wordCount': len(text.split(' ')),
|
|
'pageContent': text,
|
|
'token_count_estimate':len(tokenize(text)),
|
|
}
|
|
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta
|
|
|
|
class AsyncHTMLSessionFixed(AsyncHTMLSession):
|
|
"""
|
|
pip3 install websockets==6.0 --force-reinstall
|
|
"""
|
|
def __init__(self, **kwargs):
|
|
super(AsyncHTMLSessionFixed, self).__init__(**kwargs)
|
|
self.__browser_args = kwargs.get("browser_args", ["--no-sandbox"])
|
|
|
|
@property
|
|
async def browser(self):
|
|
if not hasattr(self, "_browser"):
|
|
self._browser = await pyppeteer.launch(ignoreHTTPSErrors=not(self.verify), headless=True, handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=self.__browser_args)
|
|
|
|
return self._browser |