anything-llm/collector/scripts/gitbook.py
frasergr 9f33b3dfcb
Docker support (#34)
* Updates for Linux for frontend/server

* frontend/server docker

* updated Dockerfile for deps related to node vectordb

* updates for collector in docker

* docker deps for ODT processing

* ignore another collector dir

* storage mount improvements; run as UID

* fix pypandoc version typo

* permissions fixes
2023-06-13 11:26:11 -07:00

45 lines
1.7 KiB
Python

import os, json
from langchain.document_loaders import GitbookLoader
from urllib.parse import urlparse
from datetime import datetime
from alive_progress import alive_it
from .utils import tokenize
from uuid import uuid4
def gitbook():
url = input("Enter the URL of the GitBook you want to collect: ")
if(url == ''):
print("Not a gitbook URL")
exit(1)
primary_source = urlparse(url)
output_path = f"./outputs/gitbook-logs/{primary_source.netloc}"
transaction_output_dir = f"../server/storage/documents/gitbook-{primary_source.netloc}"
if os.path.exists(output_path) == False:os.makedirs(output_path)
if os.path.exists(transaction_output_dir) == False: os.makedirs(transaction_output_dir)
loader = GitbookLoader(url, load_all_paths= primary_source.path in ['','/'])
for doc in alive_it(loader.load()):
metadata = doc.metadata
content = doc.page_content
source = urlparse(metadata.get('source'))
name = 'home' if source.path in ['','/'] else source.path.replace('/','_')
output_filename = f"doc-{name}.json"
transaction_output_filename = f"doc-{name}.json"
data = {
'id': str(uuid4()),
'url': metadata.get('source'),
"title": metadata.get('title'),
"description": metadata.get('title'),
"published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
"wordCount": len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=True, indent=4)
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=True, indent=4)