anything-llm/collector/scripts/gitbook.py
Timothy Carambat 3e78476739
Franzbischoff document improvements (#241)
* cosmetic changes to be compatible to hadolint

* common configuration for most editors until better plugins comes up

* Changes on PDF metadata, using PyMuPDF (faster and more compatible)

* small changes on other file ingestions in order to try to keep the fields equal

* Lint, review, and review

* fixed unknown chars

* Use PyMuPDF for pdf loading for 200% speed increase
linting

---------

Co-authored-by: Francisco Bischoff <franzbischoff@gmail.com>
Co-authored-by: Francisco Bischoff <984592+franzbischoff@users.noreply.github.com>
2023-09-18 16:21:37 -07:00

45 lines
1.7 KiB
Python

import os, json
from langchain.document_loaders import GitbookLoader
from urllib.parse import urlparse
from datetime import datetime
from alive_progress import alive_it
from .utils import tokenize
from uuid import uuid4
def gitbook():
url = input("Enter the URL of the GitBook you want to collect: ")
if(url == ''):
print("Not a gitbook URL")
exit(1)
primary_source = urlparse(url)
output_path = f"./outputs/gitbook-logs/{primary_source.netloc}"
transaction_output_dir = f"../server/storage/documents/gitbook-{primary_source.netloc}"
if os.path.exists(output_path) == False:os.makedirs(output_path)
if os.path.exists(transaction_output_dir) == False: os.makedirs(transaction_output_dir)
loader = GitbookLoader(url, load_all_paths= primary_source.path in ['','/'])
for doc in alive_it(loader.load()):
metadata = doc.metadata
content = doc.page_content
source = urlparse(metadata.get('source'))
name = 'home' if source.path in ['','/'] else source.path.replace('/','_')
output_filename = f"doc-{name}.json"
transaction_output_filename = f"doc-{name}.json"
data = {
'id': str(uuid4()),
'url': metadata.get('source'),
'title': metadata.get('title'),
'description': metadata.get('title'),
'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(content),
'pageContent': content,
'token_count_estimate': len(tokenize(content))
}
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=True, indent=4)
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=True, indent=4)