2023-06-04 04:28:07 +02:00
|
|
|
import os, json
|
|
|
|
from langchain.document_loaders import GitbookLoader
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
from datetime import datetime
|
|
|
|
from alive_progress import alive_it
|
|
|
|
from .utils import tokenize
|
|
|
|
from uuid import uuid4
|
|
|
|
|
|
|
|
def gitbook():
|
|
|
|
url = input("Enter the URL of the GitBook you want to collect: ")
|
|
|
|
if(url == ''):
|
|
|
|
print("Not a gitbook URL")
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
primary_source = urlparse(url)
|
|
|
|
output_path = f"./outputs/gitbook-logs/{primary_source.netloc}"
|
2023-06-13 20:26:11 +02:00
|
|
|
transaction_output_dir = f"../server/storage/documents/gitbook-{primary_source.netloc}"
|
2023-06-04 04:28:07 +02:00
|
|
|
|
|
|
|
if os.path.exists(output_path) == False:os.makedirs(output_path)
|
|
|
|
if os.path.exists(transaction_output_dir) == False: os.makedirs(transaction_output_dir)
|
|
|
|
loader = GitbookLoader(url, load_all_paths= primary_source.path in ['','/'])
|
|
|
|
for doc in alive_it(loader.load()):
|
|
|
|
metadata = doc.metadata
|
|
|
|
content = doc.page_content
|
|
|
|
source = urlparse(metadata.get('source'))
|
|
|
|
name = 'home' if source.path in ['','/'] else source.path.replace('/','_')
|
|
|
|
output_filename = f"doc-{name}.json"
|
|
|
|
transaction_output_filename = f"doc-{name}.json"
|
|
|
|
data = {
|
|
|
|
'id': str(uuid4()),
|
|
|
|
'url': metadata.get('source'),
|
|
|
|
"title": metadata.get('title'),
|
|
|
|
"description": metadata.get('title'),
|
|
|
|
"published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
"wordCount": len(content),
|
|
|
|
'pageContent': content,
|
|
|
|
'token_count_estimate': len(tokenize(content))
|
|
|
|
}
|
|
|
|
|
|
|
|
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
|
|
|
json.dump(data, file, ensure_ascii=True, indent=4)
|
|
|
|
|
|
|
|
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
|
|
|
json.dump(data, file, ensure_ascii=True, indent=4)
|