anything-llm/collector/scripts/link.py

import os, json, tempfile
from urllib.parse import urlparse
from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import  append_meta
from .utils import tokenize, ada_v2_cost
from requests.exceptions import ReadTimeout
    
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link():
  print("[NOTICE]: The first time running this process it will download supporting libraries.\n\n")
  fqdn_link = input("Paste in the URL of an online article or blog: ")
  if(len(fqdn_link) == 0):
    print("Invalid URL!")
    exit(1)

  session = HTMLSession()
  req = session.get(fqdn_link)
  if(req.ok == False):
    print("Could not reach this url!")
    exit(1)
  
  req.html.render()
  full_text = None
  with tempfile.NamedTemporaryFile(mode = "w") as tmp:
    tmp.write(req.html.html)
    tmp.seek(0)
    loader = UnstructuredHTMLLoader(tmp.name)
    data = loader.load()[0]
    full_text = data.page_content
    tmp.close()
  
  link = append_meta(req, full_text, True)
  if(len(full_text) > 0):
    source = urlparse(req.url)
    output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
    output_path = f"./outputs/website-logs"

    transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
    transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"

    if os.path.isdir(output_path) == False:
      os.makedirs(output_path)

    if os.path.isdir(transaction_output_dir) == False:
      os.makedirs(transaction_output_dir)

    full_text = append_meta(req, full_text)
    tokenCount = len(tokenize(full_text))
    link['pageContent'] = full_text
    link['token_count_estimate'] = tokenCount

    with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
      json.dump(link, file, ensure_ascii=True, indent=4)

    with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
      json.dump(link, file, ensure_ascii=True, indent=4)
  else:
    print("Could not parse any meaningful data from this link or url.")
    exit(1)

  print(f"\n\n[Success]: article or link content fetched!")
  print(f"////////////////////////////")
  print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(tokenCount)} using {tokenCount} tokens.")
  print(f"////////////////////////////")
  exit(0)

def links():
  links = []
  prompt = "Paste in the URL of an online article or blog: "
  done = False
  
  while(done == False):
    new_link = input(prompt)
    if(len(new_link) == 0): 
      done = True
      links = [*set(links)]
      continue

    links.append(new_link)
    prompt = f"\n{len(links)} links in queue. Submit an empty value when done pasting in links to execute collection.\nPaste in the next URL of an online article or blog: "

  if(len(links) == 0):
    print("No valid links provided!")
    exit(1)

  parse_links(links)


# parse links from array
def parse_links(links):
    totalTokens = 0
    for link in links:
        if link.endswith(".pdf"):
            print(f"Skipping PDF file: {link}")
            continue
                
        print(f"Working on {link}...")
        session = HTMLSession()
        
        req = session.get(link, timeout=20) 

        if not req.ok:
            print(f"Could not reach {link} - skipping!")
            continue
        
        req.html.render(timeout=10)    

        full_text = None
        with tempfile.NamedTemporaryFile(mode="w") as tmp:
            tmp.write(req.html.html)
            tmp.seek(0)
            loader = UnstructuredHTMLLoader(tmp.name)
            data = loader.load()[0]
            full_text = data.page_content
            tmp.close()
        
        link = append_meta(req, full_text, True)
        if len(full_text) > 0:
            source = urlparse(req.url)
            output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
            output_path = f"./outputs/website-logs"

            transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
            transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"

            if not os.path.isdir(output_path):
                os.makedirs(output_path)

            if not os.path.isdir(transaction_output_dir):
                os.makedirs(transaction_output_dir)

            full_text = append_meta(req, full_text)
            tokenCount = len(tokenize(full_text))
            link['pageContent'] = full_text
            link['token_count_estimate'] = tokenCount
            totalTokens += tokenCount

            with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
                json.dump(link, file, ensure_ascii=True, indent=4)

            with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
                json.dump(link, file, ensure_ascii=True, indent=4)

            req.session.close()
        else:
            print(f"Could not parse any meaningful data from {link}.")
            continue    

    print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
    print(f"////////////////////////////")
    print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
    print(f"////////////////////////////")
inital commit ⚡ 2023-06-04 04:28:07 +02:00			`import os, json, tempfile`
			`from urllib.parse import urlparse`
			`from requests_html import HTMLSession`
			`from langchain.document_loaders import UnstructuredHTMLLoader`
			`from .link_utils import append_meta`
			`from .utils import tokenize, ada_v2_cost`
Adds ability to import sitemaps to include a website (#51) * Adds ability to import sitemaps to include a website * adds example sitemap url 2023-06-14 20:04:17 +02:00			`from requests.exceptions import ReadTimeout`
inital commit ⚡ 2023-06-04 04:28:07 +02:00
			`# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/`
			`def link():`
			`print("[NOTICE]: The first time running this process it will download supporting libraries.\n\n")`
			`fqdn_link = input("Paste in the URL of an online article or blog: ")`
			`if(len(fqdn_link) == 0):`
			`print("Invalid URL!")`
			`exit(1)`

			`session = HTMLSession()`
			`req = session.get(fqdn_link)`
			`if(req.ok == False):`
			`print("Could not reach this url!")`
			`exit(1)`

			`req.html.render()`
			`full_text = None`
			`with tempfile.NamedTemporaryFile(mode = "w") as tmp:`
			`tmp.write(req.html.html)`
			`tmp.seek(0)`
			`loader = UnstructuredHTMLLoader(tmp.name)`
			`data = loader.load()[0]`
			`full_text = data.page_content`
			`tmp.close()`

			`link = append_meta(req, full_text, True)`
			`if(len(full_text) > 0):`
			`source = urlparse(req.url)`
			`output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"`
			`output_path = f"./outputs/website-logs"`

			`transaction_output_filename = f"article-{source.path.replace('/','_')}.json"`
Docker support (#34) * Updates for Linux for frontend/server * frontend/server docker * updated Dockerfile for deps related to node vectordb * updates for collector in docker * docker deps for ODT processing * ignore another collector dir * storage mount improvements; run as UID * fix pypandoc version typo * permissions fixes 2023-06-13 20:26:11 +02:00			`transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"`
inital commit ⚡ 2023-06-04 04:28:07 +02:00
			`if os.path.isdir(output_path) == False:`
			`os.makedirs(output_path)`

			`if os.path.isdir(transaction_output_dir) == False:`
			`os.makedirs(transaction_output_dir)`

			`full_text = append_meta(req, full_text)`
			`tokenCount = len(tokenize(full_text))`
			`link['pageContent'] = full_text`
			`link['token_count_estimate'] = tokenCount`

			`with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:`
			`json.dump(link, file, ensure_ascii=True, indent=4)`

			`with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:`
			`json.dump(link, file, ensure_ascii=True, indent=4)`
			`else:`
			`print("Could not parse any meaningful data from this link or url.")`
			`exit(1)`

			`print(f"\n\n[Success]: article or link content fetched!")`
			`print(f"////////////////////////////")`
			`print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(tokenCount)} using {tokenCount} tokens.")`
			`print(f"////////////////////////////")`
			`exit(0)`

			`def links():`
			`links = []`
			`prompt = "Paste in the URL of an online article or blog: "`
			`done = False`

			`while(done == False):`
			`new_link = input(prompt)`
			`if(len(new_link) == 0):`
			`done = True`
			`links = [*set(links)]`
			`continue`

			`links.append(new_link)`
			`prompt = f"\n{len(links)} links in queue. Submit an empty value when done pasting in links to execute collection.\nPaste in the next URL of an online article or blog: "`

			`if(len(links) == 0):`
			`print("No valid links provided!")`
			`exit(1)`

Adds ability to import sitemaps to include a website (#51) * Adds ability to import sitemaps to include a website * adds example sitemap url 2023-06-14 20:04:17 +02:00			`parse_links(links)`



			`# parse links from array`
			`def parse_links(links):`
			`totalTokens = 0`
			`for link in links:`
			`if link.endswith(".pdf"):`
			`print(f"Skipping PDF file: {link}")`
			`continue`

			`print(f"Working on {link}...")`
			`session = HTMLSession()`

			`req = session.get(link, timeout=20)`

			`if not req.ok:`
			`print(f"Could not reach {link} - skipping!")`
			`continue`

			`req.html.render(timeout=10)`

			`full_text = None`
			`with tempfile.NamedTemporaryFile(mode="w") as tmp:`
			`tmp.write(req.html.html)`
			`tmp.seek(0)`
			`loader = UnstructuredHTMLLoader(tmp.name)`
			`data = loader.load()[0]`
			`full_text = data.page_content`
			`tmp.close()`

			`link = append_meta(req, full_text, True)`
			`if len(full_text) > 0:`
			`source = urlparse(req.url)`
			`output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"`
			`output_path = f"./outputs/website-logs"`

			`transaction_output_filename = f"article-{source.path.replace('/','_')}.json"`
			`transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"`

			`if not os.path.isdir(output_path):`
			`os.makedirs(output_path)`

			`if not os.path.isdir(transaction_output_dir):`
			`os.makedirs(transaction_output_dir)`

			`full_text = append_meta(req, full_text)`
			`tokenCount = len(tokenize(full_text))`
			`link['pageContent'] = full_text`
			`link['token_count_estimate'] = tokenCount`
			`totalTokens += tokenCount`

			`with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:`
			`json.dump(link, file, ensure_ascii=True, indent=4)`

			`with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:`
			`json.dump(link, file, ensure_ascii=True, indent=4)`

			`req.session.close()`
			`else:`
			`print(f"Could not parse any meaningful data from {link}.")`
			`continue`

			`print(f"\n\n[Success]: {len(links)} article or link contents fetched!")`
			`print(f"////////////////////////////")`
			`print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")`
			`print(f"////////////////////////////")`