diff --git a/.github/workflows/check-translations.yaml b/.github/workflows/check-translations.yaml new file mode 100644 index 000000000..1dae48814 --- /dev/null +++ b/.github/workflows/check-translations.yaml @@ -0,0 +1,37 @@ +# This Github action is for validation of all languages which translations are offered for +# in the locales folder in `frontend/src`. All languages are compared to the EN translation +# schema since that is the fallback language setting. This workflow will run on all PRs that +# modify any files in the translation directory +name: Verify translations files + +concurrency: + group: build-${{ github.ref }} + cancel-in-progress: true + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - "frontend/src/locales/**.js" + +jobs: + run-script: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version: '18' + + - name: Run verifyTranslations.mjs script + run: | + cd frontend/src/locales + node verifyTranslations.mjs + + - name: Fail job on error + if: failure() + run: exit 1 diff --git a/.vscode/settings.json b/.vscode/settings.json index 4930aa2d1..aafdb17d8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,14 +3,17 @@ "adoc", "aibitat", "AIbitat", + "allm", "anythingllm", "Astra", "Chartable", + "cleancss", "comkey", "cooldown", "cooldowns", "Deduplicator", "Dockerized", + "docpath", "elevenlabs", "Embeddable", "epub", @@ -32,7 +35,9 @@ "opendocument", "openrouter", "Qdrant", + "searxng", "Serper", + "Serply", "textgenwebui", "togetherai", "vectordbs", diff --git a/README.md b/README.md index bc3e9fdd8..8039d69a0 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@

AnythingLLM: The all-in-one AI app you were looking for.
- Chat with your docs, use AI Agents, hyper-configurable, multi-user, & no fustrating set up required. + Chat with your docs, use AI Agents, hyper-configurable, multi-user, & no frustrating set up required.

diff --git a/collector/extensions/index.js b/collector/extensions/index.js index 077264646..a88b38eee 100644 --- a/collector/extensions/index.js +++ b/collector/extensions/index.js @@ -1,18 +1,41 @@ +const { setDataSigner } = require("../middleware/setDataSigner"); const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity"); const { reqBody } = require("../utils/http"); const { validURL } = require("../utils/url"); +const RESYNC_METHODS = require("./resync"); function extensions(app) { if (!app) return; app.post( - "/ext/github-repo", - [verifyPayloadIntegrity], + "/ext/resync-source-document", + [verifyPayloadIntegrity, setDataSigner], async function (request, response) { try { - const loadGithubRepo = require("../utils/extensions/GithubRepo"); + const { type, options } = reqBody(request); + if (!RESYNC_METHODS.hasOwnProperty(type)) throw new Error(`Type "${type}" is not a valid type to sync.`); + return await RESYNC_METHODS[type](options, response); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + content: null, + reason: e.message || "A processing error occurred.", + }); + } + return; + } + ) + + app.post( + "/ext/github-repo", + [verifyPayloadIntegrity, setDataSigner], + async function (request, response) { + try { + const { loadGithubRepo } = require("../utils/extensions/GithubRepo"); const { success, reason, data } = await loadGithubRepo( - reqBody(request) + reqBody(request), + response, ); response.status(200).json({ success, @@ -67,7 +90,7 @@ function extensions(app) { [verifyPayloadIntegrity], async function (request, response) { try { - const loadYouTubeTranscript = require("../utils/extensions/YoutubeTranscript"); + const { loadYouTubeTranscript } = require("../utils/extensions/YoutubeTranscript"); const { success, reason, data } = await loadYouTubeTranscript( reqBody(request) ); @@ -108,12 +131,13 @@ function extensions(app) { app.post( "/ext/confluence", - [verifyPayloadIntegrity], + [verifyPayloadIntegrity, setDataSigner], async function (request, response) { try { - const loadConfluence = require("../utils/extensions/Confluence"); + const { loadConfluence } = require("../utils/extensions/Confluence"); const { success, reason, data } = await loadConfluence( - reqBody(request) + reqBody(request), + response ); response.status(200).json({ success, reason, data }); } catch (e) { diff --git a/collector/extensions/resync/index.js b/collector/extensions/resync/index.js new file mode 100644 index 000000000..ba967962e --- /dev/null +++ b/collector/extensions/resync/index.js @@ -0,0 +1,113 @@ +const { getLinkText } = require("../../processLink"); + +/** + * Fetches the content of a raw link. Returns the content as a text string of the link in question. + * @param {object} data - metadata from document (eg: link) + * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response + */ +async function resyncLink({ link }, response) { + if (!link) throw new Error('Invalid link provided'); + try { + const { success, content = null } = await getLinkText(link); + if (!success) throw new Error(`Failed to sync link content. ${reason}`); + response.status(200).json({ success, content }); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + content: null, + }); + } +} + +/** + * Fetches the content of a YouTube link. Returns the content as a text string of the video in question. + * We offer this as there may be some videos where a transcription could be manually edited after initial scraping + * but in general - transcriptions often never change. + * @param {object} data - metadata from document (eg: link) + * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response + */ +async function resyncYouTube({ link }, response) { + if (!link) throw new Error('Invalid link provided'); + try { + const { fetchVideoTranscriptContent } = require("../../utils/extensions/YoutubeTranscript"); + const { success, reason, content } = await fetchVideoTranscriptContent({ url: link }); + if (!success) throw new Error(`Failed to sync YouTube video transcript. ${reason}`); + response.status(200).json({ success, content }); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + content: null, + }); + } +} + +/** + * Fetches the content of a specific confluence page via its chunkSource. + * Returns the content as a text string of the page in question and only that page. + * @param {object} data - metadata from document (eg: chunkSource) + * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response + */ +async function resyncConfluence({ chunkSource }, response) { + if (!chunkSource) throw new Error('Invalid source property provided'); + try { + // Confluence data is `payload` encrypted. So we need to expand its + // encrypted payload back into query params so we can reFetch the page with same access token/params. + const source = response.locals.encryptionWorker.expandPayload(chunkSource); + const { fetchConfluencePage } = require("../../utils/extensions/Confluence"); + const { success, reason, content } = await fetchConfluencePage({ + pageUrl: `https:${source.pathname}`, // need to add back the real protocol + baseUrl: source.searchParams.get('baseUrl'), + accessToken: source.searchParams.get('token'), + username: source.searchParams.get('username'), + }); + + if (!success) throw new Error(`Failed to sync Confluence page content. ${reason}`); + response.status(200).json({ success, content }); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + content: null, + }); + } +} + +/** + * Fetches the content of a specific confluence page via its chunkSource. + * Returns the content as a text string of the page in question and only that page. + * @param {object} data - metadata from document (eg: chunkSource) + * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response + */ +async function resyncGithub({ chunkSource }, response) { + if (!chunkSource) throw new Error('Invalid source property provided'); + try { + // Github file data is `payload` encrypted (might contain PAT). So we need to expand its + // encrypted payload back into query params so we can reFetch the page with same access token/params. + const source = response.locals.encryptionWorker.expandPayload(chunkSource); + const { fetchGithubFile } = require("../../utils/extensions/GithubRepo"); + const { success, reason, content } = await fetchGithubFile({ + repoUrl: `https:${source.pathname}`, // need to add back the real protocol + branch: source.searchParams.get('branch'), + accessToken: source.searchParams.get('pat'), + sourceFilePath: source.searchParams.get('path'), + }); + + if (!success) throw new Error(`Failed to sync Github file content. ${reason}`); + response.status(200).json({ success, content }); + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + content: null, + }); + } +} + +module.exports = { + link: resyncLink, + youtube: resyncYouTube, + confluence: resyncConfluence, + github: resyncGithub, +} \ No newline at end of file diff --git a/collector/middleware/setDataSigner.js b/collector/middleware/setDataSigner.js new file mode 100644 index 000000000..3ea3b2f81 --- /dev/null +++ b/collector/middleware/setDataSigner.js @@ -0,0 +1,41 @@ +const { EncryptionWorker } = require("../utils/EncryptionWorker"); +const { CommunicationKey } = require("../utils/comKey"); + +/** + * Express Response Object interface with defined encryptionWorker attached to locals property. + * @typedef {import("express").Response & import("express").Response['locals'] & {encryptionWorker: EncryptionWorker} } ResponseWithSigner +*/ + +// You can use this middleware to assign the EncryptionWorker to the response locals +// property so that if can be used to encrypt/decrypt arbitrary data via response object. +// eg: Encrypting API keys in chunk sources. + +// The way this functions is that the rolling RSA Communication Key is used server-side to private-key encrypt the raw +// key of the persistent EncryptionManager credentials. Since EncryptionManager credentials do _not_ roll, we should not send them +// even between server<>collector in plaintext because if the user configured the server/collector to be public they could technically +// be exposing the key in transit via the X-Payload-Signer header. Even if this risk is minimal we should not do this. + +// This middleware uses the CommunicationKey public key to first decrypt the base64 representation of the EncryptionManager credentials +// and then loads that in to the EncryptionWorker as a buffer so we can use the same credentials across the system. Should we ever break the +// collector out into its own service this would still work without SSL/TLS. + +/** + * + * @param {import("express").Request} request + * @param {import("express").Response} response + * @param {import("express").NextFunction} next + */ +function setDataSigner(request, response, next) { + const comKey = new CommunicationKey(); + const encryptedPayloadSigner = request.header("X-Payload-Signer"); + if (!encryptedPayloadSigner) console.log('Failed to find signed-payload to set encryption worker! Encryption calls will fail.'); + + const decryptedPayloadSignerKey = comKey.decrypt(encryptedPayloadSigner); + const encryptionWorker = new EncryptionWorker(decryptedPayloadSignerKey); + response.locals.encryptionWorker = encryptionWorker; + next(); +} + +module.exports = { + setDataSigner +} \ No newline at end of file diff --git a/collector/utils/EncryptionWorker/index.js b/collector/utils/EncryptionWorker/index.js new file mode 100644 index 000000000..ddc277331 --- /dev/null +++ b/collector/utils/EncryptionWorker/index.js @@ -0,0 +1,77 @@ +const crypto = require("crypto"); + +// Differs from EncryptionManager in that is does not set or define the keys that will be used +// to encrypt or read data and it must be told the key (as base64 string) explicitly that will be used and is provided to +// the class on creation. This key should be the same `key` that is used by the EncryptionManager class. +class EncryptionWorker { + constructor(presetKeyBase64 = "") { + this.key = Buffer.from(presetKeyBase64, "base64"); + this.algorithm = "aes-256-cbc"; + this.separator = ":"; + } + + log(text, ...args) { + console.log(`\x1b[36m[EncryptionManager]\x1b[0m ${text}`, ...args); + } + + /** + * Give a chunk source, parse its payload query param and expand that object back into the URL + * as additional query params + * @param {string} chunkSource + * @returns {URL} Javascript URL object with query params decrypted from payload query param. + */ + expandPayload(chunkSource = "") { + try { + const url = new URL(chunkSource); + if (!url.searchParams.has("payload")) return url; + + const decryptedPayload = this.decrypt(url.searchParams.get("payload")); + const encodedParams = JSON.parse(decryptedPayload); + url.searchParams.delete("payload"); // remove payload prop + + // Add all query params needed to replay as query params + Object.entries(encodedParams).forEach(([key, value]) => + url.searchParams.append(key, value) + ); + return url; + } catch (e) { + console.error(e); + } + return new URL(chunkSource); + } + + encrypt(plainTextString = null) { + try { + if (!plainTextString) + throw new Error("Empty string is not valid for this method."); + const iv = crypto.randomBytes(16); + const cipher = crypto.createCipheriv(this.algorithm, this.key, iv); + const encrypted = cipher.update(plainTextString, "utf8", "hex"); + return [ + encrypted + cipher.final("hex"), + Buffer.from(iv).toString("hex"), + ].join(this.separator); + } catch (e) { + this.log(e); + return null; + } + } + + decrypt(encryptedString) { + try { + const [encrypted, iv] = encryptedString.split(this.separator); + if (!iv) throw new Error("IV not found"); + const decipher = crypto.createDecipheriv( + this.algorithm, + this.key, + Buffer.from(iv, "hex") + ); + return decipher.update(encrypted, "hex", "utf8") + decipher.final("utf8"); + } catch (e) { + this.log(e); + return null; + } + } +} + +module.exports = { EncryptionWorker }; diff --git a/collector/utils/comKey/index.js b/collector/utils/comKey/index.js index 77ec1c612..a2e2f52a0 100644 --- a/collector/utils/comKey/index.js +++ b/collector/utils/comKey/index.js @@ -40,6 +40,15 @@ class CommunicationKey { } catch {} return false; } + + // Use the rolling public-key to decrypt arbitrary data that was encrypted via the private key on the server side CommunicationKey class + // that we know was done with the same key-pair and the given input is in base64 format already. + // Returns plaintext string of the data that was encrypted. + decrypt(base64String = "") { + return crypto + .publicDecrypt(this.#readPublicKey(), Buffer.from(base64String, "base64")) + .toString(); + } } module.exports = { CommunicationKey }; diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 7e9b4059d..7fdaa6d8c 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -3,44 +3,19 @@ const path = require("path"); const { default: slugify } = require("slugify"); const { v4 } = require("uuid"); const UrlPattern = require("url-pattern"); -const { writeToServerDocuments } = require("../../files"); +const { writeToServerDocuments, sanitizeFileName } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const { ConfluencePagesLoader, } = require("langchain/document_loaders/web/confluence"); -function validSpaceUrl(spaceUrl = "") { - // Atlassian default URL match - const atlassianPattern = new UrlPattern( - "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*" - ); - const atlassianMatch = atlassianPattern.match(spaceUrl); - if (atlassianMatch) { - return { valid: true, result: atlassianMatch }; - } - - let customMatch = null; - [ - "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*", // Custom Confluence space - "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*", // Custom Confluence space + Human-readable space tag. - ].forEach((matchPattern) => { - if (!!customMatch) return; - const pattern = new UrlPattern(matchPattern); - customMatch = pattern.match(spaceUrl); - }); - - if (customMatch) { - customMatch.customDomain = - (customMatch.subdomain ? `${customMatch.subdomain}.` : "") + // - `${customMatch.domain}.${customMatch.tld}`; - return { valid: true, result: customMatch, custom: true }; - } - - // No match - return { valid: false, result: null }; -} - -async function loadConfluence({ pageUrl, username, accessToken }) { +/** + * Load Confluence documents from a spaceID and Confluence credentials + * @param {object} args - forwarded request body params + * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker + * @returns + */ +async function loadConfluence({ pageUrl, username, accessToken }, response) { if (!pageUrl || !username || !accessToken) { return { success: false, @@ -49,21 +24,16 @@ async function loadConfluence({ pageUrl, username, accessToken }) { }; } - const validSpace = validSpaceUrl(pageUrl); - if (!validSpace.result) { + const { valid, result } = validSpaceUrl(pageUrl); + if (!valid) { return { success: false, reason: - "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*", + "Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*", }; } - const { subdomain, customDomain, spaceKey } = validSpace.result; - let baseUrl = `https://${subdomain}.atlassian.net/wiki`; - if (customDomain) { - baseUrl = `https://${customDomain}/wiki`; - } - + const { apiBase: baseUrl, spaceKey, subdomain } = result; console.log(`-- Working Confluence ${baseUrl} --`); const loader = new ConfluencePagesLoader({ baseUrl, @@ -106,7 +76,10 @@ async function loadConfluence({ pageUrl, username, accessToken }) { docAuthor: subdomain, description: doc.metadata.title, docSource: `${subdomain} Confluence`, - chunkSource: `confluence://${doc.metadata.url}`, + chunkSource: generateChunkSource( + { doc, baseUrl, accessToken, username }, + response.locals.encryptionWorker + ), published: new Date().toLocaleString(), wordCount: doc.pageContent.split(" ").length, pageContent: doc.pageContent, @@ -116,11 +89,11 @@ async function loadConfluence({ pageUrl, username, accessToken }) { console.log( `[Confluence Loader]: Saving ${doc.metadata.title} to ${outFolder}` ); - writeToServerDocuments( - data, - `${slugify(doc.metadata.title)}-${data.id}`, - outFolderPath + + const fileName = sanitizeFileName( + `${slugify(doc.metadata.title)}-${data.id}` ); + writeToServerDocuments(data, fileName, outFolderPath); }); return { @@ -133,4 +106,194 @@ async function loadConfluence({ pageUrl, username, accessToken }) { }; } -module.exports = loadConfluence; +/** + * Gets the page content from a specific Confluence page, not all pages in a workspace. + * @returns + */ +async function fetchConfluencePage({ + pageUrl, + baseUrl, + username, + accessToken, +}) { + if (!pageUrl || !baseUrl || !username || !accessToken) { + return { + success: false, + content: null, + reason: + "You need either a username and access token, or a personal access token (PAT), to use the Confluence connector.", + }; + } + + const { valid, result } = validSpaceUrl(pageUrl); + if (!valid) { + return { + success: false, + content: null, + reason: + "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*", + }; + } + + console.log(`-- Working Confluence Page ${pageUrl} --`); + const { spaceKey } = result; + const loader = new ConfluencePagesLoader({ + baseUrl, + spaceKey, + username, + accessToken, + }); + + const { docs, error } = await loader + .load() + .then((docs) => { + return { docs, error: null }; + }) + .catch((e) => { + return { + docs: [], + error: e.message?.split("Error:")?.[1] || e.message, + }; + }); + + if (!docs.length || !!error) { + return { + success: false, + reason: error ?? "No pages found for that Confluence space.", + content: null, + }; + } + + const targetDocument = docs.find( + (doc) => doc.pageContent && doc.metadata.url === pageUrl + ); + if (!targetDocument) { + return { + success: false, + reason: "Target page could not be found in Confluence space.", + content: null, + }; + } + + return { + success: true, + reason: null, + content: targetDocument.pageContent, + }; +} + +/** + * A match result for a url-pattern of a Confluence URL + * @typedef {Object} ConfluenceMatchResult + * @property {string} subdomain - the subdomain of an organization's Confluence space + * @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect. + * @property {string} apiBase - the correct REST API url to use for loader. + */ + +/** + * Generates the correct API base URL for interfacing with the Confluence REST API + * depending on the URL pattern being used since there are various ways to host/access a + * Confluence space. + * @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match + * @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL + * @returns {string} - the resulting REST API URL + */ +function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) { + const { subdomain } = matchResult; + let subpath = isCustomDomain ? `` : `/wiki`; + if (isCustomDomain) return `https://${customDomain}${subpath}`; + return `https://${subdomain}.atlassian.net${subpath}`; +} + +/** + * Validates and parses the correct information from a given Confluence URL + * @param {string} spaceUrl - The organization's Confluence URL to parse + * @returns {{ + * valid: boolean, + * result: (ConfluenceMatchResult|null), + * }} + */ +function validSpaceUrl(spaceUrl = "") { + let matchResult; + const patterns = { + default: new UrlPattern( + "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*" + ), + subdomain: new UrlPattern( + "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*" + ), + custom: new UrlPattern( + "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*" + ), + }; + + // If using the default Atlassian Confluence URL pattern. + // We can proceed because the Library/API can use this base url scheme. + matchResult = patterns.default.match(spaceUrl); + if (matchResult) + return { + valid: matchResult.hasOwnProperty("spaceKey"), + result: { + ...matchResult, + apiBase: generateAPIBaseUrl(matchResult), + }, + }; + + // If using a custom subdomain Confluence URL pattern. + // We need to attach the customDomain as a property to the match result + // so we can form the correct REST API base from the subdomain. + matchResult = patterns.subdomain.match(spaceUrl); + if (matchResult) { + return { + valid: matchResult.hasOwnProperty("spaceKey"), + result: { + ...matchResult, + apiBase: generateAPIBaseUrl(matchResult), + }, + }; + } + + // If using a base FQDN Confluence URL pattern. + // We need to attach the customDomain as a property to the match result + // so we can form the correct REST API base from the root domain since /display/ is basically a URL mask. + matchResult = patterns.custom.match(spaceUrl); + if (matchResult) { + return { + valid: matchResult.hasOwnProperty("spaceKey"), + result: { + ...matchResult, + apiBase: generateAPIBaseUrl(matchResult, true), + }, + }; + } + + // No match + return { valid: false, result: null }; +} + +/** + * Generate the full chunkSource for a specific Confluence page so that we can resync it later. + * This data is encrypted into a single `payload` query param so we can replay credentials later + * since this was encrypted with the systems persistent password and salt. + * @param {object} chunkSourceInformation + * @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker + * @returns {string} + */ +function generateChunkSource( + { doc, baseUrl, accessToken, username }, + encryptionWorker +) { + const payload = { + baseUrl, + token: accessToken, + username, + }; + return `confluence://${doc.metadata.url}?payload=${encryptionWorker.encrypt( + JSON.stringify(payload) + )}`; +} + +module.exports = { + loadConfluence, + fetchConfluencePage, +}; diff --git a/collector/utils/extensions/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/GithubRepo/RepoLoader/index.js index c842f621b..af8a1dfc3 100644 --- a/collector/utils/extensions/GithubRepo/RepoLoader/index.js +++ b/collector/utils/extensions/GithubRepo/RepoLoader/index.js @@ -150,6 +150,36 @@ class RepoLoader { this.branches = [...new Set(branches.flat())]; return this.#branchPrefSort(this.branches); } + + async fetchSingleFile(sourceFilePath) { + try { + return fetch( + `https://api.github.com/repos/${this.author}/${this.project}/contents/${sourceFilePath}?ref=${this.branch}`, + { + method: "GET", + headers: { + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + ...(!!this.accessToken + ? { Authorization: `Bearer ${this.accessToken}` } + : {}), + }, + } + ) + .then((res) => { + if (res.ok) return res.json(); + throw new Error(`Failed to fetch from Github API: ${res.statusText}`); + }) + .then((json) => { + if (json.hasOwnProperty("status") || !json.hasOwnProperty("content")) + throw new Error(json?.message || "missing content"); + return atob(json.content); + }); + } catch (e) { + console.error(`RepoLoader.fetchSingleFile`, e); + return null; + } + } } module.exports = RepoLoader; diff --git a/collector/utils/extensions/GithubRepo/index.js b/collector/utils/extensions/GithubRepo/index.js index a87445dad..83b4438c8 100644 --- a/collector/utils/extensions/GithubRepo/index.js +++ b/collector/utils/extensions/GithubRepo/index.js @@ -6,7 +6,13 @@ const { v4 } = require("uuid"); const { writeToServerDocuments, documentsFolder } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); -async function loadGithubRepo(args) { +/** + * Load in a Github Repo recursively or just the top level if no PAT is provided + * @param {object} args - forwarded request body params + * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker + * @returns + */ +async function loadGithubRepo(args, response) { const repo = new RepoLoader(args); await repo.init(); @@ -43,7 +49,11 @@ async function loadGithubRepo(args) { docAuthor: repo.author, description: "No description found.", docSource: doc.metadata.source, - chunkSource: `link://${doc.metadata.repository}/blob/${doc.metadata.branch}/${doc.metadata.source}`, + chunkSource: generateChunkSource( + repo, + doc, + response.locals.encryptionWorker + ), published: new Date().toLocaleString(), wordCount: doc.pageContent.split(" ").length, pageContent: doc.pageContent, @@ -72,4 +82,69 @@ async function loadGithubRepo(args) { }; } -module.exports = loadGithubRepo; +/** + * Gets the page content from a specific source file in a give Github Repo, not all items in a repo. + * @returns + */ +async function fetchGithubFile({ + repoUrl, + branch, + accessToken = null, + sourceFilePath, +}) { + const repo = new RepoLoader({ + repo: repoUrl, + branch, + accessToken, + }); + await repo.init(); + + if (!repo.ready) + return { + success: false, + content: null, + reason: "Could not prepare Github repo for loading! Check URL or PAT.", + }; + + console.log( + `-- Working Github ${repo.author}/${repo.project}:${repo.branch} file:${sourceFilePath} --` + ); + const fileContent = await repo.fetchSingleFile(sourceFilePath); + if (!fileContent) { + return { + success: false, + reason: "Target file returned a null content response.", + content: null, + }; + } + + return { + success: true, + reason: null, + content: fileContent, + }; +} + +/** + * Generate the full chunkSource for a specific file so that we can resync it later. + * This data is encrypted into a single `payload` query param so we can replay credentials later + * since this was encrypted with the systems persistent password and salt. + * @param {RepoLoader} repo + * @param {import("@langchain/core/documents").Document} doc + * @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker + * @returns {string} + */ +function generateChunkSource(repo, doc, encryptionWorker) { + const payload = { + owner: repo.author, + project: repo.project, + branch: repo.branch, + path: doc.metadata.source, + pat: !!repo.accessToken ? repo.accessToken : null, + }; + return `github://${repo.repo}?payload=${encryptionWorker.encrypt( + JSON.stringify(payload) + )}`; +} + +module.exports = { loadGithubRepo, fetchGithubFile }; diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index c5e7671e8..5b053c5b7 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -10,7 +10,7 @@ const path = require("path"); const fs = require("fs"); async function discoverLinks(startUrl, depth = 1, maxLinks = 20) { - const baseUrl = new URL(startUrl).origin; + const baseUrl = new URL(startUrl); const discoveredLinks = new Set(); const pendingLinks = [startUrl]; let currentLevel = 0; @@ -66,8 +66,12 @@ function extractLinks(html, baseUrl) { for (const link of links) { const href = link.getAttribute("href"); if (href) { - const absoluteUrl = new URL(href, baseUrl).href; - if (absoluteUrl.startsWith(baseUrl)) { + const absoluteUrl = new URL(href, baseUrl.href).href; + if ( + absoluteUrl.startsWith( + baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/") + ) + ) { extractedLinks.add(absoluteUrl); } } diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js index 8c541bd31..f93aa0bb2 100644 --- a/collector/utils/extensions/YoutubeTranscript/index.js +++ b/collector/utils/extensions/YoutubeTranscript/index.js @@ -25,11 +25,13 @@ function validYoutubeVideoUrl(link) { return false; } -async function loadYouTubeTranscript({ url }) { +async function fetchVideoTranscriptContent({ url }) { if (!validYoutubeVideoUrl(url)) { return { success: false, reason: "Invalid URL. Should be youtu.be or youtube.com/watch.", + content: null, + metadata: {}, }; } @@ -51,6 +53,8 @@ async function loadYouTubeTranscript({ url }) { return { success: false, reason: error ?? "No transcript found for that YouTube video.", + content: null, + metadata: {}, }; } @@ -60,9 +64,30 @@ async function loadYouTubeTranscript({ url }) { return { success: false, reason: "No transcript could be parsed for that YouTube video.", + content: null, + metadata: {}, }; } + return { + success: true, + reason: null, + content, + metadata, + }; +} + +async function loadYouTubeTranscript({ url }) { + const transcriptResults = await fetchVideoTranscriptContent({ url }); + if (!transcriptResults.success) { + return { + success: false, + reason: + transcriptResults.reason || + "An unknown error occurred during transcription retrieval", + }; + } + const { content, metadata } = transcriptResults; const outFolder = slugify( `${metadata.author} YouTube transcripts` ).toLowerCase(); @@ -76,7 +101,7 @@ async function loadYouTubeTranscript({ url }) { docAuthor: metadata.author, description: metadata.description, docSource: url, - chunkSource: `link://${url}`, + chunkSource: `youtube://${url}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, @@ -101,4 +126,7 @@ async function loadYouTubeTranscript({ url }) { }; } -module.exports = loadYouTubeTranscript; +module.exports = { + loadYouTubeTranscript, + fetchVideoTranscriptContent, +}; diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 1263a59d0..0e2f5061a 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -131,6 +131,11 @@ function normalizePath(filepath = "") { return result; } +function sanitizeFileName(fileName) { + if (!fileName) return fileName; + return fileName.replace(/[<>:"\/\\|?*]/g, ""); +} + module.exports = { documentsFolder, trashFile, @@ -140,4 +145,5 @@ module.exports = { wipeCollectorStorage, normalizePath, isWithin, + sanitizeFileName, }; diff --git a/docker/.env.example b/docker/.env.example index a38b4c5a2..38b980880 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -2,6 +2,8 @@ SERVER_PORT=3001 STORAGE_DIR="/app/server/storage" UID='1000' GID='1000' +# SIG_KEY='passphrase' # Please generate random string at least 32 chars long. +# SIG_SALT='salt' # Please generate random string at least 32 chars long. # JWT_SECRET="my-random-string-for-seeding" # Only needed if AUTH_TOKEN is set. Please generate random string at least 12 chars long. ########################################### @@ -134,6 +136,12 @@ GID='1000' # LITE_LLM_BASE_PATH='http://127.0.0.1:4000' # LITE_LLM_API_KEY='sk-123abc' +# EMBEDDING_ENGINE='generic-openai' +# EMBEDDING_MODEL_PREF='text-embedding-ada-002' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 +# EMBEDDING_BASE_PATH='http://127.0.0.1:4000' +# GENERIC_OPEN_AI_EMBEDDING_API_KEY='sk-123abc' + ########################################### ######## Vector Database Selection ######## ########################################### @@ -245,3 +253,6 @@ GID='1000' #------ Serply.io ----------- https://serply.io/ # AGENT_SERPLY_API_KEY= + +#------ SearXNG ----------- https://github.com/searxng/searxng +# AGENT_SEARXNG_API_URL= \ No newline at end of file diff --git a/docker/HOW_TO_USE_DOCKER.md b/docker/HOW_TO_USE_DOCKER.md index f570dce90..1e95bd8a0 100644 --- a/docker/HOW_TO_USE_DOCKER.md +++ b/docker/HOW_TO_USE_DOCKER.md @@ -87,46 +87,51 @@ mintplexlabs/anythingllm; - Docker Compose - - version: '3.8' - services: - anythingllm: - image: mintplexlabs/anythingllm - container_name: anythingllm - ports: - - "3001:3001" - cap_add: - - SYS_ADMIN - environment: - # Adjust for your environemnt - - STORAGE_DIR=/app/server/storage - - JWT_SECRET="make this a large list of random numbers and letters 20+" - - LLM_PROVIDER=ollama - - OLLAMA_BASE_PATH=http://127.0.0.1:11434 - - OLLAMA_MODEL_PREF=llama2 - - OLLAMA_MODEL_TOKEN_LIMIT=4096 - - EMBEDDING_ENGINE=ollama - - EMBEDDING_BASE_PATH=http://127.0.0.1:11434 - - EMBEDDING_MODEL_PREF=nomic-embed-text:latest - - EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 - - VECTOR_DB=lancedb - - WHISPER_PROVIDER=local - - TTS_PROVIDER=native - - PASSWORDMINCHAR=8 - - AGENT_SERPER_DEV_KEY="SERPER DEV API KEY" - - AGENT_SERPLY_API_KEY="Serply.io API KEY" - volumes: - - anythingllm_storage:/app/server/storage - restart: always + Docker Compose + + + +```yaml +version: '3.8' +services: + anythingllm: + image: mintplexlabs/anythingllm + container_name: anythingllm + ports: + - "3001:3001" + cap_add: + - SYS_ADMIN + environment: + # Adjust for your environment + - STORAGE_DIR=/app/server/storage + - JWT_SECRET="make this a large list of random numbers and letters 20+" + - LLM_PROVIDER=ollama + - OLLAMA_BASE_PATH=http://127.0.0.1:11434 + - OLLAMA_MODEL_PREF=llama2 + - OLLAMA_MODEL_TOKEN_LIMIT=4096 + - EMBEDDING_ENGINE=ollama + - EMBEDDING_BASE_PATH=http://127.0.0.1:11434 + - EMBEDDING_MODEL_PREF=nomic-embed-text:latest + - EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 + - VECTOR_DB=lancedb + - WHISPER_PROVIDER=local + - TTS_PROVIDER=native + - PASSWORDMINCHAR=8 + - AGENT_SERPER_DEV_KEY="SERPER DEV API KEY" + - AGENT_SERPLY_API_KEY="Serply.io API KEY" + volumes: + - anythingllm_storage:/app/server/storage + restart: always + +volumes: + anythingllm_storage: + driver: local + driver_opts: + type: none + o: bind + device: /path/on/local/disk +``` - volumes: - anythingllm_storage: - driver: local - driver_opts: - type: none - o: bind - device: /path/on/local/disk diff --git a/embed/package.json b/embed/package.json index 712af8e6c..111b68f8b 100644 --- a/embed/package.json +++ b/embed/package.json @@ -6,9 +6,12 @@ "scripts": { "dev": "nodemon -e js,jsx,css --watch src --exec \"yarn run dev:preview\"", "dev:preview": "yarn run dev:build && yarn serve . -p 3080 --no-clipboard", - "dev:build": "vite build && cat src/static/tailwind@3.4.1.js >> dist/anythingllm-chat-widget.js", - "build": "vite build && cat src/static/tailwind@3.4.1.js >> dist/anythingllm-chat-widget.js && npx terser --compress -o dist/anythingllm-chat-widget.min.js -- dist/anythingllm-chat-widget.js", - "build:publish": "yarn build && mkdir -p ../frontend/public/embed && cp -r dist/anythingllm-chat-widget.min.js ../frontend/public/embed/anythingllm-chat-widget.min.js", + "dev:build": "vite build && yarn styles", + "styles": "npx cleancss -o dist/anythingllm-chat-widget.min.css dist/style.css", + "build": "vite build && yarn styles && npx terser --compress -o dist/anythingllm-chat-widget.min.js -- dist/anythingllm-chat-widget.js", + "build:publish": "yarn build:publish:js && yarn build:publish:css", + "build:publish:js": "yarn build && mkdir -p ../frontend/public/embed && cp -r dist/anythingllm-chat-widget.min.js ../frontend/public/embed/anythingllm-chat-widget.min.js", + "build:publish:css": "cp -r dist/anythingllm-chat-widget.min.css ../frontend/public/embed/anythingllm-chat-widget.min.css", "lint": "yarn prettier --ignore-path ../.prettierignore --write ./src" }, "dependencies": { @@ -29,16 +32,20 @@ "@types/react-dom": "^18.2.15", "@vitejs/plugin-react": "^4.2.0", "autoprefixer": "^10.4.14", + "clean-css": "^5.3.3", + "clean-css-cli": "^5.6.3", "eslint": "^8.53.0", "eslint-plugin-react": "^7.33.2", "eslint-plugin-react-hooks": "^4.6.0", "eslint-plugin-react-refresh": "^0.4.4", "globals": "^13.21.0", "nodemon": "^2.0.22", + "postcss": "^8.4.23", "prettier": "^3.0.3", "serve": "^14.2.1", + "tailwindcss": "3.4.1", "terser": "^5.27.0", "vite": "^5.0.0", "vite-plugin-singlefile": "^0.13.5" } -} +} \ No newline at end of file diff --git a/embed/postcss.config.js b/embed/postcss.config.js new file mode 100644 index 000000000..568a99e36 --- /dev/null +++ b/embed/postcss.config.js @@ -0,0 +1,10 @@ +import tailwind from 'tailwindcss' +import autoprefixer from 'autoprefixer' +import tailwindConfig from './tailwind.config.js' + +export default { + plugins: [ + tailwind(tailwindConfig), + autoprefixer, + ], +} \ No newline at end of file diff --git a/embed/src/App.jsx b/embed/src/App.jsx index 44653031d..50e3f1f18 100644 --- a/embed/src/App.jsx +++ b/embed/src/App.jsx @@ -20,29 +20,29 @@ export default function App() { if (!embedSettings.loaded) return null; const positionClasses = { - "bottom-left": "bottom-0 left-0 ml-4", - "bottom-right": "bottom-0 right-0 mr-4", - "top-left": "top-0 left-0 ml-4 mt-4", - "top-right": "top-0 right-0 mr-4 mt-4", + "bottom-left": "allm-bottom-0 allm-left-0 allm-ml-4", + "bottom-right": "allm-bottom-0 allm-right-0 allm-mr-4", + "top-left": "allm-top-0 allm-left-0 allm-ml-4 allm-mt-4", + "top-right": "allm-top-0 allm-right-0 allm-mr-4 allm-mt-4", }; const position = embedSettings.position || "bottom-right"; - const windowWidth = embedSettings.windowWidth - ? `max-w-[${embedSettings.windowWidth}]` - : "max-w-[400px]"; - const windowHeight = embedSettings.windowHeight - ? `max-h-[${embedSettings.windowHeight}]` - : "max-h-[700px]"; + const windowWidth = embedSettings.windowWidth ?? "400px"; + const windowHeight = embedSettings.windowHeight ?? "700px"; return ( <>

{isChatOpen && ( @@ -57,7 +57,7 @@ export default function App() { {!isChatOpen && (
{ return ( -
+
{/* Other actions to go here later. */}
@@ -16,17 +16,17 @@ function CopyMessage({ message }) { const { copied, copyText } = useCopyText(); return ( <> -
+
@@ -34,7 +34,7 @@ function CopyMessage({ message }) { id="copy-assistant-text" place="bottom" delayShow={300} - className="tooltip !text-xs" + className="allm-tooltip !allm-text-xs" /> ); diff --git a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx index 2eab8cca2..d4bc867e9 100644 --- a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx +++ b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx @@ -14,14 +14,14 @@ const HistoricalMessage = forwardRef( ref ) => { const textSize = !!embedderSettings.settings.textSize - ? `text-[${embedderSettings.settings.textSize}px]` - : "text-sm"; + ? `allm-text-[${embedderSettings.settings.textSize}px]` + : "allm-text-sm"; return (
{role === "assistant" && (
{embedderSettings.settings.assistantName || "Anything LLM Chat Assistant"} @@ -30,42 +30,48 @@ const HistoricalMessage = forwardRef(
{role === "assistant" && ( Anything LLM Icon )}
-
+
{error ? ( -
- - Could not - respond to message. +
+ + {" "} + Could not respond to message. -

+

{error}

) : ( {formatDate(sentAt)}
diff --git a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/PromptReply/index.jsx b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/PromptReply/index.jsx index 877012226..adc442851 100644 --- a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/PromptReply/index.jsx +++ b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/PromptReply/index.jsx @@ -11,18 +11,23 @@ const PromptReply = forwardRef( if (pending) { return ( -
+
Anything LLM Icon
-
-
+
+
@@ -31,23 +36,27 @@ const PromptReply = forwardRef( if (error) { return ( -
+
Anything LLM Icon
-
+
- Could not - respond to message. - Reason: {error || "unknown"} + {" "} + Could not respond to message. + + Reason: {error || "unknown"} +
@@ -56,9 +65,9 @@ const PromptReply = forwardRef( } return ( -
+
{embedderSettings.settings.assistantName || "Anything LLM Chat Assistant"} @@ -66,29 +75,32 @@ const PromptReply = forwardRef(
Anything LLM Icon
-
+
{formatDate(Date.now() / 1000)}
diff --git a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/index.jsx b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/index.jsx index 0719043ff..70bf2ca20 100644 --- a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/index.jsx +++ b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/index.jsx @@ -46,9 +46,9 @@ export default function ChatHistory({ settings = {}, history = [] }) { if (history.length === 0) { return ( -
-
-

+

+
+

{settings?.greeting ?? "Send a chat to get started."}

@@ -58,7 +58,7 @@ export default function ChatHistory({ settings = {}, history = [] }) { return (
@@ -97,12 +97,12 @@ export default function ChatHistory({ settings = {}, history = [] }) { ); })} {!isAtBottom && ( -
-
-
+
+
+
-
-
- +
+
+
+
diff --git a/embed/src/components/ChatWindow/ChatContainer/PromptInput/index.jsx b/embed/src/components/ChatWindow/ChatContainer/PromptInput/index.jsx index 8a8b58cf2..14961ca65 100644 --- a/embed/src/components/ChatWindow/ChatContainer/PromptInput/index.jsx +++ b/embed/src/components/ChatWindow/ChatContainer/PromptInput/index.jsx @@ -46,14 +46,17 @@ export default function PromptInput({ }; return ( -
+
-
-
-
+
+
+