diff --git a/.vscode/settings.json b/.vscode/settings.json index 72b612b8..a87e960b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,6 +2,7 @@ "cSpell.words": [ "anythingllm", "Astra", + "comkey", "Dockerized", "Embeddable", "GROQ", @@ -20,4 +21,4 @@ ], "eslint.experimental.useFlatConfig": true, "docker.languageserver.formatter.ignoreMultilineInstructions": true -} +} \ No newline at end of file diff --git a/collector/extensions/index.js b/collector/extensions/index.js index bcf2229f..0e91d173 100644 --- a/collector/extensions/index.js +++ b/collector/extensions/index.js @@ -1,9 +1,10 @@ +const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity"); const { reqBody } = require("../utils/http"); function extensions(app) { if (!app) return; - app.post("/ext/github-repo", async function (request, response) { + app.post("/ext/github-repo", [verifyPayloadIntegrity], async function (request, response) { try { const loadGithubRepo = require("../utils/extensions/GithubRepo"); const { success, reason, data } = await loadGithubRepo(reqBody(request)); @@ -24,7 +25,7 @@ function extensions(app) { }); // gets all branches for a specific repo - app.post("/ext/github-repo/branches", async function (request, response) { + app.post("/ext/github-repo/branches", [verifyPayloadIntegrity], async function (request, response) { try { const GithubRepoLoader = require("../utils/extensions/GithubRepo/RepoLoader"); const allBranches = await (new GithubRepoLoader(reqBody(request))).getRepoBranches() @@ -48,7 +49,7 @@ function extensions(app) { return; }); - app.post("/ext/youtube-transcript", async function (request, response) { + app.post("/ext/youtube-transcript", [verifyPayloadIntegrity], async function (request, response) { try { const loadYouTubeTranscript = require("../utils/extensions/YoutubeTranscript"); const { success, reason, data } = await loadYouTubeTranscript(reqBody(request)); diff --git a/collector/index.js b/collector/index.js index a1142d75..f574730c 100644 --- a/collector/index.js +++ b/collector/index.js @@ -13,6 +13,7 @@ const { processLink } = require("./processLink"); const { wipeCollectorStorage } = require("./utils/files"); const extensions = require("./extensions"); const { processRawText } = require("./processRawText"); +const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity"); const app = express(); app.use(cors({ origin: true })); @@ -24,71 +25,83 @@ app.use( }) ); -app.post("/process", async function (request, response) { - const { filename, options = {} } = reqBody(request); - try { - const targetFilename = path - .normalize(filename) - .replace(/^(\.\.(\/|\\|$))+/, ""); - const { - success, - reason, - documents = [], - } = await processSingleFile(targetFilename, options); - response - .status(200) - .json({ filename: targetFilename, success, reason, documents }); - } catch (e) { - console.error(e); - response.status(200).json({ - filename: filename, - success: false, - reason: "A processing error occurred.", - documents: [], - }); +app.post( + "/process", + [verifyPayloadIntegrity], + async function (request, response) { + const { filename, options = {} } = reqBody(request); + try { + const targetFilename = path + .normalize(filename) + .replace(/^(\.\.(\/|\\|$))+/, ""); + const { + success, + reason, + documents = [], + } = await processSingleFile(targetFilename, options); + response + .status(200) + .json({ filename: targetFilename, success, reason, documents }); + } catch (e) { + console.error(e); + response.status(200).json({ + filename: filename, + success: false, + reason: "A processing error occurred.", + documents: [], + }); + } + return; } - return; -}); +); -app.post("/process-link", async function (request, response) { - const { link } = reqBody(request); - try { - const { success, reason, documents = [] } = await processLink(link); - response.status(200).json({ url: link, success, reason, documents }); - } catch (e) { - console.error(e); - response.status(200).json({ - url: link, - success: false, - reason: "A processing error occurred.", - documents: [], - }); +app.post( + "/process-link", + [verifyPayloadIntegrity], + async function (request, response) { + const { link } = reqBody(request); + try { + const { success, reason, documents = [] } = await processLink(link); + response.status(200).json({ url: link, success, reason, documents }); + } catch (e) { + console.error(e); + response.status(200).json({ + url: link, + success: false, + reason: "A processing error occurred.", + documents: [], + }); + } + return; } - return; -}); +); -app.post("/process-raw-text", async function (request, response) { - const { textContent, metadata } = reqBody(request); - try { - const { - success, - reason, - documents = [], - } = await processRawText(textContent, metadata); - response - .status(200) - .json({ filename: metadata.title, success, reason, documents }); - } catch (e) { - console.error(e); - response.status(200).json({ - filename: metadata?.title || "Unknown-doc.txt", - success: false, - reason: "A processing error occurred.", - documents: [], - }); +app.post( + "/process-raw-text", + [verifyPayloadIntegrity], + async function (request, response) { + const { textContent, metadata } = reqBody(request); + try { + const { + success, + reason, + documents = [], + } = await processRawText(textContent, metadata); + response + .status(200) + .json({ filename: metadata.title, success, reason, documents }); + } catch (e) { + console.error(e); + response.status(200).json({ + filename: metadata?.title || "Unknown-doc.txt", + success: false, + reason: "A processing error occurred.", + documents: [], + }); + } + return; } - return; -}); +); extensions(app); diff --git a/collector/middleware/verifyIntegrity.js b/collector/middleware/verifyIntegrity.js new file mode 100644 index 00000000..0dcb3f75 --- /dev/null +++ b/collector/middleware/verifyIntegrity.js @@ -0,0 +1,21 @@ +const { CommunicationKey } = require("../utils/comKey"); + +function verifyPayloadIntegrity(request, response, next) { + const comKey = new CommunicationKey(); + if (process.env.NODE_ENV === "development") { + comKey.log('verifyPayloadIntegrity is skipped in development.') + next(); + return; + } + + const signature = request.header("X-Integrity"); + if (!signature) return response.status(400).json({ msg: 'Failed integrity signature check.' }) + + const validSignedPayload = comKey.verify(signature, request.body); + if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' }) + next(); +} + +module.exports = { + verifyPayloadIntegrity +} \ No newline at end of file diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index 5d9e6a38..bdefb79e 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -4,11 +4,26 @@ const { WATCH_DIRECTORY, SUPPORTED_FILETYPE_CONVERTERS, } = require("../utils/constants"); -const { trashFile, isTextType } = require("../utils/files"); +const { + trashFile, + isTextType, + normalizePath, + isWithin, +} = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; async function processSingleFile(targetFilename, options = {}) { - const fullFilePath = path.resolve(WATCH_DIRECTORY, targetFilename); + const fullFilePath = path.resolve( + WATCH_DIRECTORY, + normalizePath(targetFilename) + ); + if (!isWithin(path.resolve(WATCH_DIRECTORY), fullFilePath)) + return { + success: false, + reason: "Filename is a not a valid path to process.", + documents: [], + }; + if (RESERVED_FILES.includes(targetFilename)) return { success: false, diff --git a/collector/utils/comKey/index.js b/collector/utils/comKey/index.js new file mode 100644 index 00000000..0e96a697 --- /dev/null +++ b/collector/utils/comKey/index.js @@ -0,0 +1,42 @@ +const crypto = require("crypto"); +const fs = require("fs"); +const path = require("path"); + +const keyPath = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../../server/storage/comkey`) + : path.resolve(process.env.STORAGE_DIR, `comkey`); + +class CommunicationKey { + #pubKeyName = "ipc-pub.pem"; + #storageLoc = keyPath; + + constructor() {} + + log(text, ...args) { + console.log(`\x1b[36m[CommunicationKeyVerify]\x1b[0m ${text}`, ...args); + } + + #readPublicKey() { + return fs.readFileSync(path.resolve(this.#storageLoc, this.#pubKeyName)); + } + + // Given a signed payload from private key from /app/server/ this signature should + // decode to match the textData provided. This class does verification only in collector. + // Note: The textData is typically the JSON stringified body sent to the document processor API. + verify(signature = "", textData = "") { + try { + let data = textData; + if (typeof textData !== "string") data = JSON.stringify(data); + return crypto.verify( + "RSA-SHA256", + Buffer.from(data), + this.#readPublicKey(), + Buffer.from(signature, "hex") + ); + } catch {} + return false; + } +} + +module.exports = { CommunicationKey }; diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 4bca62f9..9b56bb5b 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -108,10 +108,33 @@ async function wipeCollectorStorage() { return; } +/** + * Checks if a given path is within another path. + * @param {string} outer - The outer path (should be resolved). + * @param {string} inner - The inner path (should be resolved). + * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise. + */ +function isWithin(outer, inner) { + if (outer === inner) return false; + const rel = path.relative(outer, inner); + return !rel.startsWith("../") && rel !== ".."; +} + +function normalizePath(filepath = "") { + const result = path + .normalize(filepath.trim()) + .replace(/^(\.\.(\/|\\|$))+/, "") + .trim(); + if (["..", ".", "/"].includes(result)) throw new Error("Invalid path."); + return result; +} + module.exports = { trashFile, isTextType, createdDate, writeToServerDocuments, wipeCollectorStorage, + normalizePath, + isWithin, }; diff --git a/server/.gitignore b/server/.gitignore index f9d92ce1..b22a054f 100644 --- a/server/.gitignore +++ b/server/.gitignore @@ -3,6 +3,7 @@ storage/assets/* !storage/assets/anything-llm.png storage/documents/* +storage/comkey/* storage/tmp/* storage/vector-cache/*.json storage/exports diff --git a/server/utils/boot/index.js b/server/utils/boot/index.js index 7134198e..ea95e1f5 100644 --- a/server/utils/boot/index.js +++ b/server/utils/boot/index.js @@ -1,4 +1,5 @@ const { Telemetry } = require("../../models/telemetry"); +const { CommunicationKey } = require("../comKey"); const setupTelemetry = require("../telemetry"); function bootSSL(app, port = 3001) { @@ -16,6 +17,7 @@ function bootSSL(app, port = 3001) { .createServer(credentials, app) .listen(port, async () => { await setupTelemetry(); + new CommunicationKey(true); console.log(`Primary server in HTTPS mode listening on port ${port}`); }) .on("error", catchSigTerms); @@ -40,6 +42,7 @@ function bootHTTP(app, port = 3001) { app .listen(port, async () => { await setupTelemetry(); + new CommunicationKey(true); console.log(`Primary server in HTTP mode listening on port ${port}`); }) .on("error", catchSigTerms); diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index ed27a928..d96cd1e6 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -5,6 +5,8 @@ class CollectorApi { constructor() { + const { CommunicationKey } = require("../comKey"); + this.comkey = new CommunicationKey(); this.endpoint = `http://0.0.0.0:${process.env.COLLECTOR_PORT || 8888}`; } @@ -40,15 +42,19 @@ class CollectorApi { async processDocument(filename = "") { if (!filename) return false; + + const data = JSON.stringify({ + filename, + options: this.#attachOptions(), + }); + return await fetch(`${this.endpoint}/process`, { method: "POST", headers: { "Content-Type": "application/json", + "X-Integrity": this.comkey.sign(data), }, - body: JSON.stringify({ - filename, - options: this.#attachOptions(), - }), + body: data, }) .then((res) => { if (!res.ok) throw new Error("Response could not be completed"); @@ -64,12 +70,14 @@ class CollectorApi { async processLink(link = "") { if (!link) return false; + const data = JSON.stringify({ link }); return await fetch(`${this.endpoint}/process-link`, { method: "POST", headers: { "Content-Type": "application/json", + "X-Integrity": this.comkey.sign(data), }, - body: JSON.stringify({ link }), + body: data, }) .then((res) => { if (!res.ok) throw new Error("Response could not be completed"); @@ -83,12 +91,14 @@ class CollectorApi { } async processRawText(textContent = "", metadata = {}) { + const data = JSON.stringify({ textContent, metadata }); return await fetch(`${this.endpoint}/process-raw-text`, { method: "POST", headers: { "Content-Type": "application/json", + "X-Integrity": this.comkey.sign(data), }, - body: JSON.stringify({ textContent, metadata }), + body: data, }) .then((res) => { if (!res.ok) throw new Error("Response could not be completed"); @@ -110,6 +120,7 @@ class CollectorApi { body, // Stringified JSON! headers: { "Content-Type": "application/json", + "X-Integrity": this.comkey.sign(body), }, }) .then((res) => { diff --git a/server/utils/comKey/index.js b/server/utils/comKey/index.js new file mode 100644 index 00000000..e545a6be --- /dev/null +++ b/server/utils/comKey/index.js @@ -0,0 +1,75 @@ +const crypto = require("crypto"); +const fs = require("fs"); +const path = require("path"); +const keyPath = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../storage/comkey`) + : path.resolve(process.env.STORAGE_DIR, `comkey`); + +// What does this class do? +// This class generates a hashed version of some text (typically a JSON payload) using a rolling RSA key +// that can then be appended as a header value to do integrity checking on a payload. Given the +// nature of this class and that keys are rolled constantly, this protects the request +// integrity of requests sent to the collector as only the server can sign these requests. +// This keeps accidental misconfigurations of AnythingLLM that leaving port 8888 open from +// being abused or SSRF'd by users scraping malicious sites who have a loopback embedded in a