From b35feede879c543e2d6cb58c89f973b29073ecc0 Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Tue, 16 Jan 2024 16:04:22 -0800 Subject: [PATCH] 570 document api return object (#608) * Add support for fetching single document in documents folder * Add document object to upload + support link scraping via API * hotfixes for documentation * update api docs --- collector/index.js | 16 ++- collector/processLink/convert/generic.js | 13 +- .../processSingleFile/convert/asAudio.js | 14 +- collector/processSingleFile/convert/asDocx.js | 13 +- collector/processSingleFile/convert/asMbox.js | 15 ++- .../processSingleFile/convert/asOfficeMime.js | 13 +- collector/processSingleFile/convert/asPDF.js | 13 +- collector/processSingleFile/convert/asTxt.js | 13 +- collector/processSingleFile/index.js | 4 + collector/utils/files/index.js | 19 ++- server/endpoints/api/document/index.js | 121 +++++++++++++++++- server/endpoints/api/workspace/index.js | 4 +- server/swagger/openapi.json | 105 ++++++++++++++- server/utils/files/documentProcessor.js | 4 +- 14 files changed, 324 insertions(+), 43 deletions(-) diff --git a/collector/index.js b/collector/index.js index 5070ae72..062d7895 100644 --- a/collector/index.js +++ b/collector/index.js @@ -29,14 +29,21 @@ app.post("/process", async function (request, response) { const targetFilename = path .normalize(filename) .replace(/^(\.\.(\/|\\|$))+/, ""); - const { success, reason } = await processSingleFile(targetFilename); - response.status(200).json({ filename: targetFilename, success, reason }); + const { + success, + reason, + documents = [], + } = await processSingleFile(targetFilename); + response + .status(200) + .json({ filename: targetFilename, success, reason, documents }); } catch (e) { console.error(e); response.status(200).json({ filename: filename, success: false, reason: "A processing error occurred.", + documents: [], }); } return; @@ -45,14 +52,15 @@ app.post("/process", async function (request, response) { app.post("/process-link", async function (request, response) { const { link } = reqBody(request); try { - const { success, reason } = await processLink(link); - response.status(200).json({ url: link, success, reason }); + const { success, reason, documents = [] } = await processLink(link); + response.status(200).json({ url: link, success, reason, documents }); } catch (e) { console.error(e); response.status(200).json({ url: link, success: false, reason: "A processing error occurred.", + documents: [], }); } return; diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index f42dcd17..c6431d73 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -12,7 +12,11 @@ async function scrapeGenericUrl(link) { if (!content.length) { console.error(`Resulting URL content was empty at ${link}.`); - return { success: false, reason: `No URL content found at ${link}.` }; + return { + success: false, + reason: `No URL content found at ${link}.`, + documents: [], + }; } const url = new URL(link); @@ -32,9 +36,12 @@ async function scrapeGenericUrl(link) { token_count_estimate: tokenizeString(content).length, }; - writeToServerDocuments(data, `url-${slugify(filename)}-${data.id}`); + const document = writeToServerDocuments( + data, + `url-${slugify(filename)}-${data.id}` + ); console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, documents: [document] }; } async function getPageContent(link) { diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index a15207fb..7688d7b8 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -31,6 +31,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) { return { success: false, reason: `Failed to parse content from ${filename}.`, + documents: [], }; } @@ -43,7 +44,11 @@ async function asAudio({ fullFilePath = "", filename = "" }) { if (!content.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); - return { success: false, reason: `No text content found in ${filename}.` }; + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; } const data = { @@ -60,12 +65,15 @@ async function asAudio({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; - writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); trashFile(fullFilePath); console.log( `[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n` ); - return { success: true, reason: null }; + return { success: true, reason: null, documents: [document] }; } async function convertToWavAudioData(sourcePath) { diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index 7a64a042..b4fe7d2c 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -24,7 +24,11 @@ async function asDocX({ fullFilePath = "", filename = "" }) { if (!pageContent.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); - return { success: false, reason: `No text content found in ${filename}.` }; + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; } const content = pageContent.join(""); @@ -42,10 +46,13 @@ async function asDocX({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; - writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, documents: [document] }; } module.exports = asDocX; diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 30883f21..f62f6b2b 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -22,10 +22,15 @@ async function asMbox({ fullFilePath = "", filename = "" }) { if (!mails.length) { console.error(`Resulting mail items was empty for ${filename}.`); trashFile(fullFilePath); - return { success: false, reason: `No mail items found in ${filename}.` }; + return { + success: false, + reason: `No mail items found in ${filename}.`, + documents: [], + }; } let item = 1; + const documents = []; for (const mail of mails) { if (!mail.hasOwnProperty("text")) continue; @@ -52,14 +57,18 @@ async function asMbox({ fullFilePath = "", filename = "" }) { }; item++; - writeToServerDocuments(data, `${slugify(filename)}-${data.id}-msg-${item}`); + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}-msg-${item}` + ); + documents.push(document); } trashFile(fullFilePath); console.log( `[SUCCESS]: ${filename} messages converted & ready for embedding.\n` ); - return { success: true, reason: null }; + return { success: true, reason: null, documents }; } module.exports = asMbox; diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index a6eb0351..45b31661 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -20,7 +20,11 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { if (!content.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); - return { success: false, reason: `No text content found in ${filename}.` }; + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; } const data = { @@ -37,10 +41,13 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; - writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, documents: [document] }; } module.exports = asOfficeMime; diff --git a/collector/processSingleFile/convert/asPDF.js b/collector/processSingleFile/convert/asPDF.js index f6d869d5..b89b9741 100644 --- a/collector/processSingleFile/convert/asPDF.js +++ b/collector/processSingleFile/convert/asPDF.js @@ -29,7 +29,11 @@ async function asPDF({ fullFilePath = "", filename = "" }) { if (!pageContent.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); - return { success: false, reason: `No text content found in ${filename}.` }; + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; } const content = pageContent.join(""); @@ -47,10 +51,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; - writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, documents: [document] }; } module.exports = asPDF; diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index ad35e547..cf7260d4 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -19,7 +19,11 @@ async function asTxt({ fullFilePath = "", filename = "" }) { if (!content?.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); - return { success: false, reason: `No text content found in ${filename}.` }; + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; } console.log(`-- Working ${filename} --`); @@ -37,10 +41,13 @@ async function asTxt({ fullFilePath = "", filename = "" }) { token_count_estimate: tokenizeString(content).length, }; - writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); - return { success: true, reason: null }; + return { success: true, reason: null, documents: [document] }; } module.exports = asTxt; diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index 37c9fd5c..9efd3a70 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -13,11 +13,13 @@ async function processSingleFile(targetFilename) { return { success: false, reason: "Filename is a reserved filename and cannot be processed.", + documents: [], }; if (!fs.existsSync(fullFilePath)) return { success: false, reason: "File does not exist in upload directory.", + documents: [], }; const fileExtension = path.extname(fullFilePath).toLowerCase(); @@ -25,6 +27,7 @@ async function processSingleFile(targetFilename) { return { success: false, reason: `No file extension found. This file cannot be processed.`, + documents: [], }; } @@ -33,6 +36,7 @@ async function processSingleFile(targetFilename) { return { success: false, reason: `File extension ${fileExtension} not supported for parsing.`, + documents: [], }; } diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 915c4ac1..caf33c88 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -38,14 +38,19 @@ function writeToServerDocuments( ); if (!fs.existsSync(destination)) fs.mkdirSync(destination, { recursive: true }); - const destinationFilePath = path.resolve(destination, filename); + const destinationFilePath = path.resolve(destination, filename) + ".json"; - fs.writeFileSync( - destinationFilePath + ".json", - JSON.stringify(data, null, 4), - { encoding: "utf-8" } - ); - return; + fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), { + encoding: "utf-8", + }); + + return { + ...data, + // relative location string that can be passed into the /update-embeddings api + // that will work since we know the location exists and since we only allow + // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube. + location: destinationFilePath.split("/").slice(-2).join("/"), + }; } // When required we can wipe the entire collector hotdir and tmp storage in case diff --git a/server/endpoints/api/document/index.js b/server/endpoints/api/document/index.js index f1282e7c..81704352 100644 --- a/server/endpoints/api/document/index.js +++ b/server/endpoints/api/document/index.js @@ -5,11 +5,13 @@ const { checkProcessorAlive, acceptedFileTypes, processDocument, + processLink, } = require("../../../utils/files/documentProcessor"); const { viewLocalFiles, findDocumentInDocuments, } = require("../../../utils/files"); +const { reqBody } = require("../../../utils/http"); const { handleUploads } = setupMulter(); function apiDocumentEndpoints(app) { @@ -23,7 +25,6 @@ function apiDocumentEndpoints(app) { /* #swagger.tags = ['Documents'] #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.' - #swagger.requestBody = { description: 'File to be uploaded.', required: true, @@ -50,6 +51,21 @@ function apiDocumentEndpoints(app) { example: { success: true, error: null, + documents: [ + { + "location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json", + "name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json", + "url": "file:///Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt", + "title": "anythingllm.txt", + "docAuthor": "Unknown", + "description": "Unknown", + "docSource": "a text file uploaded by the user.", + "chunkSource": "anythingllm.txt", + "published": "1/16/2024, 3:07:00 PM", + "wordCount": 93, + "token_count_estimate": 115, + } + ] } } } @@ -75,16 +91,113 @@ function apiDocumentEndpoints(app) { .end(); } - const { success, reason } = await processDocument(originalname); + const { success, reason, documents } = + await processDocument(originalname); if (!success) { - response.status(500).json({ success: false, error: reason }).end(); + response + .status(500) + .json({ success: false, error: reason, documents }) + .end(); + return; } console.log( `Document ${originalname} uploaded processed and successfully. It is now available in documents.` ); await Telemetry.sendTelemetry("document_uploaded"); - response.status(200).json({ success: true, error: null }); + response.status(200).json({ success: true, error: null, documents }); + } catch (e) { + console.log(e.message, e); + response.sendStatus(500).end(); + } + } + ); + + app.post( + "/v1/document/upload-link", + [validApiKey], + async (request, response) => { + /* + #swagger.tags = ['Documents'] + #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding.' + #swagger.requestBody = { + description: 'Link of web address to be scraped.', + required: true, + type: 'file', + content: { + "application/json": { + schema: { + type: 'object', + example: { + "link": "https://useanything.com" + } + } + } + } + } + #swagger.responses[200] = { + content: { + "application/json": { + schema: { + type: 'object', + example: { + success: true, + error: null, + documents: [ + { + "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc", + "url": "file://useanything_com.html", + "title": "useanything_com.html", + "docAuthor": "no author found", + "description": "No description found.", + "docSource": "URL link uploaded by the user.", + "chunkSource": "https:useanything.com.html", + "published": "1/16/2024, 3:46:33 PM", + "wordCount": 252, + "pageContent": "AnythingLLM is the best....", + "token_count_estimate": 447, + "location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json" + } + ] + } + } + } + } + } + #swagger.responses[403] = { + schema: { + "$ref": "#/definitions/InvalidAPIKey" + } + } + */ + try { + const { link } = reqBody(request); + const processingOnline = await checkProcessorAlive(); + + if (!processingOnline) { + response + .status(500) + .json({ + success: false, + error: `Document processing API is not online. Link ${link} will not be processed automatically.`, + }) + .end(); + } + + const { success, reason, documents } = await processLink(link); + if (!success) { + response + .status(500) + .json({ success: false, error: reason, documents }) + .end(); + return; + } + + console.log( + `Link ${link} uploaded processed and successfully. It is now available in documents.` + ); + await Telemetry.sendTelemetry("document_uploaded"); + response.status(200).json({ success: true, error: null, documents }); } catch (e) { console.log(e.message, e); response.sendStatus(500).end(); diff --git a/server/endpoints/api/workspace/index.js b/server/endpoints/api/workspace/index.js index 365e8b01..c1642ce4 100644 --- a/server/endpoints/api/workspace/index.js +++ b/server/endpoints/api/workspace/index.js @@ -381,8 +381,8 @@ function apiWorkspaceEndpoints(app) { content: { "application/json": { example: { - adds: [], - deletes: ["custom-documents/anythingllm-hash.json"] + adds: ["custom-documents/my-pdf.pdf-hash.json"], + deletes: ["custom-documents/anythingllm.txt-hash.json"] } } } diff --git a/server/swagger/openapi.json b/server/swagger/openapi.json index 7d91579f..c7532059 100644 --- a/server/swagger/openapi.json +++ b/server/swagger/openapi.json @@ -845,7 +845,22 @@ "type": "object", "example": { "success": true, - "error": null + "error": null, + "documents": [ + { + "location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json", + "name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json", + "url": "file://Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt", + "title": "anythingllm.txt", + "docAuthor": "Unknown", + "description": "Unknown", + "docSource": "a text file uploaded by the user.", + "chunkSource": "anythingllm.txt", + "published": "1/16/2024, 3:07:00 PM", + "wordCount": 93, + "token_count_estimate": 115 + } + ] } } } @@ -890,6 +905,88 @@ } } }, + "/v1/document/upload-link": { + "post": { + "tags": [ + "Documents" + ], + "description": "Upload a valid URL for AnythingLLM to scrape and prepare for embedding.", + "parameters": [ + { + "name": "Authorization", + "in": "header", + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "type": "object", + "example": { + "success": true, + "error": null, + "documents": [ + { + "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc", + "url": "file://useanything_com.html", + "title": "useanything_com.html", + "docAuthor": "no author found", + "description": "No description found.", + "docSource": "URL link uploaded by the user.", + "chunkSource": "https:useanything.com.html", + "published": "1/16/2024, 3:46:33 PM", + "wordCount": 252, + "pageContent": "AnythingLLM is the best....", + "token_count_estimate": 447, + "location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json" + } + ] + } + } + } + } + }, + "403": { + "description": "Forbidden", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvalidAPIKey" + } + }, + "application/xml": { + "schema": { + "$ref": "#/components/schemas/InvalidAPIKey" + } + } + } + }, + "500": { + "description": "Internal Server Error" + } + }, + "requestBody": { + "description": "Link of web address to be scraped.", + "required": true, + "type": "file", + "content": { + "application/json": { + "schema": { + "type": "object", + "example": { + "link": "https://useanything.com" + } + } + } + } + } + } + }, "/v1/documents": { "get": { "tags": [ @@ -1593,9 +1690,11 @@ "content": { "application/json": { "example": { - "adds": [], + "adds": [ + "custom-documents/my-pdf.pdf-hash.json" + ], "deletes": [ - "custom-documents/anythingllm-hash.json" + "custom-documents/anythingllm.txt-hash.json" ] } } diff --git a/server/utils/files/documentProcessor.js b/server/utils/files/documentProcessor.js index 5239a870..27d0f5f2 100644 --- a/server/utils/files/documentProcessor.js +++ b/server/utils/files/documentProcessor.js @@ -35,7 +35,7 @@ async function processDocument(filename = "") { .then((res) => res) .catch((e) => { console.log(e.message); - return { success: false, reason: e.message }; + return { success: false, reason: e.message, documents: [] }; }); } @@ -55,7 +55,7 @@ async function processLink(link = "") { .then((res) => res) .catch((e) => { console.log(e.message); - return { success: false, reason: e.message }; + return { success: false, reason: e.message, documents: [] }; }); }