570 document api return object (#608)

* Add support for fetching single document in documents folder * Add document object to upload + support link scraping via API * hotfixes for documentation * update api docs
2024-11-05 06:20:10 +01:00 · 2024-01-16 16:04:22 -08:00 · 2024-01-16 16:04:22 -08:00 · b35feede87
commit b35feede87
parent c61cbd1502
14 changed files with 324 additions and 43 deletions
--- a/collector/index.js
+++ b/collector/index.js
@ -29,14 +29,21 @@ app.post("/process", async function (request, response) {
    const targetFilename = path
      .normalize(filename)
      .replace(/^(\.\.(\/|\\|$))+/, "");
-    const { success, reason } = await processSingleFile(targetFilename);
-    response.status(200).json({ filename: targetFilename, success, reason });
+    const {
+      success,
+      reason,
+      documents = [],
+    } = await processSingleFile(targetFilename);
+    response
+      .status(200)
+      .json({ filename: targetFilename, success, reason, documents });
  } catch (e) {
    console.error(e);
    response.status(200).json({
      filename: filename,
      success: false,
      reason: "A processing error occurred.",
+      documents: [],
    });
  }
  return;
@ -45,14 +52,15 @@ app.post("/process", async function (request, response) {
 app.post("/process-link", async function (request, response) {
  const { link } = reqBody(request);
  try {
-    const { success, reason } = await processLink(link);
-    response.status(200).json({ url: link, success, reason });
+    const { success, reason, documents = [] } = await processLink(link);
+    response.status(200).json({ url: link, success, reason, documents });
  } catch (e) {
    console.error(e);
    response.status(200).json({
      url: link,
      success: false,
      reason: "A processing error occurred.",
+      documents: [],
    });
  }
  return;
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -12,7 +12,11 @@ async function scrapeGenericUrl(link) {

  if (!content.length) {
    console.error(`Resulting URL content was empty at ${link}.`);
-    return { success: false, reason: `No URL content found at ${link}.` };
+    return {
+      success: false,
+      reason: `No URL content found at ${link}.`,
+      documents: [],
+    };
  }

  const url = new URL(link);
@ -32,9 +36,12 @@ async function scrapeGenericUrl(link) {
    token_count_estimate: tokenizeString(content).length,
  };

-  writeToServerDocuments(data, `url-${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `url-${slugify(filename)}-${data.id}`
+  );
  console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }

 async function getPageContent(link) {
--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@ -31,6 +31,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
    return {
      success: false,
      reason: `Failed to parse content from ${filename}.`,
+      documents: [],
    };
  }

@ -43,7 +44,11 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
  if (!content.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
  }

  const data = {
@ -60,12 +65,15 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
    token_count_estimate: tokenizeString(content).length,
  };

-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
  trashFile(fullFilePath);
  console.log(
    `[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n`
  );
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }

 async function convertToWavAudioData(sourcePath) {
--- a/collector/processSingleFile/convert/asDocx.js
+++ b/collector/processSingleFile/convert/asDocx.js
@ -24,7 +24,11 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
  if (!pageContent.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
  }

  const content = pageContent.join("");
@ -42,10 +46,13 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
    token_count_estimate: tokenizeString(content).length,
  };

-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
  trashFile(fullFilePath);
  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }

 module.exports = asDocX;
--- a/collector/processSingleFile/convert/asMbox.js
+++ b/collector/processSingleFile/convert/asMbox.js
@ -22,10 +22,15 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
  if (!mails.length) {
    console.error(`Resulting mail items was empty for ${filename}.`);
    trashFile(fullFilePath);
-    return { success: false, reason: `No mail items found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No mail items found in ${filename}.`,
+      documents: [],
+    };
  }

  let item = 1;
+  const documents = [];
  for (const mail of mails) {
    if (!mail.hasOwnProperty("text")) continue;

@ -52,14 +57,18 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
    };

    item++;
-    writeToServerDocuments(data, `${slugify(filename)}-${data.id}-msg-${item}`);
+    const document = writeToServerDocuments(
+      data,
+      `${slugify(filename)}-${data.id}-msg-${item}`
+    );
+    documents.push(document);
  }

  trashFile(fullFilePath);
  console.log(
    `[SUCCESS]: ${filename} messages converted & ready for embedding.\n`
  );
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents };
 }

 module.exports = asMbox;
--- a/collector/processSingleFile/convert/asOfficeMime.js
+++ b/collector/processSingleFile/convert/asOfficeMime.js
@ -20,7 +20,11 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
  if (!content.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
  }

  const data = {
@ -37,10 +41,13 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
    token_count_estimate: tokenizeString(content).length,
  };

-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
  trashFile(fullFilePath);
  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }

 module.exports = asOfficeMime;
--- a/collector/processSingleFile/convert/asPDF.js
+++ b/collector/processSingleFile/convert/asPDF.js
@ -29,7 +29,11 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
  if (!pageContent.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
  }

  const content = pageContent.join("");
@ -47,10 +51,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
    token_count_estimate: tokenizeString(content).length,
  };

-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
  trashFile(fullFilePath);
  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }

 module.exports = asPDF;
--- a/collector/processSingleFile/convert/asTxt.js
+++ b/collector/processSingleFile/convert/asTxt.js
@ -19,7 +19,11 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
  if (!content?.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
-    return { success: false, reason: `No text content found in ${filename}.` };
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
  }

  console.log(`-- Working ${filename} --`);
@ -37,10 +41,13 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
    token_count_estimate: tokenizeString(content).length,
  };

-  writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
  trashFile(fullFilePath);
  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
-  return { success: true, reason: null };
+  return { success: true, reason: null, documents: [document] };
 }

 module.exports = asTxt;
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@ -13,11 +13,13 @@ async function processSingleFile(targetFilename) {
    return {
      success: false,
      reason: "Filename is a reserved filename and cannot be processed.",
+      documents: [],
    };
  if (!fs.existsSync(fullFilePath))
    return {
      success: false,
      reason: "File does not exist in upload directory.",
+      documents: [],
    };

  const fileExtension = path.extname(fullFilePath).toLowerCase();
@ -25,6 +27,7 @@ async function processSingleFile(targetFilename) {
    return {
      success: false,
      reason: `No file extension found. This file cannot be processed.`,
+      documents: [],
    };
  }

@ -33,6 +36,7 @@ async function processSingleFile(targetFilename) {
    return {
      success: false,
      reason: `File extension ${fileExtension} not supported for parsing.`,
+      documents: [],
    };
  }

--- a/collector/utils/files/index.js
+++ b/collector/utils/files/index.js
@ -38,14 +38,19 @@ function writeToServerDocuments(
      );
  if (!fs.existsSync(destination))
    fs.mkdirSync(destination, { recursive: true });
-  const destinationFilePath = path.resolve(destination, filename);
+  const destinationFilePath = path.resolve(destination, filename) + ".json";

-  fs.writeFileSync(
-    destinationFilePath + ".json",
-    JSON.stringify(data, null, 4),
-    { encoding: "utf-8" }
-  );
-  return;
+  fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
+    encoding: "utf-8",
+  });
+
+  return {
+    ...data,
+    // relative location string that can be passed into the /update-embeddings api
+    // that will work since we know the location exists and since we only allow
+    // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
+    location: destinationFilePath.split("/").slice(-2).join("/"),
+  };
 }

 // When required we can wipe the entire collector hotdir and tmp storage in case
--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@ -5,11 +5,13 @@ const {
  checkProcessorAlive,
  acceptedFileTypes,
  processDocument,
+  processLink,
 } = require("../../../utils/files/documentProcessor");
 const {
  viewLocalFiles,
  findDocumentInDocuments,
 } = require("../../../utils/files");
+const { reqBody } = require("../../../utils/http");
 const { handleUploads } = setupMulter();

 function apiDocumentEndpoints(app) {
@ -23,7 +25,6 @@ function apiDocumentEndpoints(app) {
      /* 
    #swagger.tags = ['Documents']
    #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.'
-
    #swagger.requestBody = {
      description: 'File to be uploaded.',
      required: true,
@ -50,6 +51,21 @@ function apiDocumentEndpoints(app) {
            example: {
              success: true,
              error: null,
+              documents: [
+                {
+                  "location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                  "name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                  "url": "file:///Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt",
+                  "title": "anythingllm.txt",
+                  "docAuthor": "Unknown",
+                  "description": "Unknown",
+                  "docSource": "a text file uploaded by the user.",
+                  "chunkSource": "anythingllm.txt",
+                  "published": "1/16/2024, 3:07:00 PM",
+                  "wordCount": 93,
+                  "token_count_estimate": 115,
+                }
+              ]
            }
          }
        }           
@ -75,16 +91,113 @@ function apiDocumentEndpoints(app) {
            .end();
        }

-        const { success, reason } = await processDocument(originalname);
+        const { success, reason, documents } =
+          await processDocument(originalname);
        if (!success) {
-          response.status(500).json({ success: false, error: reason }).end();
+          response
+            .status(500)
+            .json({ success: false, error: reason, documents })
+            .end();
+          return;
        }

        console.log(
          `Document ${originalname} uploaded processed and successfully. It is now available in documents.`
        );
        await Telemetry.sendTelemetry("document_uploaded");
-        response.status(200).json({ success: true, error: null });
+        response.status(200).json({ success: true, error: null, documents });
+      } catch (e) {
+        console.log(e.message, e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
+
+  app.post(
+    "/v1/document/upload-link",
+    [validApiKey],
+    async (request, response) => {
+      /* 
+    #swagger.tags = ['Documents']
+    #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding.'
+    #swagger.requestBody = {
+      description: 'Link of web address to be scraped.',
+      required: true,
+      type: 'file',
+      content: {
+          "application/json": {
+            schema: {
+              type: 'object',
+              example: {
+                "link": "https://useanything.com"
+              }
+            }
+          }           
+        }
+    }
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          schema: {
+            type: 'object',
+            example: {
+              success: true,
+              error: null,
+              documents: [
+                {
+                  "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                  "url": "file://useanything_com.html",
+                  "title": "useanything_com.html",
+                  "docAuthor": "no author found",
+                  "description": "No description found.",
+                  "docSource": "URL link uploaded by the user.",
+                  "chunkSource": "https:useanything.com.html",
+                  "published": "1/16/2024, 3:46:33 PM",
+                  "wordCount": 252,
+                  "pageContent": "AnythingLLM is the best....",
+                  "token_count_estimate": 447,
+                  "location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                }
+              ]
+            }
+          }
+        }           
+      }
+    }  
+    #swagger.responses[403] = {
+      schema: {
+        "$ref": "#/definitions/InvalidAPIKey"
+      }
+    }
+    */
+      try {
+        const { link } = reqBody(request);
+        const processingOnline = await checkProcessorAlive();
+
+        if (!processingOnline) {
+          response
+            .status(500)
+            .json({
+              success: false,
+              error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
+            })
+            .end();
+        }
+
+        const { success, reason, documents } = await processLink(link);
+        if (!success) {
+          response
+            .status(500)
+            .json({ success: false, error: reason, documents })
+            .end();
+          return;
+        }
+
+        console.log(
+          `Link ${link} uploaded processed and successfully. It is now available in documents.`
+        );
+        await Telemetry.sendTelemetry("document_uploaded");
+        response.status(200).json({ success: true, error: null, documents });
      } catch (e) {
        console.log(e.message, e);
        response.sendStatus(500).end();
--- a/server/endpoints/api/workspace/index.js
+++ b/server/endpoints/api/workspace/index.js
@ -381,8 +381,8 @@ function apiWorkspaceEndpoints(app) {
      content: {
        "application/json": {
          example: {
-            adds: [],
-            deletes: ["custom-documents/anythingllm-hash.json"]
+            adds: ["custom-documents/my-pdf.pdf-hash.json"],
+            deletes: ["custom-documents/anythingllm.txt-hash.json"]
          }
        }
      }
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@ -845,7 +845,22 @@
                  "type": "object",
                  "example": {
                    "success": true,
-                    "error": null
+                    "error": null,
+                    "documents": [
+                      {
+                        "location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                        "name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
+                        "url": "file://Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt",
+                        "title": "anythingllm.txt",
+                        "docAuthor": "Unknown",
+                        "description": "Unknown",
+                        "docSource": "a text file uploaded by the user.",
+                        "chunkSource": "anythingllm.txt",
+                        "published": "1/16/2024, 3:07:00 PM",
+                        "wordCount": 93,
+                        "token_count_estimate": 115
+                      }
+                    ]
                  }
                }
              }
@ -890,6 +905,88 @@
        }
      }
    },
+    "/v1/document/upload-link": {
+      "post": {
+        "tags": [
+          "Documents"
+        ],
+        "description": "Upload a valid URL for AnythingLLM to scrape and prepare for embedding.",
+        "parameters": [
+          {
+            "name": "Authorization",
+            "in": "header",
+            "schema": {
+              "type": "string"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "success": true,
+                    "error": null,
+                    "documents": [
+                      {
+                        "id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
+                        "url": "file://useanything_com.html",
+                        "title": "useanything_com.html",
+                        "docAuthor": "no author found",
+                        "description": "No description found.",
+                        "docSource": "URL link uploaded by the user.",
+                        "chunkSource": "https:useanything.com.html",
+                        "published": "1/16/2024, 3:46:33 PM",
+                        "wordCount": 252,
+                        "pageContent": "AnythingLLM is the best....",
+                        "token_count_estimate": 447,
+                        "location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
+                      }
+                    ]
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        },
+        "requestBody": {
+          "description": "Link of web address to be scraped.",
+          "required": true,
+          "type": "file",
+          "content": {
+            "application/json": {
+              "schema": {
+                "type": "object",
+                "example": {
+                  "link": "https://useanything.com"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
    "/v1/documents": {
      "get": {
        "tags": [
@ -1593,9 +1690,11 @@
          "content": {
            "application/json": {
              "example": {
-                "adds": [],
+                "adds": [
+                  "custom-documents/my-pdf.pdf-hash.json"
+                ],
                "deletes": [
-                  "custom-documents/anythingllm-hash.json"
+                  "custom-documents/anythingllm.txt-hash.json"
                ]
              }
            }
--- a/server/utils/files/documentProcessor.js
+++ b/server/utils/files/documentProcessor.js
@ -35,7 +35,7 @@ async function processDocument(filename = "") {
    .then((res) => res)
    .catch((e) => {
      console.log(e.message);
-      return { success: false, reason: e.message };
+      return { success: false, reason: e.message, documents: [] };
    });
 }

@ -55,7 +55,7 @@ async function processLink(link = "") {
    .then((res) => res)
    .catch((e) => {
      console.log(e.message);
-      return { success: false, reason: e.message };
+      return { success: false, reason: e.message, documents: [] };
    });
 }