570 document api return object (#608)

* Add support for fetching single document in documents folder

* Add document object to upload + support link scraping via API

* hotfixes for documentation

* update api docs
This commit is contained in:
Timothy Carambat 2024-01-16 16:04:22 -08:00 committed by GitHub
parent c61cbd1502
commit b35feede87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 324 additions and 43 deletions

View File

@ -29,14 +29,21 @@ app.post("/process", async function (request, response) {
const targetFilename = path
.normalize(filename)
.replace(/^(\.\.(\/|\\|$))+/, "");
const { success, reason } = await processSingleFile(targetFilename);
response.status(200).json({ filename: targetFilename, success, reason });
const {
success,
reason,
documents = [],
} = await processSingleFile(targetFilename);
response
.status(200)
.json({ filename: targetFilename, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: filename,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
@ -45,14 +52,15 @@ app.post("/process", async function (request, response) {
app.post("/process-link", async function (request, response) {
const { link } = reqBody(request);
try {
const { success, reason } = await processLink(link);
response.status(200).json({ url: link, success, reason });
const { success, reason, documents = [] } = await processLink(link);
response.status(200).json({ url: link, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
url: link,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;

View File

@ -12,7 +12,11 @@ async function scrapeGenericUrl(link) {
if (!content.length) {
console.error(`Resulting URL content was empty at ${link}.`);
return { success: false, reason: `No URL content found at ${link}.` };
return {
success: false,
reason: `No URL content found at ${link}.`,
documents: [],
};
}
const url = new URL(link);
@ -32,9 +36,12 @@ async function scrapeGenericUrl(link) {
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, `url-${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`url-${slugify(filename)}-${data.id}`
);
console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}
async function getPageContent(link) {

View File

@ -31,6 +31,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
return {
success: false,
reason: `Failed to parse content from ${filename}.`,
documents: [],
};
}
@ -43,7 +44,11 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
if (!content.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}
const data = {
@ -60,12 +65,15 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(
`[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n`
);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}
async function convertToWavAudioData(sourcePath) {

View File

@ -24,7 +24,11 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
if (!pageContent.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}
const content = pageContent.join("");
@ -42,10 +46,13 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}
module.exports = asDocX;

View File

@ -22,10 +22,15 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
if (!mails.length) {
console.error(`Resulting mail items was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No mail items found in ${filename}.` };
return {
success: false,
reason: `No mail items found in ${filename}.`,
documents: [],
};
}
let item = 1;
const documents = [];
for (const mail of mails) {
if (!mail.hasOwnProperty("text")) continue;
@ -52,14 +57,18 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
};
item++;
writeToServerDocuments(data, `${slugify(filename)}-${data.id}-msg-${item}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}-msg-${item}`
);
documents.push(document);
}
trashFile(fullFilePath);
console.log(
`[SUCCESS]: ${filename} messages converted & ready for embedding.\n`
);
return { success: true, reason: null };
return { success: true, reason: null, documents };
}
module.exports = asMbox;

View File

@ -20,7 +20,11 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
if (!content.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}
const data = {
@ -37,10 +41,13 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}
module.exports = asOfficeMime;

View File

@ -29,7 +29,11 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
if (!pageContent.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}
const content = pageContent.join("");
@ -47,10 +51,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}
module.exports = asPDF;

View File

@ -19,7 +19,11 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
if (!content?.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}
console.log(`-- Working ${filename} --`);
@ -37,10 +41,13 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}
module.exports = asTxt;

View File

@ -13,11 +13,13 @@ async function processSingleFile(targetFilename) {
return {
success: false,
reason: "Filename is a reserved filename and cannot be processed.",
documents: [],
};
if (!fs.existsSync(fullFilePath))
return {
success: false,
reason: "File does not exist in upload directory.",
documents: [],
};
const fileExtension = path.extname(fullFilePath).toLowerCase();
@ -25,6 +27,7 @@ async function processSingleFile(targetFilename) {
return {
success: false,
reason: `No file extension found. This file cannot be processed.`,
documents: [],
};
}
@ -33,6 +36,7 @@ async function processSingleFile(targetFilename) {
return {
success: false,
reason: `File extension ${fileExtension} not supported for parsing.`,
documents: [],
};
}

View File

@ -38,14 +38,19 @@ function writeToServerDocuments(
);
if (!fs.existsSync(destination))
fs.mkdirSync(destination, { recursive: true });
const destinationFilePath = path.resolve(destination, filename);
const destinationFilePath = path.resolve(destination, filename) + ".json";
fs.writeFileSync(
destinationFilePath + ".json",
JSON.stringify(data, null, 4),
{ encoding: "utf-8" }
);
return;
fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
encoding: "utf-8",
});
return {
...data,
// relative location string that can be passed into the /update-embeddings api
// that will work since we know the location exists and since we only allow
// 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
location: destinationFilePath.split("/").slice(-2).join("/"),
};
}
// When required we can wipe the entire collector hotdir and tmp storage in case

View File

@ -5,11 +5,13 @@ const {
checkProcessorAlive,
acceptedFileTypes,
processDocument,
processLink,
} = require("../../../utils/files/documentProcessor");
const {
viewLocalFiles,
findDocumentInDocuments,
} = require("../../../utils/files");
const { reqBody } = require("../../../utils/http");
const { handleUploads } = setupMulter();
function apiDocumentEndpoints(app) {
@ -23,7 +25,6 @@ function apiDocumentEndpoints(app) {
/*
#swagger.tags = ['Documents']
#swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.'
#swagger.requestBody = {
description: 'File to be uploaded.',
required: true,
@ -50,6 +51,21 @@ function apiDocumentEndpoints(app) {
example: {
success: true,
error: null,
documents: [
{
"location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
"name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
"url": "file:///Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt",
"title": "anythingllm.txt",
"docAuthor": "Unknown",
"description": "Unknown",
"docSource": "a text file uploaded by the user.",
"chunkSource": "anythingllm.txt",
"published": "1/16/2024, 3:07:00PM",
"wordCount": 93,
"token_count_estimate": 115,
}
]
}
}
}
@ -75,16 +91,113 @@ function apiDocumentEndpoints(app) {
.end();
}
const { success, reason } = await processDocument(originalname);
const { success, reason, documents } =
await processDocument(originalname);
if (!success) {
response.status(500).json({ success: false, error: reason }).end();
response
.status(500)
.json({ success: false, error: reason, documents })
.end();
return;
}
console.log(
`Document ${originalname} uploaded processed and successfully. It is now available in documents.`
);
await Telemetry.sendTelemetry("document_uploaded");
response.status(200).json({ success: true, error: null });
response.status(200).json({ success: true, error: null, documents });
} catch (e) {
console.log(e.message, e);
response.sendStatus(500).end();
}
}
);
app.post(
"/v1/document/upload-link",
[validApiKey],
async (request, response) => {
/*
#swagger.tags = ['Documents']
#swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding.'
#swagger.requestBody = {
description: 'Link of web address to be scraped.',
required: true,
type: 'file',
content: {
"application/json": {
schema: {
type: 'object',
example: {
"link": "https://useanything.com"
}
}
}
}
}
#swagger.responses[200] = {
content: {
"application/json": {
schema: {
type: 'object',
example: {
success: true,
error: null,
documents: [
{
"id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
"url": "file://useanything_com.html",
"title": "useanything_com.html",
"docAuthor": "no author found",
"description": "No description found.",
"docSource": "URL link uploaded by the user.",
"chunkSource": "https:useanything.com.html",
"published": "1/16/2024, 3:46:33PM",
"wordCount": 252,
"pageContent": "AnythingLLM is the best....",
"token_count_estimate": 447,
"location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
}
]
}
}
}
}
}
#swagger.responses[403] = {
schema: {
"$ref": "#/definitions/InvalidAPIKey"
}
}
*/
try {
const { link } = reqBody(request);
const processingOnline = await checkProcessorAlive();
if (!processingOnline) {
response
.status(500)
.json({
success: false,
error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
})
.end();
}
const { success, reason, documents } = await processLink(link);
if (!success) {
response
.status(500)
.json({ success: false, error: reason, documents })
.end();
return;
}
console.log(
`Link ${link} uploaded processed and successfully. It is now available in documents.`
);
await Telemetry.sendTelemetry("document_uploaded");
response.status(200).json({ success: true, error: null, documents });
} catch (e) {
console.log(e.message, e);
response.sendStatus(500).end();

View File

@ -381,8 +381,8 @@ function apiWorkspaceEndpoints(app) {
content: {
"application/json": {
example: {
adds: [],
deletes: ["custom-documents/anythingllm-hash.json"]
adds: ["custom-documents/my-pdf.pdf-hash.json"],
deletes: ["custom-documents/anythingllm.txt-hash.json"]
}
}
}

View File

@ -845,7 +845,22 @@
"type": "object",
"example": {
"success": true,
"error": null
"error": null,
"documents": [
{
"location": "custom-documents/anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
"name": "anythingllm.txt-6e8be64c-c162-4b43-9997-b068c0071e8b.json",
"url": "file://Users/tim/Documents/anything-llm/collector/hotdir/anythingllm.txt",
"title": "anythingllm.txt",
"docAuthor": "Unknown",
"description": "Unknown",
"docSource": "a text file uploaded by the user.",
"chunkSource": "anythingllm.txt",
"published": "1/16/2024, 3:07:00PM",
"wordCount": 93,
"token_count_estimate": 115
}
]
}
}
}
@ -890,6 +905,88 @@
}
}
},
"/v1/document/upload-link": {
"post": {
"tags": [
"Documents"
],
"description": "Upload a valid URL for AnythingLLM to scrape and prepare for embedding.",
"parameters": [
{
"name": "Authorization",
"in": "header",
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"type": "object",
"example": {
"success": true,
"error": null,
"documents": [
{
"id": "c530dbe6-bff1-4b9e-b87f-710d539d20bc",
"url": "file://useanything_com.html",
"title": "useanything_com.html",
"docAuthor": "no author found",
"description": "No description found.",
"docSource": "URL link uploaded by the user.",
"chunkSource": "https:useanything.com.html",
"published": "1/16/2024, 3:46:33PM",
"wordCount": 252,
"pageContent": "AnythingLLM is the best....",
"token_count_estimate": 447,
"location": "custom-documents/url-useanything_com-c530dbe6-bff1-4b9e-b87f-710d539d20bc.json"
}
]
}
}
}
}
},
"403": {
"description": "Forbidden",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/InvalidAPIKey"
}
},
"application/xml": {
"schema": {
"$ref": "#/components/schemas/InvalidAPIKey"
}
}
}
},
"500": {
"description": "Internal Server Error"
}
},
"requestBody": {
"description": "Link of web address to be scraped.",
"required": true,
"type": "file",
"content": {
"application/json": {
"schema": {
"type": "object",
"example": {
"link": "https://useanything.com"
}
}
}
}
}
}
},
"/v1/documents": {
"get": {
"tags": [
@ -1593,9 +1690,11 @@
"content": {
"application/json": {
"example": {
"adds": [],
"adds": [
"custom-documents/my-pdf.pdf-hash.json"
],
"deletes": [
"custom-documents/anythingllm-hash.json"
"custom-documents/anythingllm.txt-hash.json"
]
}
}

View File

@ -35,7 +35,7 @@ async function processDocument(filename = "") {
.then((res) => res)
.catch((e) => {
console.log(e.message);
return { success: false, reason: e.message };
return { success: false, reason: e.message, documents: [] };
});
}
@ -55,7 +55,7 @@ async function processLink(link = "") {
.then((res) => res)
.catch((e) => {
console.log(e.message);
return { success: false, reason: e.message };
return { success: false, reason: e.message, documents: [] };
});
}