2024-04-26 02:53:38 +02:00
|
|
|
const fs = require("fs");
|
|
|
|
const path = require("path");
|
|
|
|
const { default: slugify } = require("slugify");
|
|
|
|
const { v4 } = require("uuid");
|
2024-06-26 00:45:09 +02:00
|
|
|
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
|
2024-04-26 02:53:38 +02:00
|
|
|
const { tokenizeString } = require("../../tokenizer");
|
2024-07-03 23:00:44 +02:00
|
|
|
const { ConfluencePagesLoader } = require("./ConfluenceLoader");
|
2024-04-26 02:53:38 +02:00
|
|
|
|
2024-06-21 22:38:50 +02:00
|
|
|
/**
|
|
|
|
* Load Confluence documents from a spaceID and Confluence credentials
|
|
|
|
* @param {object} args - forwarded request body params
|
|
|
|
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
|
|
|
|
* @returns
|
|
|
|
*/
|
2024-09-26 01:12:17 +02:00
|
|
|
async function loadConfluence(
|
2024-09-26 17:49:05 +02:00
|
|
|
{
|
|
|
|
baseUrl = null,
|
|
|
|
spaceKey = null,
|
|
|
|
username = null,
|
|
|
|
accessToken = null,
|
|
|
|
cloud = true,
|
|
|
|
},
|
2024-09-26 01:12:17 +02:00
|
|
|
response
|
|
|
|
) {
|
|
|
|
if (!baseUrl || !spaceKey || !username || !accessToken) {
|
2024-04-26 02:53:38 +02:00
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
reason:
|
|
|
|
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector.",
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2024-09-26 01:12:17 +02:00
|
|
|
if (!validBaseUrl(baseUrl)) {
|
2024-04-26 02:53:38 +02:00
|
|
|
return {
|
|
|
|
success: false,
|
2024-09-26 01:12:17 +02:00
|
|
|
reason: "Provided base URL is not a valid URL.",
|
2024-04-26 02:53:38 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2024-09-26 01:12:17 +02:00
|
|
|
if (!spaceKey) {
|
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
reason: "You need to provide a Confluence space key.",
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
const { origin, hostname } = new URL(baseUrl);
|
|
|
|
console.log(`-- Working Confluence ${origin} --`);
|
2024-04-26 02:53:38 +02:00
|
|
|
const loader = new ConfluencePagesLoader({
|
2024-09-26 01:12:17 +02:00
|
|
|
baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
|
2024-04-26 02:53:38 +02:00
|
|
|
spaceKey,
|
|
|
|
username,
|
|
|
|
accessToken,
|
2024-09-26 17:49:05 +02:00
|
|
|
cloud,
|
2024-04-26 02:53:38 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
const { docs, error } = await loader
|
|
|
|
.load()
|
|
|
|
.then((docs) => {
|
|
|
|
return { docs, error: null };
|
|
|
|
})
|
|
|
|
.catch((e) => {
|
|
|
|
return {
|
|
|
|
docs: [],
|
|
|
|
error: e.message?.split("Error:")?.[1] || e.message,
|
|
|
|
};
|
|
|
|
});
|
|
|
|
|
|
|
|
if (!docs.length || !!error) {
|
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
reason: error ?? "No pages found for that Confluence space.",
|
|
|
|
};
|
|
|
|
}
|
|
|
|
const outFolder = slugify(
|
2024-09-26 17:49:05 +02:00
|
|
|
`confluence-${hostname}-${v4().slice(0, 4)}`
|
2024-04-26 02:53:38 +02:00
|
|
|
).toLowerCase();
|
2024-05-02 23:03:10 +02:00
|
|
|
|
|
|
|
const outFolderPath =
|
|
|
|
process.env.NODE_ENV === "development"
|
|
|
|
? path.resolve(
|
|
|
|
__dirname,
|
|
|
|
`../../../../server/storage/documents/${outFolder}`
|
|
|
|
)
|
|
|
|
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
|
|
|
|
|
|
|
|
if (!fs.existsSync(outFolderPath))
|
|
|
|
fs.mkdirSync(outFolderPath, { recursive: true });
|
2024-04-26 02:53:38 +02:00
|
|
|
|
|
|
|
docs.forEach((doc) => {
|
2024-05-14 19:22:13 +02:00
|
|
|
if (!doc.pageContent) return;
|
|
|
|
|
2024-04-26 02:53:38 +02:00
|
|
|
const data = {
|
|
|
|
id: v4(),
|
|
|
|
url: doc.metadata.url + ".page",
|
|
|
|
title: doc.metadata.title || doc.metadata.source,
|
2024-09-26 01:12:17 +02:00
|
|
|
docAuthor: origin,
|
2024-04-26 02:53:38 +02:00
|
|
|
description: doc.metadata.title,
|
2024-09-26 01:12:17 +02:00
|
|
|
docSource: `${origin} Confluence`,
|
2024-06-21 22:38:50 +02:00
|
|
|
chunkSource: generateChunkSource(
|
2024-09-26 17:49:05 +02:00
|
|
|
{ doc, baseUrl: origin, spaceKey, accessToken, username, cloud },
|
2024-06-21 22:38:50 +02:00
|
|
|
response.locals.encryptionWorker
|
|
|
|
),
|
2024-04-26 02:53:38 +02:00
|
|
|
published: new Date().toLocaleString(),
|
|
|
|
wordCount: doc.pageContent.split(" ").length,
|
|
|
|
pageContent: doc.pageContent,
|
|
|
|
token_count_estimate: tokenizeString(doc.pageContent).length,
|
|
|
|
};
|
|
|
|
|
|
|
|
console.log(
|
|
|
|
`[Confluence Loader]: Saving ${doc.metadata.title} to ${outFolder}`
|
|
|
|
);
|
2024-06-26 00:45:09 +02:00
|
|
|
|
|
|
|
const fileName = sanitizeFileName(
|
|
|
|
`${slugify(doc.metadata.title)}-${data.id}`
|
2024-04-26 02:53:38 +02:00
|
|
|
);
|
2024-06-26 00:45:09 +02:00
|
|
|
writeToServerDocuments(data, fileName, outFolderPath);
|
2024-04-26 02:53:38 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
return {
|
|
|
|
success: true,
|
|
|
|
reason: null,
|
|
|
|
data: {
|
|
|
|
spaceKey,
|
|
|
|
destination: outFolder,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2024-06-21 22:38:50 +02:00
|
|
|
/**
|
|
|
|
* Gets the page content from a specific Confluence page, not all pages in a workspace.
|
|
|
|
* @returns
|
|
|
|
*/
|
|
|
|
async function fetchConfluencePage({
|
|
|
|
pageUrl,
|
|
|
|
baseUrl,
|
2024-09-26 01:12:17 +02:00
|
|
|
spaceKey,
|
2024-06-21 22:38:50 +02:00
|
|
|
username,
|
|
|
|
accessToken,
|
2024-09-26 17:49:05 +02:00
|
|
|
cloud = true,
|
2024-06-21 22:38:50 +02:00
|
|
|
}) {
|
2024-09-26 01:12:17 +02:00
|
|
|
if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
|
2024-06-21 22:38:50 +02:00
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
content: null,
|
|
|
|
reason:
|
|
|
|
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector.",
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2024-09-26 01:12:17 +02:00
|
|
|
if (!validBaseUrl(baseUrl)) {
|
2024-06-21 22:38:50 +02:00
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
content: null,
|
2024-09-26 01:12:17 +02:00
|
|
|
reason: "Provided base URL is not a valid URL.",
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!spaceKey) {
|
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
content: null,
|
|
|
|
reason: "You need to provide a Confluence space key.",
|
2024-06-21 22:38:50 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
console.log(`-- Working Confluence Page ${pageUrl} --`);
|
|
|
|
const loader = new ConfluencePagesLoader({
|
2024-09-26 01:12:17 +02:00
|
|
|
baseUrl, // Should be the origin of the baseUrl
|
2024-06-21 22:38:50 +02:00
|
|
|
spaceKey,
|
|
|
|
username,
|
|
|
|
accessToken,
|
2024-09-26 17:49:05 +02:00
|
|
|
cloud,
|
2024-06-21 22:38:50 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
const { docs, error } = await loader
|
|
|
|
.load()
|
|
|
|
.then((docs) => {
|
|
|
|
return { docs, error: null };
|
|
|
|
})
|
|
|
|
.catch((e) => {
|
|
|
|
return {
|
|
|
|
docs: [],
|
|
|
|
error: e.message?.split("Error:")?.[1] || e.message,
|
|
|
|
};
|
|
|
|
});
|
|
|
|
|
|
|
|
if (!docs.length || !!error) {
|
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
reason: error ?? "No pages found for that Confluence space.",
|
|
|
|
content: null,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
const targetDocument = docs.find(
|
|
|
|
(doc) => doc.pageContent && doc.metadata.url === pageUrl
|
|
|
|
);
|
|
|
|
if (!targetDocument) {
|
|
|
|
return {
|
|
|
|
success: false,
|
|
|
|
reason: "Target page could not be found in Confluence space.",
|
|
|
|
content: null,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
success: true,
|
|
|
|
reason: null,
|
|
|
|
content: targetDocument.pageContent,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2024-06-18 01:04:20 +02:00
|
|
|
/**
|
2024-09-26 01:12:17 +02:00
|
|
|
* Validates if the provided baseUrl is a valid URL at all.
|
|
|
|
* @param {string} baseUrl
|
|
|
|
* @returns {boolean}
|
2024-06-18 01:04:20 +02:00
|
|
|
*/
|
2024-09-26 01:12:17 +02:00
|
|
|
function validBaseUrl(baseUrl) {
|
|
|
|
try {
|
|
|
|
new URL(baseUrl);
|
|
|
|
return true;
|
|
|
|
} catch (e) {
|
|
|
|
return false;
|
2024-06-18 01:04:20 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-06-21 22:38:50 +02:00
|
|
|
/**
|
|
|
|
* Generate the full chunkSource for a specific Confluence page so that we can resync it later.
|
|
|
|
* This data is encrypted into a single `payload` query param so we can replay credentials later
|
|
|
|
* since this was encrypted with the systems persistent password and salt.
|
|
|
|
* @param {object} chunkSourceInformation
|
|
|
|
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
|
|
|
|
* @returns {string}
|
|
|
|
*/
|
|
|
|
function generateChunkSource(
|
2024-09-26 17:49:05 +02:00
|
|
|
{ doc, baseUrl, spaceKey, accessToken, username, cloud },
|
2024-06-21 22:38:50 +02:00
|
|
|
encryptionWorker
|
|
|
|
) {
|
|
|
|
const payload = {
|
|
|
|
baseUrl,
|
2024-09-26 01:12:17 +02:00
|
|
|
spaceKey,
|
2024-06-21 22:38:50 +02:00
|
|
|
token: accessToken,
|
|
|
|
username,
|
2024-09-26 17:49:05 +02:00
|
|
|
cloud,
|
2024-06-21 22:38:50 +02:00
|
|
|
};
|
|
|
|
return `confluence://${doc.metadata.url}?payload=${encryptionWorker.encrypt(
|
|
|
|
JSON.stringify(payload)
|
|
|
|
)}`;
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = {
|
|
|
|
loadConfluence,
|
|
|
|
fetchConfluencePage,
|
|
|
|
};
|