mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-15 02:50:10 +01:00
Merge branch 'master' of github.com:Mintplex-Labs/anything-llm
This commit is contained in:
commit
3fa00ad7b6
134
collector/utils/extensions/Confluence/ConfluenceLoader/index.js
Normal file
134
collector/utils/extensions/Confluence/ConfluenceLoader/index.js
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
/*
|
||||||
|
* This is a custom implementation of the Confluence langchain loader. There was an issue where
|
||||||
|
* code blocks were not being extracted. This is a temporary fix until this issue is resolved.*/
|
||||||
|
|
||||||
|
const { htmlToText } = require("html-to-text");
|
||||||
|
|
||||||
|
class ConfluencePagesLoader {
|
||||||
|
constructor({
|
||||||
|
baseUrl,
|
||||||
|
spaceKey,
|
||||||
|
username,
|
||||||
|
accessToken,
|
||||||
|
limit = 25,
|
||||||
|
expand = "body.storage,version",
|
||||||
|
personalAccessToken,
|
||||||
|
}) {
|
||||||
|
this.baseUrl = baseUrl;
|
||||||
|
this.spaceKey = spaceKey;
|
||||||
|
this.username = username;
|
||||||
|
this.accessToken = accessToken;
|
||||||
|
this.limit = limit;
|
||||||
|
this.expand = expand;
|
||||||
|
this.personalAccessToken = personalAccessToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
get authorizationHeader() {
|
||||||
|
if (this.personalAccessToken) {
|
||||||
|
return `Bearer ${this.personalAccessToken}`;
|
||||||
|
} else if (this.username && this.accessToken) {
|
||||||
|
const authToken = Buffer.from(
|
||||||
|
`${this.username}:${this.accessToken}`
|
||||||
|
).toString("base64");
|
||||||
|
return `Basic ${authToken}`;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
async load(options) {
|
||||||
|
try {
|
||||||
|
const pages = await this.fetchAllPagesInSpace(
|
||||||
|
options?.start,
|
||||||
|
options?.limit
|
||||||
|
);
|
||||||
|
return pages.map((page) => this.createDocumentFromPage(page));
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error:", error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fetchConfluenceData(url) {
|
||||||
|
try {
|
||||||
|
const initialHeaders = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Accept: "application/json",
|
||||||
|
};
|
||||||
|
const authHeader = this.authorizationHeader;
|
||||||
|
if (authHeader) {
|
||||||
|
initialHeaders.Authorization = authHeader;
|
||||||
|
}
|
||||||
|
const response = await fetch(url, {
|
||||||
|
headers: initialHeaders,
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(
|
||||||
|
`Failed to fetch ${url} from Confluence: ${response.status}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`Failed to fetch ${url} from Confluence: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
|
||||||
|
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
|
||||||
|
const data = await this.fetchConfluenceData(url);
|
||||||
|
if (data.size === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
const nextPageStart = start + data.size;
|
||||||
|
const nextPageResults = await this.fetchAllPagesInSpace(
|
||||||
|
nextPageStart,
|
||||||
|
limit
|
||||||
|
);
|
||||||
|
return data.results.concat(nextPageResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
createDocumentFromPage(page) {
|
||||||
|
// Function to extract code blocks
|
||||||
|
const extractCodeBlocks = (content) => {
|
||||||
|
const codeBlockRegex =
|
||||||
|
/<ac:structured-macro ac:name="code"[^>]*>[\s\S]*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g;
|
||||||
|
const languageRegex =
|
||||||
|
/<ac:parameter ac:name="language">(.*?)<\/ac:parameter>/;
|
||||||
|
|
||||||
|
return content.replace(codeBlockRegex, (match) => {
|
||||||
|
const language = match.match(languageRegex)?.[1] || "";
|
||||||
|
const code =
|
||||||
|
match.match(
|
||||||
|
/<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/
|
||||||
|
)?.[1] || "";
|
||||||
|
return `\n\`\`\`${language}\n${code.trim()}\n\`\`\`\n`;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
const contentWithCodeBlocks = extractCodeBlocks(page.body.storage.value);
|
||||||
|
const plainTextContent = htmlToText(contentWithCodeBlocks, {
|
||||||
|
wordwrap: false,
|
||||||
|
preserveNewlines: true,
|
||||||
|
});
|
||||||
|
const textWithPreservedStructure = plainTextContent.replace(
|
||||||
|
/\n{3,}/g,
|
||||||
|
"\n\n"
|
||||||
|
);
|
||||||
|
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;
|
||||||
|
|
||||||
|
return {
|
||||||
|
pageContent: textWithPreservedStructure,
|
||||||
|
metadata: {
|
||||||
|
id: page.id,
|
||||||
|
status: page.status,
|
||||||
|
title: page.title,
|
||||||
|
type: page.type,
|
||||||
|
url: pageUrl,
|
||||||
|
version: page.version?.number,
|
||||||
|
updated_by: page.version?.by?.displayName,
|
||||||
|
updated_at: page.version?.when,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { ConfluencePagesLoader };
|
@ -5,9 +5,7 @@ const { v4 } = require("uuid");
|
|||||||
const UrlPattern = require("url-pattern");
|
const UrlPattern = require("url-pattern");
|
||||||
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
|
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
|
||||||
const { tokenizeString } = require("../../tokenizer");
|
const { tokenizeString } = require("../../tokenizer");
|
||||||
const {
|
const { ConfluencePagesLoader } = require("./ConfluenceLoader");
|
||||||
ConfluencePagesLoader,
|
|
||||||
} = require("langchain/document_loaders/web/confluence");
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load Confluence documents from a spaceID and Confluence credentials
|
* Load Confluence documents from a spaceID and Confluence credentials
|
||||||
|
Loading…
Reference in New Issue
Block a user