mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-15 02:50:10 +01:00
[FIX] Confluence code snippet blocks not being extracted (#1804)
implement custom confluence loader to extract code blocks properly from documents Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
fbc9c868c4
commit
f205d51fe9
134
collector/utils/extensions/Confluence/ConfluenceLoader/index.js
Normal file
134
collector/utils/extensions/Confluence/ConfluenceLoader/index.js
Normal file
@ -0,0 +1,134 @@
|
||||
/*
|
||||
* This is a custom implementation of the Confluence langchain loader. There was an issue where
|
||||
* code blocks were not being extracted. This is a temporary fix until this issue is resolved.*/
|
||||
|
||||
const { htmlToText } = require("html-to-text");
|
||||
|
||||
class ConfluencePagesLoader {
|
||||
constructor({
|
||||
baseUrl,
|
||||
spaceKey,
|
||||
username,
|
||||
accessToken,
|
||||
limit = 25,
|
||||
expand = "body.storage,version",
|
||||
personalAccessToken,
|
||||
}) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.spaceKey = spaceKey;
|
||||
this.username = username;
|
||||
this.accessToken = accessToken;
|
||||
this.limit = limit;
|
||||
this.expand = expand;
|
||||
this.personalAccessToken = personalAccessToken;
|
||||
}
|
||||
|
||||
get authorizationHeader() {
|
||||
if (this.personalAccessToken) {
|
||||
return `Bearer ${this.personalAccessToken}`;
|
||||
} else if (this.username && this.accessToken) {
|
||||
const authToken = Buffer.from(
|
||||
`${this.username}:${this.accessToken}`
|
||||
).toString("base64");
|
||||
return `Basic ${authToken}`;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
async load(options) {
|
||||
try {
|
||||
const pages = await this.fetchAllPagesInSpace(
|
||||
options?.start,
|
||||
options?.limit
|
||||
);
|
||||
return pages.map((page) => this.createDocumentFromPage(page));
|
||||
} catch (error) {
|
||||
console.error("Error:", error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async fetchConfluenceData(url) {
|
||||
try {
|
||||
const initialHeaders = {
|
||||
"Content-Type": "application/json",
|
||||
Accept: "application/json",
|
||||
};
|
||||
const authHeader = this.authorizationHeader;
|
||||
if (authHeader) {
|
||||
initialHeaders.Authorization = authHeader;
|
||||
}
|
||||
const response = await fetch(url, {
|
||||
headers: initialHeaders,
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`Failed to fetch ${url} from Confluence: ${response.status}`
|
||||
);
|
||||
}
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to fetch ${url} from Confluence: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
|
||||
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
|
||||
const data = await this.fetchConfluenceData(url);
|
||||
if (data.size === 0) {
|
||||
return [];
|
||||
}
|
||||
const nextPageStart = start + data.size;
|
||||
const nextPageResults = await this.fetchAllPagesInSpace(
|
||||
nextPageStart,
|
||||
limit
|
||||
);
|
||||
return data.results.concat(nextPageResults);
|
||||
}
|
||||
|
||||
createDocumentFromPage(page) {
|
||||
// Function to extract code blocks
|
||||
const extractCodeBlocks = (content) => {
|
||||
const codeBlockRegex =
|
||||
/<ac:structured-macro ac:name="code"[^>]*>[\s\S]*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g;
|
||||
const languageRegex =
|
||||
/<ac:parameter ac:name="language">(.*?)<\/ac:parameter>/;
|
||||
|
||||
return content.replace(codeBlockRegex, (match) => {
|
||||
const language = match.match(languageRegex)?.[1] || "";
|
||||
const code =
|
||||
match.match(
|
||||
/<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/
|
||||
)?.[1] || "";
|
||||
return `\n\`\`\`${language}\n${code.trim()}\n\`\`\`\n`;
|
||||
});
|
||||
};
|
||||
|
||||
const contentWithCodeBlocks = extractCodeBlocks(page.body.storage.value);
|
||||
const plainTextContent = htmlToText(contentWithCodeBlocks, {
|
||||
wordwrap: false,
|
||||
preserveNewlines: true,
|
||||
});
|
||||
const textWithPreservedStructure = plainTextContent.replace(
|
||||
/\n{3,}/g,
|
||||
"\n\n"
|
||||
);
|
||||
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;
|
||||
|
||||
return {
|
||||
pageContent: textWithPreservedStructure,
|
||||
metadata: {
|
||||
id: page.id,
|
||||
status: page.status,
|
||||
title: page.title,
|
||||
type: page.type,
|
||||
url: pageUrl,
|
||||
version: page.version?.number,
|
||||
updated_by: page.version?.by?.displayName,
|
||||
updated_at: page.version?.when,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { ConfluencePagesLoader };
|
@ -5,9 +5,7 @@ const { v4 } = require("uuid");
|
||||
const UrlPattern = require("url-pattern");
|
||||
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
|
||||
const { tokenizeString } = require("../../tokenizer");
|
||||
const {
|
||||
ConfluencePagesLoader,
|
||||
} = require("langchain/document_loaders/web/confluence");
|
||||
const { ConfluencePagesLoader } = require("./ConfluenceLoader");
|
||||
|
||||
/**
|
||||
* Load Confluence documents from a spaceID and Confluence credentials
|
||||
|
Loading…
Reference in New Issue
Block a user