From f205d51fe9e610e5b37090e3587c03c79bde157f Mon Sep 17 00:00:00 2001 From: Sean Hatfield Date: Wed, 3 Jul 2024 14:00:44 -0700 Subject: [PATCH] [FIX] Confluence code snippet blocks not being extracted (#1804) implement custom confluence loader to extract code blocks properly from documents Co-authored-by: Timothy Carambat --- .../Confluence/ConfluenceLoader/index.js | 134 ++++++++++++++++++ .../utils/extensions/Confluence/index.js | 4 +- 2 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 collector/utils/extensions/Confluence/ConfluenceLoader/index.js diff --git a/collector/utils/extensions/Confluence/ConfluenceLoader/index.js b/collector/utils/extensions/Confluence/ConfluenceLoader/index.js new file mode 100644 index 00000000..77018598 --- /dev/null +++ b/collector/utils/extensions/Confluence/ConfluenceLoader/index.js @@ -0,0 +1,134 @@ +/* + * This is a custom implementation of the Confluence langchain loader. There was an issue where + * code blocks were not being extracted. This is a temporary fix until this issue is resolved.*/ + +const { htmlToText } = require("html-to-text"); + +class ConfluencePagesLoader { + constructor({ + baseUrl, + spaceKey, + username, + accessToken, + limit = 25, + expand = "body.storage,version", + personalAccessToken, + }) { + this.baseUrl = baseUrl; + this.spaceKey = spaceKey; + this.username = username; + this.accessToken = accessToken; + this.limit = limit; + this.expand = expand; + this.personalAccessToken = personalAccessToken; + } + + get authorizationHeader() { + if (this.personalAccessToken) { + return `Bearer ${this.personalAccessToken}`; + } else if (this.username && this.accessToken) { + const authToken = Buffer.from( + `${this.username}:${this.accessToken}` + ).toString("base64"); + return `Basic ${authToken}`; + } + return undefined; + } + + async load(options) { + try { + const pages = await this.fetchAllPagesInSpace( + options?.start, + options?.limit + ); + return pages.map((page) => this.createDocumentFromPage(page)); + } catch (error) { + console.error("Error:", error); + return []; + } + } + + async fetchConfluenceData(url) { + try { + const initialHeaders = { + "Content-Type": "application/json", + Accept: "application/json", + }; + const authHeader = this.authorizationHeader; + if (authHeader) { + initialHeaders.Authorization = authHeader; + } + const response = await fetch(url, { + headers: initialHeaders, + }); + if (!response.ok) { + throw new Error( + `Failed to fetch ${url} from Confluence: ${response.status}` + ); + } + return await response.json(); + } catch (error) { + throw new Error(`Failed to fetch ${url} from Confluence: ${error}`); + } + } + + async fetchAllPagesInSpace(start = 0, limit = this.limit) { + const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`; + const data = await this.fetchConfluenceData(url); + if (data.size === 0) { + return []; + } + const nextPageStart = start + data.size; + const nextPageResults = await this.fetchAllPagesInSpace( + nextPageStart, + limit + ); + return data.results.concat(nextPageResults); + } + + createDocumentFromPage(page) { + // Function to extract code blocks + const extractCodeBlocks = (content) => { + const codeBlockRegex = + /]*>[\s\S]*?<\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g; + const languageRegex = + /(.*?)<\/ac:parameter>/; + + return content.replace(codeBlockRegex, (match) => { + const language = match.match(languageRegex)?.[1] || ""; + const code = + match.match( + /<\/ac:plain-text-body>/ + )?.[1] || ""; + return `\n\`\`\`${language}\n${code.trim()}\n\`\`\`\n`; + }); + }; + + const contentWithCodeBlocks = extractCodeBlocks(page.body.storage.value); + const plainTextContent = htmlToText(contentWithCodeBlocks, { + wordwrap: false, + preserveNewlines: true, + }); + const textWithPreservedStructure = plainTextContent.replace( + /\n{3,}/g, + "\n\n" + ); + const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`; + + return { + pageContent: textWithPreservedStructure, + metadata: { + id: page.id, + status: page.status, + title: page.title, + type: page.type, + url: pageUrl, + version: page.version?.number, + updated_by: page.version?.by?.displayName, + updated_at: page.version?.when, + }, + }; + } +} + +module.exports = { ConfluencePagesLoader }; diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 22df1c7f..4436fc88 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -5,9 +5,7 @@ const { v4 } = require("uuid"); const UrlPattern = require("url-pattern"); const { writeToServerDocuments, sanitizeFileName } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); -const { - ConfluencePagesLoader, -} = require("langchain/document_loaders/web/confluence"); +const { ConfluencePagesLoader } = require("./ConfluenceLoader"); /** * Load Confluence documents from a spaceID and Confluence credentials