2024-04-26 02:53:38 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { default : slugify } = require ( "slugify" ) ;
const { v4 } = require ( "uuid" ) ;
2024-05-14 19:21:04 +02:00
const UrlPattern = require ( "url-pattern" ) ;
2024-04-26 02:53:38 +02:00
const { writeToServerDocuments } = require ( "../../files" ) ;
const { tokenizeString } = require ( "../../tokenizer" ) ;
const {
ConfluencePagesLoader ,
} = require ( "langchain/document_loaders/web/confluence" ) ;
2024-06-21 22:38:50 +02:00
/ * *
* Load Confluence documents from a spaceID and Confluence credentials
* @ param { object } args - forwarded request body params
* @ param { import ( "../../../middleware/setDataSigner" ) . ResponseWithSigner } response - Express response object with encryptionWorker
* @ returns
* /
async function loadConfluence ( { pageUrl , username , accessToken } , response ) {
2024-04-26 02:53:38 +02:00
if ( ! pageUrl || ! username || ! accessToken ) {
return {
success : false ,
reason :
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector." ,
} ;
}
2024-06-18 01:04:20 +02:00
const { valid , result } = validSpaceUrl ( pageUrl ) ;
if ( ! valid ) {
2024-04-26 02:53:38 +02:00
return {
success : false ,
reason :
2024-06-18 01:04:20 +02:00
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*" ,
2024-04-26 02:53:38 +02:00
} ;
}
2024-06-18 01:04:20 +02:00
const { apiBase : baseUrl , spaceKey , subdomain } = result ;
2024-05-14 19:21:04 +02:00
console . log ( ` -- Working Confluence ${ baseUrl } -- ` ) ;
2024-04-26 02:53:38 +02:00
const loader = new ConfluencePagesLoader ( {
2024-05-14 19:21:04 +02:00
baseUrl ,
2024-04-26 02:53:38 +02:00
spaceKey ,
username ,
accessToken ,
} ) ;
const { docs , error } = await loader
. load ( )
. then ( ( docs ) => {
return { docs , error : null } ;
} )
. catch ( ( e ) => {
return {
docs : [ ] ,
error : e . message ? . split ( "Error:" ) ? . [ 1 ] || e . message ,
} ;
} ) ;
if ( ! docs . length || ! ! error ) {
return {
success : false ,
reason : error ? ? "No pages found for that Confluence space." ,
} ;
}
const outFolder = slugify (
` ${ subdomain } -confluence- ${ v4 ( ) . slice ( 0 , 4 ) } `
) . toLowerCase ( ) ;
2024-05-02 23:03:10 +02:00
const outFolderPath =
process . env . NODE _ENV === "development"
? path . resolve (
_ _dirname ,
` ../../../../server/storage/documents/ ${ outFolder } `
)
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ outFolder } ` ) ;
if ( ! fs . existsSync ( outFolderPath ) )
fs . mkdirSync ( outFolderPath , { recursive : true } ) ;
2024-04-26 02:53:38 +02:00
docs . forEach ( ( doc ) => {
2024-05-14 19:22:13 +02:00
if ( ! doc . pageContent ) return ;
2024-04-26 02:53:38 +02:00
const data = {
id : v4 ( ) ,
url : doc . metadata . url + ".page" ,
title : doc . metadata . title || doc . metadata . source ,
docAuthor : subdomain ,
description : doc . metadata . title ,
docSource : ` ${ subdomain } Confluence ` ,
2024-06-21 22:38:50 +02:00
chunkSource : generateChunkSource (
{ doc , baseUrl , accessToken , username } ,
response . locals . encryptionWorker
) ,
2024-04-26 02:53:38 +02:00
published : new Date ( ) . toLocaleString ( ) ,
wordCount : doc . pageContent . split ( " " ) . length ,
pageContent : doc . pageContent ,
token _count _estimate : tokenizeString ( doc . pageContent ) . length ,
} ;
console . log (
` [Confluence Loader]: Saving ${ doc . metadata . title } to ${ outFolder } `
) ;
writeToServerDocuments (
data ,
` ${ slugify ( doc . metadata . title ) } - ${ data . id } ` ,
outFolderPath
) ;
} ) ;
return {
success : true ,
reason : null ,
data : {
spaceKey ,
destination : outFolder ,
} ,
} ;
}
2024-06-21 22:38:50 +02:00
/ * *
* Gets the page content from a specific Confluence page , not all pages in a workspace .
* @ returns
* /
async function fetchConfluencePage ( {
pageUrl ,
baseUrl ,
username ,
accessToken ,
} ) {
if ( ! pageUrl || ! baseUrl || ! username || ! accessToken ) {
return {
success : false ,
content : null ,
reason :
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector." ,
} ;
}
const { valid , result } = validSpaceUrl ( pageUrl ) ;
if ( ! valid ) {
return {
success : false ,
content : null ,
reason :
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*" ,
} ;
}
console . log ( ` -- Working Confluence Page ${ pageUrl } -- ` ) ;
const { spaceKey } = result ;
const loader = new ConfluencePagesLoader ( {
baseUrl ,
spaceKey ,
username ,
accessToken ,
} ) ;
const { docs , error } = await loader
. load ( )
. then ( ( docs ) => {
return { docs , error : null } ;
} )
. catch ( ( e ) => {
return {
docs : [ ] ,
error : e . message ? . split ( "Error:" ) ? . [ 1 ] || e . message ,
} ;
} ) ;
if ( ! docs . length || ! ! error ) {
return {
success : false ,
reason : error ? ? "No pages found for that Confluence space." ,
content : null ,
} ;
}
const targetDocument = docs . find (
( doc ) => doc . pageContent && doc . metadata . url === pageUrl
) ;
if ( ! targetDocument ) {
return {
success : false ,
reason : "Target page could not be found in Confluence space." ,
content : null ,
} ;
}
return {
success : true ,
reason : null ,
content : targetDocument . pageContent ,
} ;
}
2024-06-18 01:04:20 +02:00
/ * *
* A match result for a url - pattern of a Confluence URL
* @ typedef { Object } ConfluenceMatchResult
* @ property { string } subdomain - the subdomain of an organization ' s Confluence space
* @ property { string } spaceKey - the spaceKey of an organization that determines the documents to collect .
* @ property { string } apiBase - the correct REST API url to use for loader .
* /
/ * *
* Generates the correct API base URL for interfacing with the Confluence REST API
* depending on the URL pattern being used since there are various ways to host / access a
* Confluence space .
* @ param { ConfluenceMatchResult } matchResult - result from ` url-pattern ` . match
* @ param { boolean } isCustomDomain - determines if we need to coerce the subpath of the provided URL
* @ returns { string } - the resulting REST API URL
* /
function generateAPIBaseUrl ( matchResult = { } , isCustomDomain = false ) {
const { subdomain } = matchResult ;
let subpath = isCustomDomain ? ` ` : ` /wiki ` ;
if ( isCustomDomain ) return ` https:// ${ customDomain } ${ subpath } ` ;
return ` https:// ${ subdomain } .atlassian.net ${ subpath } ` ;
}
/ * *
* Validates and parses the correct information from a given Confluence URL
* @ param { string } spaceUrl - The organization ' s Confluence URL to parse
* @ returns { {
* valid : boolean ,
* result : ( ConfluenceMatchResult | null ) ,
* } }
* /
function validSpaceUrl ( spaceUrl = "" ) {
let matchResult ;
const patterns = {
default : new UrlPattern (
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
) ,
subdomain : new UrlPattern (
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
) ,
custom : new UrlPattern (
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
) ,
} ;
// If using the default Atlassian Confluence URL pattern.
// We can proceed because the Library/API can use this base url scheme.
matchResult = patterns . default . match ( spaceUrl ) ;
if ( matchResult )
return {
valid : matchResult . hasOwnProperty ( "spaceKey" ) ,
result : {
... matchResult ,
apiBase : generateAPIBaseUrl ( matchResult ) ,
} ,
} ;
// If using a custom subdomain Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the subdomain.
matchResult = patterns . subdomain . match ( spaceUrl ) ;
if ( matchResult ) {
return {
valid : matchResult . hasOwnProperty ( "spaceKey" ) ,
result : {
... matchResult ,
apiBase : generateAPIBaseUrl ( matchResult ) ,
} ,
} ;
}
// If using a base FQDN Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
matchResult = patterns . custom . match ( spaceUrl ) ;
if ( matchResult ) {
return {
valid : matchResult . hasOwnProperty ( "spaceKey" ) ,
result : {
... matchResult ,
apiBase : generateAPIBaseUrl ( matchResult , true ) ,
} ,
} ;
}
// No match
return { valid : false , result : null } ;
}
2024-06-21 22:38:50 +02:00
/ * *
* Generate the full chunkSource for a specific Confluence page so that we can resync it later .
* This data is encrypted into a single ` payload ` query param so we can replay credentials later
* since this was encrypted with the systems persistent password and salt .
* @ param { object } chunkSourceInformation
* @ param { import ( "../../EncryptionWorker" ) . EncryptionWorker } encryptionWorker
* @ returns { string }
* /
function generateChunkSource (
{ doc , baseUrl , accessToken , username } ,
encryptionWorker
) {
const payload = {
baseUrl ,
token : accessToken ,
username ,
} ;
return ` confluence:// ${ doc . metadata . url } ?payload= ${ encryptionWorker . encrypt (
JSON . stringify ( payload )
) } ` ;
}
module . exports = {
loadConfluence ,
fetchConfluencePage ,
} ;