2024-04-26 02:53:38 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { default : slugify } = require ( "slugify" ) ;
const { v4 } = require ( "uuid" ) ;
2024-05-14 19:21:04 +02:00
const UrlPattern = require ( "url-pattern" ) ;
2024-04-26 02:53:38 +02:00
const { writeToServerDocuments } = require ( "../../files" ) ;
const { tokenizeString } = require ( "../../tokenizer" ) ;
const {
ConfluencePagesLoader ,
} = require ( "langchain/document_loaders/web/confluence" ) ;
function validSpaceUrl ( spaceUrl = "" ) {
2024-05-14 19:21:04 +02:00
// Atlassian default URL match
const atlassianPattern = new UrlPattern (
2024-05-16 23:15:59 +02:00
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
2024-04-26 02:53:38 +02:00
) ;
2024-05-14 19:21:04 +02:00
const atlassianMatch = atlassianPattern . match ( spaceUrl ) ;
if ( atlassianMatch ) {
return { valid : true , result : atlassianMatch } ;
}
2024-05-16 20:01:34 +02:00
let customMatch = null ;
[
2024-05-16 23:15:59 +02:00
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*" , // Custom Confluence space
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*" , // Custom Confluence space + Human-readable space tag.
2024-05-16 20:01:34 +02:00
] . forEach ( ( matchPattern ) => {
if ( ! ! customMatch ) return ;
const pattern = new UrlPattern ( matchPattern ) ;
customMatch = pattern . match ( spaceUrl ) ;
} ) ;
if ( customMatch ) {
2024-05-14 19:21:04 +02:00
customMatch . customDomain =
( customMatch . subdomain ? ` ${ customMatch . subdomain } . ` : "" ) + //
` ${ customMatch . domain } . ${ customMatch . tld } ` ;
return { valid : true , result : customMatch , custom : true } ;
}
// No match
return { valid : false , result : null } ;
2024-04-26 02:53:38 +02:00
}
async function loadConfluence ( { pageUrl , username , accessToken } ) {
if ( ! pageUrl || ! username || ! accessToken ) {
return {
success : false ,
reason :
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector." ,
} ;
}
const validSpace = validSpaceUrl ( pageUrl ) ;
if ( ! validSpace . result ) {
return {
success : false ,
reason :
2024-05-14 19:21:04 +02:00
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*" ,
2024-04-26 02:53:38 +02:00
} ;
}
2024-05-14 19:21:04 +02:00
const { subdomain , customDomain , spaceKey } = validSpace . result ;
let baseUrl = ` https:// ${ subdomain } .atlassian.net/wiki ` ;
if ( customDomain ) {
baseUrl = ` https:// ${ customDomain } /wiki ` ;
}
console . log ( ` -- Working Confluence ${ baseUrl } -- ` ) ;
2024-04-26 02:53:38 +02:00
const loader = new ConfluencePagesLoader ( {
2024-05-14 19:21:04 +02:00
baseUrl ,
2024-04-26 02:53:38 +02:00
spaceKey ,
username ,
accessToken ,
} ) ;
const { docs , error } = await loader
. load ( )
. then ( ( docs ) => {
return { docs , error : null } ;
} )
. catch ( ( e ) => {
return {
docs : [ ] ,
error : e . message ? . split ( "Error:" ) ? . [ 1 ] || e . message ,
} ;
} ) ;
if ( ! docs . length || ! ! error ) {
return {
success : false ,
reason : error ? ? "No pages found for that Confluence space." ,
} ;
}
const outFolder = slugify (
` ${ subdomain } -confluence- ${ v4 ( ) . slice ( 0 , 4 ) } `
) . toLowerCase ( ) ;
2024-05-02 23:03:10 +02:00
const outFolderPath =
process . env . NODE _ENV === "development"
? path . resolve (
_ _dirname ,
` ../../../../server/storage/documents/ ${ outFolder } `
)
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ outFolder } ` ) ;
if ( ! fs . existsSync ( outFolderPath ) )
fs . mkdirSync ( outFolderPath , { recursive : true } ) ;
2024-04-26 02:53:38 +02:00
docs . forEach ( ( doc ) => {
2024-05-14 19:22:13 +02:00
if ( ! doc . pageContent ) return ;
2024-04-26 02:53:38 +02:00
const data = {
id : v4 ( ) ,
url : doc . metadata . url + ".page" ,
title : doc . metadata . title || doc . metadata . source ,
docAuthor : subdomain ,
description : doc . metadata . title ,
docSource : ` ${ subdomain } Confluence ` ,
chunkSource : ` confluence:// ${ doc . metadata . url } ` ,
published : new Date ( ) . toLocaleString ( ) ,
wordCount : doc . pageContent . split ( " " ) . length ,
pageContent : doc . pageContent ,
token _count _estimate : tokenizeString ( doc . pageContent ) . length ,
} ;
console . log (
` [Confluence Loader]: Saving ${ doc . metadata . title } to ${ outFolder } `
) ;
writeToServerDocuments (
data ,
` ${ slugify ( doc . metadata . title ) } - ${ data . id } ` ,
outFolderPath
) ;
} ) ;
return {
success : true ,
reason : null ,
data : {
spaceKey ,
destination : outFolder ,
} ,
} ;
}
module . exports = loadConfluence ;