2024-04-26 02:53:38 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { default : slugify } = require ( "slugify" ) ;
const { v4 } = require ( "uuid" ) ;
2024-05-14 19:21:04 +02:00
const UrlPattern = require ( "url-pattern" ) ;
2024-04-26 02:53:38 +02:00
const { writeToServerDocuments } = require ( "../../files" ) ;
const { tokenizeString } = require ( "../../tokenizer" ) ;
const {
ConfluencePagesLoader ,
} = require ( "langchain/document_loaders/web/confluence" ) ;
async function loadConfluence ( { pageUrl , username , accessToken } ) {
if ( ! pageUrl || ! username || ! accessToken ) {
return {
success : false ,
reason :
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector." ,
} ;
}
2024-06-18 01:04:20 +02:00
const { valid , result } = validSpaceUrl ( pageUrl ) ;
if ( ! valid ) {
2024-04-26 02:53:38 +02:00
return {
success : false ,
reason :
2024-06-18 01:04:20 +02:00
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*" ,
2024-04-26 02:53:38 +02:00
} ;
}
2024-06-18 01:04:20 +02:00
const { apiBase : baseUrl , spaceKey , subdomain } = result ;
2024-05-14 19:21:04 +02:00
console . log ( ` -- Working Confluence ${ baseUrl } -- ` ) ;
2024-04-26 02:53:38 +02:00
const loader = new ConfluencePagesLoader ( {
2024-05-14 19:21:04 +02:00
baseUrl ,
2024-04-26 02:53:38 +02:00
spaceKey ,
username ,
accessToken ,
} ) ;
const { docs , error } = await loader
. load ( )
. then ( ( docs ) => {
return { docs , error : null } ;
} )
. catch ( ( e ) => {
return {
docs : [ ] ,
error : e . message ? . split ( "Error:" ) ? . [ 1 ] || e . message ,
} ;
} ) ;
if ( ! docs . length || ! ! error ) {
return {
success : false ,
reason : error ? ? "No pages found for that Confluence space." ,
} ;
}
const outFolder = slugify (
` ${ subdomain } -confluence- ${ v4 ( ) . slice ( 0 , 4 ) } `
) . toLowerCase ( ) ;
2024-05-02 23:03:10 +02:00
const outFolderPath =
process . env . NODE _ENV === "development"
? path . resolve (
_ _dirname ,
` ../../../../server/storage/documents/ ${ outFolder } `
)
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ outFolder } ` ) ;
if ( ! fs . existsSync ( outFolderPath ) )
fs . mkdirSync ( outFolderPath , { recursive : true } ) ;
2024-04-26 02:53:38 +02:00
docs . forEach ( ( doc ) => {
2024-05-14 19:22:13 +02:00
if ( ! doc . pageContent ) return ;
2024-04-26 02:53:38 +02:00
const data = {
id : v4 ( ) ,
url : doc . metadata . url + ".page" ,
title : doc . metadata . title || doc . metadata . source ,
docAuthor : subdomain ,
description : doc . metadata . title ,
docSource : ` ${ subdomain } Confluence ` ,
chunkSource : ` confluence:// ${ doc . metadata . url } ` ,
published : new Date ( ) . toLocaleString ( ) ,
wordCount : doc . pageContent . split ( " " ) . length ,
pageContent : doc . pageContent ,
token _count _estimate : tokenizeString ( doc . pageContent ) . length ,
} ;
console . log (
` [Confluence Loader]: Saving ${ doc . metadata . title } to ${ outFolder } `
) ;
writeToServerDocuments (
data ,
` ${ slugify ( doc . metadata . title ) } - ${ data . id } ` ,
outFolderPath
) ;
} ) ;
return {
success : true ,
reason : null ,
data : {
spaceKey ,
destination : outFolder ,
} ,
} ;
}
2024-06-18 01:04:20 +02:00
/ * *
* A match result for a url - pattern of a Confluence URL
* @ typedef { Object } ConfluenceMatchResult
* @ property { string } subdomain - the subdomain of an organization ' s Confluence space
* @ property { string } spaceKey - the spaceKey of an organization that determines the documents to collect .
* @ property { string } apiBase - the correct REST API url to use for loader .
* /
/ * *
* Generates the correct API base URL for interfacing with the Confluence REST API
* depending on the URL pattern being used since there are various ways to host / access a
* Confluence space .
* @ param { ConfluenceMatchResult } matchResult - result from ` url-pattern ` . match
* @ param { boolean } isCustomDomain - determines if we need to coerce the subpath of the provided URL
* @ returns { string } - the resulting REST API URL
* /
function generateAPIBaseUrl ( matchResult = { } , isCustomDomain = false ) {
const { subdomain } = matchResult ;
let subpath = isCustomDomain ? ` ` : ` /wiki ` ;
if ( isCustomDomain ) return ` https:// ${ customDomain } ${ subpath } ` ;
return ` https:// ${ subdomain } .atlassian.net ${ subpath } ` ;
}
/ * *
* Validates and parses the correct information from a given Confluence URL
* @ param { string } spaceUrl - The organization ' s Confluence URL to parse
* @ returns { {
* valid : boolean ,
* result : ( ConfluenceMatchResult | null ) ,
* } }
* /
function validSpaceUrl ( spaceUrl = "" ) {
let matchResult ;
const patterns = {
default : new UrlPattern (
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
) ,
subdomain : new UrlPattern (
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
) ,
custom : new UrlPattern (
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
) ,
} ;
// If using the default Atlassian Confluence URL pattern.
// We can proceed because the Library/API can use this base url scheme.
matchResult = patterns . default . match ( spaceUrl ) ;
if ( matchResult )
return {
valid : matchResult . hasOwnProperty ( "spaceKey" ) ,
result : {
... matchResult ,
apiBase : generateAPIBaseUrl ( matchResult ) ,
} ,
} ;
// If using a custom subdomain Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the subdomain.
matchResult = patterns . subdomain . match ( spaceUrl ) ;
if ( matchResult ) {
return {
valid : matchResult . hasOwnProperty ( "spaceKey" ) ,
result : {
... matchResult ,
apiBase : generateAPIBaseUrl ( matchResult ) ,
} ,
} ;
}
// If using a base FQDN Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
matchResult = patterns . custom . match ( spaceUrl ) ;
if ( matchResult ) {
return {
valid : matchResult . hasOwnProperty ( "spaceKey" ) ,
result : {
... matchResult ,
apiBase : generateAPIBaseUrl ( matchResult , true ) ,
} ,
} ;
}
// No match
return { valid : false , result : null } ;
}
2024-04-26 02:53:38 +02:00
module . exports = loadConfluence ;