1347 human readable confluence url (#1706)

* chore: confluence data connector can now handle custom urls, in addition to default {subdomain}.atlassian.net ones

* chore: formatting as per yarn lint

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* refactor implementation of various types of Confluence URL patterns

---------

Co-authored-by: Predrag Stojadinovic <predrag@stojadinovic.net>
Co-authored-by: Predrag Stojadinović <cope@users.noreply.github.com>
Co-authored-by: Predrag Stojadinovic <predrags@nvidia.com>
This commit is contained in:
Timothy Carambat 2024-06-17 16:04:20 -07:00 committed by GitHub
parent c8c618137f
commit a598c8e04c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -9,37 +9,6 @@ const {
ConfluencePagesLoader,
} = require("langchain/document_loaders/web/confluence");
function validSpaceUrl(spaceUrl = "") {
// Atlassian default URL match
const atlassianPattern = new UrlPattern(
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
);
const atlassianMatch = atlassianPattern.match(spaceUrl);
if (atlassianMatch) {
return { valid: true, result: atlassianMatch };
}
let customMatch = null;
[
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*", // Custom Confluence space
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*", // Custom Confluence space + Human-readable space tag.
].forEach((matchPattern) => {
if (!!customMatch) return;
const pattern = new UrlPattern(matchPattern);
customMatch = pattern.match(spaceUrl);
});
if (customMatch) {
customMatch.customDomain =
(customMatch.subdomain ? `${customMatch.subdomain}.` : "") + //
`${customMatch.domain}.${customMatch.tld}`;
return { valid: true, result: customMatch, custom: true };
}
// No match
return { valid: false, result: null };
}
async function loadConfluence({ pageUrl, username, accessToken }) {
if (!pageUrl || !username || !accessToken) {
return {
@ -49,21 +18,16 @@ async function loadConfluence({ pageUrl, username, accessToken }) {
};
}
const validSpace = validSpaceUrl(pageUrl);
if (!validSpace.result) {
const { valid, result } = validSpaceUrl(pageUrl);
if (!valid) {
return {
success: false,
reason:
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
};
}
const { subdomain, customDomain, spaceKey } = validSpace.result;
let baseUrl = `https://${subdomain}.atlassian.net/wiki`;
if (customDomain) {
baseUrl = `https://${customDomain}/wiki`;
}
const { apiBase: baseUrl, spaceKey, subdomain } = result;
console.log(`-- Working Confluence ${baseUrl} --`);
const loader = new ConfluencePagesLoader({
baseUrl,
@ -142,4 +106,93 @@ async function loadConfluence({ pageUrl, username, accessToken }) {
};
}
/**
* A match result for a url-pattern of a Confluence URL
* @typedef {Object} ConfluenceMatchResult
* @property {string} subdomain - the subdomain of an organization's Confluence space
* @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
* @property {string} apiBase - the correct REST API url to use for loader.
*/
/**
* Generates the correct API base URL for interfacing with the Confluence REST API
* depending on the URL pattern being used since there are various ways to host/access a
* Confluence space.
* @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
* @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
* @returns {string} - the resulting REST API URL
*/
function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
const { subdomain } = matchResult;
let subpath = isCustomDomain ? `` : `/wiki`;
if (isCustomDomain) return `https://${customDomain}${subpath}`;
return `https://${subdomain}.atlassian.net${subpath}`;
}
/**
* Validates and parses the correct information from a given Confluence URL
* @param {string} spaceUrl - The organization's Confluence URL to parse
* @returns {{
* valid: boolean,
* result: (ConfluenceMatchResult|null),
* }}
*/
function validSpaceUrl(spaceUrl = "") {
let matchResult;
const patterns = {
default: new UrlPattern(
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
),
subdomain: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
),
custom: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
),
};
// If using the default Atlassian Confluence URL pattern.
// We can proceed because the Library/API can use this base url scheme.
matchResult = patterns.default.match(spaceUrl);
if (matchResult)
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};
// If using a custom subdomain Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the subdomain.
matchResult = patterns.subdomain.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};
}
// If using a base FQDN Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
matchResult = patterns.custom.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult, true),
},
};
}
// No match
return { valid: false, result: null };
}
module.exports = loadConfluence;