anything-llm/collector/extensions/resync/index.js

const { getLinkText } = require("../../processLink");

/**
 * Fetches the content of a raw link. Returns the content as a text string of the link in question.
 * @param {object} data - metadata from document (eg: link) 
 * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
 */
async function resyncLink({ link }, response) {
  if (!link) throw new Error('Invalid link provided');
  try {
    const { success, content = null } = await getLinkText(link);
    if (!success) throw new Error(`Failed to sync link content. ${reason}`);
    response.status(200).json({ success, content });
  } catch (e) {
    console.error(e);
    response.status(200).json({
      success: false,
      content: null,
    });
  }
}

/**
 * Fetches the content of a YouTube link. Returns the content as a text string of the video in question.
 * We offer this as there may be some videos where a transcription could be manually edited after initial scraping
 * but in general - transcriptions often never change.
 * @param {object} data - metadata from document (eg: link) 
 * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
 */
async function resyncYouTube({ link }, response) {
  if (!link) throw new Error('Invalid link provided');
  try {
    const { fetchVideoTranscriptContent } = require("../../utils/extensions/YoutubeTranscript");
    const { success, reason, content } = await fetchVideoTranscriptContent({ url: link });
    if (!success) throw new Error(`Failed to sync YouTube video transcript. ${reason}`);
    response.status(200).json({ success, content });
  } catch (e) {
    console.error(e);
    response.status(200).json({
      success: false,
      content: null,
    });
  }
}

/**
 * Fetches the content of a specific confluence page via its chunkSource. 
 * Returns the content as a text string of the page in question and only that page.
 * @param {object} data - metadata from document (eg: chunkSource) 
 * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
 */
async function resyncConfluence({ chunkSource }, response) {
  if (!chunkSource) throw new Error('Invalid source property provided');
  try {
    // Confluence data is `payload` encrypted. So we need to expand its
    // encrypted payload back into query params so we can reFetch the page with same access token/params.
    const source = response.locals.encryptionWorker.expandPayload(chunkSource);
    const { fetchConfluencePage } = require("../../utils/extensions/Confluence");
    const { success, reason, content } = await fetchConfluencePage({
      pageUrl: `https:${source.pathname}`, // need to add back the real protocol
      baseUrl: source.searchParams.get('baseUrl'),
      spaceKey: source.searchParams.get('spaceKey'),
      accessToken: source.searchParams.get('token'),
      username: source.searchParams.get('username'),
    });

    if (!success) throw new Error(`Failed to sync Confluence page content. ${reason}`);
    response.status(200).json({ success, content });
  } catch (e) {
    console.error(e);
    response.status(200).json({
      success: false,
      content: null,
    });
  }
}

/**
 * Fetches the content of a specific confluence page via its chunkSource. 
 * Returns the content as a text string of the page in question and only that page.
 * @param {object} data - metadata from document (eg: chunkSource) 
 * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
 */
async function resyncGithub({ chunkSource }, response) {
  if (!chunkSource) throw new Error('Invalid source property provided');
  try {
    // Github file data is `payload` encrypted (might contain PAT). So we need to expand its
    // encrypted payload back into query params so we can reFetch the page with same access token/params.
    const source = response.locals.encryptionWorker.expandPayload(chunkSource);
    const { fetchGithubFile } = require("../../utils/extensions/RepoLoader/GithubRepo");
    const { success, reason, content } = await fetchGithubFile({
      repoUrl: `https:${source.pathname}`, // need to add back the real protocol
      branch: source.searchParams.get('branch'),
      accessToken: source.searchParams.get('pat'),
      sourceFilePath: source.searchParams.get('path'),
    });

    if (!success) throw new Error(`Failed to sync Github file content. ${reason}`);
    response.status(200).json({ success, content });
  } catch (e) {
    console.error(e);
    response.status(200).json({
      success: false,
      content: null,
    });
  }
}

module.exports = {
  link: resyncLink,
  youtube: resyncYouTube,
  confluence: resyncConfluence,
  github: resyncGithub,
}
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com> 2024-06-21 22:38:50 +02:00			`const { getLinkText } = require("../../processLink");`

			`/**`
			`* Fetches the content of a raw link. Returns the content as a text string of the link in question.`
			`* @param {object} data - metadata from document (eg: link)`
			`* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response`
			`*/`
			`async function resyncLink({ link }, response) {`
			`if (!link) throw new Error('Invalid link provided');`
			`try {`
			`const { success, content = null } = await getLinkText(link);`
			if (!success) throw new Error(`Failed to sync link content. ${reason}`);
			`response.status(200).json({ success, content });`
			`} catch (e) {`
			`console.error(e);`
			`response.status(200).json({`
			`success: false,`
			`content: null,`
			`});`
			`}`
			`}`

			`/**`
			`* Fetches the content of a YouTube link. Returns the content as a text string of the video in question.`
			`* We offer this as there may be some videos where a transcription could be manually edited after initial scraping`
			`* but in general - transcriptions often never change.`
			`* @param {object} data - metadata from document (eg: link)`
			`* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response`
			`*/`
			`async function resyncYouTube({ link }, response) {`
			`if (!link) throw new Error('Invalid link provided');`
			`try {`
			`const { fetchVideoTranscriptContent } = require("../../utils/extensions/YoutubeTranscript");`
			`const { success, reason, content } = await fetchVideoTranscriptContent({ url: link });`
			if (!success) throw new Error(`Failed to sync YouTube video transcript. ${reason}`);
			`response.status(200).json({ success, content });`
			`} catch (e) {`
			`console.error(e);`
			`response.status(200).json({`
			`success: false,`
			`content: null,`
			`});`
			`}`
			`}`

			`/**`
			`* Fetches the content of a specific confluence page via its chunkSource.`
			`* Returns the content as a text string of the page in question and only that page.`
			`* @param {object} data - metadata from document (eg: chunkSource)`
			`* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response`
			`*/`
			`async function resyncConfluence({ chunkSource }, response) {`
			`if (!chunkSource) throw new Error('Invalid source property provided');`
			`try {`
			// Confluence data is `payload` encrypted. So we need to expand its
			`// encrypted payload back into query params so we can reFetch the page with same access token/params.`
			`const source = response.locals.encryptionWorker.expandPayload(chunkSource);`
			`const { fetchConfluencePage } = require("../../utils/extensions/Confluence");`
			`const { success, reason, content } = await fetchConfluencePage({`
			pageUrl: `https:${source.pathname}`, // need to add back the real protocol
			`baseUrl: source.searchParams.get('baseUrl'),`
Support more Confluence URL formats (#2118) * support more confluence url formats * use pattern matching for confluence urls and manual splitting as fallback * rework entire Confluence flow to prevent issues with custom, local, and cloud spaces * remove dep --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> 2024-09-26 01:12:17 +02:00			`spaceKey: source.searchParams.get('spaceKey'),`
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com> 2024-06-21 22:38:50 +02:00			`accessToken: source.searchParams.get('token'),`
			`username: source.searchParams.get('username'),`
			`});`

			if (!success) throw new Error(`Failed to sync Confluence page content. ${reason}`);
			`response.status(200).json({ success, content });`
			`} catch (e) {`
			`console.error(e);`
			`response.status(200).json({`
			`success: false,`
			`content: null,`
			`});`
			`}`
			`}`

			`/**`
			`* Fetches the content of a specific confluence page via its chunkSource.`
			`* Returns the content as a text string of the page in question and only that page.`
			`* @param {object} data - metadata from document (eg: chunkSource)`
			`* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response`
			`*/`
			`async function resyncGithub({ chunkSource }, response) {`
			`if (!chunkSource) throw new Error('Invalid source property provided');`
			`try {`
			// Github file data is `payload` encrypted (might contain PAT). So we need to expand its
			`// encrypted payload back into query params so we can reFetch the page with same access token/params.`
			`const source = response.locals.encryptionWorker.expandPayload(chunkSource);`
GitLab Hosted and Local Connector (#1932) * Add support for GitLab repo collection as well as Github Repo collection * Refactor for repo collectors to be more compact --------- Co-authored-by: Emil Rofors <emirof@gmail.com> 2024-07-23 21:23:51 +02:00			`const { fetchGithubFile } = require("../../utils/extensions/RepoLoader/GithubRepo");`
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com> 2024-06-21 22:38:50 +02:00			`const { success, reason, content } = await fetchGithubFile({`
			repoUrl: `https:${source.pathname}`, // need to add back the real protocol
			`branch: source.searchParams.get('branch'),`
			`accessToken: source.searchParams.get('pat'),`
			`sourceFilePath: source.searchParams.get('path'),`
			`});`

			if (!success) throw new Error(`Failed to sync Github file content. ${reason}`);
			`response.status(200).json({ success, content });`
			`} catch (e) {`
			`console.error(e);`
			`response.status(200).json({`
			`success: false,`
			`content: null,`
			`});`
			`}`
			`}`

			`module.exports = {`
			`link: resyncLink,`
			`youtube: resyncYouTube,`
			`confluence: resyncConfluence,`
			`github: resyncGithub,`
			`}`