Improve RAG responses via source backfilling (#1477)

* Improve RAG responses via source backfilling * Hide irrelevant citations from UI
2024-11-10 17:00:11 +01:00 · 2024-05-23 11:56:57 -05:00 · 2024-05-23 11:56:57 -05:00 · 13fb63930b
commit 13fb63930b
parent cc7e7fb3ac
3 changed files with 135 additions and 17 deletions
--- a/server/utils/chats/index.js
+++ b/server/utils/chats/index.js
@ -151,16 +151,27 @@ async function chatWithWorkspace(
    };
  }
-  contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts];
+  const { fillSourceWindow } = require("../helpers/chat");
  const filledSources = fillSourceWindow({
    nDocs: workspace?.topN || 4,
    searchResults: vectorSearchResults.sources,
    history: rawHistory,
    filterIdentifiers: pinnedDocIdentifiers,
  });
  // Why does contextTexts get all the info, but sources only get current search?
  // This is to give the ability of the LLM to "comprehend" a contextual response without
  // populating the Citations under a response with documents the user "thinks" are irrelevant
  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
  contextTexts = [...contextTexts, ...filledSources.contextTexts];
  sources = [...sources, ...vectorSearchResults.sources];
-  // If in query mode and no sources are found from the vector search and no pinned documents, do not
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
  // let the LLM try to hallucinate a response or use general knowledge and exit early
-  if (
+  if (chatMode === "query" && contextTexts.length === 0) {
    chatMode === "query" &&
    vectorSearchResults.sources.length === 0 &&
    pinnedDocIdentifiers.length === 0
  ) {
    return {
      id: uuid,
      type: "textResponse",
@ -224,9 +235,7 @@ async function recentChatHistory({
  workspace,
  thread = null,
  messageLimit = 20,
  chatMode = null,
 }) {
  if (chatMode === "query") return { rawHistory: [], chatHistory: [] };
  const rawHistory = (
    await WorkspaceChats.where(
      {
--- a/server/utils/chats/stream.js
+++ b/server/utils/chats/stream.js
@ -100,7 +100,6 @@ async function streamChatWithWorkspace(
    workspace,
    thread,
    messageLimit,
    chatMode,
  });
  // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search
@ -157,16 +156,27 @@ async function streamChatWithWorkspace(
    return;
  }
-  contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts];
+  const { fillSourceWindow } = require("../helpers/chat");
  const filledSources = fillSourceWindow({
    nDocs: workspace?.topN || 4,
    searchResults: vectorSearchResults.sources,
    history: rawHistory,
    filterIdentifiers: pinnedDocIdentifiers,
  });
  // Why does contextTexts get all the info, but sources only get current search?
  // This is to give the ability of the LLM to "comprehend" a contextual response without
  // populating the Citations under a response with documents the user "thinks" are irrelevant
  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
  contextTexts = [...contextTexts, ...filledSources.contextTexts];
  sources = [...sources, ...vectorSearchResults.sources];
-  // If in query mode and no sources are found from the vector search and no pinned documents, do not
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
  // let the LLM try to hallucinate a response or use general knowledge and exit early
-  if (
+  if (chatMode === "query" && contextTexts.length === 0) {
    chatMode === "query" &&
    sources.length === 0 &&
    pinnedDocIdentifiers.length === 0
  ) {
    writeResponseChunk(response, {
      id: uuid,
      type: "textResponse",
--- a/server/utils/helpers/chat/index.js
+++ b/server/utils/helpers/chat/index.js
@ -1,3 +1,5 @@
 const { sourceIdentifier } = require("../../chats");
 const { safeJsonParse } = require("../../http");
 const { TokenManager } = require("../tiktoken");
 const { convertToPromptHistory } = require("./responses");
@ -343,7 +345,104 @@ function cannonball({
  return truncatedText;
 }
 /**
 * Fill the sources window with the priority of
 * 1. Pinned documents (handled prior to function)
 * 2. VectorSearch results
 * 3. prevSources in chat history - starting from most recent.
 *
 * Ensuring the window always has the desired amount of sources so that followup questions
 * in any chat mode have relevant sources, but not infinite sources. This function is used during chatting
 * and allows follow-up questions within a query chat that otherwise would have zero sources and would fail.
 * The added benefit is that during regular RAG chat, we have better coherence of citations that otherwise would
 * also yield no results with no need for a ReRanker to run and take much longer to return a response.
 *
 * The side effect of this is follow-up unrelated questions now have citations that would look totally irrelevant, however
 * we would rather optimize on the correctness of a response vs showing extraneous sources during a response. Given search
 * results always take a priority a good unrelated question that produces RAG results will still function as desired and due to previous
 * history backfill sources "changing context" mid-chat is handled appropriately.
 * example:
 * ---previous implementation---
 * prompt 1: "What is anythingllm?" -> possibly get 4 good sources
 * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> bad response due to bad context mgmt
 * ---next implementation---
 * prompt 1: "What is anythingllm?" -> possibly get 4 good sources
 * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> backfill with 3 good sources from previous -> much better response
 *
 * @param {Object} config - params to call
 * @param {object} config.nDocs = fill size of the window
 * @param {object} config.searchResults = vector similarityResponse results for .sources
 * @param {object[]} config.history - rawHistory of chat containing sources
 * @param {string[]} config.filterIdentifiers - Pinned document identifiers to prevent duplicate context
 * @returns {{
 *   contextTexts: string[],
 *   sources: object[],
 * }} - Array of sources that should be added to window
 */
 function fillSourceWindow({
  nDocs = 4, // Number of documents
  searchResults = [], // Sources from similarity search
  history = [], // Raw history
  filterIdentifiers = [], // pinned document sources
 } = config) {
  const sources = [...searchResults];
  if (sources.length >= nDocs || history.length === 0) {
    return {
      sources,
      contextTexts: sources.map((src) => src.text),
    };
  }
  const log = (text, ...args) => {
    console.log(`\x1b[36m[fillSourceWindow]\x1b[0m ${text}`, ...args);
  };
  log(
    `Need to backfill ${nDocs - searchResults.length} chunks to fill in the source window for RAG!`
  );
  const seenChunks = new Set(searchResults.map((source) => source.id));
  // We need to reverse again because we need to iterate from bottom of array (most recent chats)
  // Looking at this function by itself you may think that this loop could be extreme for long history chats,
  // but this was already handled where `history` we derived. This comes from `recentChatHistory` which
  // includes a limit for history (default: 20). So this loop does not look as extreme as on first glance.
  for (const chat of history.reverse()) {
    if (sources.length >= nDocs) {
      log(
        `Citations backfilled to ${nDocs} references from ${searchResults.length} original citations.`
      );
      break;
    }
    const chatSources =
      safeJsonParse(chat.response, { sources: [] })?.sources || [];
    if (!chatSources?.length || !Array.isArray(chatSources)) continue;
    const validSources = chatSources.filter((source) => {
      return (
        filterIdentifiers.includes(sourceIdentifier(source)) == false && // source cannot be in current pins
        source.hasOwnProperty("score") && // source cannot have come from a pinned document that was previously pinned
        source.hasOwnProperty("text") && // source has a valid text property we can use
        seenChunks.has(source.id) == false // is unique
      );
    });
    for (const validSource of validSources) {
      if (sources.length >= nDocs) break;
      sources.push(validSource);
      seenChunks.add(validSource.id);
    }
  }
  return {
    sources,
    contextTexts: sources.map((src) => src.text),
  };
 }
 module.exports = {
  messageArrayCompressor,
  messageStringCompressor,
  fillSourceWindow,
 };