mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-11 09:10:13 +01:00
13fb63930b
* Improve RAG responses via source backfilling * Hide irrelevant citations from UI
449 lines
18 KiB
JavaScript
449 lines
18 KiB
JavaScript
const { sourceIdentifier } = require("../../chats");
|
|
const { safeJsonParse } = require("../../http");
|
|
const { TokenManager } = require("../tiktoken");
|
|
const { convertToPromptHistory } = require("./responses");
|
|
|
|
/*
|
|
What is the message Array compressor?
|
|
TLDR: So anyway, i started blasting (your prompts & stuff)
|
|
|
|
messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts
|
|
and also maintain coherent history, system instructions and context, if applicable.
|
|
|
|
We took an opinionated approach that after much back-testing we have found retained a highly coherent answer
|
|
under most user conditions that a user would take while using this specific system. While other systems may
|
|
use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple.
|
|
|
|
We "cannonball" the input.
|
|
Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so.
|
|
This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied.
|
|
You may think: "Doesn't this result in massive data loss?" - yes & no.
|
|
Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback
|
|
on the quality of responses.
|
|
|
|
We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to
|
|
be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable.
|
|
In general:
|
|
system: at best 15% of token capacity
|
|
history: at best 15% of token capacity
|
|
prompt: at best 70% of token capacity.
|
|
|
|
we handle overflows by taking an aggressive path for two main cases.
|
|
|
|
1. Very large user prompt
|
|
- Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread.
|
|
- We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone.
|
|
|
|
2. Context window is exceeded in regular use.
|
|
- We do not touch prompt since it is very likely to be <70% of window.
|
|
- We check system prompt is not outrageous - if it is we cannonball it and keep context if present.
|
|
- We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a
|
|
preference for recent history if we can cannonball to fit it, otherwise it is omitted.
|
|
|
|
We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases.
|
|
We also take the approach that history is the least important and most flexible of the items in this array of responses.
|
|
|
|
There is a supplemental version of this function that also returns a formatted string for models like Claude-2
|
|
*/
|
|
|
|
async function messageArrayCompressor(llm, messages = [], rawHistory = []) {
|
|
// assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively
|
|
// run the compressor to ensure the prompt has enough space to reply.
|
|
// realistically - most users will not be impacted by this.
|
|
const tokenBuffer = 600;
|
|
const tokenManager = new TokenManager(llm.model);
|
|
// If no work needs to be done, just pass through.
|
|
if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit())
|
|
return messages;
|
|
|
|
const system = messages.shift();
|
|
const user = messages.pop();
|
|
const userPromptSize = tokenManager.countFromString(user.content);
|
|
|
|
// User prompt is the main focus here - we we prioritize it and allow
|
|
// it to highjack the entire conversation thread. We are going to
|
|
// cannonball the prompt through to ensure the reply has at least 20% of
|
|
// the token supply to reply with.
|
|
if (userPromptSize > llm.limits.user) {
|
|
return [
|
|
{
|
|
role: "user",
|
|
content: cannonball({
|
|
input: user.content,
|
|
targetTokenSize: llm.promptWindowLimit() * 0.8,
|
|
tiktokenInstance: tokenManager,
|
|
}),
|
|
},
|
|
];
|
|
}
|
|
|
|
const compressedSystem = new Promise(async (resolve) => {
|
|
const count = tokenManager.countFromString(system.content);
|
|
if (count < llm.limits.system) {
|
|
resolve(system);
|
|
return;
|
|
}
|
|
|
|
// Split context from system prompt - cannonball since its over the window.
|
|
// We assume the context + user prompt is enough tokens to fit.
|
|
const [prompt, context = ""] = system.content.split("Context:");
|
|
let compressedPrompt;
|
|
let compressedContext;
|
|
|
|
// If the user system prompt contribution's to the system prompt is more than
|
|
// 25% of the system limit, we will cannonball it - this favors the context
|
|
// over the instruction from the user.
|
|
if (tokenManager.countFromString(prompt) >= llm.limits.system * 0.25) {
|
|
compressedPrompt = cannonball({
|
|
input: prompt,
|
|
targetTokenSize: llm.limits.system * 0.25,
|
|
tiktokenInstance: tokenManager,
|
|
});
|
|
} else {
|
|
compressedPrompt = prompt;
|
|
}
|
|
|
|
if (tokenManager.countFromString(context) >= llm.limits.system * 0.75) {
|
|
compressedContext = cannonball({
|
|
input: context,
|
|
targetTokenSize: llm.limits.system * 0.75,
|
|
tiktokenInstance: tokenManager,
|
|
});
|
|
} else {
|
|
compressedContext = context;
|
|
}
|
|
|
|
system.content = `${compressedPrompt}${
|
|
compressedContext ? `\nContext: ${compressedContext}` : ""
|
|
}`;
|
|
resolve(system);
|
|
});
|
|
|
|
// Prompt is allowed to take up to 70% of window - we know its under
|
|
// if we are here, so passthrough.
|
|
const compressedPrompt = new Promise(async (resolve) => resolve(user));
|
|
|
|
// We always aggressively compress history because it is the least
|
|
// important data to retain in full-fidelity.
|
|
const compressedHistory = new Promise((resolve) => {
|
|
const eligibleHistoryItems = [];
|
|
var historyTokenCount = 0;
|
|
|
|
for (const [i, history] of rawHistory.reverse().entries()) {
|
|
const [user, assistant] = convertToPromptHistory([history]);
|
|
const [userTokens, assistantTokens] = [
|
|
tokenManager.countFromString(user.content),
|
|
tokenManager.countFromString(assistant.content),
|
|
];
|
|
const total = userTokens + assistantTokens;
|
|
|
|
// If during the loop the token cost of adding this history
|
|
// is small, we can add it to history and move onto next.
|
|
if (historyTokenCount + total < llm.limits.history) {
|
|
eligibleHistoryItems.unshift(user, assistant);
|
|
historyTokenCount += total;
|
|
continue;
|
|
}
|
|
|
|
// If we reach here the overhead of adding this history item will
|
|
// be too much of the limit. So now, we are prioritizing
|
|
// the most recent 3 message pairs - if we are already past those - exit loop and stop
|
|
// trying to make history work.
|
|
if (i > 2) break;
|
|
|
|
// We are over the limit and we are within the first 3 most recent chats.
|
|
// so now we cannonball them to make them fit into the window.
|
|
// max size = llm.limit.history; Each component of the message, can at most
|
|
// be 50% of the history. We cannonball whichever is the problem.
|
|
// The math isnt perfect for tokens, so we have to add a fudge factor for safety.
|
|
const maxTargetSize = Math.floor(llm.limits.history / 2.2);
|
|
if (userTokens > maxTargetSize) {
|
|
user.content = cannonball({
|
|
input: user.content,
|
|
targetTokenSize: maxTargetSize,
|
|
tiktokenInstance: tokenManager,
|
|
});
|
|
}
|
|
|
|
if (assistantTokens > maxTargetSize) {
|
|
assistant.content = cannonball({
|
|
input: assistant.content,
|
|
targetTokenSize: maxTargetSize,
|
|
tiktokenInstance: tokenManager,
|
|
});
|
|
}
|
|
|
|
const newTotal = tokenManager.statsFrom([user, assistant]);
|
|
if (historyTokenCount + newTotal > llm.limits.history) continue;
|
|
eligibleHistoryItems.unshift(user, assistant);
|
|
historyTokenCount += newTotal;
|
|
}
|
|
resolve(eligibleHistoryItems);
|
|
});
|
|
|
|
const [cSystem, cHistory, cPrompt] = await Promise.all([
|
|
compressedSystem,
|
|
compressedHistory,
|
|
compressedPrompt,
|
|
]);
|
|
return [cSystem, ...cHistory, cPrompt];
|
|
}
|
|
|
|
// Implementation of messageArrayCompressor, but for string only completion models
|
|
async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) {
|
|
const tokenBuffer = 600;
|
|
const tokenManager = new TokenManager(llm.model);
|
|
const initialPrompt = llm.constructPrompt(promptArgs);
|
|
if (
|
|
tokenManager.statsFrom(initialPrompt) + tokenBuffer <
|
|
llm.promptWindowLimit()
|
|
)
|
|
return initialPrompt;
|
|
|
|
const system = promptArgs.systemPrompt;
|
|
const user = promptArgs.userPrompt;
|
|
const userPromptSize = tokenManager.countFromString(user);
|
|
|
|
// User prompt is the main focus here - we we prioritize it and allow
|
|
// it to highjack the entire conversation thread. We are going to
|
|
// cannonball the prompt through to ensure the reply has at least 20% of
|
|
// the token supply to reply with.
|
|
if (userPromptSize > llm.limits.user) {
|
|
return llm.constructPrompt({
|
|
userPrompt: cannonball({
|
|
input: user,
|
|
targetTokenSize: llm.promptWindowLimit() * 0.8,
|
|
tiktokenInstance: tokenManager,
|
|
}),
|
|
});
|
|
}
|
|
|
|
const compressedSystem = new Promise(async (resolve) => {
|
|
const count = tokenManager.countFromString(system);
|
|
if (count < llm.limits.system) {
|
|
resolve(system);
|
|
return;
|
|
}
|
|
resolve(
|
|
cannonball({
|
|
input: system,
|
|
targetTokenSize: llm.limits.system,
|
|
tiktokenInstance: tokenManager,
|
|
})
|
|
);
|
|
});
|
|
|
|
// Prompt is allowed to take up to 70% of window - we know its under
|
|
// if we are here, so passthrough.
|
|
const compressedPrompt = new Promise(async (resolve) => resolve(user));
|
|
|
|
// We always aggressively compress history because it is the least
|
|
// important data to retain in full-fidelity.
|
|
const compressedHistory = new Promise((resolve) => {
|
|
const eligibleHistoryItems = [];
|
|
var historyTokenCount = 0;
|
|
|
|
for (const [i, history] of rawHistory.reverse().entries()) {
|
|
const [user, assistant] = convertToPromptHistory([history]);
|
|
const [userTokens, assistantTokens] = [
|
|
tokenManager.countFromString(user.content),
|
|
tokenManager.countFromString(assistant.content),
|
|
];
|
|
const total = userTokens + assistantTokens;
|
|
|
|
// If during the loop the token cost of adding this history
|
|
// is small, we can add it to history and move onto next.
|
|
if (historyTokenCount + total < llm.limits.history) {
|
|
eligibleHistoryItems.unshift(user, assistant);
|
|
historyTokenCount += total;
|
|
continue;
|
|
}
|
|
|
|
// If we reach here the overhead of adding this history item will
|
|
// be too much of the limit. So now, we are prioritizing
|
|
// the most recent 3 message pairs - if we are already past those - exit loop and stop
|
|
// trying to make history work.
|
|
if (i > 2) break;
|
|
|
|
// We are over the limit and we are within the first 3 most recent chats.
|
|
// so now we cannonball them to make them fit into the window.
|
|
// max size = llm.limit.history; Each component of the message, can at most
|
|
// be 50% of the history. We cannonball whichever is the problem.
|
|
// The math isnt perfect for tokens, so we have to add a fudge factor for safety.
|
|
const maxTargetSize = Math.floor(llm.limits.history / 2.2);
|
|
if (userTokens > maxTargetSize) {
|
|
user.content = cannonball({
|
|
input: user.content,
|
|
targetTokenSize: maxTargetSize,
|
|
tiktokenInstance: tokenManager,
|
|
});
|
|
}
|
|
|
|
if (assistantTokens > maxTargetSize) {
|
|
assistant.content = cannonball({
|
|
input: assistant.content,
|
|
targetTokenSize: maxTargetSize,
|
|
tiktokenInstance: tokenManager,
|
|
});
|
|
}
|
|
|
|
const newTotal = tokenManager.statsFrom([user, assistant]);
|
|
if (historyTokenCount + newTotal > llm.limits.history) continue;
|
|
eligibleHistoryItems.unshift(user, assistant);
|
|
historyTokenCount += newTotal;
|
|
}
|
|
resolve(eligibleHistoryItems);
|
|
});
|
|
|
|
const [cSystem, cHistory, cPrompt] = await Promise.all([
|
|
compressedSystem,
|
|
compressedHistory,
|
|
compressedPrompt,
|
|
]);
|
|
|
|
return llm.constructPrompt({
|
|
systemPrompt: cSystem,
|
|
contextTexts: promptArgs?.contextTexts || [],
|
|
chatHistory: cHistory,
|
|
userPrompt: cPrompt,
|
|
});
|
|
}
|
|
|
|
// Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt
|
|
// Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it.
|
|
function cannonball({
|
|
input = "",
|
|
targetTokenSize = 0,
|
|
tiktokenInstance = null,
|
|
ellipsesStr = null,
|
|
}) {
|
|
if (!input || !targetTokenSize) return input;
|
|
const tokenManager = tiktokenInstance || new TokenManager();
|
|
const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n";
|
|
const initialInputSize = tokenManager.countFromString(input);
|
|
if (initialInputSize < targetTokenSize) return input;
|
|
|
|
// if the delta is the token difference between where our prompt is in size
|
|
// and where we ideally need to land.
|
|
const delta = initialInputSize - targetTokenSize;
|
|
const tokenChunks = tokenManager.tokensFromString(input);
|
|
const middleIdx = Math.floor(tokenChunks.length / 2);
|
|
|
|
// middle truncate the text going left and right of midpoint
|
|
const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2));
|
|
const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2));
|
|
const truncatedText =
|
|
tokenManager.bytesFromTokens(leftChunks) +
|
|
truncText +
|
|
tokenManager.bytesFromTokens(rightChunks);
|
|
|
|
console.log(
|
|
`Cannonball results ${initialInputSize} -> ${tokenManager.countFromString(
|
|
truncatedText
|
|
)} tokens.`
|
|
);
|
|
return truncatedText;
|
|
}
|
|
|
|
/**
|
|
* Fill the sources window with the priority of
|
|
* 1. Pinned documents (handled prior to function)
|
|
* 2. VectorSearch results
|
|
* 3. prevSources in chat history - starting from most recent.
|
|
*
|
|
* Ensuring the window always has the desired amount of sources so that followup questions
|
|
* in any chat mode have relevant sources, but not infinite sources. This function is used during chatting
|
|
* and allows follow-up questions within a query chat that otherwise would have zero sources and would fail.
|
|
* The added benefit is that during regular RAG chat, we have better coherence of citations that otherwise would
|
|
* also yield no results with no need for a ReRanker to run and take much longer to return a response.
|
|
*
|
|
* The side effect of this is follow-up unrelated questions now have citations that would look totally irrelevant, however
|
|
* we would rather optimize on the correctness of a response vs showing extraneous sources during a response. Given search
|
|
* results always take a priority a good unrelated question that produces RAG results will still function as desired and due to previous
|
|
* history backfill sources "changing context" mid-chat is handled appropriately.
|
|
* example:
|
|
* ---previous implementation---
|
|
* prompt 1: "What is anythingllm?" -> possibly get 4 good sources
|
|
* prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> bad response due to bad context mgmt
|
|
* ---next implementation---
|
|
* prompt 1: "What is anythingllm?" -> possibly get 4 good sources
|
|
* prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> backfill with 3 good sources from previous -> much better response
|
|
*
|
|
* @param {Object} config - params to call
|
|
* @param {object} config.nDocs = fill size of the window
|
|
* @param {object} config.searchResults = vector similarityResponse results for .sources
|
|
* @param {object[]} config.history - rawHistory of chat containing sources
|
|
* @param {string[]} config.filterIdentifiers - Pinned document identifiers to prevent duplicate context
|
|
* @returns {{
|
|
* contextTexts: string[],
|
|
* sources: object[],
|
|
* }} - Array of sources that should be added to window
|
|
*/
|
|
function fillSourceWindow({
|
|
nDocs = 4, // Number of documents
|
|
searchResults = [], // Sources from similarity search
|
|
history = [], // Raw history
|
|
filterIdentifiers = [], // pinned document sources
|
|
} = config) {
|
|
const sources = [...searchResults];
|
|
|
|
if (sources.length >= nDocs || history.length === 0) {
|
|
return {
|
|
sources,
|
|
contextTexts: sources.map((src) => src.text),
|
|
};
|
|
}
|
|
|
|
const log = (text, ...args) => {
|
|
console.log(`\x1b[36m[fillSourceWindow]\x1b[0m ${text}`, ...args);
|
|
};
|
|
|
|
log(
|
|
`Need to backfill ${nDocs - searchResults.length} chunks to fill in the source window for RAG!`
|
|
);
|
|
const seenChunks = new Set(searchResults.map((source) => source.id));
|
|
|
|
// We need to reverse again because we need to iterate from bottom of array (most recent chats)
|
|
// Looking at this function by itself you may think that this loop could be extreme for long history chats,
|
|
// but this was already handled where `history` we derived. This comes from `recentChatHistory` which
|
|
// includes a limit for history (default: 20). So this loop does not look as extreme as on first glance.
|
|
for (const chat of history.reverse()) {
|
|
if (sources.length >= nDocs) {
|
|
log(
|
|
`Citations backfilled to ${nDocs} references from ${searchResults.length} original citations.`
|
|
);
|
|
break;
|
|
}
|
|
|
|
const chatSources =
|
|
safeJsonParse(chat.response, { sources: [] })?.sources || [];
|
|
if (!chatSources?.length || !Array.isArray(chatSources)) continue;
|
|
|
|
const validSources = chatSources.filter((source) => {
|
|
return (
|
|
filterIdentifiers.includes(sourceIdentifier(source)) == false && // source cannot be in current pins
|
|
source.hasOwnProperty("score") && // source cannot have come from a pinned document that was previously pinned
|
|
source.hasOwnProperty("text") && // source has a valid text property we can use
|
|
seenChunks.has(source.id) == false // is unique
|
|
);
|
|
});
|
|
|
|
for (const validSource of validSources) {
|
|
if (sources.length >= nDocs) break;
|
|
sources.push(validSource);
|
|
seenChunks.add(validSource.id);
|
|
}
|
|
}
|
|
|
|
return {
|
|
sources,
|
|
contextTexts: sources.map((src) => src.text),
|
|
};
|
|
}
|
|
|
|
module.exports = {
|
|
messageArrayCompressor,
|
|
messageStringCompressor,
|
|
fillSourceWindow,
|
|
};
|