anything-llm/server/utils/helpers/chat/index.js
Timothy Carambat be9d8b0397
Infinite prompt input and compression implementation (#332)
* WIP on continuous prompt window summary

* wip

* Move chat out of VDB
simplify chat interface
normalize LLM model interface
have compression abstraction
Cleanup compressor
TODO: Anthropic stuff

* Implement compression for Anythropic
Fix lancedb sources

* cleanup vectorDBs and check that lance, chroma, and pinecone are returning valid metadata sources

* Resolve Weaviate citation sources not working with schema

* comment cleanup
2023-11-06 13:13:53 -08:00

326 lines
13 KiB
JavaScript

const { convertToPromptHistory } = require("../../chats");
const { TokenManager } = require("../tiktoken");
/*
What is the message Array compressor?
TLDR: So anyway, i started blasting (your prompts & stuff)
messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts
and also maintain coherent history, system instructions and context, if applicable.
We took an opinionated approach that after much back-testing we have found retained a highly coherent answer
under most user conditions that a user would take while using this specific system. While other systems may
use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple.
We "cannonball" the input.
Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so.
This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied.
You may think: "Doesn't this result in massive data loss?" - yes & no.
Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback
on the quality of responses.
We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to
be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable.
In general:
system: at best 15% of token capacity
history: at best 15% of token capacity
prompt: at best 70% of token capacity.
we handle overflows by taking an aggressive path for two main cases.
1. Very large user prompt
- Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread.
- We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone.
2. Context window is exceeded in regular use.
- We do not touch prompt since it is very likely to be <70% of window.
- We check system prompt is not outrageous - if it is we cannonball it and keep context if present.
- We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a
preference for recent history if we can cannonball to fit it, otherwise it is omitted.
We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases.
We also take the approach that history is the least important and most flexible of the items in this array of responses.
There is a supplemental version of this function that also returns a formatted string for models like Claude-2
*/
async function messageArrayCompressor(llm, messages = [], rawHistory = []) {
// assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively
// run the compressor to ensure the prompt has enough space to reply.
// realistically - most users will not be impacted by this.
const tokenBuffer = 600;
const tokenManager = new TokenManager(llm.model);
// If no work needs to be done, just pass through.
if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit())
return messages;
const system = messages.shift();
const user = messages.pop();
const userPromptSize = tokenManager.countFromString(user.content);
// User prompt is the main focus here - we we prioritize it and allow
// it to highjack the entire conversation thread. We are going to
// cannonball the prompt through to ensure the reply has at least 20% of
// the token supply to reply with.
if (userPromptSize > llm.limits.user) {
return [
{
role: "user",
content: cannonball({
input: user.content,
targetTokenSize: llm.promptWindowLimit() * 0.8,
tiktokenInstance: tokenManager,
}),
},
];
}
const compressedSystem = new Promise(async (resolve) => {
const count = tokenManager.countFromString(system.content);
if (count < llm.limits.system) {
resolve(system);
return;
}
// Split context from system prompt - cannonball since its over the window.
// We assume the context + user prompt is enough tokens to fit.
const [prompt, context = ""] = system.content.split("Context:");
system.content = `${cannonball({
input: prompt,
targetTokenSize: llm.limits.system,
tiktokenInstance: tokenManager,
})}${context ? `\nContext: ${context}` : ""}`;
resolve(system);
});
// Prompt is allowed to take up to 70% of window - we know its under
// if we are here, so passthrough.
const compressedPrompt = new Promise(async (resolve) => resolve(user));
// We always aggressively compress history because it is the least
// important data to retain in full-fidelity.
const compressedHistory = new Promise((resolve) => {
const eligibleHistoryItems = [];
var historyTokenCount = 0;
for (const [i, history] of rawHistory.reverse().entries()) {
const [user, assistant] = convertToPromptHistory([history]);
const [userTokens, assistantTokens] = [
tokenManager.countFromString(user.content),
tokenManager.countFromString(assistant.content),
];
const total = userTokens + assistantTokens;
// If during the loop the token cost of adding this history
// is small, we can add it to history and move onto next.
if (historyTokenCount + total < llm.limits.history) {
eligibleHistoryItems.unshift(user, assistant);
historyTokenCount += total;
continue;
}
// If we reach here the overhead of adding this history item will
// be too much of the limit. So now, we are prioritizing
// the most recent 3 message pairs - if we are already past those - exit loop and stop
// trying to make history work.
if (i > 2) break;
// We are over the limit and we are within the first 3 most recent chats.
// so now we cannonball them to make them fit into the window.
// max size = llm.limit.history; Each component of the message, can at most
// be 50% of the history. We cannonball whichever is the problem.
// The math isnt perfect for tokens, so we have to add a fudge factor for safety.
const maxTargetSize = Math.floor(llm.limits.history / 2.2);
if (userTokens > maxTargetSize) {
user.content = cannonball({
input: user.content,
targetTokenSize: maxTargetSize,
tiktokenInstance: tokenManager,
});
}
if (assistantTokens > maxTargetSize) {
assistant.content = cannonball({
input: assistant.content,
targetTokenSize: maxTargetSize,
tiktokenInstance: tokenManager,
});
}
const newTotal = tokenManager.statsFrom([user, assistant]);
if (historyTokenCount + newTotal > llm.limits.history) continue;
eligibleHistoryItems.unshift(user, assistant);
historyTokenCount += newTotal;
}
resolve(eligibleHistoryItems);
});
const [cSystem, cHistory, cPrompt] = await Promise.all([
compressedSystem,
compressedHistory,
compressedPrompt,
]);
return [cSystem, ...cHistory, cPrompt];
}
// Implementation of messageArrayCompressor, but for string only completion models
async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) {
const tokenBuffer = 600;
const tokenManager = new TokenManager(llm.model);
const initialPrompt = llm.constructPrompt(promptArgs);
if (
tokenManager.statsFrom(initialPrompt) + tokenBuffer <
llm.promptWindowLimit()
)
return initialPrompt;
const system = promptArgs.systemPrompt;
const user = promptArgs.userPrompt;
const userPromptSize = tokenManager.countFromString(user);
// User prompt is the main focus here - we we prioritize it and allow
// it to highjack the entire conversation thread. We are going to
// cannonball the prompt through to ensure the reply has at least 20% of
// the token supply to reply with.
if (userPromptSize > llm.limits.user) {
return llm.constructPrompt({
userPrompt: cannonball({
input: user,
targetTokenSize: llm.promptWindowLimit() * 0.8,
tiktokenInstance: tokenManager,
}),
});
}
const compressedSystem = new Promise(async (resolve) => {
const count = tokenManager.countFromString(system);
if (count < llm.limits.system) {
resolve(system);
return;
}
resolve(
cannonball({
input: system,
targetTokenSize: llm.limits.system,
tiktokenInstance: tokenManager,
})
);
});
// Prompt is allowed to take up to 70% of window - we know its under
// if we are here, so passthrough.
const compressedPrompt = new Promise(async (resolve) => resolve(user));
// We always aggressively compress history because it is the least
// important data to retain in full-fidelity.
const compressedHistory = new Promise((resolve) => {
const eligibleHistoryItems = [];
var historyTokenCount = 0;
for (const [i, history] of rawHistory.reverse().entries()) {
const [user, assistant] = convertToPromptHistory([history]);
const [userTokens, assistantTokens] = [
tokenManager.countFromString(user.content),
tokenManager.countFromString(assistant.content),
];
const total = userTokens + assistantTokens;
// If during the loop the token cost of adding this history
// is small, we can add it to history and move onto next.
if (historyTokenCount + total < llm.limits.history) {
eligibleHistoryItems.unshift(user, assistant);
historyTokenCount += total;
continue;
}
// If we reach here the overhead of adding this history item will
// be too much of the limit. So now, we are prioritizing
// the most recent 3 message pairs - if we are already past those - exit loop and stop
// trying to make history work.
if (i > 2) break;
// We are over the limit and we are within the first 3 most recent chats.
// so now we cannonball them to make them fit into the window.
// max size = llm.limit.history; Each component of the message, can at most
// be 50% of the history. We cannonball whichever is the problem.
// The math isnt perfect for tokens, so we have to add a fudge factor for safety.
const maxTargetSize = Math.floor(llm.limits.history / 2.2);
if (userTokens > maxTargetSize) {
user.content = cannonball({
input: user.content,
targetTokenSize: maxTargetSize,
tiktokenInstance: tokenManager,
});
}
if (assistantTokens > maxTargetSize) {
assistant.content = cannonball({
input: assistant.content,
targetTokenSize: maxTargetSize,
tiktokenInstance: tokenManager,
});
}
const newTotal = tokenManager.statsFrom([user, assistant]);
if (historyTokenCount + newTotal > llm.limits.history) continue;
eligibleHistoryItems.unshift(user, assistant);
historyTokenCount += newTotal;
}
resolve(eligibleHistoryItems);
});
const [cSystem, cHistory, cPrompt] = await Promise.all([
compressedSystem,
compressedHistory,
compressedPrompt,
]);
return llm.constructPrompt({
systemPrompt: cSystem,
contextTexts: promptArgs?.contextTexts || [],
chatHistory: cHistory,
userPrompt: cPrompt,
});
}
// Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt
// Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it.
function cannonball({
input = "",
targetTokenSize = 0,
tiktokenInstance = null,
ellipsesStr = null,
}) {
if (!input || !targetTokenSize) return input;
const tokenManager = tiktokenInstance || new TokenManager();
const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n";
const initialInputSize = tokenManager.countFromString(input);
if (initialInputSize < targetTokenSize) return input;
// if the delta is the token difference between where our prompt is in size
// and where we ideally need to land.
const delta = initialInputSize - targetTokenSize;
const tokenChunks = tokenManager.tokensFromString(input);
const middleIdx = Math.floor(tokenChunks.length / 2);
// middle truncate the text going left and right of midpoint
const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2));
const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2));
const truncatedText =
tokenManager.bytesFromTokens(leftChunks) +
truncText +
tokenManager.bytesFromTokens(rightChunks);
console.log(
`Cannonball results ${initialInputSize} -> ${tokenManager.countFromString(
truncatedText
)} tokens.`
);
return truncatedText;
}
module.exports = {
messageArrayCompressor,
messageStringCompressor,
};