diff --git a/server/utils/chats/embed.js b/server/utils/chats/embed.js index 94df306f..533ea0c3 100644 --- a/server/utils/chats/embed.js +++ b/server/utils/chats/embed.js @@ -77,11 +77,10 @@ async function streamChatWithForEmbed( chatMode ); - // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search - // as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window. + // See stream.js comment for more information on this implementation. await new DocumentManager({ workspace: embed.workspace, - maxTokens: LLMConnector.limits.system, + maxTokens: LLMConnector.promptWindowLimit(), }) .pinnedDocs() .then((pinnedDocs) => { diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js index 87d96c47..38ce6c9b 100644 --- a/server/utils/chats/index.js +++ b/server/utils/chats/index.js @@ -89,11 +89,10 @@ async function chatWithWorkspace( chatMode, }); - // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search - // as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window. + // See stream.js comment for more information on this implementation. await new DocumentManager({ workspace, - maxTokens: LLMConnector.limits.system, + maxTokens: LLMConnector.promptWindowLimit(), }) .pinnedDocs() .then((pinnedDocs) => { diff --git a/server/utils/chats/stream.js b/server/utils/chats/stream.js index 0e471161..57f32666 100644 --- a/server/utils/chats/stream.js +++ b/server/utils/chats/stream.js @@ -105,9 +105,13 @@ async function streamChatWithWorkspace( // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search // as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window. + // However we limit the maximum of appended context to 80% of its overall size, mostly because if it expands beyond this + // it will undergo prompt compression anyway to make it work. If there is so much pinned that the context here is bigger than + // what the model can support - it would get compressed anyway and that really is not the point of pinning. It is really best + // suited for high-context models. await new DocumentManager({ workspace, - maxTokens: LLMConnector.limits.system, + maxTokens: LLMConnector.promptWindowLimit(), }) .pinnedDocs() .then((pinnedDocs) => {