update schema of milvus to store text content separately

This commit is contained in:
shatfield4 2024-08-27 15:30:50 -07:00
parent f519a4b48c
commit 9344c834f3

View File

@ -121,6 +121,14 @@ const Milvus = {
decription: "metadata", decription: "metadata",
data_type: DataType.JSON, data_type: DataType.JSON,
}, },
{
name: "text",
description: "text",
data_type: DataType.VarChar,
// Max length of a text field in Milvus is 65535
// https://milvus.io/docs/limitations.md
max_length: 65535,
},
], ],
}); });
await client.createIndex({ await client.createIndex({
@ -156,13 +164,19 @@ const Milvus = {
vectorDimension = chunks[0][0].values.length || null; vectorDimension = chunks[0][0].values.length || null;
await this.getOrCreateCollection(client, namespace, vectorDimension); await this.getOrCreateCollection(client, namespace, vectorDimension);
try {
for (const chunk of chunks) { for (const chunk of chunks) {
// Before sending to Pinecone and saving the records to our db // Before sending to Milvus and saving the records to our db
// we need to assign the id of each chunk that is stored in the cached file. // we need to assign the id of each chunk that is stored in the cached file.
const newChunks = chunk.map((chunk) => { const newChunks = chunk.map((chunk) => {
const id = uuidv4(); const id = uuidv4();
documentVectors.push({ docId, vectorId: id }); documentVectors.push({ docId, vectorId: id });
return { id, vector: chunk.values, metadata: chunk.metadata }; return {
id,
vector: chunk.values,
metadata: chunk.metadata,
text: chunk.text,
};
}); });
const insertResult = await client.insert({ const insertResult = await client.insert({
collection_name: this.normalize(namespace), collection_name: this.normalize(namespace),
@ -170,9 +184,13 @@ const Milvus = {
}); });
if (insertResult?.status.error_code !== "Success") { if (insertResult?.status.error_code !== "Success") {
throw new Error( console.error(
`Error embedding into Milvus! Reason:${insertResult?.status.reason}` `Error embedding into Milvus: ${insertResult?.status.reason}`
); );
return {
vectorized: false,
error: insertResult?.status.reason,
};
} }
} }
await DocumentVectors.bulkInsert(documentVectors); await DocumentVectors.bulkInsert(documentVectors);
@ -180,6 +198,13 @@ const Milvus = {
collection_names: [this.normalize(namespace)], collection_names: [this.normalize(namespace)],
}); });
return { vectorized: true, error: null }; return { vectorized: true, error: null };
} catch (insertError) {
console.error(
"Error inserting cached chunks:",
insertError.message
);
return { vectorized: false, error: insertError.message };
}
} }
} }
@ -233,6 +258,7 @@ const Milvus = {
await this.getOrCreateCollection(client, namespace, vectorDimension); await this.getOrCreateCollection(client, namespace, vectorDimension);
console.log("Inserting vectorized chunks into Milvus."); console.log("Inserting vectorized chunks into Milvus.");
try {
for (const chunk of toChunks(vectors, 100)) { for (const chunk of toChunks(vectors, 100)) {
chunks.push(chunk); chunks.push(chunk);
const insertResult = await client.insert({ const insertResult = await client.insert({
@ -240,26 +266,32 @@ const Milvus = {
data: chunk.map((item) => ({ data: chunk.map((item) => ({
id: item.id, id: item.id,
vector: item.values, vector: item.values,
metadata: chunk.metadata, metadata: JSON.stringify(item.metadata),
text: item.metadata.text,
})), })),
}); });
if (insertResult?.status.error_code !== "Success") { if (insertResult?.status.error_code !== "Success") {
throw new Error( console.error(
`Error embedding into Milvus! Reason:${insertResult?.status.reason}` `Error embedding into Milvus: ${insertResult?.status.reason}`
); );
return { vectorized: false, error: insertResult?.status.reason };
} }
} }
await storeVectorResult(chunks, fullFilePath); await storeVectorResult(chunks, fullFilePath);
await client.flushSync({ await client.flushSync({
collection_names: [this.normalize(namespace)], collection_names: [this.normalize(namespace)],
}); });
} catch (insertError) {
console.error("Error inserting new chunks:", insertError.message);
return { vectorized: false, error: insertError.message };
}
} }
await DocumentVectors.bulkInsert(documentVectors); await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null }; return { vectorized: true, error: null };
} catch (e) { } catch (e) {
console.error("addDocumentToNamespace", e.message); console.error("addDocumentToNamespace error:", e.message);
return { vectorized: false, error: e.message }; return { vectorized: false, error: e.message };
} }
}, },
@ -342,6 +374,7 @@ const Milvus = {
collection_name: this.normalize(namespace), collection_name: this.normalize(namespace),
vectors: queryVector, vectors: queryVector,
limit: topN, limit: topN,
output_fields: ["id", "metadata", "text"],
}); });
response.results.forEach((match) => { response.results.forEach((match) => {
if (match.score < similarityThreshold) return; if (match.score < similarityThreshold) return;
@ -352,8 +385,11 @@ const Milvus = {
return; return;
} }
result.contextTexts.push(match.metadata.text); result.contextTexts.push(match.text);
result.sourceDocuments.push(match); result.sourceDocuments.push({
...match,
metadata: JSON.parse(match.metadata),
});
result.scores.push(match.score); result.scores.push(match.score);
}); });
return result; return result;
@ -385,17 +421,14 @@ const Milvus = {
curateSources: function (sources = []) { curateSources: function (sources = []) {
const documents = []; const documents = [];
for (const source of sources) { for (const source of sources) {
const { metadata = {} } = source; const { metadata = {}, text } = source;
if (Object.keys(metadata).length > 0) { if (Object.keys(metadata).length > 0 || text) {
documents.push({ documents.push({
...metadata, ...metadata,
...(source.hasOwnProperty("pageContent") text: text || source.pageContent,
? { text: source.pageContent }
: {}),
}); });
} }
} }
return documents; return documents;
}, },
}; };