mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 04:30:10 +01:00
update schema of milvus to store text content separately
This commit is contained in:
parent
f519a4b48c
commit
9344c834f3
@ -121,6 +121,14 @@ const Milvus = {
|
||||
decription: "metadata",
|
||||
data_type: DataType.JSON,
|
||||
},
|
||||
{
|
||||
name: "text",
|
||||
description: "text",
|
||||
data_type: DataType.VarChar,
|
||||
// Max length of a text field in Milvus is 65535
|
||||
// https://milvus.io/docs/limitations.md
|
||||
max_length: 65535,
|
||||
},
|
||||
],
|
||||
});
|
||||
await client.createIndex({
|
||||
@ -156,13 +164,19 @@ const Milvus = {
|
||||
vectorDimension = chunks[0][0].values.length || null;
|
||||
|
||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||
try {
|
||||
for (const chunk of chunks) {
|
||||
// Before sending to Pinecone and saving the records to our db
|
||||
// Before sending to Milvus and saving the records to our db
|
||||
// we need to assign the id of each chunk that is stored in the cached file.
|
||||
const newChunks = chunk.map((chunk) => {
|
||||
const id = uuidv4();
|
||||
documentVectors.push({ docId, vectorId: id });
|
||||
return { id, vector: chunk.values, metadata: chunk.metadata };
|
||||
return {
|
||||
id,
|
||||
vector: chunk.values,
|
||||
metadata: chunk.metadata,
|
||||
text: chunk.text,
|
||||
};
|
||||
});
|
||||
const insertResult = await client.insert({
|
||||
collection_name: this.normalize(namespace),
|
||||
@ -170,9 +184,13 @@ const Milvus = {
|
||||
});
|
||||
|
||||
if (insertResult?.status.error_code !== "Success") {
|
||||
throw new Error(
|
||||
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
|
||||
console.error(
|
||||
`Error embedding into Milvus: ${insertResult?.status.reason}`
|
||||
);
|
||||
return {
|
||||
vectorized: false,
|
||||
error: insertResult?.status.reason,
|
||||
};
|
||||
}
|
||||
}
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
@ -180,6 +198,13 @@ const Milvus = {
|
||||
collection_names: [this.normalize(namespace)],
|
||||
});
|
||||
return { vectorized: true, error: null };
|
||||
} catch (insertError) {
|
||||
console.error(
|
||||
"Error inserting cached chunks:",
|
||||
insertError.message
|
||||
);
|
||||
return { vectorized: false, error: insertError.message };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -233,6 +258,7 @@ const Milvus = {
|
||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||
|
||||
console.log("Inserting vectorized chunks into Milvus.");
|
||||
try {
|
||||
for (const chunk of toChunks(vectors, 100)) {
|
||||
chunks.push(chunk);
|
||||
const insertResult = await client.insert({
|
||||
@ -240,26 +266,32 @@ const Milvus = {
|
||||
data: chunk.map((item) => ({
|
||||
id: item.id,
|
||||
vector: item.values,
|
||||
metadata: chunk.metadata,
|
||||
metadata: JSON.stringify(item.metadata),
|
||||
text: item.metadata.text,
|
||||
})),
|
||||
});
|
||||
|
||||
if (insertResult?.status.error_code !== "Success") {
|
||||
throw new Error(
|
||||
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
|
||||
console.error(
|
||||
`Error embedding into Milvus: ${insertResult?.status.reason}`
|
||||
);
|
||||
return { vectorized: false, error: insertResult?.status.reason };
|
||||
}
|
||||
}
|
||||
await storeVectorResult(chunks, fullFilePath);
|
||||
await client.flushSync({
|
||||
collection_names: [this.normalize(namespace)],
|
||||
});
|
||||
} catch (insertError) {
|
||||
console.error("Error inserting new chunks:", insertError.message);
|
||||
return { vectorized: false, error: insertError.message };
|
||||
}
|
||||
}
|
||||
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
console.error("addDocumentToNamespace error:", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
@ -342,6 +374,7 @@ const Milvus = {
|
||||
collection_name: this.normalize(namespace),
|
||||
vectors: queryVector,
|
||||
limit: topN,
|
||||
output_fields: ["id", "metadata", "text"],
|
||||
});
|
||||
response.results.forEach((match) => {
|
||||
if (match.score < similarityThreshold) return;
|
||||
@ -352,8 +385,11 @@ const Milvus = {
|
||||
return;
|
||||
}
|
||||
|
||||
result.contextTexts.push(match.metadata.text);
|
||||
result.sourceDocuments.push(match);
|
||||
result.contextTexts.push(match.text);
|
||||
result.sourceDocuments.push({
|
||||
...match,
|
||||
metadata: JSON.parse(match.metadata),
|
||||
});
|
||||
result.scores.push(match.score);
|
||||
});
|
||||
return result;
|
||||
@ -385,17 +421,14 @@ const Milvus = {
|
||||
curateSources: function (sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { metadata = {} } = source;
|
||||
if (Object.keys(metadata).length > 0) {
|
||||
const { metadata = {}, text } = source;
|
||||
if (Object.keys(metadata).length > 0 || text) {
|
||||
documents.push({
|
||||
...metadata,
|
||||
...(source.hasOwnProperty("pageContent")
|
||||
? { text: source.pageContent }
|
||||
: {}),
|
||||
text: text || source.pageContent,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user