mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 04:30:10 +01:00
update schema of milvus to store text content separately
This commit is contained in:
parent
f519a4b48c
commit
9344c834f3
@ -121,6 +121,14 @@ const Milvus = {
|
|||||||
decription: "metadata",
|
decription: "metadata",
|
||||||
data_type: DataType.JSON,
|
data_type: DataType.JSON,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "text",
|
||||||
|
description: "text",
|
||||||
|
data_type: DataType.VarChar,
|
||||||
|
// Max length of a text field in Milvus is 65535
|
||||||
|
// https://milvus.io/docs/limitations.md
|
||||||
|
max_length: 65535,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
await client.createIndex({
|
await client.createIndex({
|
||||||
@ -156,30 +164,47 @@ const Milvus = {
|
|||||||
vectorDimension = chunks[0][0].values.length || null;
|
vectorDimension = chunks[0][0].values.length || null;
|
||||||
|
|
||||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||||
for (const chunk of chunks) {
|
try {
|
||||||
// Before sending to Pinecone and saving the records to our db
|
for (const chunk of chunks) {
|
||||||
// we need to assign the id of each chunk that is stored in the cached file.
|
// Before sending to Milvus and saving the records to our db
|
||||||
const newChunks = chunk.map((chunk) => {
|
// we need to assign the id of each chunk that is stored in the cached file.
|
||||||
const id = uuidv4();
|
const newChunks = chunk.map((chunk) => {
|
||||||
documentVectors.push({ docId, vectorId: id });
|
const id = uuidv4();
|
||||||
return { id, vector: chunk.values, metadata: chunk.metadata };
|
documentVectors.push({ docId, vectorId: id });
|
||||||
});
|
return {
|
||||||
const insertResult = await client.insert({
|
id,
|
||||||
collection_name: this.normalize(namespace),
|
vector: chunk.values,
|
||||||
data: newChunks,
|
metadata: chunk.metadata,
|
||||||
});
|
text: chunk.text,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
const insertResult = await client.insert({
|
||||||
|
collection_name: this.normalize(namespace),
|
||||||
|
data: newChunks,
|
||||||
|
});
|
||||||
|
|
||||||
if (insertResult?.status.error_code !== "Success") {
|
if (insertResult?.status.error_code !== "Success") {
|
||||||
throw new Error(
|
console.error(
|
||||||
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
|
`Error embedding into Milvus: ${insertResult?.status.reason}`
|
||||||
);
|
);
|
||||||
|
return {
|
||||||
|
vectorized: false,
|
||||||
|
error: insertResult?.status.reason,
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
await DocumentVectors.bulkInsert(documentVectors);
|
||||||
|
await client.flushSync({
|
||||||
|
collection_names: [this.normalize(namespace)],
|
||||||
|
});
|
||||||
|
return { vectorized: true, error: null };
|
||||||
|
} catch (insertError) {
|
||||||
|
console.error(
|
||||||
|
"Error inserting cached chunks:",
|
||||||
|
insertError.message
|
||||||
|
);
|
||||||
|
return { vectorized: false, error: insertError.message };
|
||||||
}
|
}
|
||||||
await DocumentVectors.bulkInsert(documentVectors);
|
|
||||||
await client.flushSync({
|
|
||||||
collection_names: [this.normalize(namespace)],
|
|
||||||
});
|
|
||||||
return { vectorized: true, error: null };
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,33 +258,40 @@ const Milvus = {
|
|||||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||||
|
|
||||||
console.log("Inserting vectorized chunks into Milvus.");
|
console.log("Inserting vectorized chunks into Milvus.");
|
||||||
for (const chunk of toChunks(vectors, 100)) {
|
try {
|
||||||
chunks.push(chunk);
|
for (const chunk of toChunks(vectors, 100)) {
|
||||||
const insertResult = await client.insert({
|
chunks.push(chunk);
|
||||||
collection_name: this.normalize(namespace),
|
const insertResult = await client.insert({
|
||||||
data: chunk.map((item) => ({
|
collection_name: this.normalize(namespace),
|
||||||
id: item.id,
|
data: chunk.map((item) => ({
|
||||||
vector: item.values,
|
id: item.id,
|
||||||
metadata: chunk.metadata,
|
vector: item.values,
|
||||||
})),
|
metadata: JSON.stringify(item.metadata),
|
||||||
});
|
text: item.metadata.text,
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
|
||||||
if (insertResult?.status.error_code !== "Success") {
|
if (insertResult?.status.error_code !== "Success") {
|
||||||
throw new Error(
|
console.error(
|
||||||
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
|
`Error embedding into Milvus: ${insertResult?.status.reason}`
|
||||||
);
|
);
|
||||||
|
return { vectorized: false, error: insertResult?.status.reason };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
await storeVectorResult(chunks, fullFilePath);
|
||||||
|
await client.flushSync({
|
||||||
|
collection_names: [this.normalize(namespace)],
|
||||||
|
});
|
||||||
|
} catch (insertError) {
|
||||||
|
console.error("Error inserting new chunks:", insertError.message);
|
||||||
|
return { vectorized: false, error: insertError.message };
|
||||||
}
|
}
|
||||||
await storeVectorResult(chunks, fullFilePath);
|
|
||||||
await client.flushSync({
|
|
||||||
collection_names: [this.normalize(namespace)],
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await DocumentVectors.bulkInsert(documentVectors);
|
await DocumentVectors.bulkInsert(documentVectors);
|
||||||
return { vectorized: true, error: null };
|
return { vectorized: true, error: null };
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("addDocumentToNamespace", e.message);
|
console.error("addDocumentToNamespace error:", e.message);
|
||||||
return { vectorized: false, error: e.message };
|
return { vectorized: false, error: e.message };
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -342,6 +374,7 @@ const Milvus = {
|
|||||||
collection_name: this.normalize(namespace),
|
collection_name: this.normalize(namespace),
|
||||||
vectors: queryVector,
|
vectors: queryVector,
|
||||||
limit: topN,
|
limit: topN,
|
||||||
|
output_fields: ["id", "metadata", "text"],
|
||||||
});
|
});
|
||||||
response.results.forEach((match) => {
|
response.results.forEach((match) => {
|
||||||
if (match.score < similarityThreshold) return;
|
if (match.score < similarityThreshold) return;
|
||||||
@ -352,8 +385,11 @@ const Milvus = {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.contextTexts.push(match.metadata.text);
|
result.contextTexts.push(match.text);
|
||||||
result.sourceDocuments.push(match);
|
result.sourceDocuments.push({
|
||||||
|
...match,
|
||||||
|
metadata: JSON.parse(match.metadata),
|
||||||
|
});
|
||||||
result.scores.push(match.score);
|
result.scores.push(match.score);
|
||||||
});
|
});
|
||||||
return result;
|
return result;
|
||||||
@ -385,19 +421,16 @@ const Milvus = {
|
|||||||
curateSources: function (sources = []) {
|
curateSources: function (sources = []) {
|
||||||
const documents = [];
|
const documents = [];
|
||||||
for (const source of sources) {
|
for (const source of sources) {
|
||||||
const { metadata = {} } = source;
|
const { metadata = {}, text } = source;
|
||||||
if (Object.keys(metadata).length > 0) {
|
if (Object.keys(metadata).length > 0 || text) {
|
||||||
documents.push({
|
documents.push({
|
||||||
...metadata,
|
...metadata,
|
||||||
...(source.hasOwnProperty("pageContent")
|
text: text || source.pageContent,
|
||||||
? { text: source.pageContent }
|
|
||||||
: {}),
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports.Milvus = Milvus;
|
module.exports.Milvus = Milvus;
|
Loading…
Reference in New Issue
Block a user