const { v4: uuidv4 } = require("uuid"); const { getVectorDbClass } = require("../utils/helpers"); const prisma = require("../utils/prisma"); const { Telemetry } = require("./telemetry"); const { EventLogs } = require("./eventLogs"); const { safeJsonParse } = require("../utils/http"); const Document = { writable: ["pinned", "watched", "lastUpdatedAt"], /** * @param {import("@prisma/client").workspace_documents} document - Document PrismaRecord * @returns {{ * metadata: (null|object), * type: import("./documentSyncQueue.js").validFileType, * source: string * }} */ parseDocumentTypeAndSource: function (document) { const metadata = safeJsonParse(document.metadata, null); if (!metadata) return { metadata: null, type: null, source: null }; // Parse the correct type of source and its original source path. const idx = metadata.chunkSource.indexOf("://"); const [type, source] = [ metadata.chunkSource.slice(0, idx), metadata.chunkSource.slice(idx + 3), ]; return { metadata, type, source: this._stripSource(source, type) }; }, forWorkspace: async function (workspaceId = null) { if (!workspaceId) return []; return await prisma.workspace_documents.findMany({ where: { workspaceId }, }); }, delete: async function (clause = {}) { try { await prisma.workspace_documents.deleteMany({ where: clause }); return true; } catch (error) { console.error(error.message); return false; } }, get: async function (clause = {}) { try { const document = await prisma.workspace_documents.findFirst({ where: clause, }); return document || null; } catch (error) { console.error(error.message); return null; } }, getOnlyWorkspaceIds: async function (clause = {}) { try { const workspaceIds = await prisma.workspace_documents.findMany({ where: clause, select: { workspaceId: true, }, }); return workspaceIds.map((record) => record.workspaceId) || []; } catch (error) { console.error(error.message); return []; } }, where: async function ( clause = {}, limit = null, orderBy = null, include = null ) { try { const results = await prisma.workspace_documents.findMany({ where: clause, ...(limit !== null ? { take: limit } : {}), ...(orderBy !== null ? { orderBy } : {}), ...(include !== null ? { include } : {}), }); return results; } catch (error) { console.error(error.message); return []; } }, addDocuments: async function (workspace, additions = [], userId = null) { const VectorDb = getVectorDbClass(); if (additions.length === 0) return { failed: [], embedded: [] }; const { fileData } = require("../utils/files"); const embedded = []; const failedToEmbed = []; const errors = new Set(); for (const path of additions) { const data = await fileData(path); if (!data) continue; const docId = uuidv4(); const { pageContent, ...metadata } = data; const newDoc = { docId, filename: path.split("/")[1], docpath: path, workspaceId: workspace.id, metadata: JSON.stringify(metadata), }; const { vectorized, error } = await VectorDb.addDocumentToNamespace( workspace.slug, { ...data, docId }, path ); if (!vectorized) { console.error( "Failed to vectorize", metadata?.title || newDoc.filename ); failedToEmbed.push(metadata?.title || newDoc.filename); errors.add(error); continue; } try { await prisma.workspace_documents.create({ data: newDoc }); embedded.push(path); } catch (error) { console.error(error.message); } } await Telemetry.sendTelemetry("documents_embedded_in_workspace", { LLMSelection: process.env.LLM_PROVIDER || "openai", Embedder: process.env.EMBEDDING_ENGINE || "inherit", VectorDbSelection: process.env.VECTOR_DB || "lancedb", TTSSelection: process.env.TTS_PROVIDER || "native", }); await EventLogs.logEvent( "workspace_documents_added", { workspaceName: workspace?.name || "Unknown Workspace", numberOfDocumentsAdded: additions.length, }, userId ); return { failedToEmbed, errors: Array.from(errors), embedded }; }, removeDocuments: async function (workspace, removals = [], userId = null) { const VectorDb = getVectorDbClass(); if (removals.length === 0) return; for (const path of removals) { const document = await this.get({ docpath: path, workspaceId: workspace.id, }); if (!document) continue; await VectorDb.deleteDocumentFromNamespace( workspace.slug, document.docId ); try { await prisma.workspace_documents.delete({ where: { id: document.id, workspaceId: workspace.id }, }); await prisma.document_vectors.deleteMany({ where: { docId: document.docId }, }); } catch (error) { console.error(error.message); } } await Telemetry.sendTelemetry("documents_removed_in_workspace", { LLMSelection: process.env.LLM_PROVIDER || "openai", Embedder: process.env.EMBEDDING_ENGINE || "inherit", VectorDbSelection: process.env.VECTOR_DB || "lancedb", TTSSelection: process.env.TTS_PROVIDER || "native", }); await EventLogs.logEvent( "workspace_documents_removed", { workspaceName: workspace?.name || "Unknown Workspace", numberOfDocuments: removals.length, }, userId ); return true; }, count: async function (clause = {}, limit = null) { try { const count = await prisma.workspace_documents.count({ where: clause, ...(limit !== null ? { take: limit } : {}), }); return count; } catch (error) { console.error("FAILED TO COUNT DOCUMENTS.", error.message); return 0; } }, update: async function (id = null, data = {}) { if (!id) throw new Error("No workspace document id provided for update"); const validKeys = Object.keys(data).filter((key) => this.writable.includes(key) ); if (validKeys.length === 0) return { document: { id }, message: "No valid fields to update!" }; try { const document = await prisma.workspace_documents.update({ where: { id }, data, }); return { document, message: null }; } catch (error) { console.error(error.message); return { document: null, message: error.message }; } }, _updateAll: async function (clause = {}, data = {}) { try { await prisma.workspace_documents.updateMany({ where: clause, data, }); return true; } catch (error) { console.error(error.message); return false; } }, content: async function (docId) { if (!docId) throw new Error("No workspace docId provided!"); const document = await this.get({ docId: String(docId) }); if (!document) throw new Error(`Could not find a document by id ${docId}`); const { fileData } = require("../utils/files"); const data = await fileData(document.docpath); return { title: data.title, content: data.pageContent }; }, contentByDocPath: async function (docPath) { const { fileData } = require("../utils/files"); const data = await fileData(docPath); return { title: data.title, content: data.pageContent }; }, // Some data sources have encoded params in them we don't want to log - so strip those details. _stripSource: function (sourceString, type) { if (["confluence", "github"].includes(type)) { const _src = new URL(sourceString); _src.search = ""; // remove all search params that are encoded for resync. return _src.toString(); } return sourceString; }, }; module.exports = { Document };