2023-12-15 00:14:56 +01:00
|
|
|
const fs = require("fs");
|
|
|
|
const path = require("path");
|
2024-02-26 22:43:54 +01:00
|
|
|
const { getType } = require("mime");
|
2023-12-19 20:35:20 +01:00
|
|
|
const documentsFolder =
|
|
|
|
process.env.NODE_ENV === "production"
|
|
|
|
? path.resolve("/storage/documents") // hardcoded to Render storage mount.
|
|
|
|
: path.resolve(__dirname, "../../../server/storage/documents");
|
2023-12-15 00:14:56 +01:00
|
|
|
|
2024-02-26 22:43:54 +01:00
|
|
|
function isTextType(filepath) {
|
|
|
|
if (!fs.existsSync(filepath)) return false;
|
|
|
|
// These are types of mime primary classes that for sure
|
|
|
|
// cannot also for forced into a text type.
|
|
|
|
const nonTextTypes = ["multipart", "image", "model", "audio", "video"];
|
|
|
|
// These are full-mimes we for sure cannot parse or interpret as text
|
|
|
|
// documents
|
|
|
|
const BAD_MIMES = [
|
|
|
|
"application/octet-stream",
|
|
|
|
"application/zip",
|
|
|
|
"application/pkcs8",
|
|
|
|
"application/vnd.microsoft.portable-executable",
|
|
|
|
"application/x-msdownload",
|
|
|
|
];
|
|
|
|
|
|
|
|
try {
|
|
|
|
const mime = getType(filepath);
|
|
|
|
if (BAD_MIMES.includes(mime)) return false;
|
|
|
|
|
|
|
|
const type = mime.split("/")[0];
|
|
|
|
if (nonTextTypes.includes(type)) return false;
|
|
|
|
return true;
|
|
|
|
} catch {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2023-12-15 00:14:56 +01:00
|
|
|
|
|
|
|
function trashFile(filepath) {
|
|
|
|
if (!fs.existsSync(filepath)) return;
|
|
|
|
|
|
|
|
try {
|
|
|
|
const isDir = fs.lstatSync(filepath).isDirectory();
|
|
|
|
if (isDir) return;
|
|
|
|
} catch {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
fs.rmSync(filepath);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
function createdDate(filepath) {
|
|
|
|
try {
|
|
|
|
const { birthtimeMs, birthtime } = fs.statSync(filepath);
|
|
|
|
if (birthtimeMs === 0) throw new Error("Invalid stat for file!");
|
|
|
|
return birthtime.toLocaleString();
|
|
|
|
} catch {
|
|
|
|
return "unknown";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function writeToServerDocuments(
|
|
|
|
data = {},
|
|
|
|
filename,
|
|
|
|
destinationOverride = null
|
|
|
|
) {
|
|
|
|
const destination = destinationOverride
|
|
|
|
? path.resolve(destinationOverride)
|
2023-12-19 20:35:20 +01:00
|
|
|
: path.resolve(documentsFolder, "custom-documents");
|
|
|
|
|
2023-12-15 00:14:56 +01:00
|
|
|
if (!fs.existsSync(destination))
|
|
|
|
fs.mkdirSync(destination, { recursive: true });
|
2024-01-17 01:04:22 +01:00
|
|
|
const destinationFilePath = path.resolve(destination, filename) + ".json";
|
2023-12-15 00:14:56 +01:00
|
|
|
|
2024-01-17 01:04:22 +01:00
|
|
|
fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
|
|
|
|
encoding: "utf-8",
|
|
|
|
});
|
|
|
|
|
|
|
|
return {
|
|
|
|
...data,
|
|
|
|
// relative location string that can be passed into the /update-embeddings api
|
|
|
|
// that will work since we know the location exists and since we only allow
|
|
|
|
// 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
|
|
|
|
location: destinationFilePath.split("/").slice(-2).join("/"),
|
|
|
|
};
|
2023-12-15 00:14:56 +01:00
|
|
|
}
|
|
|
|
|
2023-12-15 20:20:13 +01:00
|
|
|
// When required we can wipe the entire collector hotdir and tmp storage in case
|
|
|
|
// there were some large file failures that we unable to be removed a reboot will
|
|
|
|
// force remove them.
|
|
|
|
async function wipeCollectorStorage() {
|
|
|
|
const cleanHotDir = new Promise((resolve) => {
|
|
|
|
const directory = path.resolve(__dirname, "../../hotdir");
|
|
|
|
fs.readdir(directory, (err, files) => {
|
|
|
|
if (err) resolve();
|
|
|
|
|
|
|
|
for (const file of files) {
|
|
|
|
if (file === "__HOTDIR__.md") continue;
|
|
|
|
try {
|
|
|
|
fs.rmSync(path.join(directory, file));
|
|
|
|
} catch {}
|
|
|
|
}
|
|
|
|
resolve();
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
|
|
|
const cleanTmpDir = new Promise((resolve) => {
|
|
|
|
const directory = path.resolve(__dirname, "../../storage/tmp");
|
|
|
|
fs.readdir(directory, (err, files) => {
|
|
|
|
if (err) resolve();
|
|
|
|
|
|
|
|
for (const file of files) {
|
|
|
|
if (file === ".placeholder") continue;
|
|
|
|
try {
|
|
|
|
fs.rmSync(path.join(directory, file));
|
|
|
|
} catch {}
|
|
|
|
}
|
|
|
|
resolve();
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
|
|
|
await Promise.all([cleanHotDir, cleanTmpDir]);
|
|
|
|
console.log(`Collector hot directory and tmp storage wiped!`);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-12-15 00:14:56 +01:00
|
|
|
module.exports = {
|
2023-12-19 20:35:20 +01:00
|
|
|
documentsFolder,
|
2023-12-15 00:14:56 +01:00
|
|
|
trashFile,
|
2024-02-26 22:43:54 +01:00
|
|
|
isTextType,
|
2023-12-15 00:14:56 +01:00
|
|
|
createdDate,
|
|
|
|
writeToServerDocuments,
|
2023-12-15 20:20:13 +01:00
|
|
|
wipeCollectorStorage,
|
2023-12-15 00:14:56 +01:00
|
|
|
};
|