diff --git a/.vscode/settings.json b/.vscode/settings.json index aafdb17d8..ce350ca2f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -29,11 +29,13 @@ "Milvus", "Mintplex", "moderations", + "numpages", "Ollama", "Oobabooga", "openai", "opendocument", "openrouter", + "pagerender", "Qdrant", "searxng", "Serper", diff --git a/collector/package.json b/collector/package.json index 72deb4abd..5e3873d1e 100644 --- a/collector/package.json +++ b/collector/package.json @@ -37,7 +37,7 @@ "node-html-parser": "^6.1.13", "officeparser": "^4.0.5", "openai": "4.38.5", - "pdfjs-dist": "3.4.120", + "pdf-parse": "^1.1.1", "puppeteer": "~21.5.2", "slugify": "^1.6.6", "url-pattern": "^1.0.3", diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js new file mode 100644 index 000000000..698769062 --- /dev/null +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js @@ -0,0 +1,62 @@ +const fs = require("fs").promises; +const pdf = require("pdf-parse"); + +class PDFLoader { + constructor(filePath, { splitPages = true } = {}) { + this.filePath = filePath; + this.splitPages = splitPages; + } + + async load() { + const buffer = await fs.readFile(this.filePath); + + const options = { + pagerender: this.splitPages ? this.renderPage : null, + }; + + const { text, numpages, info, metadata, version } = await pdf( + buffer, + options + ); + + if (!this.splitPages) { + return [ + { + pageContent: text.trim(), + metadata: { + source: this.filePath, + pdf: { version, info, metadata, totalPages: numpages }, + }, + }, + ]; + } + + return this.pages.map((pageContent, index) => ({ + pageContent: pageContent.trim(), + metadata: { + source: this.filePath, + pdf: { version, info, metadata, totalPages: numpages }, + loc: { pageNumber: index + 1 }, + }, + })); + } + + pages = []; + + renderPage = async (pageData) => { + const textContent = await pageData.getTextContent(); + let lastY, + text = ""; + for (const item of textContent.items) { + if (lastY !== item.transform[5] && lastY !== undefined) { + text += "\n"; + } + text += item.str; + lastY = item.transform[5]; + } + this.pages.push(text); + return text; + }; +} + +module.exports = PDFLoader; diff --git a/collector/processSingleFile/convert/asPDF.js b/collector/processSingleFile/convert/asPDF/index.js similarity index 54% rename from collector/processSingleFile/convert/asPDF.js rename to collector/processSingleFile/convert/asPDF/index.js index 0521a8fbb..bf1451641 100644 --- a/collector/processSingleFile/convert/asPDF.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -3,29 +3,28 @@ const { createdDate, trashFile, writeToServerDocuments, -} = require("../../utils/files"); -const { tokenizeString } = require("../../utils/tokenizer"); +} = require("../../../utils/files"); +const { tokenizeString } = require("../../../utils/tokenizer"); const { default: slugify } = require("slugify"); +const PDFLoader = require("./PDFLoader"); + +async function asPdf({ fullFilePath = "", filename = "" }) { + const pdfLoader = new PDFLoader(fullFilePath, { + splitPages: true, + }); -async function asPDF({ fullFilePath = "", filename = "" }) { - const pdfjsLib = await import("pdfjs-dist"); console.log(`-- Working ${filename} --`); - - const loadingTask = pdfjsLib.default.getDocument(fullFilePath); - const pdf = await loadingTask.promise; - - const numPages = pdf.numPages; const pageContent = []; + const docs = await pdfLoader.load(); - for (let i = 1; i <= numPages; i++) { - console.log(`-- Parsing content from pg ${i} --`); - const page = await pdf.getPage(i); - const content = await page.getTextContent(); - const text = content.items.map((item) => item.str).join(" "); - - if (text.length) { - pageContent.push(text); - } + for (const doc of docs) { + console.log( + `-- Parsing content from pg ${ + doc.metadata?.loc?.pageNumber || "unknown" + } --` + ); + if (!doc.pageContent || !doc.pageContent.length) continue; + pageContent.push(doc.pageContent); } if (!pageContent.length) { @@ -38,15 +37,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) { }; } - const content = pageContent.join(" "); - const metadata = await pdf.getMetadata(); - + const content = pageContent.join(""); const data = { id: v4(), url: "file://" + fullFilePath, title: filename, - docAuthor: metadata?.info?.Creator || "no author found", - description: metadata?.info?.Title || "No description found.", + docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", + description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.", docSource: "pdf file uploaded by the user.", chunkSource: "", published: createdDate(fullFilePath), @@ -64,4 +61,4 @@ async function asPDF({ fullFilePath = "", filename = "" }) { return { success: true, reason: null, documents: [document] }; } -module.exports = asPDF; +module.exports = asPdf; diff --git a/collector/utils/constants.js b/collector/utils/constants.js index ddcee800f..ee9ad22ae 100644 --- a/collector/utils/constants.js +++ b/collector/utils/constants.js @@ -33,7 +33,7 @@ const SUPPORTED_FILETYPE_CONVERTERS = { ".rst": "./convert/asTxt.js", ".html": "./convert/asTxt.js", - ".pdf": "./convert/asPDF.js", + ".pdf": "./convert/asPDF/index.js", ".docx": "./convert/asDocx.js", ".pptx": "./convert/asOfficeMime.js", diff --git a/collector/yarn.lock b/collector/yarn.lock index 394d8954d..24dfd435f 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -122,7 +122,7 @@ "@langchain/core" "~0.1" js-tiktoken "^1.0.11" -"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11": +"@mapbox/node-pre-gyp@^1.0.11": version "1.0.11" resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa" integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ== @@ -662,15 +662,6 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== -canvas@^2.11.0: - version "2.11.2" - resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860" - integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw== - dependencies: - "@mapbox/node-pre-gyp" "^1.0.0" - nan "^2.17.0" - simple-get "^3.0.3" - chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -936,13 +927,6 @@ decamelize@1.2.0: resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290" integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA== -decompress-response@^4.2.0: - version "4.2.1" - resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986" - integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw== - dependencies: - mimic-response "^2.0.0" - decompress-response@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc" @@ -2237,11 +2221,6 @@ mime@^3.0.0: resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7" integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A== -mimic-response@^2.0.0: - version "2.1.0" - resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43" - integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA== - mimic-response@^3.1.0: version "3.1.0" resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" @@ -2375,11 +2354,6 @@ mustache@^4.2.0: resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64" integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ== -nan@^2.17.0: - version "2.20.0" - resolved "https://registry.yarnpkg.com/nan/-/nan-2.20.0.tgz#08c5ea813dd54ed16e5bd6505bf42af4f7838ca3" - integrity sha512-bk3gXBZDGILuuo/6sKtr0DQmSThYHLtNCdSdXk9YkxD/jK6X2vmCyyXBBxyqZ4XcnzTyYEAThfX3DCEnLf6igw== - napi-build-utils@^1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806" @@ -2715,18 +2689,6 @@ path-type@^4.0.0: resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== -path2d-polyfill@^2.0.1: - version "2.1.1" - resolved "https://registry.yarnpkg.com/path2d-polyfill/-/path2d-polyfill-2.1.1.tgz#6098b7bf2fc24c306c6377bcd558b17ba437ea27" - integrity sha512-4Rka5lN+rY/p0CdD8+E+BFv51lFaFvJOrlOhyQ+zjzyQrzyh3ozmxd1vVGGDdIbUFSBtIZLSnspxTgPT0iJhvA== - dependencies: - path2d "0.1.1" - -path2d@0.1.1: - version "0.1.1" - resolved "https://registry.yarnpkg.com/path2d/-/path2d-0.1.1.tgz#d3c3886cd2252fb2a7830c27ea7bb9a862d937ea" - integrity sha512-/+S03c8AGsDYKKBtRDqieTJv2GlkMb0bWjnqOgtF6MkjdUQ9a8ARAtxWf9NgKLGm2+WQr6+/tqJdU8HNGsIDoA== - pdf-parse@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7" @@ -2735,16 +2697,6 @@ pdf-parse@^1.1.1: debug "^3.1.0" node-ensure "^0.0.0" -pdfjs-dist@3.4.120: - version "3.4.120" - resolved "https://registry.yarnpkg.com/pdfjs-dist/-/pdfjs-dist-3.4.120.tgz#6f4222117157498f179c95dc4569fad6336a8fdd" - integrity sha512-B1hw9ilLG4m/jNeFA0C2A0PZydjxslP8ylU+I4XM7Bzh/xWETo9EiBV848lh0O0hLut7T6lK1V7cpAXv5BhxWw== - dependencies: - path2d-polyfill "^2.0.1" - web-streams-polyfill "^3.2.1" - optionalDependencies: - canvas "^2.11.0" - peberminta@^0.9.0: version "0.9.0" resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352" @@ -3175,15 +3127,6 @@ simple-concat@^1.0.0: resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f" integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q== -simple-get@^3.0.3: - version "3.1.1" - resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55" - integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA== - dependencies: - decompress-response "^4.2.0" - once "^1.3.1" - simple-concat "^1.0.0" - simple-get@^4.0.0, simple-get@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543" diff --git a/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx b/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx index 209c0aa21..6a15e1ab6 100644 --- a/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx +++ b/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx @@ -1,15 +1,15 @@ export default function AzureAiOptions({ settings }) { return (
-
+
-