diff --git a/.github/workflows/build-and-push-image.yaml b/.github/workflows/build-and-push-image.yaml index bbc2d0064..f29fa511d 100644 --- a/.github/workflows/build-and-push-image.yaml +++ b/.github/workflows/build-and-push-image.yaml @@ -81,13 +81,14 @@ jobs: type=ref,event=tag type=ref,event=pr - - name: Build and push multi-platform Docker image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . file: ./docker/Dockerfile push: true + sbom: true + provenance: mode=max platforms: linux/amd64,linux/arm64 tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} diff --git a/cloud-deployments/digitalocean/terraform/DEPLOY.md b/cloud-deployments/digitalocean/terraform/DEPLOY.md index 1877abc2e..7baf5d401 100644 --- a/cloud-deployments/digitalocean/terraform/DEPLOY.md +++ b/cloud-deployments/digitalocean/terraform/DEPLOY.md @@ -12,16 +12,18 @@ The output of this Terraform configuration will be: - Follow the instructions in the [official Terraform documentation](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) for your operating system. ## How to deploy on DigitalOcean -Open your terminal and navigate to the `digitalocean/terraform` folder -1. Replace the token value in the provider "digitalocean" block in main.tf with your DigitalOcean API token. -2. Run the following commands to initialize Terraform, review the infrastructure changes, and apply them: +Open your terminal and navigate to the `docker` folder +1. Create a `.env` file by cloning the `.env.example`. +2. Navigate to `digitalocean/terraform` folder. +3. Replace the token value in the provider "digitalocean" block in main.tf with your DigitalOcean API token. +4. Run the following commands to initialize Terraform, review the infrastructure changes, and apply them: ``` terraform init terraform plan terraform apply ``` Confirm the changes by typing yes when prompted. -4. Once the deployment is complete, Terraform will output the public IP address of your droplet. You can access your application using this IP address. +5. Once the deployment is complete, Terraform will output the public IP address of your droplet. You can access your application using this IP address. ## How to deploy on DigitalOcean To delete the resources created by Terraform, run the following command in the terminal: diff --git a/cloud-deployments/digitalocean/terraform/main.tf b/cloud-deployments/digitalocean/terraform/main.tf index 331d254e3..7a76f57b2 100644 --- a/cloud-deployments/digitalocean/terraform/main.tf +++ b/cloud-deployments/digitalocean/terraform/main.tf @@ -16,7 +16,7 @@ provider "digitalocean" { resource "digitalocean_droplet" "anything_llm_instance" { - image = "ubuntu-22-10-x64" + image = "ubuntu-24-04-x64" name = "anything-llm-instance" region = "nyc3" size = "s-2vcpu-2gb" diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js index 698769062..26bcf2b1c 100644 --- a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js @@ -1,5 +1,4 @@ const fs = require("fs").promises; -const pdf = require("pdf-parse"); class PDFLoader { constructor(filePath, { splitPages = true } = {}) { @@ -9,54 +8,90 @@ class PDFLoader { async load() { const buffer = await fs.readFile(this.filePath); + const { getDocument, version } = await this.getPdfJS(); - const options = { - pagerender: this.splitPages ? this.renderPage : null, - }; + const pdf = await getDocument({ + data: new Uint8Array(buffer), + useWorkerFetch: false, + isEvalSupported: false, + useSystemFonts: true, + }).promise; - const { text, numpages, info, metadata, version } = await pdf( - buffer, - options - ); + const meta = await pdf.getMetadata().catch(() => null); + const documents = []; - if (!this.splitPages) { - return [ - { - pageContent: text.trim(), - metadata: { - source: this.filePath, - pdf: { version, info, metadata, totalPages: numpages }, + for (let i = 1; i <= pdf.numPages; i += 1) { + const page = await pdf.getPage(i); + const content = await page.getTextContent(); + + if (content.items.length === 0) { + continue; + } + + let lastY; + const textItems = []; + for (const item of content.items) { + if ("str" in item) { + if (lastY === item.transform[5] || !lastY) { + textItems.push(item.str); + } else { + textItems.push(`\n${item.str}`); + } + lastY = item.transform[5]; + } + } + + const text = textItems.join(""); + documents.push({ + pageContent: text.trim(), + metadata: { + source: this.filePath, + pdf: { + version, + info: meta?.info, + metadata: meta?.metadata, + totalPages: pdf.numPages, + }, + loc: { pageNumber: i }, + }, + }); + } + + if (this.splitPages) { + return documents; + } + + if (documents.length === 0) { + return []; + } + + return [ + { + pageContent: documents.map((doc) => doc.pageContent).join("\n\n"), + metadata: { + source: this.filePath, + pdf: { + version, + info: meta?.info, + metadata: meta?.metadata, + totalPages: pdf.numPages, }, }, - ]; - } - - return this.pages.map((pageContent, index) => ({ - pageContent: pageContent.trim(), - metadata: { - source: this.filePath, - pdf: { version, info, metadata, totalPages: numpages }, - loc: { pageNumber: index + 1 }, }, - })); + ]; } - pages = []; - - renderPage = async (pageData) => { - const textContent = await pageData.getTextContent(); - let lastY, - text = ""; - for (const item of textContent.items) { - if (lastY !== item.transform[5] && lastY !== undefined) { - text += "\n"; - } - text += item.str; - lastY = item.transform[5]; + async getPdfJS() { + try { + const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"); + return { getDocument: pdfjs.getDocument, version: pdfjs.version }; + } catch (e) { + console.error(e); + throw new Error( + "Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`." + ); } - this.pages.push(text); - return text; - }; + } } module.exports = PDFLoader; diff --git a/docker/Dockerfile b/docker/Dockerfile index 2edbadb2c..e531e4c87 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,5 @@ # Setup base image -FROM ubuntu:jammy-20230916 AS base +FROM ubuntu:jammy-20240627.1 AS base # Build arguments ARG ARG_UID=1000 @@ -138,7 +138,7 @@ USER anythingllm FROM frontend-deps AS build-stage COPY ./frontend/ ./frontend/ WORKDIR /app/frontend -RUN yarn build && yarn cache clean +RUN yarn build && yarn cache clean && rm -rf node_modules WORKDIR /app # Setup the server diff --git a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx index d4bc867e9..09d352053 100644 --- a/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx +++ b/embed/src/components/ChatWindow/ChatContainer/ChatHistory/HistoricalMessage/index.jsx @@ -16,6 +16,7 @@ const HistoricalMessage = forwardRef( const textSize = !!embedderSettings.settings.textSize ? `allm-text-[${embedderSettings.settings.textSize}px]` : "allm-text-sm"; + if (error) console.error(`ANYTHING_LLM_CHAT_WIDGET_ERROR: ${error}`); return (
- {error} + Server error
"+$n(e[t].content)+"
\n"},zt.fence=function(e,t,n,r,o){var u,c,p,d,g,a=e[t],s=a.info?Fb(a.info).trim():"",i="",l="";return s&&(i=(p=s.split(/(\s+)/g))[0],l=p.slice(2).join("")),0===(u=n.highlight&&n.highlight(a.content,i,l)||$n(a.content)).indexOf(""+u+"
\n"):""+u+"
\n"},zt.image=function(e,t,n,r,o){var a=e[t];return a.attrs[a.attrIndex("alt")][1]=o.renderInlineAsText(a.children,n,r),o.renderToken(e,t,n)},zt.hardbreak=function(e,t,n){return n.xhtmlOut?""u"&&(t.env.references={}),typeof t.env.references[d]>"u"&&(t.env.references[d]={title:v,href:u}),t.parentType=A,t.line=n+N+1),!0)}],["html_block",function(t,n,r,o){var a,s,i,l,u=t.bMarks[n]+t.tShift[n],c=t.eMarks[n];if(t.sCount[n]-t.blkIndent>=4||!t.md.options.html||60!==t.src.charCodeAt(u))return!1;for(l=t.src.slice(u,c),a=0;a=e.pos)throw new Error("inline rule didn't increment state.pos");break}}else e.pos=e.posMax;t||e.pos++,i[r]=e.pos}},Mo.prototype.tokenize=function(e){for(var t,n,r,o=this.ruler.getRules(""),a=o.length,s=e.posMax,i=e.md.options.maxNesting;e.pos=e.pos)throw new Error("inline rule didn't increment state.pos");break}if(t){if(e.pos>=s)break}else e.pending+=e.src[e.pos++]}e.pending&&e.pushPending()},Mo.prototype.parse=function(e,t,n,r){var o,a,s,i=new this.State(e,t,n,r);for(this.tokenize(i),s=(a=this.ruler2.getRules("")).length,o=0;o=3&&":"===e[t-3]||t>=3&&"/"===e[t-3]?0:r.match(n.re.no_http)[0].length:0}},"mailto:":{validate:function(e,t,n){var r=e.slice(t);return n.re.mailto||(n.re.mailto=new RegExp("^"+n.re.src_email_name+"@"+n.re.src_host_strict,"i")),n.re.mailto.test(r)?r.match(n.re.mailto)[0].length:0}}},Tv="a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]",Cv="biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф".split("|");function Ns(e){var t=e.re=(ff||(ff=1,yu=function(e){var t={};e=e||{},t.src_Any=kp().source,t.src_Cc=Mp().source,t.src_Z=Pp().source,t.src_P=su.source,t.src_ZPCc=[t.src_Z,t.src_P,t.src_Cc].join("|"),t.src_ZCc=[t.src_Z,t.src_Cc].join("|");var n="[><|]";return t.src_pseudo_letter="(?:(?![><|]|"+t.src_ZPCc+")"+t.src_Any+")",t.src_ip4="(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",t.src_auth="(?:(?:(?!"+t.src_ZCc+"|[@/\\[\\]()]).)+@)?",t.src_port="(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?",t.src_host_terminator="(?=$|[><|]|"+t.src_ZPCc+")(?!"+(e["---"]?"-(?!--)|":"-|")+"_|:\\d|\\.-|\\.(?!$|"+t.src_ZPCc+"))",t.src_path="(?:[/?#](?:(?!"+t.src_ZCc+"|"+n+"|[()[\\]{}.,\"'?!\\-;]).|\\[(?:(?!"+t.src_ZCc+"|\\]).)*\\]|\\((?:(?!"+t.src_ZCc+"|[)]).)*\\)|\\{(?:(?!"+t.src_ZCc+'|[}]).)*\\}|\\"(?:(?!'+t.src_ZCc+'|["]).)+\\"|\\\'(?:(?!'+t.src_ZCc+"|[']).)+\\'|\\'(?="+t.src_pseudo_letter+"|[-])|\\.{2,}[a-zA-Z0-9%/&]|\\.(?!"+t.src_ZCc+"|[.]|$)|"+(e["---"]?"\\-(?!--(?:[^-]|$))(?:-*)|":"\\-+|")+",(?!"+t.src_ZCc+"|$)|;(?!"+t.src_ZCc+"|$)|\\!+(?!"+t.src_ZCc+"|[!]|$)|\\?(?!"+t.src_ZCc+"|[?]|$))+|\\/)?",t.src_email_name='[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*',t.src_xn="xn--[a-z0-9\\-]{1,59}",t.src_domain_root="(?:"+t.src_xn+"|"+t.src_pseudo_letter+"{1,63})",t.src_domain="(?:"+t.src_xn+"|(?:"+t.src_pseudo_letter+")|(?:"+t.src_pseudo_letter+"(?:-|"+t.src_pseudo_letter+"){0,61}"+t.src_pseudo_letter+"))",t.src_host="(?:(?:(?:(?:"+t.src_domain+")\\.)*"+t.src_domain+"))",t.tpl_host_fuzzy="(?:"+t.src_ip4+"|(?:(?:(?:"+t.src_domain+")\\.)+(?:%TLDS%)))",t.tpl_host_no_ip_fuzzy="(?:(?:(?:"+t.src_domain+")\\.)+(?:%TLDS%))",t.src_host_strict=t.src_host+t.src_host_terminator,t.tpl_host_fuzzy_strict=t.tpl_host_fuzzy+t.src_host_terminator,t.src_host_port_strict=t.src_host+t.src_port+t.src_host_terminator,t.tpl_host_port_fuzzy_strict=t.tpl_host_fuzzy+t.src_port+t.src_host_terminator,t.tpl_host_port_no_ip_fuzzy_strict=t.tpl_host_no_ip_fuzzy+t.src_port+t.src_host_terminator,t.tpl_host_fuzzy_test="localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:"+t.src_ZPCc+"|>|$))",t.tpl_email_fuzzy='(^|[><|]|"|\\(|'+t.src_ZCc+")("+t.src_email_name+"@"+t.tpl_host_fuzzy_strict+")",t.tpl_link_fuzzy="(^|(?![.:/\\-_@])(?:[$+<=>^`||]|"+t.src_ZPCc+"))((?![$+<=>^`||])"+t.tpl_host_port_fuzzy_strict+t.src_path+")",t.tpl_link_no_ip_fuzzy="(^|(?![.:/\\-_@])(?:[$+<=>^`||]|"+t.src_ZPCc+"))((?![$+<=>^`||])"+t.tpl_host_port_no_ip_fuzzy_strict+t.src_path+")",t}),yu)(e.__opts__),n=e.__tlds__.slice();function r(i){return i.replace("%TLDS%",t.src_tlds)}e.onCompile(),e.__tlds_replaced__||n.push(Tv),n.push(t.src_xn),t.src_tlds=n.join("|"),t.email_fuzzy=RegExp(r(t.tpl_email_fuzzy),"i"),t.link_fuzzy=RegExp(r(t.tpl_link_fuzzy),"i"),t.link_no_ip_fuzzy=RegExp(r(t.tpl_link_no_ip_fuzzy),"i"),t.host_fuzzy_test=RegExp(r(t.tpl_host_fuzzy_test),"i");var o=[];function a(i,l){throw new Error('(LinkifyIt) Invalid schema "'+i+'": '+l)}e.__compiled__={},Object.keys(e.__schemas__).forEach((function(i){var l=e.__schemas__[i];if(null!==l){var u={validate:null,link:null};if(e.__compiled__[i]=u,function(e){return"[object Object]"===Cs(e)}(l))return!function(e){return"[object RegExp]"===Cs(e)}(l.validate)?mf(l.validate)?u.validate=l.validate:a(i,l):u.validate=function(e){return function(t,n){var r=t.slice(n);return e.test(r)?r.match(e)[0].length:0}}(l.validate),void(mf(l.normalize)?u.normalize=l.normalize:l.normalize?a(i,l):u.normalize=function(e,t){t.normalize(e)});if(function(e){return"[object String]"===Cs(e)}(l))return void o.push(i);a(i,l)}})),o.forEach((function(i){e.__compiled__[e.__schemas__[i]]&&(e.__compiled__[i].validate=e.__compiled__[e.__schemas__[i]].validate,e.__compiled__[i].normalize=e.__compiled__[e.__schemas__[i]].normalize)})),e.__compiled__[""]={validate:null,normalize:function(e,t){t.normalize(e)}};var s=Object.keys(e.__compiled__).filter((function(i){return i.length>0&&e.__compiled__[i]})).map(vv).join("|");e.re.schema_test=RegExp("(^|(?!_)(?:[><|]|"+t.src_ZPCc+"))("+s+")","i"),e.re.schema_search=RegExp("(^|(?!_)(?:[><|]|"+t.src_ZPCc+"))("+s+")","ig"),e.re.schema_at_start=RegExp("^"+e.re.schema_search.source,"i"),e.re.pretest=RegExp("("+e.re.schema_test.source+")|("+e.re.host_fuzzy_test.source+")|@","i"),function(e){e.__index__=-1,e.__text_cache__=""}(e)}function wv(e,t){var n=e.__index__,r=e.__last_index__,o=e.__text_cache__.slice(n,r);this.schema=e.__schema__.toLowerCase(),this.index=n+t,this.lastIndex=r+t,this.raw=o,this.text=o,this.url=o}function Cu(e,t){var n=new wv(e,t);return e.__compiled__[n.schema].normalize(n,e),n}function ft(e,t){if(!(this instanceof ft))return new ft(e,t);t||function(e){return Object.keys(e||{}).reduce((function(t,n){return t||gf.hasOwnProperty(n)}),!1)}(e)&&(t=e,e={}),this.__opts__=Tu({},gf,t),this.__index__=-1,this.__last_index__=-1,this.__schema__="",this.__text_cache__="",this.__schemas__=Tu({},yv,e),this.__compiled__={},this.__tlds__=Cv,this.__tlds_replaced__=!1,this.re={},Ns(this)}ft.prototype.add=function(t,n){return this.__schemas__[t]=n,Ns(this),this},ft.prototype.set=function(t){return this.__opts__=Tu(this.__opts__,t),this},ft.prototype.test=function(t){if(this.__text_cache__=t,this.__index__=-1,!t.length)return!1;var n,r,o,a,s,i,l,u;if(this.re.schema_test.test(t))for((l=this.re.schema_search).lastIndex=0;null!==(n=l.exec(t));)if(a=this.testSchemaAt(t,n[2],l.lastIndex)){this.__schema__=n[2],this.__index__=n.index+n[1].length,this.__last_index__=n.index+n[0].length+a;break}return this.__opts__.fuzzyLink&&this.__compiled__["http:"]&&((u=t.search(this.re.host_fuzzy_test))>=0&&(this.__index__<0||u=128&&Sn("not-basic"),t.push(e.charCodeAt(i));for(let i=s>0?s+1:0;i