anything-llm/collector/utils/tokenizer/index.js
Timothy Carambat 719521c307
Document Processor v2 (#442)
* wip: init refactor of document processor to JS

* add NodeJs PDF support

* wip: partity with python processor
feat: add pptx support

* fix: forgot files

* Remove python scripts totally

* wip:update docker to boot new collector

* add package.json support

* update dockerfile for new build

* update gitignore and linting

* add more protections on file lookup

* update package.json

* test build

* update docker commands to use cap-add=SYS_ADMIN so web scraper can run
update all scripts to reflect this
remove docker build for branch
2023-12-14 15:14:56 -08:00

16 lines
304 B
JavaScript

const { getEncoding } = require("js-tiktoken");
function tokenizeString(input = "") {
try {
const encoder = getEncoding("cl100k_base");
return encoder.encode(input);
} catch (e) {
console.error("Could not tokenize string!");
return [];
}
}
module.exports = {
tokenizeString,
};