From 2fff3083ae9d608397582abbde86cff0e79d123d Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Tue, 26 Mar 2024 19:25:16 +0000 Subject: [PATCH] Update TextFinder.java (#980) --- .../software/SPDF/pdf/TextFinder.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index cdfb5501..f9e339c2 100644 --- a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -19,6 +19,16 @@ public class TextFinder extends PDFTextStripper { private final boolean wholeWordSearch; private final List textOccurrences = new ArrayList<>(); + private class MatchInfo { + int startIndex; + int matchLength; + + MatchInfo(int startIndex, int matchLength) { + this.startIndex = startIndex; + this.matchLength = matchLength; + } + } + public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException { this.searchText = searchText.toLowerCase(); @@ -27,36 +37,37 @@ public class TextFinder extends PDFTextStripper { setSortByPosition(true); } - private List findOccurrencesInText(String searchText, String content) { - List indexes = new ArrayList<>(); + private List findOccurrencesInText(String searchText, String content) { + List matches = new ArrayList<>(); + Pattern pattern; if (useRegex) { // Use regex-based search pattern = wholeWordSearch - ? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)") + ? Pattern.compile("\\b" + searchText + "\\b") : Pattern.compile(searchText); } else { // Use normal text search pattern = wholeWordSearch - ? Pattern.compile( - "(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)") + ? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b") : Pattern.compile(Pattern.quote(searchText)); } Matcher matcher = pattern.matcher(content); while (matcher.find()) { - indexes.add(matcher.start()); + matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start())); } - return indexes; + return matches; } @Override protected void writeString(String text, List textPositions) { - for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) { - if (index + searchText.length() <= textPositions.size()) { + for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) { + int index = match.startIndex; + if (index + match.matchLength <= textPositions.size()) { // Initial values based on the first character TextPosition first = textPositions.get(index); float minX = first.getX(); @@ -65,7 +76,7 @@ public class TextFinder extends PDFTextStripper { float maxY = first.getY() + first.getHeight(); // Loop over the rest of the characters and adjust bounding box values - for (int i = index; i < index + searchText.length(); i++) { + for (int i = index; i < index + match.matchLength; i++) { TextPosition position = textPositions.get(i); minX = Math.min(minX, position.getX()); minY = Math.min(minY, position.getY());