Stirling-PDF/src/main/java/stirling/software/SPDF/pdf/TextFinder.java

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

105 lines
3.7 KiB
Java
Raw Normal View History

2023-08-25 00:23:25 +02:00
package stirling.software.SPDF.pdf;
2023-12-30 20:11:27 +01:00
2023-08-25 00:23:25 +02:00
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
2023-08-27 01:39:22 +02:00
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import stirling.software.SPDF.model.PDFText;
2023-08-25 00:23:25 +02:00
public class TextFinder extends PDFTextStripper {
private final String searchText;
private final boolean useRegex;
private final boolean wholeWordSearch;
private final List<PDFText> textOccurrences = new ArrayList<>();
2024-03-26 20:25:16 +01:00
private class MatchInfo {
int startIndex;
int matchLength;
MatchInfo(int startIndex, int matchLength) {
this.startIndex = startIndex;
this.matchLength = matchLength;
}
}
2023-08-25 00:23:25 +02:00
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
throws IOException {
this.searchText = searchText.toLowerCase();
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
setSortByPosition(true);
}
2024-03-26 20:25:16 +01:00
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
List<MatchInfo> matches = new ArrayList<>();
2023-08-25 00:23:25 +02:00
Pattern pattern;
if (useRegex) {
// Use regex-based search
pattern =
wholeWordSearch
2024-03-26 20:25:16 +01:00
? Pattern.compile("\\b" + searchText + "\\b")
2023-08-25 00:23:25 +02:00
: Pattern.compile(searchText);
} else {
// Use normal text search
pattern =
wholeWordSearch
2024-03-26 20:25:16 +01:00
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
2023-08-25 00:23:25 +02:00
: Pattern.compile(Pattern.quote(searchText));
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
2024-03-26 20:25:16 +01:00
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
2023-08-25 00:23:25 +02:00
}
2024-03-26 20:25:16 +01:00
return matches;
2023-08-25 00:23:25 +02:00
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
2024-03-26 20:25:16 +01:00
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
int index = match.startIndex;
if (index + match.matchLength <= textPositions.size()) {
2023-08-25 00:23:25 +02:00
// Initial values based on the first character
TextPosition first = textPositions.get(index);
float minX = first.getX();
float minY = first.getY();
float maxX = first.getX() + first.getWidth();
float maxY = first.getY() + first.getHeight();
// Loop over the rest of the characters and adjust bounding box values
2024-03-26 20:25:16 +01:00
for (int i = index; i < index + match.matchLength; i++) {
2023-08-25 00:23:25 +02:00
TextPosition position = textPositions.get(i);
minX = Math.min(minX, position.getX());
minY = Math.min(minY, position.getY());
maxX = Math.max(maxX, position.getX() + position.getWidth());
maxY = Math.max(maxY, position.getY() + position.getHeight());
}
textOccurrences.add(
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
2023-12-30 20:11:27 +01:00
}
}
}
2023-08-25 00:23:25 +02:00
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
this.getText(document);
System.out.println(
2023-12-30 20:11:27 +01:00
"Found "
2023-08-25 00:23:25 +02:00
+ textOccurrences.size()
+ " occurrences of '"
+ searchText
+ "' in the document.");
return textOccurrences;
}
2023-12-30 20:11:27 +01:00
}