From c8ac1f702938b8073331a8858bc5f73fcdc1a5b1 Mon Sep 17 00:00:00 2001 From: Atrem Petrenko <64548375+Artem-ka-create@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:50:50 +0100 Subject: [PATCH] implementing extracting tables from pdf by pdfbox --- build.gradle | 8 - .../controller/api/ExtractController.java | 120 ++++++ .../api/strippers/PDFTableStripper.java | 354 ++++++++++++++++++ .../SPDF/model/api/extract/PDFFilePage.java | 18 + .../templates/convert/pdf-to-csv.html | 165 +++++++- 5 files changed, 639 insertions(+), 26 deletions(-) create mode 100644 src/main/java/stirling/software/SPDF/controller/api/ExtractController.java create mode 100644 src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java create mode 100644 src/main/java/stirling/software/SPDF/model/api/extract/PDFFilePage.java diff --git a/build.gradle b/build.gradle index 42298136..a0cdea0c 100644 --- a/build.gradle +++ b/build.gradle @@ -102,14 +102,6 @@ dependencies { developmentOnly("org.springframework.boot:spring-boot-devtools") compileOnly 'org.projectlombok:lombok:1.18.28' annotationProcessor 'org.projectlombok:lombok:1.18.28' - -//// https://mvnrepository.com/artifact/technology.tabula/tabula -// implementation group: 'technology.tabula', name: 'tabula', version: '1.0.5' - - -// implementation files('/Users/artempetrenko/Java/Stirling-PDF/tabula-1.0.5-jar-with-dependencies.jar') - implementation fileTree(include: ['tabula-1.0.5-jar-with-dependencies.jar'],dir: 'libs') - } diff --git a/src/main/java/stirling/software/SPDF/controller/api/ExtractController.java b/src/main/java/stirling/software/SPDF/controller/api/ExtractController.java new file mode 100644 index 00000000..20cc7328 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/ExtractController.java @@ -0,0 +1,120 @@ +package stirling.software.SPDF.controller.api; + +import com.opencsv.CSVWriter; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.http.ContentDisposition; +import org.springframework.http.HttpHeaders; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; +import stirling.software.SPDF.controller.api.strippers.PDFTableStripper; +import stirling.software.SPDF.model.api.extract.PDFFilePage; + +import java.awt.*; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; + +@RestController +@RequestMapping("/api/v1/extract/pdf-to-csv") +@Tag(name = "General", description = "General APIs") +public class ExtractController { + + private static final Logger logger = LoggerFactory.getLogger(CropController.class); + + @PostMapping(consumes = "multipart/form-data") + @Operation(summary = "Extracts a PDF document to csv", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO") + public ResponseEntity PdfToCsv(@ModelAttribute PDFFilePage form) + throws IOException { + + ArrayList tableData = new ArrayList<>(); + int columnsCount = 0; + + try (PDDocument document = PDDocument.load(new ByteArrayInputStream(form.getFileInput().getBytes()))) { + final double res = 72; // PDF units are at 72 DPI + PDFTableStripper stripper = new PDFTableStripper(); + stripper.setSortByPosition(true); + stripper.setRegion(new Rectangle((int) Math.round(1.0 * res), (int) Math.round(1 * res), (int) Math.round(6 * res), (int) Math.round(9.0 * res))); + + PDPage pdPage = document.getPage(form.getPageId() - 1); + stripper.extractTable(pdPage); + columnsCount = stripper.getColumns(); + + for (int c = 0; c < columnsCount; ++c) { + for(int r=0; r notEmptyColumns = new ArrayList<>(); + + for (String item: tableData) { + if(!item.trim().isEmpty()){ + notEmptyColumns.add(item); + }else{ + columnsCount--; + } + } + + List fullTable = notEmptyColumns.stream().map((entity)-> + entity.replace('\n',' ').replace('\r',' ').trim().replaceAll("\\s{2,}", "|")).toList(); + + int rowsCount = fullTable.get(0).split("\\|").length; + + ArrayList headersList = getTableHeaders(columnsCount,fullTable); + ArrayList recordList = getRecordsList(rowsCount,fullTable); + + + StringWriter writer = new StringWriter(); + try (CSVWriter csvWriter = new CSVWriter(writer)) { + csvWriter.writeNext(headersList.toArray(new String[0])); + for (String record : recordList) { + csvWriter.writeNext(record.split("\\|")); + } + } + + HttpHeaders headers = new HttpHeaders(); + headers.setContentDisposition(ContentDisposition.builder("attachment").filename(form.getFileInput().getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted.csv").build()); + headers.setContentType(MediaType.parseMediaType("text/csv")); + + return ResponseEntity.ok() + .headers(headers) + .body(writer.toString()); + } + + private ArrayList getRecordsList( int rowsCounts ,List items){ + ArrayList recordsList = new ArrayList<>(); + + for (int b=1; b getTableHeaders(int columnsCount, List items){ + ArrayList resultList = new ArrayList<>(); + for (int i=0;i boxes; + + // Border to allow when finding intersections + private double dx = 1.0; // This value works for me, feel free to tweak (or add setter) + private double dy = 0.000; // Rows of text tend to overlap, so need to extend + + /** + * Region in which to find table (otherwise whole page) + */ + private Rectangle2D regionArea; + + /** + * Number of rows in inferred table + */ + private int nRows=0; + + /** + * Number of columns in inferred table + */ + private int nCols=0; + + /** + * This is the object that does the text extraction + */ + private PDFTextStripperByArea regionStripper; + + /** + * 1D intervals - used for calculateTableRegions() + * @author Beldaz + * + */ + public static class Interval { + double start; + double end; + public Interval(double start, double end) { + this.start=start; this.end = end; + } + public void add(Interval col) { + if(col.startend) + end = col.end; + } + public static void addTo(Interval x, LinkedList columns) { + int p = 0; + Iterator it = columns.iterator(); + // Find where x should go + while(it.hasNext()) { + Interval col = it.next(); + if(x.end>=col.start) { + if(x.start<=col.end) { // overlaps + x.add(col); + it.remove(); + } + break; + } + ++p; + } + while(it.hasNext()) { + Interval col = it.next(); + if(x.start>col.end) + break; + x.add(col); + it.remove(); + } + columns.add(p, x); + } + + } + + + /** + * Instantiate a new PDFTableStripper object. + * + * @param document + * @throws IOException If there is an error loading the properties. + */ + public PDFTableStripper() throws IOException + { + super.setShouldSeparateByBeads(false); + regionStripper = new PDFTextStripperByArea(); + regionStripper.setSortByPosition( true ); + } + + /** + * Define the region to group text by. + * + * @param rect The rectangle area to retrieve the text from. + */ + public void setRegion(Rectangle2D rect ) + { + regionArea = rect; + } + + public int getRows() + { + return nRows; + } + + public int getColumns() + { + return nCols; + } + + /** + * Get the text for the region, this should be called after extractTable(). + * + * @return The text that was identified in that region. + */ + public String getText(int row, int col) + { + return regionStripper.getTextForRegion("el"+col+"x"+row); + } + + public void extractTable(PDPage pdPage) throws IOException + { + setStartPage(getCurrentPageNo()); + setEndPage(getCurrentPageNo()); + + boxes = new HashSet(); + // flip y-axis + flipAT = new AffineTransform(); + flipAT.translate(0, pdPage.getBBox().getHeight()); + flipAT.scale(1, -1); + + // page may be rotated + rotateAT = new AffineTransform(); + int rotation = pdPage.getRotation(); + if (rotation != 0) + { + PDRectangle mediaBox = pdPage.getMediaBox(); + switch (rotation) + { + case 90: + rotateAT.translate(mediaBox.getHeight(), 0); + break; + case 270: + rotateAT.translate(0, mediaBox.getWidth()); + break; + case 180: + rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight()); + break; + default: + break; + } + rotateAT.rotate(Math.toRadians(rotation)); + } + // Trigger processing of the document so that writeString is called. + try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) { + super.output = dummy; + super.processPage(pdPage); + } + + Rectangle2D[][] regions = calculateTableRegions(); + +// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + " regions"); + for(int i=0; i columns = new LinkedList(); + LinkedList rows = new LinkedList(); + + for(Rectangle2D box: boxes) { + Interval x = new Interval(box.getMinX(), box.getMaxX()); + Interval y = new Interval(box.getMinY(), box.getMaxY()); + + Interval.addTo(x, columns); + Interval.addTo(y, rows); + } + + nRows = rows.size(); + nCols = columns.size(); + Rectangle2D[][] regions = new Rectangle2D[nCols][nRows]; + int i=0; + // Label regions from top left, rather than the transformed orientation + for(Interval column: columns) { + int j=0; + for(Interval row: rows) { + regions[nCols-i-1][nRows-j-1] = new Rectangle2D.Double(column.start, row.start, column.end - column.start, row.end - row.start); + ++j; + } + ++i; + } + + return regions; + } + + /** + * Register each character's bounding box, updating boxes field to maintain + * a list of all distinct groups of characters. + * + * Overrides the default functionality of PDFTextStripper. + * Most of this is taken from DrawPrintTextLocations.java, with extra steps + * at end of main loop + */ + @Override + protected void writeString(String string, List textPositions) throws IOException + { + for (TextPosition text : textPositions) + { + // glyph space -> user space + // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix + AffineTransform at = text.getTextMatrix().createAffineTransform(); + PDFont font = text.getFont(); + BoundingBox bbox = font.getBoundingBox(); + + // advance width, bbox height (glyph space) + float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars + Rectangle2D.Float rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight()); + + if (font instanceof PDType3Font) + { + // bbox and font matrix are unscaled + at.concatenate(font.getFontMatrix().createAffineTransform()); + } + else + { + // bbox and font matrix are already scaled to 1000 + at.scale(1/1000f, 1/1000f); + } + Shape s = at.createTransformedShape(rect); + s = flipAT.createTransformedShape(s); + s = rotateAT.createTransformedShape(s); + + + // + // Merge character's bounding box with boxes field + // + Rectangle2D bounds = s.getBounds2D(); + // Pad sides to detect almost touching boxes + Rectangle2D hitbox = bounds.getBounds2D(); + hitbox.add(bounds.getMinX() - dx , bounds.getMinY() - dy); + hitbox.add(bounds.getMaxX() + dx , bounds.getMaxY() + dy); + + // Find all overlapping boxes + List intersectList = new ArrayList(); + for(Rectangle2D box: boxes) { + if(box.intersects(hitbox)) { + intersectList.add(box); + } + } + + // Combine all touching boxes and update + // (NOTE: Potentially this could leave some overlapping boxes un-merged, + // but it's sufficient for now and get's fixed up in calculateTableRegions) + for(Rectangle2D box: intersectList) { + bounds.add(box); + boxes.remove(box); + } + boxes.add(bounds); + + } + + } + + /** + * This method does nothing in this derived class, because beads and regions are incompatible. Beads are + * ignored when stripping by area. + * + * @param aShouldSeparateByBeads The new grouping of beads. + */ + @Override + public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) + { + } + + /** + * Adapted from PDFTextStripperByArea + * {@inheritDoc} + */ + @Override + protected void processTextPosition( TextPosition text ) + { + if(regionArea!=null && !regionArea.contains( text.getX(), text.getY() ) ) { + // skip character + } else { + super.processTextPosition( text ); + } + } +} \ No newline at end of file diff --git a/src/main/java/stirling/software/SPDF/model/api/extract/PDFFilePage.java b/src/main/java/stirling/software/SPDF/model/api/extract/PDFFilePage.java new file mode 100644 index 00000000..bfe87a16 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/model/api/extract/PDFFilePage.java @@ -0,0 +1,18 @@ +package stirling.software.SPDF.model.api.extract; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; +import lombok.EqualsAndHashCode; +import stirling.software.SPDF.model.api.PDFFile; + +@Data +@EqualsAndHashCode(callSuper=true) +public class PDFFilePage extends PDFFile { + + + @Schema(description = "Number of chosen page", type = "number") + private int pageId; + + +} + diff --git a/src/main/resources/templates/convert/pdf-to-csv.html b/src/main/resources/templates/convert/pdf-to-csv.html index bad849fb..cbd87ad8 100644 --- a/src/main/resources/templates/convert/pdf-to-csv.html +++ b/src/main/resources/templates/convert/pdf-to-csv.html @@ -1,29 +1,158 @@ - - - -
-
-
-

-
-
-
-

-
-
-
- + -
-

+ + +
+
+ +
+
+
+

+
+ +
+ +
+ + +
+
+ + + + +
+
+ + +
-
+
+