From 0b1cdf6a689451c36f64619e1f39e05cb821e994 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Mon, 8 May 2023 12:18:48 +0100 Subject: [PATCH] remove blanks --- scripts/detect-blank-pages.sh | 39 ++++++++++ .../api/other/BlankPageController.java | 75 ++++++++++--------- .../controller/web/OtherWebController.java | 7 ++ src/main/resources/templates/home.html | 2 + .../templates/other/remove-blanks.html | 28 +++++++ 5 files changed, 116 insertions(+), 35 deletions(-) create mode 100644 scripts/detect-blank-pages.sh create mode 100644 src/main/resources/templates/other/remove-blanks.html diff --git a/scripts/detect-blank-pages.sh b/scripts/detect-blank-pages.sh new file mode 100644 index 00000000..1fb80187 --- /dev/null +++ b/scripts/detect-blank-pages.sh @@ -0,0 +1,39 @@ +import cv2 +import numpy as np +import sys + +def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5): + image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) + + if image is None: + print(f"Error: Unable to read the image file: {image_path}") + return False + + # Apply Gaussian blur to reduce noise + blurred_image = cv2.GaussianBlur(image, (blur_size, blur_size), 0) + + _, thresholded_image = cv2.threshold(blurred_image, white_value - threshold, white_value, cv2.THRESH_BINARY) + + # Calculate the percentage of white pixels in the thresholded image + white_pixels = np.sum(thresholded_image == white_value) + total_pixels = thresholded_image.size + white_pixel_percentage = (white_pixels / total_pixels) * 100 + + return white_pixel_percentage > 99 + + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python detect_blank_page.py ") + sys.exit(1) + + image_path = sys.argv[1] + blank = is_blank_image(image_path) + + if blank: + # Return code 1: The image is considered blank. + sys.exit(1) + else: + # Return code 0: The image is not considered blank. + sys.exit(0) \ No newline at end of file diff --git a/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java b/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java index 0934418c..40b6bdb1 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java @@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.other; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; @@ -10,38 +11,30 @@ import org.springframework.web.bind.annotation.RequestPart; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import stirling.software.SPDF.utils.ImageFinder; +import stirling.software.SPDF.utils.ProcessExecutor; + +import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import javax.imageio.ImageIO; + @RestController public class BlankPageController { @PostMapping(consumes = "multipart/form-data", value = "/remove-blanks") - public ResponseEntity removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException { - boolean removeNoText = false; - boolean removeNoTextOrImages = false; - - if(processType == 0) { - removeNoText = true; - } else if (processType == 1) { - removeNoTextOrImages = true; - } else if (processType == 2) { - //run OCR - OCRController ocr = new OCRController(); - ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false); - - removeNoText = true; - } - + public ResponseEntity removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException { try { PDDocument document = PDDocument.load(inputFile.getInputStream()); PDPageTree pages = document.getDocumentCatalog().getPages(); PDFTextStripper textStripper = new PDFTextStripper(); - List pagesToKeep = new ArrayList<>(); + List pagesToKeepIndex = new ArrayList<>(); int pageIndex = 0; for (PDPage page : pages) { @@ -50,28 +43,40 @@ public class BlankPageController { textStripper.setEndPage(pageIndex); String pageText = textStripper.getText(document); boolean hasText = !pageText.trim().isEmpty(); + if (hasText) { + pagesToKeepIndex.add(pageIndex); + System.out.print("page " + pageIndex + " has text"); + continue; + } boolean hasImages = hasImagesOnPage(page); - - if (removeNoText && removeNoTextOrImages) { - if (hasText || hasImages) { - pagesToKeep.add(page); - } - } else if (removeNoText) { - if (hasText) { - pagesToKeep.add(page); - } - } else if (removeNoTextOrImages) { - if (hasText && hasImages) { - pagesToKeep.add(page); - } - } else { - pagesToKeep.add(page); + if (hasImages) { + pagesToKeepIndex.add(pageIndex); + System.out.print("page " + pageIndex + " has image"); + continue; } } - + System.out.print(pagesToKeepIndex.size()); PDDocument outputDocument = new PDDocument(); - for (PDPage page : pagesToKeep) { - outputDocument.addPage(page); + PDFRenderer pdfRenderer = new PDFRenderer(document); + for (Integer i : pagesToKeepIndex) { + // Create temp file to save the image + Path tempFile = Files.createTempFile("image_", ".png"); + + // Render image and save as temp file + BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300); + ImageIO.write(image, "png", tempFile.toFile()); + + List command = new ArrayList<>(Arrays.asList("python3", "/scripts/detect-blank-pages.py", tempFile.toString())); + + // Run CLI command + int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command); + + //does contain data + if(returnCode ==0) { + outputDocument.addPage(document.getPage(i - 1)); + } else { + System.out.print("Found blank page skipping, page #" + i); + } } ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); diff --git a/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java b/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java index 8199070c..b937d4c0 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java @@ -95,4 +95,11 @@ public class OtherWebController { return "other/repair"; } + @GetMapping("/remove-blanks") + @Hidden + public String removeBlanksForm(Model model) { + model.addAttribute("currentPage", "remove-blanks"); + return "other/remove-blanks"; + } + } diff --git a/src/main/resources/templates/home.html b/src/main/resources/templates/home.html index 6e55502d..e143f28b 100644 --- a/src/main/resources/templates/home.html +++ b/src/main/resources/templates/home.html @@ -112,6 +112,8 @@ filter: invert(0.2) sepia(2) saturate(50) hue-rotate(190deg);
+
+
diff --git a/src/main/resources/templates/other/remove-blanks.html b/src/main/resources/templates/other/remove-blanks.html new file mode 100644 index 00000000..e6d99d4b --- /dev/null +++ b/src/main/resources/templates/other/remove-blanks.html @@ -0,0 +1,28 @@ + + + + + + + +
+
+
+

+
+
+
+

+ +
+
+ +
+
+
+
+
+
+
+ + \ No newline at end of file