From 9af537c985d59e861ae5b1955c5d75ee4bfe9bb8 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Mon, 1 May 2023 21:53:10 +0100 Subject: [PATCH] ocr remove images --- .../controller/api/other/OCRController.java | 18 +++++++++-- .../controller/web/OtherWebController.java | 9 ++++++ .../software/SPDF/utils/ProcessExecutor.java | 3 +- src/main/resources/messages_ar_AR.properties | 2 ++ src/main/resources/messages_de_DE.properties | 2 ++ src/main/resources/messages_en_GB.properties | 2 ++ src/main/resources/messages_es_ES.properties | 2 ++ src/main/resources/messages_fr_FR.properties | 2 ++ .../templates/other/adjust-contrast.html | 32 +++++++++++++++++++ .../resources/templates/other/ocr-pdf.html | 8 ++++- 10 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 src/main/resources/templates/other/adjust-contrast.html diff --git a/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java b/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java index 3aa1bb37..5c7f2553 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java @@ -47,7 +47,8 @@ public class OCRController { @RequestParam("languages") List selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar, @RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean, @RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType, - @RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType) + @RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType, + @RequestParam(name = "removeImagesAfter", required = false) Boolean removeImagesAfter) throws IOException, InterruptedException { // --output-type pdfa @@ -114,11 +115,24 @@ public class OCRController { // Run CLI command int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command); + + + + + // Remove images from the OCR processed PDF if the flag is set to true + if (removeImagesAfter != null && removeImagesAfter) { + Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf"); + + List gsCommand = Arrays.asList("gs", "-sDEVICE=pdfwrite", "-dFILTERIMAGE", "-o", tempPdfWithoutImages.toString(), tempOutputFile.toString()); + + int gsReturnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(gsCommand); + tempOutputFile = tempPdfWithoutImages; + } // Read the OCR processed PDF file byte[] pdfBytes = Files.readAllBytes(tempOutputFile); - // Clean up the temporary files Files.delete(tempInputFile); + // Return the OCR processed PDF as a response String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf"; diff --git a/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java b/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java index 549bd1fb..a7d17d76 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java @@ -72,4 +72,13 @@ public class OtherWebController { model.addAttribute("currentPage", "add-image"); return "other/add-image"; } + + @GetMapping("/adjust-contrast") + @Hidden + public String contrast(Model model) { + model.addAttribute("currentPage", "adjust-contrast"); + return "other/adjust-contrast"; + } + + } diff --git a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java index 34cfdb6b..33823507 100644 --- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java +++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java @@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore; public class ProcessExecutor { public enum Processes { - LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV + LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT } private static final Map instances = new ConcurrentHashMap<>(); @@ -24,6 +24,7 @@ public class ProcessExecutor { case LIBRE_OFFICE -> 1; case OCR_MY_PDF -> 2; case PYTHON_OPENCV -> 8; + case GHOSTSCRIPT -> 16; }; return new ProcessExecutor(semaphoreLimit); }); diff --git a/src/main/resources/messages_ar_AR.properties b/src/main/resources/messages_ar_AR.properties index b922b114..15a3ee2c 100644 --- a/src/main/resources/messages_ar_AR.properties +++ b/src/main/resources/messages_ar_AR.properties @@ -149,6 +149,8 @@ ocr.selectText.7=\u0641\u0631\u0636 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\ ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635) ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629 ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641 +ocr.selectText.11 = إزالة الصور بعد التعرف الضوئي على الحروف (يزيل كل الصور ، يكون مفيدًا فقط إذا كان جزءًا من خطوة التحويل) +ocr.selectText.12 = نوع العرض (متقدم) ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621 ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR. ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR diff --git a/src/main/resources/messages_de_DE.properties b/src/main/resources/messages_de_DE.properties index 7cd261bc..57430b55 100644 --- a/src/main/resources/messages_de_DE.properties +++ b/src/main/resources/messages_de_DE.properties @@ -142,6 +142,8 @@ ocr.selectText.7=OCR erzwingen, OCR wird jede Seite entfernen und alle ursprüng ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält) ocr.selectText.9=Zusätzliche Einstellungen ocr.selectText.10=OCR-Modus +ocr.selectText.11=Bilder nach OCR entfernen (Entfernt ALLE Bilder, nur sinnvoll, wenn Teil des Konvertierungsschritts) +ocr.selectText.12=Rendertyp (Erweitert) ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR. ocr.submit=PDF mit OCR verarbeiten diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 8230fe8b..26e958ae 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -148,6 +148,8 @@ ocr.selectText.7=Force OCR, will OCR Every page removing all original text eleme ocr.selectText.8=Normal (Will error if PDF contains text) ocr.selectText.9=Additional Settings ocr.selectText.10=OCR Mode +ocr.selectText.11=Remove images after OCR (Removes ALL images, only useful if part of conversion step) +ocr.selectText.12=Render Type (Advanced) ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker ocr.credit=This service uses OCRmyPDF and Tesseract for OCR. ocr.submit=Process PDF with OCR diff --git a/src/main/resources/messages_es_ES.properties b/src/main/resources/messages_es_ES.properties index 8957599b..a8712edb 100644 --- a/src/main/resources/messages_es_ES.properties +++ b/src/main/resources/messages_es_ES.properties @@ -144,6 +144,8 @@ ocr.selectText.7=Fuerza OCR, OCR eliminará en cada página todo el texto origin ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto) ocr.selectText.9=Ajustes Adicionales ocr.selectText.10=Modo OCR +ocr.selectText.11=Eliminar imágenes después de OCR (Elimina TODAS las imágenes, solo es útil si es parte del paso de conversión) +ocr.selectText.12=Tipo de procesamiento (avanzado) ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR. ocr.submit=Procesa PDF con OCR diff --git a/src/main/resources/messages_fr_FR.properties b/src/main/resources/messages_fr_FR.properties index 45b45750..ddba2305 100644 --- a/src/main/resources/messages_fr_FR.properties +++ b/src/main/resources/messages_fr_FR.properties @@ -148,6 +148,8 @@ ocr.selectText.7=Forcer l'OCR, OCR chaque page supprimera tous les éléments de ocr.selectText.8=Normal (Erreur si le PDF contient du texte) ocr.selectText.9=Paramètres supplémentaires ocr.selectText.10=Mode ROC +ocr.selectText.11=Supprimer les images après l'OCR (Supprime TOUTES les images, utile uniquement si elles font partie de l'étape de conversion) +ocr.selectText.12=Type de rendu (avancé) ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR. ocr.submit=Traiter PDF avec OCR diff --git a/src/main/resources/templates/other/adjust-contrast.html b/src/main/resources/templates/other/adjust-contrast.html new file mode 100644 index 00000000..87496d4f --- /dev/null +++ b/src/main/resources/templates/other/adjust-contrast.html @@ -0,0 +1,32 @@ + + + + + + + +
+
+
+

+
+
+
+

+ +
+
+
+ + +
+ +
+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/src/main/resources/templates/other/ocr-pdf.html b/src/main/resources/templates/other/ocr-pdf.html index aa22a332..3e36b5eb 100644 --- a/src/main/resources/templates/other/ocr-pdf.html +++ b/src/main/resources/templates/other/ocr-pdf.html @@ -53,8 +53,14 @@ +
+ + +
+ +
- +