ocr remove images

2024-06-30 22:50:11 +02:00 · 2023-05-01 21:53:10 +01:00 · 2023-05-01 21:53:10 +01:00 · 9af537c985
commit 9af537c985
parent 30c56a0ec9
10 changed files with 76 additions and 4 deletions
--- a/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java
@ -47,7 +47,8 @@ public class OCRController {
            @RequestParam("languages") List<String> selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar,
            @RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean,
            @RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType,
-            @RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType)
+            @RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType,
+            @RequestParam(name = "removeImagesAfter", required = false) Boolean removeImagesAfter)
            throws IOException, InterruptedException {

        // --output-type pdfa
@ -114,11 +115,24 @@ public class OCRController {
        // Run CLI command
        int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);

+        
+
+        
+        
+        // Remove images from the OCR processed PDF if the flag is set to true
+        if (removeImagesAfter != null && removeImagesAfter) {
+            Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
+
+            List<String> gsCommand = Arrays.asList("gs", "-sDEVICE=pdfwrite", "-dFILTERIMAGE", "-o", tempPdfWithoutImages.toString(), tempOutputFile.toString());
+
+            int gsReturnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(gsCommand);
+            tempOutputFile = tempPdfWithoutImages;
+        }
        // Read the OCR processed PDF file
        byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
-
        // Clean up the temporary files
        Files.delete(tempInputFile);
+        
        // Return the OCR processed PDF as a response
        String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";

--- a/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java
+++ b/src/main/java/stirling/software/SPDF/controller/web/OtherWebController.java
@ -72,4 +72,13 @@ public class OtherWebController {
        model.addAttribute("currentPage", "add-image");
        return "other/add-image";
    }
+    
+    @GetMapping("/adjust-contrast")
+    @Hidden
+    public String contrast(Model model) {
+        model.addAttribute("currentPage", "adjust-contrast");
+        return "other/adjust-contrast";
+    }
+    
+    
 }
--- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java
+++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java
@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
 public class ProcessExecutor {

    public enum Processes {
-        LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV
+        LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT
    }

    private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
@ -24,6 +24,7 @@ public class ProcessExecutor {
            case LIBRE_OFFICE -> 1;
            case OCR_MY_PDF -> 2;
            case PYTHON_OPENCV -> 8;
+            case GHOSTSCRIPT -> 16;
            };
            return new ProcessExecutor(semaphoreLimit);
        });
--- a/src/main/resources/messages_ar_AR.properties
+++ b/src/main/resources/messages_ar_AR.properties
@ -149,6 +149,8 @@ ocr.selectText.7=\u0641\u0631\u0636 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\
 ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635)
 ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629
 ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641
+ocr.selectText.11 = إزالة الصور بعد التعرف الضوئي على الحروف (يزيل كل الصور ، يكون مفيدًا فقط إذا كان جزءًا من خطوة التحويل)
+ocr.selectText.12 = نوع العرض (متقدم)
 ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621
 ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR.
 ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR
--- a/src/main/resources/messages_de_DE.properties
+++ b/src/main/resources/messages_de_DE.properties
@ -142,6 +142,8 @@ ocr.selectText.7=OCR erzwingen, OCR wird jede Seite entfernen und alle ursprüng
 ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält)
 ocr.selectText.9=Zusätzliche Einstellungen
 ocr.selectText.10=OCR-Modus
+ocr.selectText.11=Bilder nach OCR entfernen (Entfernt ALLE Bilder, nur sinnvoll, wenn Teil des Konvertierungsschritts)
+ocr.selectText.12=Rendertyp (Erweitert)
 ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können
 ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR.
 ocr.submit=PDF mit OCR verarbeiten
--- a/src/main/resources/messages_en_GB.properties
+++ b/src/main/resources/messages_en_GB.properties
@ -148,6 +148,8 @@ ocr.selectText.7=Force OCR, will OCR Every page removing all original text eleme
 ocr.selectText.8=Normal (Will error if PDF contains text)
 ocr.selectText.9=Additional Settings
 ocr.selectText.10=OCR Mode
+ocr.selectText.11=Remove images after OCR (Removes ALL images, only useful if part of conversion step)
+ocr.selectText.12=Render Type (Advanced)
 ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
 ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
 ocr.submit=Process PDF with OCR
--- a/src/main/resources/messages_es_ES.properties
+++ b/src/main/resources/messages_es_ES.properties
@ -144,6 +144,8 @@ ocr.selectText.7=Fuerza OCR, OCR eliminará en cada página todo el texto origin
 ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto)
 ocr.selectText.9=Ajustes Adicionales
 ocr.selectText.10=Modo OCR
+ocr.selectText.11=Eliminar imágenes después de OCR (Elimina TODAS las imágenes, solo es útil si es parte del paso de conversión)
+ocr.selectText.12=Tipo de procesamiento (avanzado)
 ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker
 ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR.
 ocr.submit=Procesa PDF con OCR
--- a/src/main/resources/messages_fr_FR.properties
+++ b/src/main/resources/messages_fr_FR.properties
@ -148,6 +148,8 @@ ocr.selectText.7=Forcer l'OCR, OCR chaque page supprimera tous les éléments de
 ocr.selectText.8=Normal (Erreur si le PDF contient du texte)
 ocr.selectText.9=Paramètres supplémentaires
 ocr.selectText.10=Mode ROC
+ocr.selectText.11=Supprimer les images après l'OCR (Supprime TOUTES les images, utile uniquement si elles font partie de l'étape de conversion)
+ocr.selectText.12=Type de rendu (avancé)
 ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker
 ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR.
 ocr.submit=Traiter PDF avec OCR
--- a/src/main/resources/templates/other/adjust-contrast.html
+++ b/src/main/resources/templates/other/adjust-contrast.html
@ -0,0 +1,32 @@
+<!DOCTYPE html>
+<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
+
+<th:block th:insert="~{fragments/common :: head(title=#{extractImages.title})}"></th:block>
+
+
+<body>
+  <div id="page-container">
+    <div id="content-wrap">
+      <div th:insert="~{fragments/navbar.html :: navbar}"></div>
+      <br> <br>
+      <div class="container">
+        <div class="row justify-content-center">
+          <div class="col-md-6">
+            <h2 th:text="#{extractImages.header}"></h2>
+
+            <form id="multiPdfForm" th:action="@{adjust-contrast}" method="post" enctype="multipart/form-data">
+              <div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
+                 <div class="form-group">
+                    <label for="contrastRange">Contrast</label>
+                    <input name="contrastRange" type="range" class="form-control-range" id="contrastRange" min="-100" max="100" value="0" step="1">
+                </div>
+              <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImages.submit}"></button>
+            </form>
+          </div>
+        </div>
+      </div>
+    </div>
+    <div th:insert="~{fragments/footer.html :: footer}"></div>
+  </div>
+</body>
+</html>
--- a/src/main/resources/templates/other/ocr-pdf.html
+++ b/src/main/resources/templates/other/ocr-pdf.html
@ -53,8 +53,14 @@
                                <input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
                                <label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
                            </div>
+                            <div class="form-check">
+                                <input type="checkbox" class="form-check-input" name="removeImagesAfter" id="removeImagesAfter" />
+                                <label class="form-check-label" for="removeImagesAfter" th:text="#{ocr.selectText.11}"></label>
+                            </div>
+                            
+                            
                            <div class="form-group">
-                                <label>Render Type (Advanced)</label> 
+                                <label th:text="#{ocr.selectText.12}"></label> 
                                <select class="form-control" name="ocrRenderType">
                                    <option value="hocr">HOCR (Latin/Roman alphabet only)</option>
                                    <option value="sandwich">Sandwich</option>