Add: Convert PDF to WebP (#1666)

* Add PDF to WebP * add swagger param * back * creates a custom image for Docker from pymupdf * Converting with pdf2image and Pillow instead of pymupdf * webp remove to pdf-to-img * remove mupdf
2024-09-21 04:10:38 +02:00 · 2024-08-20 17:17:54 +02:00 · 2024-08-20 17:17:54 +02:00 · 58618b3a21
commit 58618b3a21
parent 4a4c7faf47
12 changed files with 296 additions and 28 deletions
--- a/.github/labeler-config.yml
+++ b/.github/labeler-config.yml
@ -2,6 +2,7 @@ Translation:
  - changed-files:
    - any-glob-to-any-file: 'src/main/resources/messages_*_*.properties'
    - any-glob-to-any-file: 'scripts/ignore_translation.toml'
+    - any-glob-to-any-file: 'src/main/resources/templates/fragments/languages.html'

 Front End:
  - changed-files:
--- a/4
+++ b/4
@ -39,7 +39,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
        libreoffice \
 # pdftohtml
        poppler-utils \
-# OCR MY PDF (unpaper for descew and other advanced featues)
+# OCR MY PDF (unpaper for descew and other advanced features)
        ocrmypdf \
        tesseract-ocr-data-eng \
 # CV
@ -48,7 +48,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
        python3 \
        py3-pip && \
 # uno unoconv and HTML
-    pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
+    pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \
    mv /usr/share/tessdata /usr/share/tessdata-original && \
    mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
    fc-cache -f -v && \
--- a/2
+++ b/2
@ -64,7 +64,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
        python3 \
    py3-pip && \
 # uno unoconv and HTML
-    pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
+    pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \
    mv /usr/share/tessdata /usr/share/tessdata-original && \
    mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
    fc-cache -f -v && \
--- a/Endpoint-groups.md
+++ b/Endpoint-groups.md
@ -15,7 +15,7 @@
 | file-to-pdf         |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            |
 | img-to-pdf          |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            |
 | pdf-to-html         |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            |
-| pdf-to-img          |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            |
+| pdf-to-img          |         | ✔️       |          |       |     | ✔️      |        |             |          | ✔️    |            |
 | pdf-to-pdfa         |         | ✔️       |          |       | ✔️   |        |        |             | ✔️        |      |            |
 | pdf-to-markdown     |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            |
 | pdf-to-presentation |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            |
--- a/scripts/png_to_webp.py
+++ b/scripts/png_to_webp.py
@ -0,0 +1,174 @@
+"""
+Author: Ludy87
+Description: This script converts a PDF file to WebP images. It includes functionality to resize images if they exceed specified dimensions and handle conversion of PDF pages to WebP format.
+
+Example
+-------
+To convert a PDF file to WebP images with each page as a separate WebP file:
+    python script.py input.pdf output_directory
+
+To convert a PDF file to a single WebP image:
+    python script.py input.pdf output_directory --single
+
+To adjust the DPI resolution for rendering PDF pages:
+    python script.py input.pdf output_directory --dpi 150
+"""
+
+import argparse
+import os
+from pdf2image import convert_from_path
+from PIL import Image
+
+
+def resize_image(input_image_path, output_image_path, max_size=(16383, 16383)):
+    """
+    Resize the image if its dimensions exceed the maximum allowed size and save it as WebP.
+
+    Parameters
+    ----------
+    input_image_path : str
+        Path to the input image file.
+    output_image_path : str
+        Path where the output WebP image will be saved.
+    max_size : tuple of int, optional
+        Maximum allowed dimensions for the image (width, height). Default is (16383, 16383).
+
+    Returns
+    -------
+    None
+    """
+    try:
+        # Open the image
+        image = Image.open(input_image_path)
+        width, height = image.size
+        max_width, max_height = max_size
+
+        # Check if the image dimensions exceed the maximum allowed dimensions
+        if width > max_width or height > max_height:
+            # Calculate the scaling ratio
+            ratio = min(max_width / width, max_height / height)
+            new_width = int(width * ratio)
+            new_height = int(height * ratio)
+
+            # Resize the image
+            resized_image = image.resize((new_width, new_height), Image.LANCZOS)
+            resized_image.save(output_image_path, format="WEBP", quality=100)
+            print(
+                f"The image was successfully resized to ({new_width}, {new_height}) and saved as WebP: {output_image_path}"
+            )
+        else:
+            # If dimensions are within the allowed limits, save the image directly
+            image.save(output_image_path, format="WEBP", quality=100)
+            print(f"The image was successfully saved as WebP: {output_image_path}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+
+def convert_image_to_webp(input_image, output_file):
+    """
+    Convert an image to WebP format, resizing it if it exceeds the maximum dimensions.
+
+    Parameters
+    ----------
+    input_image : str
+        Path to the input image file.
+    output_file : str
+        Path where the output WebP image will be saved.
+
+    Returns
+    -------
+    None
+    """
+    # Resize the image if it exceeds the maximum dimensions
+    resize_image(input_image, output_file, max_size=(16383, 16383))
+
+
+def pdf_to_webp(pdf_path, output_dir, dpi=300):
+    """
+    Convert each page of a PDF file to WebP images.
+
+    Parameters
+    ----------
+    pdf_path : str
+        Path to the input PDF file.
+    output_dir : str
+        Directory where the WebP images will be saved.
+    dpi : int, optional
+        DPI resolution for rendering PDF pages. Default is 300.
+
+    Returns
+    -------
+    None
+    """
+    # Convert the PDF to a list of images
+    images = convert_from_path(pdf_path, dpi=dpi)
+
+    for page_number, image in enumerate(images):
+        # Define temporary PNG path
+        temp_png_path = os.path.join(output_dir, f"temp_page_{page_number + 1}.png")
+        image.save(temp_png_path, format="PNG")
+
+        # Define the output path for WebP
+        output_path = os.path.join(output_dir, f"page_{page_number + 1}.webp")
+
+        # Convert PNG to WebP
+        convert_image_to_webp(temp_png_path, output_path)
+
+        # Delete the temporary PNG file
+        os.remove(temp_png_path)
+
+
+def main(pdf_image_path, output_dir, dpi=300, single_images_flag=False):
+    """
+    Main function to handle conversion from PDF to WebP images.
+
+    Parameters
+    ----------
+    pdf_image_path : str
+        Path to the input PDF file or image.
+    output_dir : str
+        Directory where the WebP images will be saved.
+    dpi : int, optional
+        DPI resolution for rendering PDF pages. Default is 300.
+    single_images_flag : bool, optional
+        If True, combine all pages into a single WebP image. Default is False.
+
+    Returns
+    -------
+    None
+    """
+    if single_images_flag:
+        # Combine all pages into a single WebP image
+        output_path = os.path.join(output_dir, "combined_image.webp")
+        convert_image_to_webp(pdf_image_path, output_path)
+    else:
+        # Convert each PDF page to a separate WebP image
+        pdf_to_webp(pdf_image_path, output_dir, dpi)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert a PDF file to WebP images.")
+    parser.add_argument("pdf_path", help="The path to the input PDF file.")
+    parser.add_argument(
+        "output_dir", help="The directory where the WebP images should be saved."
+    )
+    parser.add_argument(
+        "--dpi",
+        type=int,
+        default=300,
+        help="The DPI resolution for rendering the PDF pages (default: 300).",
+    )
+    parser.add_argument(
+        "--single",
+        action="store_true",
+        help="Combine all pages into a single WebP image.",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    main(
+        args.pdf_path,
+        args.output_dir,
+        dpi=args.dpi,
+        single_images_flag=args.single,
+    )
--- a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
+++ b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
@ -166,6 +166,7 @@ public class EndpointConfiguration {
        addEndpointToGroup("Python", REMOVE_BLANKS);
        addEndpointToGroup("Python", "html-to-pdf");
        addEndpointToGroup("Python", "url-to-pdf");
+        addEndpointToGroup("Python", "pdf-to-img");

        // openCV
        addEndpointToGroup("OpenCV", "extract-image-scans");
--- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java
@ -1,11 +1,23 @@
 package stirling.software.SPDF.controller.api.converters;

+import java.io.ByteArrayOutputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.net.URLConnection;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;

+import org.apache.commons.io.FileUtils;
 import org.apache.pdfbox.rendering.ImageType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.springframework.http.HttpHeaders;
 import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.ModelAttribute;
@ -21,6 +33,8 @@ import io.swagger.v3.oas.annotations.tags.Tag;
 import stirling.software.SPDF.model.api.converters.ConvertToImageRequest;
 import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest;
 import stirling.software.SPDF.utils.PdfUtils;
+import stirling.software.SPDF.utils.ProcessExecutor;
+import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
 import stirling.software.SPDF.utils.WebResponseUtils;

@RestController
@ -60,15 +74,92 @@ public class ConvertImgPDFController {
        result =
                PdfUtils.convertFromPdf(
                        pdfBytes,
-                        imageFormat.toUpperCase(),
+                        imageFormat.equalsIgnoreCase("webp") ? "png" : imageFormat.toUpperCase(),
                        colorTypeResult,
                        singleImage,
                        Integer.valueOf(dpi),
                        filename);
-
        if (result == null || result.length == 0) {
            logger.error("resultant bytes for {} is null, error converting ", filename);
        }
+        if (imageFormat.equalsIgnoreCase("webp")) {
+            // Write the output stream to a temp file
+            Path tempFile = Files.createTempFile("temp_png", ".png");
+            try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
+                fos.write(result);
+                fos.flush();
+            }
+
+            String pythonVersion = "python3";
+            try {
+                ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
+                        .runCommandWithOutputHandling(Arrays.asList("python3", "--version"));
+            } catch (IOException e) {
+                ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
+                        .runCommandWithOutputHandling(Arrays.asList("python", "--version"));
+                pythonVersion = "python";
+            }
+
+            List<String> command = new ArrayList<>();
+            command.add(pythonVersion);
+            command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion
+
+                // Create a temporary directory for the output WebP files
+            Path tempOutputDir = Files.createTempDirectory("webp_output");
+            if (singleImage) {
+                // Run the Python script to convert PNG to WebP
+                command.add(tempFile.toString());
+                command.add(tempOutputDir.toString());
+                command.add("--single");
+            } else {
+                // Save the uploaded PDF to a temporary file
+                Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
+                file.transferTo(tempPdfPath.toFile());
+                // Run the Python script to convert PDF to WebP
+                command.add(tempPdfPath.toString());
+                command.add(tempOutputDir.toString());
+            }
+            command.add("--dpi");
+            command.add(dpi);
+            ProcessExecutorResult resultProcess =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
+                            .runCommandWithOutputHandling(command);
+
+            // Find all WebP files in the output directory
+            List<Path> webpFiles =
+                    Files.walk(tempOutputDir)
+                            .filter(path -> path.toString().endsWith(".webp"))
+                            .collect(Collectors.toList());
+
+            if (webpFiles.isEmpty()) {
+                logger.error("No WebP files were created in: {}", tempOutputDir.toString());
+                throw new IOException("No WebP files were created. " + resultProcess.getMessages());
+            }
+
+            byte[] bodyBytes = new byte[0];
+
+            if (webpFiles.size() == 1) {
+                // Return the single WebP file directly
+                Path webpFilePath = webpFiles.get(0);
+                bodyBytes = Files.readAllBytes(webpFilePath);
+            } else {
+                // Create a ZIP file containing all WebP images
+                ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
+                try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
+                    for (Path webpFile : webpFiles) {
+                        zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
+                        Files.copy(webpFile, zos);
+                        zos.closeEntry();
+                    }
+                }
+                bodyBytes = zipOutputStream.toByteArray();
+            }
+            // Clean up the temporary files
+            Files.deleteIfExists(tempFile);
+            if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
+            result = bodyBytes;
+        }
+
        if (singleImage) {
            String docName = filename + "." + imageFormat;
            MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat));
--- a/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java
+++ b/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java
@ -21,14 +21,6 @@ public class ConverterWebController {
        return "convert/book-to-pdf";
    }

-    @ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}")
-    @GetMapping("/pdf-to-book")
-    @Hidden
-    public String convertPdfToBookForm(Model model) {
-        model.addAttribute("currentPage", "pdf-to-book");
-        return "convert/pdf-to-book";
-    }
-
    @GetMapping("/img-to-pdf")
    @Hidden
    public String convertImgToPdfForm(Model model) {
@ -57,13 +49,6 @@ public class ConverterWebController {
        return "convert/url-to-pdf";
    }

-    @GetMapping("/pdf-to-img")
-    @Hidden
-    public String pdfToimgForm(Model model) {
-        model.addAttribute("currentPage", "pdf-to-img");
-        return "convert/pdf-to-img";
-    }
-
    @GetMapping("/file-to-pdf")
    @Hidden
    public String convertToPdfForm(Model model) {
@ -73,6 +58,21 @@ public class ConverterWebController {

    // PDF TO......

+    @ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}")
+    @GetMapping("/pdf-to-book")
+    @Hidden
+    public String convertPdfToBookForm(Model model) {
+        model.addAttribute("currentPage", "pdf-to-book");
+        return "convert/pdf-to-book";
+    }
+
+    @GetMapping("/pdf-to-img")
+    @Hidden
+    public String pdfToimgForm(Model model) {
+        model.addAttribute("currentPage", "pdf-to-img");
+        return "convert/pdf-to-img";
+    }
+
    @GetMapping("/pdf-to-html")
    @Hidden
    public ModelAndView pdfToHTML() {
--- a/src/main/java/stirling/software/SPDF/model/api/converters/ConvertToImageRequest.java
+++ b/src/main/java/stirling/software/SPDF/model/api/converters/ConvertToImageRequest.java
@ -12,7 +12,7 @@ public class ConvertToImageRequest extends PDFFile {

    @Schema(
            description = "The output image format",
-            allowableValues = {"png", "jpeg", "jpg", "gif"})
+            allowableValues = {"png", "jpeg", "jpg", "gif", "webp"})
    private String imageFormat;

    @Schema(
--- a/src/main/resources/templates/convert/pdf-to-img.html
+++ b/src/main/resources/templates/convert/pdf-to-img.html
@ -28,6 +28,7 @@
                    <option value="gif">GIF</option>
                    <option value="tiff">TIFF</option>
                    <option value="bmp">BMP</option>
+                    <option value="webp">WEPB</option>
                  </select>
                </div>
                <div class="mb-3">