From 3160bd473e3783bb1347d7ac254a0a4522bdee02 Mon Sep 17 00:00:00 2001
From: Carsten  Rose <carsten.rose@math.uzh.ch>
Date: Sun, 15 May 2022 18:03:49 +0200
Subject: [PATCH] #Refs 13722: Detect and fix pdfunite problems "Gen inside
 xref table too large (bigger than INT_MAX)" via pdf2ps, ps2pdf

---
 Documentation/Installation.rst                |   4 +-
 extension/Classes/Core/Constants.php          |   3 +-
 extension/Classes/Core/Report/Download.php    | 193 ++++++++++++++----
 extension/Classes/Core/Store/Config.php       |   2 +
 extension/Tests/Unit/Core/Store/StoreTest.php |   2 +
 extension/ext_conf_template.txt               |   6 +
 6 files changed, 170 insertions(+), 40 deletions(-)

diff --git a/Documentation/Installation.rst b/Documentation/Installation.rst
index 1ecf8164f..d1e10b221 100644
--- a/Documentation/Installation.rst
+++ b/Documentation/Installation.rst
@@ -46,6 +46,7 @@ The following features are only tested / supported on linux hosts:
 * Convert of images to PDF files - command `img2pdf`.
 * PDF decrypt (used for merge with pdfunite) - command `qpdf`.
 * PDF decrypt (used for merge with pdfunite) - command `gs` - in case `qpdf` is not successful.
+* PDF fix (used for merge with pdfunite) - command `pdf2ps` and `ps2pdf` - in case `qpdf` is not successful.
 * Mime type detection for uploads - command `file`.
 * Split PDF into JPG - command `convert`.
 * Repair PDF - command `pdftocairo`.
@@ -63,7 +64,8 @@ To normalize UTF8 input, *php-intl* package is needed by
 
 * normalizer::normalize()
 
-For the :ref:`download` function, the programs `img2pdf`, `pdfunite`, `qpdf`, `gs` and `file` are necessary to concatenate PDF files.
+For the :ref:`download` function, the programs `img2pdf`, `pdfunite`, `qpdf`, `gs`, `pdf2ps`, `ps2pdf` and `file` are
+necessary to concatenate PDF files.
 
 Preparation for Ubuntu::
 
diff --git a/extension/Classes/Core/Constants.php b/extension/Classes/Core/Constants.php
index 35eed85dd..7b2096e5c 100644
--- a/extension/Classes/Core/Constants.php
+++ b/extension/Classes/Core/Constants.php
@@ -679,7 +679,8 @@ const SYSTEM_CMD_GS = 'cmdGs';
 const SYSTEM_CMD_PDFUNITE = 'cmdPdfunite';
 const SYSTEM_CMD_IMG2PDF = 'cmdImg2pdf';
 const SYSTEM_CMD_HEIF_CONVERT = 'cmdHeifConvert';
-
+const SYSTEM_CMD_PDF2PS = 'cmdPdf2ps';
+const SYSTEM_CMD_PS2PDF = 'cmdPs2pdf';
 // Thumbnail
 const SYSTEM_THUMBNAIL_DIR_SECURE_REL_TO_APP = 'thumbnailDirSecure';
 const SYSTEM_THUMBNAIL_DIR_PUBLIC_REL_TO_APP = 'thumbnailDirPublic';
diff --git a/extension/Classes/Core/Report/Download.php b/extension/Classes/Core/Report/Download.php
index ad78dec55..776369e8b 100644
--- a/extension/Classes/Core/Report/Download.php
+++ b/extension/Classes/Core/Report/Download.php
@@ -74,6 +74,16 @@ class Download {
      */
     private $gs = '';
 
+    /**
+     * @var string Name of command
+     */
+    private $pdf2ps = '';
+
+    /**
+     * @var string Name of command
+     */
+    private $ps2pdf = '';
+
     /**
      * @var string Name of command
      */
@@ -101,6 +111,8 @@ class Download {
 
         $this->qpdf = $this->store->getVar(SYSTEM_CMD_QPDF, STORE_SYSTEM);
         $this->gs = $this->store->getVar(SYSTEM_CMD_GS, STORE_SYSTEM);
+        $this->pdf2ps = $this->store->getVar(SYSTEM_CMD_PDF2PS, STORE_SYSTEM);
+        $this->ps2pdf = $this->store->getVar(SYSTEM_CMD_PS2PDF, STORE_SYSTEM);
         $this->pdfunite = $this->store->getVar(SYSTEM_CMD_PDFUNITE, STORE_SYSTEM);
         $this->img2Pdf = $this->store->getVar(SYSTEM_CMD_IMG2PDF, STORE_SYSTEM);
 
@@ -218,11 +230,17 @@ class Download {
     }
 
     /**
-     * Fires the merge command.
-     * If for any reason the command fails: check if the reason is 'unencrypted files'.
-     * If 'yes': try to decrypt them with qpdf.
-     * After one decrypt, try merge again.
-     * Try to merge and decrypt as long as there are encrypted files.
+     * Do the merge command.
+     * Fixing part is highly customized to pdfunite: parsing of reported problematic files.
+     * If for any reason the command fails: check if the reason
+     * a) is 'unencrypted files'. If 'yes': try to decrypt them with qpdf.
+     *        Try1: qpdf --decrypt '$backup' '$file'
+     *        Try2: gs -sDEVICE=pdfwrite -dNOPAUSE -sOutputFile="$file" -- "$backup"
+     * b) is 'Syntax Error: Gen inside xref table too large (bigger than INT_MAX)': try convert PDF2PS and PS2PDF
+     *       Try1: pdf2ps $file $file.ps; ps2pdf $file.ps $file
+     *
+     * After one antempt to fix, try merge again.
+     * Try to merge, decrypt and fix as long as there are encrypted files.
      *
      * @param $cmd
      * @param $rcOutput
@@ -236,7 +254,6 @@ class Download {
 
         // Try to merge the PDFs as long as a problematic PDF has been repaired. Check this by comparing the last and the current output.
         while ($last != $rcOutput) {
-
             $last = $rcOutput; // Remember last
 
             // Merge:
@@ -246,50 +263,137 @@ class Download {
                 break; // skip rest if everything is fine
             }
 
-            // Possible output: "Unimplemented Feature: Could not merge encrypted files ('ct.18.06.092-097.pdf')"
-            $line = implode(',', $rcOutput);
-            if (false !== ($line = strstr($line, "Unimplemented Feature: Could not merge encrypted files ("))) {
+            // Possible output on error:
+            //   "Unimplemented Feature: Could not merge encrypted files ('ct.18.06.092-097.pdf')"
+            //   "Could not merge damaged documents ('offerFinal_Light_Cycler_Offerte_2.pdf')"
+            $lineAll = implode(',', $rcOutput);
+            $cmdFailed = "CMD Merge: " . $cmd . "<br>RC: $rc<br>Output: " . implode("<br>", $rcOutput) . '<br>';
+            $fix = false;
+
+            // Searching for line "Unimplemented Feature: Could not merge encrypted files ('file.pdf')"
+            // Skip error message until problematic file is named.
+            $line = strstr($lineAll, "Unimplemented Feature: Could not merge encrypted files (");
+            if (false !== $line) {
 
                 $arr = explode("'", $line, 3);
                 if (!empty($arr[1]) && file_exists($arr[1])) {
                     $file = $arr[1]; // problematic file
 
-                    // Create a backup file: only one per day!
-                    $backup = $file . date('.Y-m-d');
-                    if (!file_exists($backup)) {
-                        HelperFile::copy($file, $backup);
-                    }
+                    // Create a backup file if none already exist
+                    $backup = $this->doBackupFilePerDay($file);
 
                     // Try 1: via 'qpdf --decrypt'
                     $cmdQpdf = $this->qpdf . " --decrypt '$backup' '$file' 2>&1"; // Try to decrypt file
                     exec($cmdQpdf, $outputQpdf, $rcQpdf);
 
-                    if ($rcQpdf != 0) {
+                    $fix = $rcQpdf == 0;
+                    if (!$fix) {
 
                         // Try 2: via 'gs -sDEVICE=pdfwrite'
                         $cmdGs = $this->gs . " -sDEVICE=pdfwrite -dNOPAUSE -sOutputFile=\"$file\" -- \"$backup\" 2>&1";
                         exec($cmdGs, $outputGs, $rcGs);
 
-                        if ($rcGs != 0) {
-                            // qpdf failed: restore origfile in case the $file has been destroyed.
-                            HelperFile::copy($backup, $file);
-                            throw new \DownloadException (json_encode([ERROR_MESSAGE_TO_USER => "Failed to decrypt PDF",
-                                    ERROR_MESSAGE_TO_DEVELOPER => "CMD1: " . $cmdQpdf . "<br>RC: $rcQpdf<br>Output: " . implode("<br>", $outputQpdf) . '<br>' .
-                                        "CMD2: " . $cmdGs . "<br>RC: $rcGs<br>Output: " . implode("<br>", $outputGs)])
-                                , ERROR_DOWNLOAD_MERGE_FAILED);
+                        $fix = $rcGs == 0;
+                        if (!$fix) {
+                            $line = implode(',', $rcOutputGs);
+                            $msgUser = "Failed to decrypt PDF";
+                            $msgDeveloper = $cmdFailed . "CMD Fix: " . $cmdGs . "<br>RC: $rcGs<br>Output: " . implode("<br>", $rcOutputGs);
+                            $this->revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup);
                         }
                     }
                 }
-            } else {
+                // Check for 'Gen inside xref table too large' (Ticket 13722, 13731: pdf2ps >> ps2pdf)
+            }
+
+            $line = strstr($lineAll, "Syntax Error: Gen inside xref table too large (bigger than INT_MAX)");
+            if (!$fix && false !== $line) {
+                // Skip error message until problematic file is named.
+                $line = strstr($line, "Could not merge damaged documents (");
+                $arr = explode("'", $line, 3);
+                if (!empty($arr[1]) && file_exists($arr[1])) {
+                    $file = $arr[1]; // problematic file
+
+                    // Create a backup file if none already exist
+                    $backup = $this->doBackupFilePerDay($file);
+
+                    // PDF2PS
+                    $cmdFix = $this->pdf2ps . " $file $file.ps";
+                    exec($cmdFix, $rcOutputFix, $rcFix);
+                    $fix = $rcFix == 0;
+                    if (!$fix) {
+                        $line = implode(',', $rcOutputFix);
+                        $msgUser = "Failed to merge PDF and also fix try failed on converting source PDF to PS.";
+                        $msgDeveloper = $cmdFailed . "CMD Fix: " . $cmdFix . "<br>RC: $rcFix<br>Output: " . implode("<br>", $rcOutputFix);
+                        $this->revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup);
+                    }
+
+                    // PS2PDF
+                    $cmdFix = $this->ps2pdf . " $file.ps $file";
+                    exec($cmdFix, $rcOutputFix, $rcFix);
+                    $fix = $rcFix == 0;
+                    if (!$fix) {
+                        $line = implode(',', $rcOutputFix);
+                        $msgUser = "Failed to merge PDF and also fix try failed on converting temporary PS to PDF.";
+                        $msgDeveloper = $cmdFailed . "CMD Fix: " . $cmdFix . "<br>RC: $rcFix<br>Output: " . implode("<br>", $rcOutputFix);
+                        $this->revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup);
+                    }
+
+                    HelperFile::unlink($file . '.ps');
+                }
+            }
+
+            if (!$fix) {
                 throw new \DownloadException (json_encode([ERROR_MESSAGE_TO_USER => "Merge PDF file failed.",
-                        ERROR_MESSAGE_TO_DEVELOPER => "CMD: " . $cmd . "<br>RC: $rc<br>Output: " . implode("<br>", $rcOutput)])
+                        ERROR_MESSAGE_TO_DEVELOPER => $cmdFailed])
                     , ERROR_DOWNLOAD_MERGE_FAILED);
             }
         }
-
         return $rc;
     }
 
+    /**
+     * @param $msgUser
+     * @param $msgDeveloper
+     * @param $file
+     * @param $backup
+     * @return mixed
+     * @throws \DownloadException
+     * @throws \UserFormException
+     */
+    private
+    function revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup) {
+        if ($backup != '' && $file != '') {
+            // command failed: restore origfile in case the $file has been destroyed.
+            HelperFile::copy($backup, $file);
+        }
+
+        throw new \DownloadException (json_encode([ERROR_MESSAGE_TO_USER => $msgUser,
+                ERROR_MESSAGE_TO_DEVELOPER => $msgDeveloper])
+            , ERROR_DOWNLOAD_MERGE_FAILED);
+    }
+
+    /**
+     * If there is already a Backup file from today: do nothing.
+     * If there is no Backup file from today: duplicate the named file to the backup filename.
+     * In general, it's meant that the original filename will be replaced by a fixed one. This should lead to have
+     * only one copy of the original.
+     *
+     * @param $file
+     * @return string
+     * @throws \UserFormException
+     */
+    private
+    function doBackupFilePerDay($file) {
+
+        // Create a backup file: only one per day!
+        $backup = $file . date('.Y-m-d');
+        if (!file_exists($backup)) {
+            HelperFile::copy($file, $backup);
+        }
+
+        return ($backup);
+    }
+
     /**
      * Get the mimetype of $filename and store them in $rcMimetype.
      *
@@ -299,7 +403,8 @@ class Download {
      *
      * @return string possible updated $outputFilename, according the mimetype.
      */
-    private function targetFilenameExtension($pathFileName, $outputFilename, &$rcMimetype) {
+    private
+    function targetFilenameExtension($pathFileName, $outputFilename, &$rcMimetype) {
 
         if ($pathFileName != '' && file_exists($pathFileName)) {
 
@@ -315,7 +420,8 @@ class Download {
      * @param $outputFilename
      * @throws \DownloadException
      */
-    private function outputFile($file, $outputFilename) {
+    private
+    function outputFile($file, $outputFilename) {
 
         $json = '';
         $flagJson = ($this->getOutputFormat() === DOWNLOAD_OUTPUT_FORMAT_JSON);
@@ -369,7 +475,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function getEvaluatedBodytext($uid, array $urlParam) {
+    private
+    function getEvaluatedBodytext($uid, array $urlParam) {
         $bodyTextArr = $this->db->getBodytext($uid);
 
         // Copy $urlParam to STORE_SIP
@@ -402,7 +509,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function getElement($element, $downloadMode, &$rcData) {
+    private
+    function getElement($element, $downloadMode, &$rcData) {
 
         $filename = '';
         $rcArgs = array();
@@ -482,7 +590,8 @@ class Download {
      * @return string ZIP filename - has to be deleted later.
      * @throws \DownloadException
      */
-    private function zipFiles(array $files) {
+    private
+    function zipFiles(array $files) {
 
         $zipFile = HelperFile::tempnam();
         if (false === $zipFile) {
@@ -538,7 +647,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function checkAndExpandSource($param) {
+    private
+    function checkAndExpandSource($param) {
 
         if ($param == '') {
             return '';
@@ -599,7 +709,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function doElements(array $vars, $outputMode) {
+    private
+    function doElements(array $vars, $outputMode) {
 
         $srcFiles = array();
         $filesCleanLater = array();
@@ -733,7 +844,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function doThumbnail($urlParam) {
+    private
+    function doThumbnail($urlParam) {
 
         $thumbnail = new Thumbnail();
         $pathFilenameThumbnail = $thumbnail->process($urlParam, THUMBNAIL_VIA_DOWNLOAD);
@@ -751,7 +863,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function getDirectDownloadSql() {
+    private
+    function getDirectDownloadSql() {
         $scriptName = str_replace('.', '', $this->store->getVar('SCRIPT_NAME', STORE_CLIENT . STORE_EMPTY));
 
         // Example: /var/www/html/qfq/dl.php >> dl.php
@@ -776,7 +889,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    private function getDirectDownloadModeDetails() {
+    private
+    function getDirectDownloadModeDetails() {
 
         $arr = $this->getDirectDownloadSql();
 
@@ -831,7 +945,8 @@ class Download {
      * @throws \UserFormException
      * @throws \UserReportException
      */
-    public function process($vars, $outputMode = OUTPUT_MODE_DIRECT) {
+    public
+    function process($vars, $outputMode = OUTPUT_MODE_DIRECT) {
 
         if (!is_array($vars)) {
 
@@ -851,14 +966,16 @@ class Download {
     /**
      * @param $outputFormat
      */
-    private function setOutputFormat($outputFormat) {
+    private
+    function setOutputFormat($outputFormat) {
         $this->outputFormat = $outputFormat;
     }
 
     /**
      * @return string - DOWNLOAD_OUTPUT_FORMAT_RAW | DOWNLOAD_OUTPUT_FORMAT_JSON
      */
-    public function getOutputFormat() {
+    public
+    function getOutputFormat() {
         return $this->outputFormat;
     }
 }
diff --git a/extension/Classes/Core/Store/Config.php b/extension/Classes/Core/Store/Config.php
index fce48c6b1..a754cf3de 100644
--- a/extension/Classes/Core/Store/Config.php
+++ b/extension/Classes/Core/Store/Config.php
@@ -484,6 +484,8 @@ class Config {
             SYSTEM_CMD_PDFUNITE => 'pdfunite',
             SYSTEM_CMD_IMG2PDF => 'img2pdf',
             SYSTEM_CMD_HEIF_CONVERT => 'heif-convert',
+            SYSTEM_CMD_PDF2PS => 'pdf2ps',
+            SYSTEM_CMD_PS2PDF => 'ps2pdf',
 
             SYSTEM_THUMBNAIL_DIR_SECURE_REL_TO_APP => Path::APP_TO_SYSTEM_THUMBNAIL_DIR_SECURE_DEFAULT,
             SYSTEM_THUMBNAIL_DIR_PUBLIC_REL_TO_APP => Path::APP_TO_SYSTEM_THUMBNAIL_DIR_PUBLIC_DEFAULT,
diff --git a/extension/Tests/Unit/Core/Store/StoreTest.php b/extension/Tests/Unit/Core/Store/StoreTest.php
index da0ec8e3e..4997a7f96 100644
--- a/extension/Tests/Unit/Core/Store/StoreTest.php
+++ b/extension/Tests/Unit/Core/Store/StoreTest.php
@@ -420,6 +420,8 @@ class StoreTest extends TestCase {
             SYSTEM_CMD_IMG2PDF => 'img2pdf',
             SYSTEM_CMD_HEIF_CONVERT => 'heif-convert',
             SYSTEM_CMD_QFQPDF => '/opt/qfqpdf/qfqpdf',
+            SYSTEM_CMD_PDF2PS => 'pdf2ps',
+            SYSTEM_CMD_PS2PDF => 'ps2pdf',
         ];
 
         $body = json_encode([
diff --git a/extension/ext_conf_template.txt b/extension/ext_conf_template.txt
index 87d84f06c..97e57d9e8 100644
--- a/extension/ext_conf_template.txt
+++ b/extension/ext_conf_template.txt
@@ -52,6 +52,12 @@ cmdImg2pdf = img2pdf
 # cat=config/config; type=string; label=Command 'heif-convert':Default is 'heif-convert'. Will be used to convert images from HEIC/HEIF to PNG.
 cmdHeifConvert = heif-convert
 
+# cat=config/config; type=string; label=Command 'pdf2ps':Default is 'pdf2ps'. Will be used to convert images from PDF to PS.
+cmdPdf2ps = pdf2ps
+
+# cat=config/config; type=string; label=Command 'ps2pdf':Default is 'ps2pdf'. Will be used to convert images from PS to PDF.
+cmdPs2pdf = ps2pdf
+
 # cat=config/email; type=string; label=Options for SendEMail:Default is empty. General options. Check: http://caspian.dotconf.net/menu/Software/SendEmail. E.g.: 'sendEMail=-o tls=yes'
 sendEMailOptions =
 
-- 
GitLab