From 3160bd473e3783bb1347d7ac254a0a4522bdee02 Mon Sep 17 00:00:00 2001 From: Carsten Rose <carsten.rose@math.uzh.ch> Date: Sun, 15 May 2022 18:03:49 +0200 Subject: [PATCH] #Refs 13722: Detect and fix pdfunite problems "Gen inside xref table too large (bigger than INT_MAX)" via pdf2ps, ps2pdf --- Documentation/Installation.rst | 4 +- extension/Classes/Core/Constants.php | 3 +- extension/Classes/Core/Report/Download.php | 193 ++++++++++++++---- extension/Classes/Core/Store/Config.php | 2 + extension/Tests/Unit/Core/Store/StoreTest.php | 2 + extension/ext_conf_template.txt | 6 + 6 files changed, 170 insertions(+), 40 deletions(-) diff --git a/Documentation/Installation.rst b/Documentation/Installation.rst index 1ecf8164f..d1e10b221 100644 --- a/Documentation/Installation.rst +++ b/Documentation/Installation.rst @@ -46,6 +46,7 @@ The following features are only tested / supported on linux hosts: * Convert of images to PDF files - command `img2pdf`. * PDF decrypt (used for merge with pdfunite) - command `qpdf`. * PDF decrypt (used for merge with pdfunite) - command `gs` - in case `qpdf` is not successful. +* PDF fix (used for merge with pdfunite) - command `pdf2ps` and `ps2pdf` - in case `qpdf` is not successful. * Mime type detection for uploads - command `file`. * Split PDF into JPG - command `convert`. * Repair PDF - command `pdftocairo`. @@ -63,7 +64,8 @@ To normalize UTF8 input, *php-intl* package is needed by * normalizer::normalize() -For the :ref:`download` function, the programs `img2pdf`, `pdfunite`, `qpdf`, `gs` and `file` are necessary to concatenate PDF files. +For the :ref:`download` function, the programs `img2pdf`, `pdfunite`, `qpdf`, `gs`, `pdf2ps`, `ps2pdf` and `file` are +necessary to concatenate PDF files. Preparation for Ubuntu:: diff --git a/extension/Classes/Core/Constants.php b/extension/Classes/Core/Constants.php index 35eed85dd..7b2096e5c 100644 --- a/extension/Classes/Core/Constants.php +++ b/extension/Classes/Core/Constants.php @@ -679,7 +679,8 @@ const SYSTEM_CMD_GS = 'cmdGs'; const SYSTEM_CMD_PDFUNITE = 'cmdPdfunite'; const SYSTEM_CMD_IMG2PDF = 'cmdImg2pdf'; const SYSTEM_CMD_HEIF_CONVERT = 'cmdHeifConvert'; - +const SYSTEM_CMD_PDF2PS = 'cmdPdf2ps'; +const SYSTEM_CMD_PS2PDF = 'cmdPs2pdf'; // Thumbnail const SYSTEM_THUMBNAIL_DIR_SECURE_REL_TO_APP = 'thumbnailDirSecure'; const SYSTEM_THUMBNAIL_DIR_PUBLIC_REL_TO_APP = 'thumbnailDirPublic'; diff --git a/extension/Classes/Core/Report/Download.php b/extension/Classes/Core/Report/Download.php index ad78dec55..776369e8b 100644 --- a/extension/Classes/Core/Report/Download.php +++ b/extension/Classes/Core/Report/Download.php @@ -74,6 +74,16 @@ class Download { */ private $gs = ''; + /** + * @var string Name of command + */ + private $pdf2ps = ''; + + /** + * @var string Name of command + */ + private $ps2pdf = ''; + /** * @var string Name of command */ @@ -101,6 +111,8 @@ class Download { $this->qpdf = $this->store->getVar(SYSTEM_CMD_QPDF, STORE_SYSTEM); $this->gs = $this->store->getVar(SYSTEM_CMD_GS, STORE_SYSTEM); + $this->pdf2ps = $this->store->getVar(SYSTEM_CMD_PDF2PS, STORE_SYSTEM); + $this->ps2pdf = $this->store->getVar(SYSTEM_CMD_PS2PDF, STORE_SYSTEM); $this->pdfunite = $this->store->getVar(SYSTEM_CMD_PDFUNITE, STORE_SYSTEM); $this->img2Pdf = $this->store->getVar(SYSTEM_CMD_IMG2PDF, STORE_SYSTEM); @@ -218,11 +230,17 @@ class Download { } /** - * Fires the merge command. - * If for any reason the command fails: check if the reason is 'unencrypted files'. - * If 'yes': try to decrypt them with qpdf. - * After one decrypt, try merge again. - * Try to merge and decrypt as long as there are encrypted files. + * Do the merge command. + * Fixing part is highly customized to pdfunite: parsing of reported problematic files. + * If for any reason the command fails: check if the reason + * a) is 'unencrypted files'. If 'yes': try to decrypt them with qpdf. + * Try1: qpdf --decrypt '$backup' '$file' + * Try2: gs -sDEVICE=pdfwrite -dNOPAUSE -sOutputFile="$file" -- "$backup" + * b) is 'Syntax Error: Gen inside xref table too large (bigger than INT_MAX)': try convert PDF2PS and PS2PDF + * Try1: pdf2ps $file $file.ps; ps2pdf $file.ps $file + * + * After one antempt to fix, try merge again. + * Try to merge, decrypt and fix as long as there are encrypted files. * * @param $cmd * @param $rcOutput @@ -236,7 +254,6 @@ class Download { // Try to merge the PDFs as long as a problematic PDF has been repaired. Check this by comparing the last and the current output. while ($last != $rcOutput) { - $last = $rcOutput; // Remember last // Merge: @@ -246,50 +263,137 @@ class Download { break; // skip rest if everything is fine } - // Possible output: "Unimplemented Feature: Could not merge encrypted files ('ct.18.06.092-097.pdf')" - $line = implode(',', $rcOutput); - if (false !== ($line = strstr($line, "Unimplemented Feature: Could not merge encrypted files ("))) { + // Possible output on error: + // "Unimplemented Feature: Could not merge encrypted files ('ct.18.06.092-097.pdf')" + // "Could not merge damaged documents ('offerFinal_Light_Cycler_Offerte_2.pdf')" + $lineAll = implode(',', $rcOutput); + $cmdFailed = "CMD Merge: " . $cmd . "<br>RC: $rc<br>Output: " . implode("<br>", $rcOutput) . '<br>'; + $fix = false; + + // Searching for line "Unimplemented Feature: Could not merge encrypted files ('file.pdf')" + // Skip error message until problematic file is named. + $line = strstr($lineAll, "Unimplemented Feature: Could not merge encrypted files ("); + if (false !== $line) { $arr = explode("'", $line, 3); if (!empty($arr[1]) && file_exists($arr[1])) { $file = $arr[1]; // problematic file - // Create a backup file: only one per day! - $backup = $file . date('.Y-m-d'); - if (!file_exists($backup)) { - HelperFile::copy($file, $backup); - } + // Create a backup file if none already exist + $backup = $this->doBackupFilePerDay($file); // Try 1: via 'qpdf --decrypt' $cmdQpdf = $this->qpdf . " --decrypt '$backup' '$file' 2>&1"; // Try to decrypt file exec($cmdQpdf, $outputQpdf, $rcQpdf); - if ($rcQpdf != 0) { + $fix = $rcQpdf == 0; + if (!$fix) { // Try 2: via 'gs -sDEVICE=pdfwrite' $cmdGs = $this->gs . " -sDEVICE=pdfwrite -dNOPAUSE -sOutputFile=\"$file\" -- \"$backup\" 2>&1"; exec($cmdGs, $outputGs, $rcGs); - if ($rcGs != 0) { - // qpdf failed: restore origfile in case the $file has been destroyed. - HelperFile::copy($backup, $file); - throw new \DownloadException (json_encode([ERROR_MESSAGE_TO_USER => "Failed to decrypt PDF", - ERROR_MESSAGE_TO_DEVELOPER => "CMD1: " . $cmdQpdf . "<br>RC: $rcQpdf<br>Output: " . implode("<br>", $outputQpdf) . '<br>' . - "CMD2: " . $cmdGs . "<br>RC: $rcGs<br>Output: " . implode("<br>", $outputGs)]) - , ERROR_DOWNLOAD_MERGE_FAILED); + $fix = $rcGs == 0; + if (!$fix) { + $line = implode(',', $rcOutputGs); + $msgUser = "Failed to decrypt PDF"; + $msgDeveloper = $cmdFailed . "CMD Fix: " . $cmdGs . "<br>RC: $rcGs<br>Output: " . implode("<br>", $rcOutputGs); + $this->revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup); } } } - } else { + // Check for 'Gen inside xref table too large' (Ticket 13722, 13731: pdf2ps >> ps2pdf) + } + + $line = strstr($lineAll, "Syntax Error: Gen inside xref table too large (bigger than INT_MAX)"); + if (!$fix && false !== $line) { + // Skip error message until problematic file is named. + $line = strstr($line, "Could not merge damaged documents ("); + $arr = explode("'", $line, 3); + if (!empty($arr[1]) && file_exists($arr[1])) { + $file = $arr[1]; // problematic file + + // Create a backup file if none already exist + $backup = $this->doBackupFilePerDay($file); + + // PDF2PS + $cmdFix = $this->pdf2ps . " $file $file.ps"; + exec($cmdFix, $rcOutputFix, $rcFix); + $fix = $rcFix == 0; + if (!$fix) { + $line = implode(',', $rcOutputFix); + $msgUser = "Failed to merge PDF and also fix try failed on converting source PDF to PS."; + $msgDeveloper = $cmdFailed . "CMD Fix: " . $cmdFix . "<br>RC: $rcFix<br>Output: " . implode("<br>", $rcOutputFix); + $this->revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup); + } + + // PS2PDF + $cmdFix = $this->ps2pdf . " $file.ps $file"; + exec($cmdFix, $rcOutputFix, $rcFix); + $fix = $rcFix == 0; + if (!$fix) { + $line = implode(',', $rcOutputFix); + $msgUser = "Failed to merge PDF and also fix try failed on converting temporary PS to PDF."; + $msgDeveloper = $cmdFailed . "CMD Fix: " . $cmdFix . "<br>RC: $rcFix<br>Output: " . implode("<br>", $rcOutputFix); + $this->revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup); + } + + HelperFile::unlink($file . '.ps'); + } + } + + if (!$fix) { throw new \DownloadException (json_encode([ERROR_MESSAGE_TO_USER => "Merge PDF file failed.", - ERROR_MESSAGE_TO_DEVELOPER => "CMD: " . $cmd . "<br>RC: $rc<br>Output: " . implode("<br>", $rcOutput)]) + ERROR_MESSAGE_TO_DEVELOPER => $cmdFailed]) , ERROR_DOWNLOAD_MERGE_FAILED); } } - return $rc; } + /** + * @param $msgUser + * @param $msgDeveloper + * @param $file + * @param $backup + * @return mixed + * @throws \DownloadException + * @throws \UserFormException + */ + private + function revertFixAndThrowException($msgUser, $msgDeveloper, $file, $backup) { + if ($backup != '' && $file != '') { + // command failed: restore origfile in case the $file has been destroyed. + HelperFile::copy($backup, $file); + } + + throw new \DownloadException (json_encode([ERROR_MESSAGE_TO_USER => $msgUser, + ERROR_MESSAGE_TO_DEVELOPER => $msgDeveloper]) + , ERROR_DOWNLOAD_MERGE_FAILED); + } + + /** + * If there is already a Backup file from today: do nothing. + * If there is no Backup file from today: duplicate the named file to the backup filename. + * In general, it's meant that the original filename will be replaced by a fixed one. This should lead to have + * only one copy of the original. + * + * @param $file + * @return string + * @throws \UserFormException + */ + private + function doBackupFilePerDay($file) { + + // Create a backup file: only one per day! + $backup = $file . date('.Y-m-d'); + if (!file_exists($backup)) { + HelperFile::copy($file, $backup); + } + + return ($backup); + } + /** * Get the mimetype of $filename and store them in $rcMimetype. * @@ -299,7 +403,8 @@ class Download { * * @return string possible updated $outputFilename, according the mimetype. */ - private function targetFilenameExtension($pathFileName, $outputFilename, &$rcMimetype) { + private + function targetFilenameExtension($pathFileName, $outputFilename, &$rcMimetype) { if ($pathFileName != '' && file_exists($pathFileName)) { @@ -315,7 +420,8 @@ class Download { * @param $outputFilename * @throws \DownloadException */ - private function outputFile($file, $outputFilename) { + private + function outputFile($file, $outputFilename) { $json = ''; $flagJson = ($this->getOutputFormat() === DOWNLOAD_OUTPUT_FORMAT_JSON); @@ -369,7 +475,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function getEvaluatedBodytext($uid, array $urlParam) { + private + function getEvaluatedBodytext($uid, array $urlParam) { $bodyTextArr = $this->db->getBodytext($uid); // Copy $urlParam to STORE_SIP @@ -402,7 +509,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function getElement($element, $downloadMode, &$rcData) { + private + function getElement($element, $downloadMode, &$rcData) { $filename = ''; $rcArgs = array(); @@ -482,7 +590,8 @@ class Download { * @return string ZIP filename - has to be deleted later. * @throws \DownloadException */ - private function zipFiles(array $files) { + private + function zipFiles(array $files) { $zipFile = HelperFile::tempnam(); if (false === $zipFile) { @@ -538,7 +647,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function checkAndExpandSource($param) { + private + function checkAndExpandSource($param) { if ($param == '') { return ''; @@ -599,7 +709,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function doElements(array $vars, $outputMode) { + private + function doElements(array $vars, $outputMode) { $srcFiles = array(); $filesCleanLater = array(); @@ -733,7 +844,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function doThumbnail($urlParam) { + private + function doThumbnail($urlParam) { $thumbnail = new Thumbnail(); $pathFilenameThumbnail = $thumbnail->process($urlParam, THUMBNAIL_VIA_DOWNLOAD); @@ -751,7 +863,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function getDirectDownloadSql() { + private + function getDirectDownloadSql() { $scriptName = str_replace('.', '', $this->store->getVar('SCRIPT_NAME', STORE_CLIENT . STORE_EMPTY)); // Example: /var/www/html/qfq/dl.php >> dl.php @@ -776,7 +889,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - private function getDirectDownloadModeDetails() { + private + function getDirectDownloadModeDetails() { $arr = $this->getDirectDownloadSql(); @@ -831,7 +945,8 @@ class Download { * @throws \UserFormException * @throws \UserReportException */ - public function process($vars, $outputMode = OUTPUT_MODE_DIRECT) { + public + function process($vars, $outputMode = OUTPUT_MODE_DIRECT) { if (!is_array($vars)) { @@ -851,14 +966,16 @@ class Download { /** * @param $outputFormat */ - private function setOutputFormat($outputFormat) { + private + function setOutputFormat($outputFormat) { $this->outputFormat = $outputFormat; } /** * @return string - DOWNLOAD_OUTPUT_FORMAT_RAW | DOWNLOAD_OUTPUT_FORMAT_JSON */ - public function getOutputFormat() { + public + function getOutputFormat() { return $this->outputFormat; } } diff --git a/extension/Classes/Core/Store/Config.php b/extension/Classes/Core/Store/Config.php index fce48c6b1..a754cf3de 100644 --- a/extension/Classes/Core/Store/Config.php +++ b/extension/Classes/Core/Store/Config.php @@ -484,6 +484,8 @@ class Config { SYSTEM_CMD_PDFUNITE => 'pdfunite', SYSTEM_CMD_IMG2PDF => 'img2pdf', SYSTEM_CMD_HEIF_CONVERT => 'heif-convert', + SYSTEM_CMD_PDF2PS => 'pdf2ps', + SYSTEM_CMD_PS2PDF => 'ps2pdf', SYSTEM_THUMBNAIL_DIR_SECURE_REL_TO_APP => Path::APP_TO_SYSTEM_THUMBNAIL_DIR_SECURE_DEFAULT, SYSTEM_THUMBNAIL_DIR_PUBLIC_REL_TO_APP => Path::APP_TO_SYSTEM_THUMBNAIL_DIR_PUBLIC_DEFAULT, diff --git a/extension/Tests/Unit/Core/Store/StoreTest.php b/extension/Tests/Unit/Core/Store/StoreTest.php index da0ec8e3e..4997a7f96 100644 --- a/extension/Tests/Unit/Core/Store/StoreTest.php +++ b/extension/Tests/Unit/Core/Store/StoreTest.php @@ -420,6 +420,8 @@ class StoreTest extends TestCase { SYSTEM_CMD_IMG2PDF => 'img2pdf', SYSTEM_CMD_HEIF_CONVERT => 'heif-convert', SYSTEM_CMD_QFQPDF => '/opt/qfqpdf/qfqpdf', + SYSTEM_CMD_PDF2PS => 'pdf2ps', + SYSTEM_CMD_PS2PDF => 'ps2pdf', ]; $body = json_encode([ diff --git a/extension/ext_conf_template.txt b/extension/ext_conf_template.txt index 87d84f06c..97e57d9e8 100644 --- a/extension/ext_conf_template.txt +++ b/extension/ext_conf_template.txt @@ -52,6 +52,12 @@ cmdImg2pdf = img2pdf # cat=config/config; type=string; label=Command 'heif-convert':Default is 'heif-convert'. Will be used to convert images from HEIC/HEIF to PNG. cmdHeifConvert = heif-convert +# cat=config/config; type=string; label=Command 'pdf2ps':Default is 'pdf2ps'. Will be used to convert images from PDF to PS. +cmdPdf2ps = pdf2ps + +# cat=config/config; type=string; label=Command 'ps2pdf':Default is 'ps2pdf'. Will be used to convert images from PS to PDF. +cmdPs2pdf = ps2pdf + # cat=config/email; type=string; label=Options for SendEMail:Default is empty. General options. Check: http://caspian.dotconf.net/menu/Software/SendEmail. E.g.: 'sendEMail=-o tls=yes' sendEMailOptions = -- GitLab