Преобразование PDF в XML с использованием PHP

Мне нужна помощь для преобразования PDF в XML с помощью PHP. Есть некоторые сайты, которые заявляют об этом. Но они берут за это. Для этого мне нужно написать свой собственный код на PHP. Будучи новичком в PHP, я не знаю, как подойти к этой задаче. Так что, если бы кто-то работал над этим, мне помогли бы с этим.

Любая помощь будет высоко оценен.

Solutions Collecting From Web of "Преобразование PDF в XML с использованием PHP"

PDFX делает преобразование PDF-to-XML и его можно использовать бесплатно. Это может быть полезно в вашем случае, поскольку оно может извлекать вещи, такие как изображения и подписи отдельно.

Пример ввода / вывода можно найти здесь .

Страница использования включает простой пример PHP-клиента .

(Раскрытие информации: Это моя система.)

PDF2HTML преобразуется в HTML или XML (с использованием флага -xml), но результат немного беспорядок. Вы получаете много мелких фрагментов информации о расположении фрагментов текста. Вы не хотите извлекать абзацы или разделы текста. Возможно, вы сможете изолировать изображения с помощью подходящего XPath?

Если вам нужны абзацы или разделы текста, кажется, вам нужно сделать это эвристически. Блог Geert имеет интересный подход:

Изолирование текста выполняется в разных зонах (например, верхний и нижний колонтитулы)

Сбор текста выполняется на одной и той же строке (здесь игнорируются столбцы)

Переводить отступы в иерархию (помогает находить списки, обеспечивает работу с таблицами и столбцами)

Слияние строк для построения абзацев

Вы можете использовать этот класс для pars pdf в строку и чем это работает it =)

<?php class PDF2Text2 { var $multibyte = 4; // var $convertquotes = ENT_QUOTES; // var $showprogress = true; // var $filename = ''; var $decodedtext = ''; function setFilename($filename) { // Reset $this->decodedtext = ''; $this->filename = $filename; } function output($echo = false) { if($echo) echo $this->decodedtext; else return $this->decodedtext; } function setUnicode($input) { // 4 for unicode. But 2 should work in most cases just fine if($input == true) $this->multibyte = 4; else $this->multibyte = 2; } function decodePDF() { // Read the data from pdf file $infile = @file_get_contents($this->filename, FILE_BINARY); if (empty($infile)) return ""; // Get all text data. $transformations = array(); $texts = array(); // Get the list of all objects. preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile . "endobj\r", $objects); $objects = @$objects[1]; // Select objects with streams. for ($i = 0; $i < count($objects); $i++) { $currentObject = $objects[$i]; // Prevent time-out @set_time_limit (); if($this->showprogress) { // echo ". "; flush(); ob_flush(); } // Check if an object includes data stream. if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject . "endstream\r", $stream )) { $stream = ltrim($stream[1]); // Check object parameters and look for text data. $options = $this->getObjectOptions($currentObject); if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])) ) if ( $options["Image"] && $options["Subtype"] ) if (!(empty($options["Length1"]) && empty($options["Subtype"])) ) continue; // Hack, length doesnt always seem to be correct unset($options["Length"]); // So, we have text data. Decode it. $data = $this->getDecodedStream($stream, $options); if (strlen($data)) { if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data . "ET\r", $textContainers)) { $textContainers = @$textContainers[1]; $this->getDirtyTexts($texts, $textContainers); } else $this->getCharTransformations($transformations, $data); } } } // Analyze text blocks taking into account character transformations and return results. $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); } function decodeAsciiHex($input) { $output = ""; $isOdd = true; $isComment = false; for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { $c = $input[$i]; if($isComment) { if ($c == '\r' || $c == '\n') $isComment = false; continue; } switch($c) { case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; case '%': $isComment = true; break; default: $code = hexdec($c); if($code === 0 && $c != '0') return ""; if($isOdd) $codeHigh = $code; else $output .= chr($codeHigh * 16 + $code); $isOdd = !$isOdd; break; } } if($input[$i] != '>') return ""; if($isOdd) $output .= chr($codeHigh * 16); return $output; } function decodeAscii85($input) { $output = ""; $isComment = false; $ords = array(); for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { $c = $input[$i]; if($isComment) { if ($c == '\r' || $c == '\n') $isComment = false; continue; } if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') continue; if ($c == '%') { $isComment = true; continue; } if ($c == 'z' && $state === 0) { $output .= str_repeat(chr(0), 4); continue; } if ($c < '!' || $c > 'u') return ""; $code = ord($input[$i]) & 0xff; $ords[$state++] = $code - ord('!'); if ($state == 5) { $state = 0; for ($sum = 0, $j = 0; $j < 5; $j++) $sum = $sum * 85 + $ords[$j]; for ($j = 3; $j >= 0; $j--) $output .= chr($sum >> ($j * 8)); } } if ($state === 1) return ""; elseif ($state > 1) { for ($i = 0, $sum = 0; $i < $state; $i++) $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); for ($i = 0; $i < $state - 1; $i++) { try { if(false == ($o = chr($sum >> ((3 - $i) * 8)))) { throw new Exception('Error'); } $output .= $o; } catch (Exception $e) { /*Dont do anything*/ } } } return $output; } function decodeFlate($data) { return @gzuncompress($data); } function getObjectOptions($object) { $options = array(); if (preg_match("#<<(.*)>>#ismU", $object, $options)) { $options = explode("/", $options[1]); @array_shift($options); $o = array(); for ($j = 0; $j < @count($options); $j++) { $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); if (strpos($options[$j], " ") !== false) { $parts = explode(" ", $options[$j]); $o[$parts[0]] = $parts[1]; } else $o[$options[$j]] = true; } $options = $o; unset($o); } return $options; } function getDecodedStream($stream, $options) { $data = ""; if (empty($options["Filter"])) $data = $stream; else { $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); $_stream = substr($stream, 0, $length); foreach ($options as $key => $value) { if ($key == "ASCIIHexDecode") $_stream = $this->decodeAsciiHex($_stream); elseif ($key == "ASCII85Decode") $_stream = $this->decodeAscii85($_stream); elseif ($key == "FlateDecode") $_stream = $this->decodeFlate($_stream); elseif ($key == "Crypt") { // TO DO } } $data = $_stream; } return $data; } function getDirtyTexts(&$texts, $textContainers) { for ($j = 0; $j < count($textContainers); $j++) { if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) $texts = array_merge($texts, array(@implode('', $parts[1]))); elseif (preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) $texts = array_merge($texts, array(@implode('', $parts[1]))); elseif (preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) $texts = array_merge($texts, array(@implode('', $parts[1]))); } } function getCharTransformations(&$transformations, $stream) { preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); for ($j = 0; $j < count($chars); $j++) { $count = $chars[$j][1]; $current = explode("\n", trim($chars[$j][2])); for ($k = 0; $k < $count && $k < count($current); $k++) { if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) $transformations[str_pad($map[1], 4, "0")] = $map[2]; } } for ($j = 0; $j < count($ranges); $j++) { $count = $ranges[$j][1]; $current = explode("\n", trim($ranges[$j][2])); for ($k = 0; $k < $count && $k < count($current); $k++) { if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { $from = hexdec($map[1]); $to = hexdec($map[2]); $_from = hexdec($map[3]); for ($m = $from, $n = 0; $m <= $to; $m++, $n++) $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { $from = hexdec($map[1]); $to = hexdec($map[2]); $parts = preg_split("#\s+#", trim($map[3])); for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); } } } } function getTextUsingTransformations($texts, $transformations) { $document = ""; for ($i = 0; $i < count($texts); $i++) { $isHex = false; $isPlain = false; $hex = ""; $plain = ""; for ($j = 0; $j < strlen($texts[$i]); $j++) { $c = $texts[$i][$j]; switch($c) { case "<": $hex = ""; $isHex = true; $isPlain = false; break; case ">": $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) for ($k = 0; $k < count($hexs); $k++) { $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero if (isset($transformations[$chex])) $chex = $transformations[$chex]; $document .= html_entity_decode("&#x".$chex.";"); } $isHex = false; break; case "(": $plain = ""; $isPlain = true; $isHex = false; break; case ")": $document .= $plain; $isPlain = false; break; case "\\": $c2 = $texts[$i][$j + 1]; if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; elseif ($c2 == "n") $plain .= '\n'; elseif ($c2 == "r") $plain .= '\r'; elseif ($c2 == "t") $plain .= '\t'; elseif ($c2 == "b") $plain .= '\b'; elseif ($c2 == "f") $plain .= '\f'; elseif ($c2 >= '0' && $c2 <= '9') { $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); $j += strlen($oct) - 1; $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); } $j++; break; default: if ($isHex) $hex .= $c; elseif ($isPlain) $plain .= $c; break; } } $document .= "\n"; } return $document; }}?>