Files
documenter-mono/app/Services/DocxParser.php
2025-10-31 16:48:05 +09:00

645 lines
21 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace App\Services;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\PhpWord;
//use DOMDocument;
use Exception;
class DocxParser
{
public function parse($filePath)
{
try {
$phpWord = IOFactory::load($filePath);
$sections = $phpWord->getSections();
$template = [
'metadata' => [
'title' => basename($filePath, '.docx'),
'created_at' => now()->toISOString(),
'source' => 'docx',
'total_pages' => 0
],
'structure' => []
];
$sectionElements = [];
foreach ($sections as $sectionIndex => $section) {
foreach ($section->getElements() as $elementIndex => $element) {
// $sectionElements[] = $this->parseElement($element);
array_push($sectionElements, $this->parseElement($element, $elementIndex));
}
}
$template['structure']['elements'] = $sectionElements;
// dd($template);
// Извлекаем переменные
$template['variables_config'] = $this->extractVariables($template['structure']);
return $template;
} catch (Exception $e) {
\Log::error("Ошибка парсинга DOCX: " . $e->getMessage(), $e->getTrace());
}
}
private function parseElement($element, $elementIndex = 0)
{
$parsedElements = [];
$elementType = get_class($element);
switch ($elementType) {
case 'PhpOffice\PhpWord\Element\Text':
$parsedElements = $this->parseTextElement($element, $elementIndex++);
break;
case 'PhpOffice\PhpWord\Element\TextRun':
$textRunElements = $this->parseTextRun($element, $elementIndex);
// Получаем стиль параграфа для TextRun
$paragraphStyle = $element->getParagraphStyle();
// Если TextRun содержит только один элемент, добавляем его напрямую
if (count($textRunElements) === 1) {
$textRunElements[0]['style'] = $this->parseParagraphStyle($paragraphStyle);
$isHeading = $this->isTextRunHeading($textRunElements[0]);
$textRunElements[0]['is_heading'] = $isHeading;
$textRunElements[0]['type'] = $isHeading ? 'heading' : 'paragraph';
$parsedElements = $textRunElements[0];
$elementIndex++;
} else {
// Иначе создаем контейнер TextRun
$parsedElements = [
'id' => 'textrun-' . $elementIndex++,
'type' => 'text_run',
'elements' => $textRunElements,
'style' => $this->parseParagraphStyle($paragraphStyle)
// 'formatting' => $this->getTextRunFormatting($element)
];
}
break;
case 'PhpOffice\PhpWord\Element\TextBreak':
$parsedElements = [
'id' => 'break-' . $elementIndex++,
'type' => 'line_break',
'content' => '<br>'
];
break;
case 'PhpOffice\PhpWord\Element\Title':
$parsedElements = $this->parseTitleElement($element, $elementIndex++);
break;
case 'PhpOffice\PhpWord\Element\Table':
$parsedElements = $this->parseTableElement($element, $elementIndex++);
break;
// case 'PhpOffice\PhpWord\Element\Image':
// $parsedElements[] = $this->parseImageElement($element, $elementIndex++);
// break;
default:
// Пропускаем неизвестные элементы
break;
}
return $parsedElements;
}
private function getTextRunFormatting($textRun)
{
$style = $textRun->getParagraphStyle();
return $this->parseStyle($style);
}
private function parseTextElement($element, $index)
{
$text = $element->getText();
$style = $element->getFontStyle();
// Ищем переменные в тексте (например: {{variable}} или [VARIABLE])
$variables = $this->extractVariablesFromText($text);
return [
'id' => 'element-' . $index,
'type' => empty($variables) ? 'paragraph' : 'variable',
'content' => $text,
'formatting' => $this->parseStyle($style),
'variables' => $variables,
'is_inline' => true
];
}
public function isTextRunHeading($element) {
if ($element['formatting'] && $element['style']) {
if ($element['formatting']['bold'] && $element['style']['align'] === 'center') {
return true;
}
}
return false;
}
public function parseParagraphStyle($style)
{
if (!$style) return null;
$paragraphStyle = [];
// Выравнивание
if ($style->getAlignment()) {
$alignment = $style->getAlignment();
$paragraphStyle['align'] = $this->mapAlignment($alignment);
}
// Отступы
if ($style->getIndentation()) {
$indentation = $style->getIndentation();
$paragraphStyle['indent'] = [
'left' => $indentation->getLeft(),
'right' => $indentation->getRight(),
'firstLine' => ($indentation->getFirstLine() / 1440) * 96, //\PhpOffice\PhpWord\Shared\Converter::twip($indentation->getFirstLine())
];
}
// dd($style->getLineHeight());
// Междустрочный интервал
if ($style->getLineHeight()) {
$paragraphStyle['lineHeight'] = $style->getLineHeight();
}
// Интервалы до и после
if ($style->getSpaceBefore()) {
$paragraphStyle['spaceBefore'] = $style->getSpaceBefore();
}
if ($style->getSpaceAfter()) {
$paragraphStyle['spaceAfter'] = $style->getSpaceAfter();
}
// Табуляция
if ($style->getTabs()) {
$paragraphStyle['tabs'] = $style->getTabs();
}
return !empty($paragraphStyle) ? $paragraphStyle : null;
}
private function mapAlignment($alignment)
{
$mapping = [
\PhpOffice\PhpWord\SimpleType\Jc::START => 'left',
\PhpOffice\PhpWord\SimpleType\Jc::END => 'right',
\PhpOffice\PhpWord\SimpleType\Jc::CENTER => 'center',
\PhpOffice\PhpWord\SimpleType\Jc::BOTH => 'justify',
\PhpOffice\PhpWord\SimpleType\Jc::DISTRIBUTE => 'distribute'
];
return $mapping[$alignment] ?? 'left';
}
public function parseTextRun($textRun, $startIndex)
{
$elements = [];
$index = $startIndex;
foreach ($textRun->getElements() as $runElement) {
if (get_class($runElement) === 'PhpOffice\PhpWord\Element\Text') {
$elements[] = $this->parseTextElement($runElement, $index++);
}
}
return $elements;
}
private function parseTitleElement($element, $index)
{
$text = $element->getText();
$level = $element->getDepth() + 1;
return [
'id' => 'heading-' . $index,
'type' => 'heading',
'level' => $level,
'content' => $text,
'formatting' => [
'bold' => true,
'fontSize' => $this->getHeadingSize($level)
]
];
}
private function parseTableElement($table, $index)
{
$rows = [];
$rowIndex = 0;
$tableIndex = $index;
foreach ($table->getRows() as $row) {
$cells = [];
$cellIndex = 0;
foreach ($row->getCells() as $cell) {
$styles = $this->getCellStyles($cell);
$cellElements = [];
foreach ($cell->getElements() as $cellElement) {
$cellElements[] = $this->parseElement($cellElement, $index);
$index++;
}
$cells[] = [
'id' => 'cell-' . $rowIndex . '-' . $cellIndex,
'elements' => $cellElements,
// 'variables' => $this->extra($cellElements),
'style' => $styles,
'width' => ($cell->getWidth() / 1440) * 96
];
$cellIndex++;
}
$rows[] = [
'id' => 'row-' . $rowIndex,
'cells' => $cells
];
$rowIndex++;
}
return [
'id' => 'table-' . $tableIndex,
'type' => 'table',
'rows' => $rows,
'cols' => count($rows[0]['cells'] ?? []),
// 'content' => $this->generateTableHTML($rows)
];
}
private function getCellStyles($cell)
{
$style = $cell->getStyle();
return [
'borderTopSize' => $style->getBorderTopSize(),
'borderTopColor' => $style->getBorderTopColor(),
'borderTopStyle' => $style->getBorderTopStyle(),
'borderLeftSize' => $style->getBorderLeftSize(),
'borderLeftColor' => $style->getBorderLeftColor(),
'borderLeftStyle' => $style->getBorderLeftStyle(),
'borderRightSize' => $style->getBorderRightSize(),
'borderRightColor' => $style->getBorderRightColor(),
'borderRightStyle' => $style->getBorderRightStyle(),
'borderBottomSize' => $style->getBorderBottomSize(),
'borderBottomColor' => $style->getBorderBottomColor(),
'borderBottomStyle' => $style->getBorderBottomStyle(),
];
}
private function parseImageElement($element, $index)
{
return [
'id' => 'image-' . $index,
'type' => 'image',
'src' => $element->getSource(),
'width' => $element->getWidth(),
'height' => $element->getHeight(),
'alt' => $element->getAlt() ?? 'Image'
];
}
private function parseStyle($style)
{
if (!$style) return [];
return [
'bold' => $style->isBold(),
'italic' => $style->isItalic(),
'underline' => $style->getUnderline(),
'fontSize' => $style->getSize(),
'fontColor' => $style->getColor(),
'fontFamily' => $style->getName()
];
}
// Формирование HTML
public function generateParagraphStyle($paragraphStyle)
{
if (empty($paragraphStyle)) {
return '';
}
$styles = [];
// Выравнивание
if (!empty($paragraphStyle['align'])) {
$styles[] = "text-align: {$paragraphStyle['align']}";
}
// Междустрочный интервал
if (!empty($paragraphStyle['lineHeight'])) {
$lineHeight = $paragraphStyle['lineHeight'];
if (is_numeric($lineHeight)) {
$styles[] = "line-height: {$lineHeight}";
} else {
$styles[] = "line-height: {$lineHeight}";
}
}
// Отступы
if (!empty($paragraphStyle['spaceBefore'])) {
$styles[] = "margin-top: {$paragraphStyle['spaceBefore']}pt";
}
if (!empty($paragraphStyle['spaceAfter'])) {
$styles[] = "margin-bottom: {$paragraphStyle['spaceAfter']}pt";
}
// Отступы первой строки
if (!empty($paragraphStyle['indent'])) {
$indent = $paragraphStyle['indent'];
if (!empty($indent['left'])) {
$styles[] = "margin-left: {$indent['left']}pt";
}
if (!empty($indent['right'])) {
$styles[] = "margin-right: {$indent['right']}pt";
}
if (!empty($indent['firstLine'])) {
$styles[] = "text-indent: {$indent['firstLine']}pt";
}
}
// Табуляция
if (!empty($paragraphStyle['tabs'])) {
$tabStops = [];
foreach ($paragraphStyle['tabs'] as $tab) {
if (!empty($tab['position'])) {
$position = $tab['position'];
$type = $tab['type'] ?? 'left';
$tabStops[] = "{$position}pt {$type}";
}
}
if (!empty($tabStops)) {
$styles[] = "tab-stops: " . implode(', ', $tabStops);
}
}
return implode('; ', $styles);
}
public function generateTextRunStyle($formatting)
{
if (empty($formatting)) {
return '';
}
$styles = [];
// Размер шрифта
if (!empty($formatting['fontSize'])) {
$fontSize = $formatting['fontSize'];
if (is_numeric($fontSize)) {
$styles[] = "font-size: {$fontSize}pt";
} else {
$styles[] = "font-size: {$fontSize}";
}
}
// Цвет шрифта
if (!empty($formatting['fontColor'])) {
$styles[] = "color: {$formatting['fontColor']}";
}
// Цвет фона
if (!empty($formatting['backgroundColor'])) {
$styles[] = "background-color: {$formatting['backgroundColor']}";
}
// Шрифт
if (!empty($formatting['fontFamily'])) {
$styles[] = "font-family: '{$formatting['fontFamily']}'";
}
// Жирный текст
if (!empty($formatting['bold']) && $formatting['bold']) {
$styles[] = "font-weight: bold";
}
// Курсив
if (!empty($formatting['italic']) && $formatting['italic']) {
$styles[] = "font-style: italic";
}
// Подчеркивание
if (!empty($formatting['underline']) && $formatting['underline']) {
$styles[] = "text-decoration: underline";
}
// Зачеркивание
if (!empty($formatting['strikethrough']) && $formatting['strikethrough']) {
$styles[] = "text-decoration: line-through";
}
// Надстрочный индекс
if (!empty($formatting['superscript']) && $formatting['superscript']) {
$styles[] = "vertical-align: super";
$styles[] = "font-size: smaller";
}
// Подстрочный индекс
if (!empty($formatting['subscript']) && $formatting['subscript']) {
$styles[] = "vertical-align: sub";
$styles[] = "font-size: smaller";
}
// Тень текста
if (!empty($formatting['shadow']) && $formatting['shadow']) {
$styles[] = "text-shadow: 1px 1px 2px rgba(0,0,0,0.3)";
}
// Трансформация текста
if (!empty($formatting['textTransform'])) {
$styles[] = "text-transform: {$formatting['textTransform']}";
}
// Межбуквенный интервал
if (!empty($formatting['letterSpacing'])) {
$styles[] = "letter-spacing: {$formatting['letterSpacing']}pt";
}
// Межсловный интервал
if (!empty($formatting['wordSpacing'])) {
$styles[] = "word-spacing: {$formatting['wordSpacing']}pt";
}
return implode('; ', $styles);
}
public function replaceVariables($content, $formData)
{
return $content;
if (empty($content)) {
return $content;
}
// Заменяем плейсхолдеры вида {{variable}}
foreach ($formData as $key => $value) {
if (!empty($value)) {
$content = str_replace("{{{$key}}}", $value, $content);
$content = str_replace("[[{$key}]]", $value, $content);
$content = str_replace("__{$key}__", $value, $content);
}
}
// Заменяем span-плейсхолдеры с data-variable
$dom = new DOMDocument();
@$dom->loadHTML('<?xml encoding="UTF-8"><div>' . $content . '</div>');
$xpath = new DOMXPath($dom);
$placeholders = $xpath->query('//*[@data-variable]');
foreach ($placeholders as $placeholder) {
$variableName = $placeholder->getAttribute('data-variable');
$value = $formData[$variableName] ?? '';
if (!empty($value)) {
// Создаем текстовый узел с значением
$textNode = $dom->createTextNode($value);
$placeholder->parentNode->replaceChild($textNode, $placeholder);
}
}
// Извлекаем HTML обратно
$html = $dom->saveHTML();
$html = preg_replace('/^<!DOCTYPE.*?>\n?/', '', $html);
$html = preg_replace('/<\?xml encoding="UTF-8"\?>\n?/', '', $html);
$html = preg_replace('/^<div>/', '', $html);
$html = preg_replace('/<\/div>$/', '', $html);
return trim($html);
}
private function extractVariablesFromText($text)
{
$variables = [];
// Ищем шаблоны переменных: {{variable}}, [VARIABLE], ${variable}, etc.
$patterns = [
'/\{\{(\w+)\}\}/',
'/\[(\w+)\]/',
'/\$(\w+)/',
'/%(\w+)%/'
];
foreach ($patterns as $pattern) {
if (preg_match_all($pattern, $text, $matches)) {
foreach ($matches[1] as $match) {
$variables[] = $match;
}
}
}
return array_unique($variables);
}
public function extractVariables($structure)
{
$variables = [];
foreach ($structure['elements'] as $element) {
if (!empty($element['variables'])) {
foreach ($element['variables'] as $varName) {
if (!isset($variables[$varName])) {
$variables[$varName] = [
'type' => 'text',
'label' => $this->formatVariableLabel($varName),
'default' => ''
];
}
}
}
if (!array_key_exists('type', $element)) dd($element);
// Для таблиц
if ($element['type'] === 'table' && !empty($element['rows'])) {
foreach ($element['rows'] as $row) {
foreach ($row['cells'] as $cell) {
if (!empty($cell['variables'])) {
foreach ($cell['variables'] as $varName) {
if (!isset($variables[$varName])) {
$variables[$varName] = [
'type' => 'text',
'label' => $this->formatVariableLabel($varName),
'default' => ''
];
}
}
}
}
}
}
}
return $variables;
}
private function formatVariableLabel($varName)
{
return ucfirst(str_replace(['_', '-'], ' ', $varName));
}
private function getHeadingSize($level)
{
$sizes = [
1 => '24pt',
2 => '20pt',
3 => '18pt',
4 => '16pt',
5 => '14pt',
6 => '12pt'
];
return $sizes[$level] ?? '16pt';
}
private function generateTableHTML($rows)
{
$html = '<table style="width: 100%; border-collapse: collapse; margin: 1em 0;">';
foreach ($rows as $row) {
$html .= '<tr>';
foreach ($row['cells'] as $cell) {
$html .= '<td style="border: 1px solid #000; padding: 8px;">';
$html .= htmlspecialchars($cell['content']);
$html .= '</td>';
}
$html .= '</tr>';
}
$html .= '</table>';
return $html;
}
// Метод для обработки загруженного файла
public function parseUploadedFile($uploadedFile)
{
$tempPath = $uploadedFile->getRealPath();
$extension = $uploadedFile->getClientOriginalExtension();
if ($extension !== 'docx') {
throw new Exception("Поддерживаются только файлы .docx");
}
return $this->parse($tempPath);
}
}