Skip to content

Commit d40a1cf

Browse files
committed
Xml Reader Rich Text
Fix #4001. Thanks to @SlowFox71 who reported the problem and developed most of the solution. This PR adds Rich Text support to the XML reader. The Xml Spreadsheet stores Rich Text as Html tags, children of the ss:Data tag using a specific namespace. These can be parsed into a RichText object using existing method Helper/Html::toRichTextObject. There are 2 items which need special attention. First, for attributes like bold or italic, Excel uses the appropriate Html tag (e.g. `<B>`). However, for an attribute like color, Excel uses `<Font html:Color="#FF0000">`, with a prefix on the Color tag. PhpSpreadsheet's Html parser cannot cope with the prefix. The parser is changed to strip `html:` from attribute names for the Font tag. The example cited by the user used a `<BR />` to indicate a line break in the data. However, it appears that, at least some of the time, Excel will instead use `&#10;` to indicate a line break. The existing parser reduces one or more whitespace characters in the text to a single space, and so `&#10;` will wind up disappearing. I am not sure why the existing code does this, but I do know that I am not willing to break it. Instead, I've added an optional boolean parameter `$preserveWhiteSpace` to `toRichTextObject`. If false (default), the existing logic will be used; but if true, substitution for whitespace characters in the text will not happen.
1 parent 35030fa commit d40a1cf

File tree

3 files changed

+188
-6
lines changed

3 files changed

+188
-6
lines changed

src/PhpSpreadsheet/Helper/Html.php

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,8 @@ class Html
595595

596596
private RichText $richTextObject;
597597

598+
private bool $preserveWhiteSpace = false;
599+
598600
private function initialise(): void
599601
{
600602
$this->face = $this->size = $this->color = null;
@@ -608,7 +610,7 @@ private function initialise(): void
608610
/**
609611
* Parse HTML formatting and return the resulting RichText.
610612
*/
611-
public function toRichTextObject(string $html): RichText
613+
public function toRichTextObject(string $html, bool $preserveWhiteSpace = false): RichText
612614
{
613615
$this->initialise();
614616

@@ -622,7 +624,9 @@ public function toRichTextObject(string $html): RichText
622624
$dom->preserveWhiteSpace = false;
623625

624626
$this->richTextObject = new RichText();
627+
$this->preserveWhiteSpace = $preserveWhiteSpace;
625628
$this->parseElements($dom);
629+
$this->preserveWhiteSpace = false;
626630

627631
// Clean any further spurious whitespace
628632
$this->cleanWhitespace();
@@ -706,6 +710,7 @@ protected function startFontTag(DOMElement $tag): void
706710
if ($attrs !== null) {
707711
foreach ($attrs as $attribute) {
708712
$attributeName = strtolower($attribute->name);
713+
$attributeName = preg_replace('/^html:/', '', $attributeName) ?? $attributeName; // in case from Xml spreadsheet
709714
$attributeValue = $attribute->value;
710715

711716
if ($attributeName == 'color') {
@@ -795,11 +800,15 @@ public function breakTag(): void
795800

796801
private function parseTextNode(DOMText $textNode): void
797802
{
798-
$domText = (string) preg_replace(
799-
'/\s+/u',
800-
' ',
801-
str_replace(["\r", "\n"], ' ', $textNode->nodeValue ?? '')
802-
);
803+
if ($this->preserveWhiteSpace) {
804+
$domText = $textNode->nodeValue ?? '';
805+
} else {
806+
$domText = (string) preg_replace(
807+
'/\s+/u',
808+
' ',
809+
str_replace(["\r", "\n"], ' ', $textNode->nodeValue ?? '')
810+
);
811+
}
803812
$this->stringData .= $domText;
804813
$this->buildTextRun();
805814
}

src/PhpSpreadsheet/Reader/Xml.php

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
99
use PhpOffice\PhpSpreadsheet\Cell\DataType;
1010
use PhpOffice\PhpSpreadsheet\DefinedName;
11+
use PhpOffice\PhpSpreadsheet\Helper\Html as HelperHtml;
1112
use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner;
1213
use PhpOffice\PhpSpreadsheet\Reader\Xlsx\Namespaces;
1314
use PhpOffice\PhpSpreadsheet\Reader\Xml\PageSettings;
@@ -426,6 +427,14 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet, boo
426427
*/
427428
case 'String':
428429
$type = DataType::TYPE_STRING;
430+
$rich = $cellData->children('http://www.w3.org/TR/REC-html40');
431+
if ($rich) {
432+
// in case of HTML content we extract the payload
433+
// and convert it into a rich text object
434+
$content = $cellData->asXML() ?: '';
435+
$html = new HelperHtml();
436+
$cellValue = $html->toRichTextObject($content, true);
437+
}
429438

430439
break;
431440
case 'Number':
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpOffice\PhpSpreadsheetTests\Reader\Xml;
6+
7+
use PhpOffice\PhpSpreadsheet\Reader\Xml;
8+
use PhpOffice\PhpSpreadsheet\RichText\RichText;
9+
use PhpOffice\PhpSpreadsheet\RichText\Run;
10+
use PHPUnit\Framework\TestCase;
11+
12+
class XmlRichTextTest extends TestCase
13+
{
14+
public function testBreakTag(): void
15+
{
16+
$xmldata = <<< 'EOT'
17+
<?xml version="1.0"?>
18+
<?mso-application progid="Excel.Sheet"?>
19+
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
20+
xmlns:o="urn:schemas-microsoft-com:office:office"
21+
xmlns:x="urn:schemas-microsoft-com:office:excel"
22+
xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
23+
xmlns:html="http://www.w3.org/TR/REC-html40">
24+
<Worksheet ss:Name="Test">
25+
<ss:Table>
26+
<ss:Row>
27+
<ss:Cell>
28+
<ss:Data ss:Type="String" xmlns="http://www.w3.org/TR/REC-html40"><I>italic</I><B>bold</B><BR />second line</ss:Data>
29+
</ss:Cell>
30+
</ss:Row>
31+
</ss:Table>
32+
</Worksheet>
33+
</Workbook>
34+
EOT;
35+
$reader = new Xml();
36+
$spreadsheet = $reader->loadSpreadsheetFromString($xmldata);
37+
self::assertEquals(1, $spreadsheet->getSheetCount());
38+
39+
$sheet = $spreadsheet->getActiveSheet();
40+
self::assertEquals('Test', $sheet->getTitle());
41+
$richText = $sheet->getCell('A1')->getValue();
42+
self::assertInstanceOf(RichText::class, $richText);
43+
$elements = $richText->getRichTextElements();
44+
self::assertCount(3, $elements);
45+
$run = $elements[0];
46+
self::assertInstanceOf(Run::class, $run);
47+
self::assertSame('italic', $run->getText());
48+
self::assertNotNull($run->getFont());
49+
self::assertTrue($run->getFont()->getItalic());
50+
self::assertFalse($run->getFont()->getBold());
51+
52+
$run = $elements[1];
53+
self::assertInstanceOf(Run::class, $run);
54+
self::assertSame('bold', $run->getText());
55+
self::assertNotNull($run->getFont());
56+
self::assertFalse($run->getFont()->getItalic());
57+
self::assertTrue($run->getFont()->getBold());
58+
59+
$run = $elements[2];
60+
self::assertInstanceOf(Run::class, $run);
61+
self::assertSame("\nsecond line", $run->getText());
62+
63+
$spreadsheet->disconnectWorksheets();
64+
}
65+
66+
public function testNewlineAndFontTag(): void
67+
{
68+
$xmldata = <<< 'EOT'
69+
<?xml version="1.0"?>
70+
<?mso-application progid="Excel.Sheet"?>
71+
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
72+
xmlns:o="urn:schemas-microsoft-com:office:office"
73+
xmlns:x="urn:schemas-microsoft-com:office:excel"
74+
xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
75+
xmlns:html="http://www.w3.org/TR/REC-html40">
76+
<DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
77+
<LastAuthor>Owen Leibman</LastAuthor>
78+
<Created>2024-04-28T06:03:14Z</Created>
79+
<Version>16.00</Version>
80+
</DocumentProperties>
81+
<OfficeDocumentSettings xmlns="urn:schemas-microsoft-com:office:office">
82+
<AllowPNG/>
83+
</OfficeDocumentSettings>
84+
<ExcelWorkbook xmlns="urn:schemas-microsoft-com:office:excel">
85+
<WindowHeight>6510</WindowHeight>
86+
<WindowWidth>19200</WindowWidth>
87+
<WindowTopX>32767</WindowTopX>
88+
<WindowTopY>32767</WindowTopY>
89+
<ProtectStructure>False</ProtectStructure>
90+
<ProtectWindows>False</ProtectWindows>
91+
</ExcelWorkbook>
92+
<Styles>
93+
<Style ss:ID="Default" ss:Name="Normal">
94+
<Alignment ss:Vertical="Bottom"/>
95+
<Borders/>
96+
<Font ss:FontName="Aptos Narrow" x:Family="Swiss" ss:Size="11"
97+
ss:Color="#000000"/>
98+
<Interior/>
99+
<NumberFormat/>
100+
<Protection/>
101+
</Style>
102+
<Style ss:ID="s63">
103+
<Alignment ss:Vertical="Bottom" ss:WrapText="1"/>
104+
<Borders/>
105+
<Font ss:FontName="Aptos Narrow" x:Family="Swiss" ss:Size="11" ss:Italic="1"/>
106+
<Interior/>
107+
<NumberFormat/>
108+
<Protection/>
109+
</Style>
110+
</Styles>
111+
<Worksheet ss:Name="Test">
112+
<Table ss:ExpandedColumnCount="1" ss:ExpandedRowCount="1" x:FullColumns="1"
113+
x:FullRows="1" ss:DefaultRowHeight="14.5">
114+
<Row ss:AutoFitHeight="0" ss:Height="47.5">
115+
<Cell ss:StyleID="s63"><ss:Data ss:Type="String"
116+
xmlns="http://www.w3.org/TR/REC-html40"><I>italic</I><B>bold&#10;</B><Font
117+
html:Color="#FF0000">second</Font><Font> line</Font></ss:Data></Cell>
118+
</Row>
119+
</Table>
120+
<WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
121+
<Unsynced/>
122+
<Selected/>
123+
<ProtectObjects>False</ProtectObjects>
124+
<ProtectScenarios>False</ProtectScenarios>
125+
</WorksheetOptions>
126+
</Worksheet>
127+
</Workbook>
128+
EOT;
129+
$reader = new Xml();
130+
$spreadsheet = $reader->loadSpreadsheetFromString($xmldata);
131+
self::assertEquals(1, $spreadsheet->getSheetCount());
132+
133+
$sheet = $spreadsheet->getActiveSheet();
134+
self::assertEquals('Test', $sheet->getTitle());
135+
$richText = $sheet->getCell('A1')->getValue();
136+
self::assertInstanceOf(RichText::class, $richText);
137+
$elements = $richText->getRichTextElements();
138+
self::assertCount(4, $elements);
139+
$run = $elements[0];
140+
self::assertInstanceOf(Run::class, $run);
141+
self::assertSame('italic', $run->getText());
142+
self::assertNotNull($run->getFont());
143+
self::assertTrue($run->getFont()->getItalic());
144+
self::assertFalse($run->getFont()->getBold());
145+
146+
$run = $elements[1];
147+
self::assertInstanceOf(Run::class, $run);
148+
self::assertSame("bold\n", $run->getText());
149+
self::assertNotNull($run->getFont());
150+
self::assertFalse($run->getFont()->getItalic());
151+
self::assertTrue($run->getFont()->getBold());
152+
153+
$run = $elements[2];
154+
self::assertInstanceOf(Run::class, $run);
155+
self::assertSame('second', $run->getText());
156+
self::assertSame('FF0000', $run->getFont()?->getColor()->getRgb());
157+
158+
$run = $elements[3];
159+
self::assertInstanceOf(Run::class, $run);
160+
self::assertSame(' line', $run->getText());
161+
162+
$spreadsheet->disconnectWorksheets();
163+
}
164+
}

0 commit comments

Comments
 (0)