Skip to content

Commit 62086d0

Browse files
authored
Color and font information for chars, words and boxes (#39)
The information originates in chars, but is pushed to words, lines and boxes in case the values don't differ.
1 parent 518ead3 commit 62086d0

File tree

6 files changed

+200
-1
lines changed

6 files changed

+200
-1
lines changed

libpdf/models/horizontal_box.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ class Char: # pylint: disable=too-few-public-methods # simplicity is good.
1919
:ivar y1: distance from the bottom of the page to the upper edge of the character
2020
(greater than y0)
2121
:vartype y1: float
22+
:ivar ncolor: non-stroking-color as rgb value
23+
:vartype ncolor: Tuple[float, float, float]
2224
"""
2325

2426
def __init__(
@@ -28,13 +30,17 @@ def __init__(
2830
y0: float | None = None,
2931
x1: float | None = None,
3032
y1: float | None = None,
33+
ncolor: tuple | None = None,
34+
fontname: str | None = None,
3135
):
3236
"""Init with plain char of a character and its rectangular coordinates."""
3337
self.x0 = x0
3438
self.y0 = y0
3539
self.x1 = x1
3640
self.y1 = y1
3741
self.text = text
42+
self.ncolor = ncolor
43+
self.fontname = fontname
3844

3945
def __repr__(self) -> str:
4046
"""Make the text part of the repr for better debugging."""
@@ -65,13 +71,24 @@ def __init__(
6571
self.x1 = x1
6672
self.y1 = y1
6773
self.chars = chars
74+
self.ncolor = None
75+
self.fontname = None
76+
6877
if self.chars:
6978
# Obtain the rectangle coordinates from a list of libpdf text objects
7079
self.x0 = min(text_obj.x0 for text_obj in self.chars)
7180
self.y0 = min(text_obj.y0 for text_obj in self.chars)
7281
self.x1 = max(text_obj.x1 for text_obj in self.chars)
7382
self.y1 = max(text_obj.y1 for text_obj in self.chars)
7483

84+
for n in ["ncolor", "fontname"]:
85+
if all(
86+
getattr(x, n) == getattr(self.chars[0], n)
87+
and getattr(x, n) is not None
88+
for x in self.chars
89+
):
90+
setattr(self, n, getattr(self.chars[0], n))
91+
7592
@property
7693
def text(self) -> str:
7794
"""Return plain text."""
@@ -106,13 +123,24 @@ def __init__(
106123
self.x1 = x1
107124
self.y1 = y1
108125
self.words = words
126+
self.ncolor = None
127+
self.fontname = None
128+
109129
if self.words:
110130
# Obtain the rectangle coordinates from a list of libpdf text objects
111131
self.x0 = min(text_obj.x0 for text_obj in self.words)
112132
self.y0 = min(text_obj.y0 for text_obj in self.words)
113133
self.x1 = max(text_obj.x1 for text_obj in self.words)
114134
self.y1 = max(text_obj.y1 for text_obj in self.words)
115135

136+
for n in ["ncolor", "fontname"]:
137+
if all(
138+
getattr(x, n) == getattr(self.words[0], n)
139+
and getattr(x, n) is not None
140+
for x in self.words
141+
):
142+
setattr(self, n, getattr(self.words[0], n))
143+
116144
@property
117145
def text(self) -> str:
118146
"""Return plain text."""
@@ -147,18 +175,35 @@ def __init__(
147175
self.x1 = x1
148176
self.y1 = y1
149177
self.lines = lines
178+
self.ncolor = None
179+
self.fontname = None
180+
150181
if self.lines:
151182
# Obtain the rectangle coordinates from a list of libpdf text objects.
152183
self.x0 = min(text_obj.x0 for text_obj in self.lines)
153184
self.y0 = min(text_obj.y0 for text_obj in self.lines)
154185
self.x1 = max(text_obj.x1 for text_obj in self.lines)
155186
self.y1 = max(text_obj.y1 for text_obj in self.lines)
156187

188+
_words = [word for line in self.lines for word in line.words]
189+
190+
for n in ["ncolor", "fontname"]:
191+
if all(
192+
getattr(x, n) == getattr(_words[0], n) and getattr(x, n) is not None
193+
for x in _words
194+
):
195+
setattr(self, n, getattr(_words[0], n))
196+
157197
@property
158198
def text(self) -> str:
159199
"""Return plain text."""
160200
return "\n".join([x.text for x in self.lines])
161201

202+
@property
203+
def words(self) -> list[str]:
204+
"""Return list of words."""
205+
return [word for line in self.lines for word in line.words]
206+
162207
def __repr__(self) -> str | None:
163208
"""Make the text part of the repr for better debugging."""
164209
if self.lines:

libpdf/utils.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,15 @@ def assemble_to_textlines(
488488
for lt_obj in flatten_lt_objs:
489489
if lt_obj.get_text() != " " and lt_obj.get_text() != "\n":
490490
# instantiate Char
491-
char = Char(lt_obj.get_text(), lt_obj.x0, lt_obj.y0, lt_obj.x1, lt_obj.y1)
491+
char = Char(
492+
lt_obj.get_text(),
493+
lt_obj.x0,
494+
lt_obj.y0,
495+
lt_obj.x1,
496+
lt_obj.y1,
497+
lt_obj.graphicstate.ncolor if hasattr(lt_obj, "graphicstate") else None,
498+
lt_obj.fontname,
499+
)
492500
chars.append(char)
493501

494502
if lt_obj is flatten_lt_objs[-1]:

tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
# test PDF for rect extraction generateby by sphinx-simplepdf
3535
PDF_RECTS_EXTRACTION = Path(__file__).parent / "pdf" / "test_rects_extraction.pdf"
3636

37+
# test PDF for color style info
38+
PDF_COLOR_STYLE = Path(__file__).parent / "pdf" / "test_words_color_style.pdf"
39+
3740

3841
@pytest.fixture(scope="session")
3942
def load_full_features_pdf(
26.3 KB
Binary file not shown.
34.3 KB
Binary file not shown.

tests/test_word_colors.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""Test catalog extraction."""
2+
3+
import libpdf
4+
from tests.conftest import PDF_COLOR_STYLE
5+
6+
7+
def test_colors_0() -> None:
8+
"""Test word colors in given chapter paragraph."""
9+
objects = libpdf.load(PDF_COLOR_STYLE)
10+
assert objects is not None
11+
assert objects.flattened.chapters
12+
13+
for chapter in objects.flattened.chapters:
14+
if chapter.title == "Color in Text and Heading":
15+
assert chapter.textbox.ncolor == (1, 0, 0)
16+
17+
18+
def test_colors_1() -> None:
19+
"""Test word colors in given chapter paragraph."""
20+
objects = libpdf.load(PDF_COLOR_STYLE)
21+
assert objects is not None
22+
assert objects.flattened.chapters
23+
24+
for chapter in objects.flattened.chapters:
25+
if chapter.title == "HorizontalLine":
26+
for content in chapter.content:
27+
if (
28+
content.type == "paragraph"
29+
and "Paragraph text is blue" in content.textbox.text
30+
):
31+
assert content.textbox.ncolor == (0, 0, 1)
32+
if (
33+
content.type == "paragraph"
34+
and "This chapter is for" in content.textbox.text
35+
):
36+
assert content.textbox.ncolor == (0, 0, 0)
37+
38+
39+
def test_colors_2() -> None:
40+
"""Test word colors in given chapter paragraph."""
41+
objects = libpdf.load(PDF_COLOR_STYLE)
42+
assert objects is not None
43+
assert objects.flattened.chapters
44+
45+
for chapter in objects.flattened.chapters:
46+
if chapter.title == "HorizontalBox":
47+
for content in chapter.content:
48+
if content.type == "paragraph":
49+
assert content.textbox.ncolor == (0, 1, 0)
50+
elif chapter.title == "UncoloredHorizontalbox":
51+
for content in chapter.content:
52+
if content.type == "paragraph":
53+
assert content.textbox.ncolor is None
54+
for line in content.textbox.lines:
55+
assert line.ncolor is not None
56+
57+
58+
def test_colors_3() -> None:
59+
"""Test word colors in given chapter paragraph."""
60+
objects = libpdf.load(PDF_COLOR_STYLE)
61+
assert objects is not None
62+
assert objects.flattened.chapters
63+
64+
for chapter in objects.flattened.chapters:
65+
if "Words" in chapter.title:
66+
for content in chapter.content:
67+
if (
68+
content.type == "paragraph"
69+
and "This line has no color" in content.textbox.text
70+
):
71+
assert content.textbox.ncolor is None
72+
73+
for word in content.textbox.words:
74+
if word.text == "has":
75+
assert word.ncolor == (0, 0, 1)
76+
elif word.text == "color":
77+
assert word.ncolor in [(0, 1, 0), (0, 0, 0)]
78+
elif word.text == "changes":
79+
assert word.ncolor == (1, 0, 0)
80+
elif word.text == "words":
81+
assert word.ncolor == (0, 0, 1)
82+
83+
84+
def test_colors_4() -> None:
85+
"""Test word colors in given chapter paragraph."""
86+
objects = libpdf.load(PDF_COLOR_STYLE)
87+
assert objects is not None
88+
assert objects.flattened.chapters
89+
90+
for chapter in objects.flattened.chapters:
91+
if "Words" in chapter.title:
92+
for content in chapter.content:
93+
if "This words have no color" in content.textbox.text:
94+
assert content.textbox.ncolor is None
95+
96+
for word in content.textbox.words:
97+
assert word.ncolor is None or word.ncolor == (0, 0, 0)
98+
99+
100+
def test_colors_5() -> None:
101+
"""Test word colors in given chapter paragraph."""
102+
objects = libpdf.load(PDF_COLOR_STYLE)
103+
assert objects is not None
104+
assert objects.flattened.chapters
105+
106+
for chapter in objects.flattened.chapters:
107+
if "Words" in chapter.title:
108+
for content in chapter.content:
109+
if "These words are printed" in content.textbox.text:
110+
assert content.textbox.ncolor is None
111+
112+
for word in content.textbox.words:
113+
if word.text in ["words", "but"]:
114+
assert word.ncolor == (0, 1, 0)
115+
elif word.text == "printed":
116+
assert word.ncolor == (0, 0, 1)
117+
elif word.text == "background":
118+
assert word.ncolor == (1, 0, 0)
119+
120+
121+
def test_colors_6() -> None:
122+
"""Test word colors in given chapter paragraph."""
123+
objects = libpdf.load(PDF_COLOR_STYLE)
124+
assert objects is not None
125+
assert objects.flattened.chapters
126+
127+
for chapter in objects.flattened.chapters:
128+
if "Styled Text" in chapter.title:
129+
for content in chapter.content:
130+
if "bold text format" in content.textbox.text:
131+
for word in content.textbox.words:
132+
if word.text == "bold":
133+
assert "Bold" in word.fontname
134+
else:
135+
assert "Bold" not in word.fontname
136+
elif "italic text format" in content.textbox.text:
137+
if word.text == "italic":
138+
assert "Italic" in word.fontname
139+
else:
140+
assert "Italic" not in word.fontname
141+
elif "underline text format" in content.textbox.text:
142+
# this seems to be exracted as rect
143+
pass

0 commit comments

Comments
 (0)