Skip to content

Commit 469ca75

Browse files
committed
Do not copy the article content. Return a memoryview.
1 parent 8492547 commit 469ca75

File tree

4 files changed

+123
-41
lines changed

4 files changed

+123
-41
lines changed

libzim/__init__.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,3 @@
1616
#
1717
# You should have received a copy of the GNU General Public License
1818
# along with this program. If not, see <http://www.gnu.org/licenses/>.
19-
20-
21-
from libzim_wrapper import Blob
22-
23-
__all__ = ["Blob"]

libzim/libzim_wrapper.pyx

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ cimport libzim.libzim_wrapper as clibzim
2222

2323
from cython.operator import dereference, preincrement
2424
from cpython.ref cimport PyObject
25+
from cpython.buffer cimport PyBUF_WRITABLE
2526

2627
from libc.stdint cimport uint64_t
2728
from libcpp.string cimport string
@@ -30,14 +31,11 @@ from libcpp.memory cimport shared_ptr, make_shared, unique_ptr
3031

3132
import datetime
3233

33-
34-
35-
3634
#########################
3735
# Blob #
3836
#########################
3937

40-
cdef class Blob:
38+
cdef class WritingBlob:
4139
cdef clibzim.Blob* c_blob
4240
cdef bytes ref_content
4341

@@ -52,6 +50,50 @@ cdef class Blob:
5250
if self.c_blob != NULL:
5351
del self.c_blob
5452

53+
cdef Py_ssize_t itemsize = 1
54+
55+
cdef class ReadingBlob:
56+
cdef clibzim.Blob c_blob
57+
cdef Py_ssize_t size
58+
cdef int view_count
59+
60+
cdef __setup(self, clibzim.Blob blob):
61+
"""Assigns an internal pointer to the wrapped C++ article object.
62+
63+
Parameters
64+
----------
65+
*art : Article
66+
Pointer to a C++ (zim::) article object
67+
"""
68+
# Set new internal C zim.ZimArticle article
69+
self.c_blob = blob
70+
self.size = blob.size()
71+
self.view_count = 0
72+
73+
def __dealloc__(self):
74+
if self.view_count:
75+
raise RuntimeError("Blob has views")
76+
77+
def __getbuffer__(self, Py_buffer *buffer, int flags):
78+
if flags&PyBUF_WRITABLE:
79+
raise BufferError("Cannot create writable memoryview on readonly data")
80+
buffer.obj = self
81+
buffer.buf = <void*>self.c_blob.data()
82+
buffer.len = self.size
83+
buffer.readonly = 1
84+
buffer.format = 'c'
85+
buffer.internal = NULL # see References
86+
buffer.itemsize = itemsize
87+
buffer.ndim = 1
88+
buffer.shape = &self.size
89+
buffer.strides = &itemsize
90+
buffer.suboffsets = NULL # for pointer arrays only
91+
92+
self.view_count += 1
93+
94+
def __releasebuffer__(self, Py_buffer *buffer):
95+
self.view_count -= 1
96+
5597

5698
#------ Helper for pure virtual methods --------
5799

@@ -76,7 +118,7 @@ cdef public api:
76118

77119
clibzim.Blob blob_cy_call_fct(object obj, string method, int *error) with gil:
78120
"""Lookup and execute a pure virtual method on ZimArticle returning a Blob"""
79-
cdef Blob blob
121+
cdef WritingBlob blob
80122

81123
func = get_article_method_from_object(obj, method, error)
82124
blob = func()
@@ -198,12 +240,17 @@ cdef class ReadArticle:
198240
Creates a python ZimArticle from a C++ zim.Article article.
199241
"""
200242
cdef clibzim.Article c_article
243+
cdef ReadingBlob _blob
244+
cdef bool _haveBlob
201245

202246
#def __eq__(self, other):
203247
# if isinstance(other, ZimArticle):
204248
# return (self.longurl == other.longurl) and (self.content == other.content) and (self.is_redirect == other.is_redirect)
205249
# return False
206250

251+
def __cinit__(self):
252+
self._haveBlob = False
253+
207254
cdef __setup(self, clibzim.Article art):
208255
"""Assigns an internal pointer to the wrapped C++ article object.
209256
@@ -214,6 +261,7 @@ cdef class ReadArticle:
214261
"""
215262
# Set new internal C zim.ZimArticle article
216263
self.c_article = art
264+
self._blob = None
217265

218266

219267

@@ -248,9 +296,11 @@ cdef class ReadArticle:
248296
@property
249297
def content(self):
250298
"""Get the article's content"""
251-
cdef clibzim.Blob blob = self.c_article.getData(<int> 0)
252-
data = blob.data()[:blob.size()]
253-
return data
299+
if not self._haveBlob:
300+
self._blob = ReadingBlob()
301+
self._blob.__setup(self.c_article.getData(<int> 0))
302+
self._haveBlob = True
303+
return memoryview(self._blob)
254304

255305
@property
256306
def longurl(self):

libzim/writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from collections import defaultdict
2323

2424
import libzim_wrapper
25-
from libzim_wrapper import Blob
25+
from libzim_wrapper import WritingBlob as Blob
2626

2727
__all__ = ["Article", "Blob", "Creator"]
2828

tests/test_libzim_file_reader.py

Lines changed: 64 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,33 @@
1-
import pytest
1+
import gc
22
from pathlib import Path
33

4-
DATA_DIR = Path(__file__).parent
4+
import pytest
55

66
from libzim.reader import File
77

8+
DATA_DIR = Path(__file__).parent
9+
10+
11+
ZIMFILES = [
12+
{
13+
'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"),
14+
'checksum': u"99ea7a5598c6040c4f50b8ac0653b703",
15+
'namespaces': u"-AIMX",
16+
'article_count': 22027,
17+
'main_page_url': u"A/index",
18+
}
19+
]
20+
21+
22+
23+
24+
@pytest.fixture(params=ZIMFILES)
25+
def zimdata(request):
26+
return request.param
27+
828
@pytest.fixture
9-
def reader_data():
10-
return (
11-
File(str(DATA_DIR/"wikipedia_es_physics_mini.zim")),
12-
{
13-
'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"),
14-
'checksum': u"99ea7a5598c6040c4f50b8ac0653b703",
15-
'namespaces': u"-AIMX",
16-
'article_count': 22027,
17-
'main_page_url': u"A/index"
18-
}
19-
)
29+
def reader(zimdata):
30+
return File(zimdata['filename'])
2031

2132

2233
@pytest.fixture
@@ -25,45 +36,71 @@ def article_data():
2536
'url': u"A/Albert_Einstein",
2637
'title': u"Albert Einstein",
2738
'mimetype':u"text/html",
28-
'article_id': 663
39+
'article_id': 663,
40+
'size': 17343
2941
}
3042

3143

32-
def test_zim_filename(reader_data):
33-
reader, data = reader_data
34-
for k, v in data.items():
44+
def test_zim_filename(reader, zimdata):
45+
for k, v in zimdata.items():
3546
assert getattr(reader, k) == v
3647

37-
def test_zim_read(reader_data, article_data):
38-
reader, _ = reader_data
48+
def test_zim_read(reader, article_data):
3949
article = reader.get_article(article_data['url'])
4050

4151
assert article.longurl == article_data['url']
4252
assert article.title == article_data['title']
4353
assert article.url == article_data['url'][2:]
4454
assert article.mimetype == article_data['mimetype']
55+
assert isinstance(article.content, memoryview)
56+
assert len(article.content) == article_data['size']
4557

46-
def test_get_article_by_id(reader_data, article_data):
47-
reader, _ = reader_data
58+
def test_content_ref_keep(reader):
59+
"""Get the memoryview on a content and loose the reference on the article.
60+
We try to load a lot of other articles to detect possible use of dandling pointer
61+
"""
62+
content =None
63+
def get_content():
64+
nonlocal content
65+
article = reader.get_article(u"A/Albert_Einstein")
66+
assert isinstance(article.content, memoryview)
67+
content = article.content
68+
get_content() # Now we have a content but no reference to the article.
69+
gc.collect()
70+
# Load a lot of content
71+
for i in range(0, reader.article_count, 2):
72+
article = reader.get_article_by_id(i)
73+
if not article.is_redirect:
74+
c = article.content
75+
# Check everything is ok
76+
assert len(content) == 17343
77+
assert bytes(content[:100]) == b'<!DOCTYPE html>\n<html class="client-js"><head>\n <meta charset="UTF-8">\n <title>Albert Einstein</ti'
78+
79+
def test_get_article_by_id(reader, article_data):
80+
return
4881
article = reader.get_article_by_id(article_data['article_id'])
4982

5083
assert article.longurl == article_data['url']
5184
assert article.title == article_data['title']
5285
assert article.url == article_data['url'][2:]
5386
assert article.mimetype == article_data['mimetype']
5487

55-
def test_namespace_count(reader_data):
56-
reader, _ = reader_data
88+
def test_namespace_count(reader):
5789
namespaces = reader.namespaces
5890
num_articles = sum(reader.get_namespaces_count(ns) for ns in namespaces)
5991
assert reader.article_count == num_articles
6092

61-
def test_suggest(reader_data):
62-
reader, _ = reader_data
93+
def test_suggest(reader):
6394
results = reader.suggest(u"Einstein")
6495
assert u"A/Albert_Einstein" in list(results)
6596

66-
def test_search(reader_data):
67-
reader, _ = reader_data
97+
def test_search(reader):
6898
results = reader.search(u"Einstein")
6999
assert len(list(results)) == 10
100+
101+
102+
def test_get_wrong_article(reader):
103+
with pytest.raises(RuntimeError):
104+
reader.get_article_by_id(reader.article_count + 100)
105+
with pytest.raises(RuntimeError):
106+
reader.get_article("A/I_do_not_exists")

0 commit comments

Comments
 (0)