Skip to content

Commit 0520e93

Browse files
zzambonivzhd1701
andauthored
fix: fix parse errors for huge and empty nodes (#102)
* Fix parse errors for huge and empty nodes - Enabled the "huge_tree" option in the XML parser to prevent the "xmlSAX2Characters: huge text node" error. - Fixed a "list index out of range" error that happened on some notes with title but no content. Fixes #101. * refactor: make empty dom check explicit * test: add big resource note test * test: add empty note dom test --------- Co-authored-by: vzhd1701 <vzhd1701@gmail.com>
1 parent dd29618 commit 0520e93

4 files changed

Lines changed: 87 additions & 0 deletions

File tree

enex2notion/enex_parser_xml.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def iter_process_xml_elements(
2727
recover=True,
2828
strip_cdata=False,
2929
resolve_entities=False,
30+
huge_tree=True,
3031
)
3132

3233
try:

enex2notion/note_parser/note.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ def _parse_note_dom(note: EvernoteNote) -> Optional[Tag]:
4444
logger.error(f"Failed to extract DOM from note '{note.title}'")
4545
return None
4646

47+
if len(note_dom.contents) == 0:
48+
return None
49+
4750
return _filter_yinxiang_markdown(note_dom)
4851

4952

tests/test_enex_parser.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import datetime
23
import logging
34
from pathlib import Path
@@ -547,6 +548,72 @@ def test_iter_notes_single_with_resource(fs):
547548
assert notes[0].resource_by_md5("000") is None
548549

549550

551+
def test_iter_notes_single_with_huge_resource(fs, caplog):
552+
test_enex_head = b"""<?xml version="1.0" encoding="UTF-8"?>
553+
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
554+
<en-export export-date="20211218T085932Z" application="Evernote" version="10.25.6">
555+
<note>
556+
<title>test1</title>
557+
<created>20211118T085332Z</created>
558+
<updated>20211118T085920Z</updated>
559+
<note-attributes>
560+
</note-attributes>
561+
<content>test</content>
562+
<resource>
563+
<data encoding="base64">
564+
"""
565+
test_enex_tail = b"""
566+
</data>
567+
<mime>image/gif</mime>
568+
<resource-attributes>
569+
<file-name>smallest.gif</file-name>
570+
</resource-attributes>
571+
</resource>
572+
</note>
573+
</en-export>
574+
"""
575+
test_enex_file = fs.create_file("test.enex", contents=test_enex_head)
576+
577+
# 10 MB
578+
big_binary = b"\x00" * 10 * 1024 * 1024
579+
big_binary_hash = "f1c9645dbc14efddc7d8a322685f26eb"
580+
581+
with Path("test.enex").open("ab+") as f:
582+
f.write(base64.b64encode(big_binary))
583+
f.write(test_enex_tail)
584+
585+
with caplog.at_level(logging.WARNING, logger="enex2notion"):
586+
notes_count = count_notes(Path("test.enex"))
587+
588+
notes = list(iter_notes(Path("test.enex")))
589+
590+
expected_resource = EvernoteResource(
591+
data_bin=big_binary,
592+
size=len(big_binary),
593+
md5=big_binary_hash,
594+
mime="image/gif",
595+
file_name="smallest.gif",
596+
)
597+
598+
assert caplog.text == ""
599+
assert notes_count == 1
600+
assert notes == [
601+
EvernoteNote(
602+
title="test1",
603+
created=datetime.datetime(2021, 11, 18, 8, 53, 32, tzinfo=tzutc()),
604+
updated=datetime.datetime(2021, 11, 18, 8, 59, 20, tzinfo=tzutc()),
605+
content="test",
606+
tags=[],
607+
author="",
608+
url="",
609+
is_webclip=False,
610+
resources=[expected_resource],
611+
),
612+
]
613+
assert notes[0].resource_by_md5(big_binary_hash) == expected_resource
614+
assert notes[0].resource_by_md5("000") is None
615+
616+
550617
def test_iter_notes_single_with_noext_resource(fs):
551618
test_enex = """<?xml version="1.0" encoding="UTF-8"?>
552619
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">

tests/test_note_parser.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,22 @@ def test_linebreaks_inside_root(parse_html):
654654
]
655655

656656

657+
def test_empty_note(parse_rules):
658+
test_note = EvernoteNote(
659+
title="test1",
660+
created=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()),
661+
updated=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()),
662+
content="<en-note></en-note>",
663+
tags=[],
664+
author="",
665+
url="",
666+
is_webclip=False,
667+
resources=[],
668+
)
669+
670+
assert parse_note(test_note, parse_rules) == []
671+
672+
657673
def test_yinxiang_markdown(parse_rules):
658674
test_note = EvernoteNote(
659675
title="test1",

0 commit comments

Comments
 (0)