Skip to content

Commit ba979fc

Browse files
committed
Rework mimetypes handling (#75)
- Fix an issue that native (Python and platform) MIME=>ext conversions are not patched. - Allow user config to overwrite the patch. - Patch mimetypes.init() to defer patching and improve the performance, and allow unittest mockings for the user config directory to work for the mimetypes module.
1 parent d851b3e commit ba979fc

File tree

3 files changed

+165
-56
lines changed

3 files changed

+165
-56
lines changed

tests/test_mimetypes.py

+63-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1+
import importlib
12
import os
3+
import tempfile
24
import unittest
35
from unittest import mock
46

7+
from webscrapbook import WSB_USER_DIR
58
from webscrapbook._polyfill import mimetypes
69

10+
from . import TEMP_DIR
11+
712

813
def setUpModule():
914
# mock out user config
@@ -14,15 +19,25 @@ def setUpModule():
1419
for mocking in mockings:
1520
mocking.start()
1621

22+
# Since our mimetypes patch is one-time, we need to reload the modules to
23+
# reapply the patch on the reinited mimetypes database.
24+
# This is also required in Python < 3.7.5, in which no default maps exist and
25+
# `mimetypes.init()` cannot recover the default maps.
26+
importlib.reload(mimetypes._mimetypes)
27+
importlib.reload(mimetypes)
28+
1729

1830
def tearDownModule():
1931
# stop mock
2032
for mocking in mockings:
2133
mocking.stop()
2234

35+
importlib.reload(mimetypes._mimetypes)
36+
importlib.reload(mimetypes)
37+
2338

2439
class TestMimetypes(unittest.TestCase):
25-
def test_overridden_mimetypes(self):
40+
def test_patch_ext2type(self):
2641
self.assertEqual(
2742
mimetypes.guess_type('myfile.htz'),
2843
('application/html+zip', None),
@@ -56,6 +71,53 @@ def test_overridden_mimetypes(self):
5671
('image/x-icon', None),
5772
)
5873

74+
def test_patch_type2ext(self):
75+
self.assertEqual(
76+
mimetypes.guess_extension('text/javascript'),
77+
'.js',
78+
)
79+
80+
def test_user_config(self):
81+
"""Test if user config works."""
82+
with tempfile.TemporaryDirectory(prefix='mimetypes-', dir=TEMP_DIR) as tmpdir:
83+
user_config_dir = os.path.normpath(os.path.join(tmpdir, WSB_USER_DIR))
84+
os.makedirs(user_config_dir)
85+
with open(os.path.join(user_config_dir, mimetypes.WSB_USER_MIMETYPES), 'w', encoding='UTF-8') as fh:
86+
# poison with bad/invalid conversions that are unlikely really used
87+
fh.write("""\
88+
user/.type js
89+
user/.type2 js
90+
text/javascript .userext
91+
text/javascript .userext2 .userext3
92+
""")
93+
94+
try:
95+
with mock.patch('webscrapbook.Config.user_config_dir', return_value=os.devnull):
96+
importlib.reload(mimetypes._mimetypes)
97+
importlib.reload(mimetypes)
98+
99+
# get the default conversion
100+
js_exts = mimetypes.guess_all_extensions('text/javascript')
101+
102+
with mock.patch('webscrapbook.Config.user_config_dir', return_value=user_config_dir):
103+
importlib.reload(mimetypes._mimetypes)
104+
importlib.reload(mimetypes)
105+
106+
# last-win (overwrite built-in)
107+
self.assertEqual(
108+
mimetypes.guess_type('abc.js'),
109+
('user/.type2', None),
110+
)
111+
112+
# first-win (add to last extensions)
113+
self.assertEqual(
114+
mimetypes.guess_all_extensions('text/javascript'),
115+
js_exts + ['..userext', '..userext2', '..userext3'],
116+
)
117+
finally:
118+
importlib.reload(mimetypes._mimetypes)
119+
importlib.reload(mimetypes)
120+
59121

60122
if __name__ == '__main__':
61123
unittest.main()

webscrapbook/_polyfill/mimetypes.py

+96-53
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,103 @@
11
import mimetypes as _mimetypes
22
import os
3-
from mimetypes import *
43

54
from .. import Config
65

7-
__all__ = _mimetypes.__all__
6+
WSB_USER_MIMETYPES = 'mime.types'
7+
8+
9+
def _patch_mimetypes():
10+
patch_types_map = {
11+
# WebScrapBook related
12+
'.htz': 'application/html+zip',
13+
'.maff': 'application/x-maff',
14+
'.wsba': 'application/wsba+zip',
15+
16+
# Some common types
17+
'.md': 'text/markdown',
18+
'.mkd': 'text/markdown',
19+
'.mkdn': 'text/markdown',
20+
'.mdwn': 'text/markdown',
21+
'.mdown': 'text/markdown',
22+
'.markdown': 'text/markdown',
23+
'.rss': 'application/rss+xml',
24+
'.atom': 'application/atom+xml',
25+
'.woff': 'font/woff',
26+
'.woff2': 'font/woff2',
27+
'.webp': 'image/webp',
28+
'.weba': 'audio/weba',
29+
'.webm': 'video/webm',
30+
'.oga': 'audio/ogg',
31+
'.ogv': 'video/ogg',
32+
'.ogx': 'application/ogg', # IANA
33+
'.ogg': 'application/ogg', # MAFF
34+
'.vtt': 'text/vtt',
35+
'.swf': 'application/x-shockwave-flash', # Apache, nginx, etc.
36+
'.jar': 'application/java-archive',
37+
'.class': 'application/java-vm',
38+
'.epub': 'application/epub+zip',
39+
'.7z': 'application/x-7z-compressed',
40+
'.rar': 'application/vnd.rar',
41+
42+
# .js is mapped to application/javascript or application/x-javascript in some OS
43+
# ref: https://www.ietf.org/rfc/rfc9239.txt
44+
# text/javascript is mapped to .es in Debian 12
45+
'.js': 'text/javascript',
46+
47+
# .bmp is mapped to image/x-ms-bmp in Python < 3.11
48+
# ref: https://github.com/python/cpython/issues/86194
49+
'.bmp': 'image/bmp',
50+
51+
# .ico is mapped to image/vnd.microsoft.icon in Python,
52+
# which is not actually used by Microsoft softwares and causes
53+
# a compatibility issue in IE9.
54+
# ref: https://en.wikipedia.org/wiki/ICO_%28file_format%29#MIME_type
55+
'.ico': 'image/x-icon',
56+
57+
# .zip is mapped to application/x-zip-compressed in Windows
58+
'.zip': 'application/zip',
59+
}
860

61+
def patch_db(db):
62+
# apply the patch
63+
patch_types_map_inv = {}
64+
for ext, type in patch_types_map.items():
65+
db.types_map[True][ext] = type
66+
patch_types_map_inv.setdefault(type, []).append(ext)
67+
for type, exts in patch_types_map_inv.items():
68+
entry = db.types_map_inv[True].setdefault(type, [])
69+
for ext in exts:
70+
try:
71+
entry.remove(ext)
72+
except ValueError:
73+
pass
74+
entry[0:0] = exts
975

10-
# add custom user MIME types mapping
11-
_mimetypes.knownfiles += [os.path.join(Config.user_config_dir(), 'mime.types')]
12-
13-
# WebScrapBook related
14-
_mimetypes.add_type('application/html+zip', '.htz')
15-
_mimetypes.add_type('application/x-maff', '.maff')
16-
_mimetypes.add_type('application/wsba+zip', '.wsba')
17-
18-
# Some common types
19-
_mimetypes.add_type('text/markdown', '.md')
20-
_mimetypes.add_type('text/markdown', '.mkd')
21-
_mimetypes.add_type('text/markdown', '.mkdn')
22-
_mimetypes.add_type('text/markdown', '.mdwn')
23-
_mimetypes.add_type('text/markdown', '.mdown')
24-
_mimetypes.add_type('text/markdown', '.markdown')
25-
_mimetypes.add_type('application/rss+xml', '.rss')
26-
_mimetypes.add_type('application/atom+xml', '.atom')
27-
_mimetypes.add_type('font/woff', '.woff')
28-
_mimetypes.add_type('font/woff2', '.woff2')
29-
_mimetypes.add_type('image/webp', '.webp')
30-
_mimetypes.add_type('audio/weba', '.weba')
31-
_mimetypes.add_type('video/webm', '.webm')
32-
_mimetypes.add_type('audio/ogg', '.oga')
33-
_mimetypes.add_type('video/ogg', '.ogv')
34-
_mimetypes.add_type('application/ogg', '.ogx') # IANA
35-
_mimetypes.add_type('application/ogg', '.ogg') # MAFF
36-
_mimetypes.add_type('text/vtt', '.vtt')
37-
_mimetypes.add_type('application/x-shockwave-flash', '.swf') # Apache, nginx, etc.
38-
_mimetypes.add_type('application/java-archive', '.jar')
39-
_mimetypes.add_type('application/java-vm', '.class')
40-
_mimetypes.add_type('application/epub+zip', '.epub')
41-
_mimetypes.add_type('application/x-7z-compressed', '.7z')
42-
_mimetypes.add_type('application/vnd.rar', '.rar')
43-
44-
# .js is mapped to application/javascript or application/x-javascript in some OS
45-
# ref: https://www.ietf.org/rfc/rfc9239.txt
46-
# text/javascript is mapped to .es in Debian 12
47-
_mimetypes.add_type('text/javascript', '.js')
48-
49-
# .bmp is mapped to image/x-ms-bmp in Python < 3.11
50-
# ref: https://github.com/python/cpython/issues/86194
51-
_mimetypes.add_type('image/bmp', '.bmp')
52-
53-
# .ico is mapped to image/vnd.microsoft.icon in Python,
54-
# which is not actually used by Microsoft softwares and causes
55-
# a compatibility issue in IE9.
56-
# ref: https://en.wikipedia.org/wiki/ICO_%28file_format%29#MIME_type
57-
_mimetypes.add_type('image/x-icon', '.ico')
58-
59-
# .zip is mapped to application/x-zip-compressed in Windows
60-
_mimetypes.add_type('application/zip', '.zip')
76+
# load user mappings
77+
for file in (os.path.join(Config.user_config_dir(), WSB_USER_MIMETYPES),):
78+
if os.path.isfile(file):
79+
db.read(file)
80+
81+
if _mimetypes.inited:
82+
patch_db(_mimetypes._db)
83+
else:
84+
# patch init
85+
patched = False
86+
_init = _mimetypes.init
87+
88+
def init(files=None):
89+
nonlocal patched
90+
_init(files)
91+
if not patched:
92+
patch_db(_mimetypes._db)
93+
patched = True
94+
95+
_mimetypes.init = init
96+
97+
98+
_patch_mimetypes()
99+
100+
# export all attributes
101+
from mimetypes import * # noqa: E402
102+
103+
__all__ = _mimetypes.__all__

webscrapbook/resources/mimetypes.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@ MIME type mappings for WebScrapBook are defined by:
44

55
1. the default mappings of the native Python code
66
2. the system-wide registry
7-
3. the user config file for WebScrapBook
8-
4. the internal mappings of WebScrapBook
7+
3. the internal mappings of WebScrapBook
8+
4. the user config file for WebScrapBook
99

1010
For conflicting definitions, a conversion of file extension to MIME type is
1111
handled in a last-win manner, while a conversion of MIME type to file
1212
extension(s) is handled in a first-win manner.
1313

14+
As an exception, the internal mappings of WebScrapBook overwrites any
15+
conflicting mappings of the prior ones, to fix known mapping issues in native
16+
Python and platforms.
17+
1418

1519
## System registry
1620

0 commit comments

Comments
 (0)