Skip to content

gh-101178: refactor base64.b85encode to be memory friendly #112248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 9 additions & 26 deletions Lib/base64.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,27 +298,12 @@ def b16decode(s, casefold=False):

def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
# Helper function for a85encode and b85encode
# chars2 is now unused
if not isinstance(b, bytes_types):
b = memoryview(b).tobytes()

padding = (-len(b)) % 4
if padding:
b = b + b'\0' * padding
words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)

chunks = [b'z' if foldnuls and not word else
b'y' if foldspaces and word == 0x20202020 else
(chars2[word // 614125] +
chars2[word // 85 % 7225] +
chars[word % 85])
for word in words]

if padding and not pad:
if chunks[-1] == b'z':
chunks[-1] = chars[0] * 5
chunks[-1] = chunks[-1][:-padding]

return b''.join(chunks)
return binascii._b2a_base85(b, chars=chars, pad=pad,
foldnuls=foldnuls, foldspaces=foldspaces)

def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
"""Encode bytes-like object b using Ascii85 and return a bytes object.
Expand All @@ -337,14 +322,13 @@ def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
adobe controls whether the encoded byte sequence is framed with <~ and ~>,
which is used by the Adobe implementation.
"""
global _a85chars, _a85chars2
global _a85chars
# Delay the initialization of tables to not waste memory
# if the function is never called
if _a85chars2 is None:
if _a85chars is None:
_a85chars = [bytes((i,)) for i in range(33, 118)]
_a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]

result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
result = _85encode(b, b''.join(_a85chars), None, pad, True, foldspaces)

if adobe:
result = _A85START + result
Expand Down Expand Up @@ -445,13 +429,12 @@ def b85encode(b, pad=False):
If pad is true, the input is padded with b'\\0' so its length is a multiple of
4 bytes before encoding.
"""
global _b85chars, _b85chars2
global _b85chars
# Delay the initialization of tables to not waste memory
# if the function is never called
if _b85chars2 is None:
if _b85chars is None:
_b85chars = [bytes((i,)) for i in _b85alphabet]
_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
return _85encode(b, _b85chars, _b85chars2, pad)
return _85encode(b, _b85alphabet, None, pad)

def b85decode(b):
"""Decode the base85-encoded bytes-like object or ASCII string b
Expand Down
1 change: 1 addition & 0 deletions Lib/test/test_base64.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ def test_b85encode(self):
b"""0123456789!@#0^&*();:<>,. []{}""":
b"""VPa!sWoBn+X=-b1ZEkOHadLBXb#`}nd3r%YLqtVJM@UIZOH55pPf$@("""
b"""Q&d$}S6EqEFflSSG&MFiI5{CeBQRbjDkv#CIy^osE+AW7dwl""",
b"paddu\xc7": b'aA9O*b;k',
b'no padding..': b'Zf_uPVPs@!Zf7no',
b'zero compression\x00\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG00000',
b'zero compression\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG0000',
Expand Down
95 changes: 95 additions & 0 deletions Modules/binascii.c
Original file line number Diff line number Diff line change
Expand Up @@ -1239,13 +1239,108 @@ binascii_b2a_qp_impl(PyObject *module, Py_buffer *data, int quotetabs,
return rv;
}

/*[clinic input]
binascii._b2a_base85

data: Py_buffer
chars: Py_buffer
pad: bool = False
foldnuls: bool = False
foldspaces: bool = False

Utility method used by the base64 module to encode a85/b85 data

data: bytes
chars: 85 bytes conversion table
pad: use NULL-paded input if necessary
foldnuls: replace NULL chunks by 'z'
foldspaces: replace space-only chucks by 'y'

[clinic start generated code]*/

static PyObject *
binascii__b2a_base85_impl(PyObject *module, Py_buffer *data,
Py_buffer *chars, int pad, int foldnuls,
int foldspaces)
/*[clinic end generated code: output=cefe84c300ad7314 input=3c8faf77b992dcc2]*/
{
if (chars->len != 85) {
PyErr_SetString(PyExc_ValueError,
"chars must be exactly 85 bytes long");
return NULL;
}

_PyBytesWriter writer;
_PyBytesWriter_Init(&writer);

const size_t bin_len = data->len;

// Allocate up to maxium encoded length, adjusted at end
const size_t ascii_len = ((bin_len + 3) / 4) * 5;

unsigned char *ascii_data = _PyBytesWriter_Alloc(&writer, ascii_len);
if (ascii_data == NULL) {
PyErr_NoMemory();
return NULL;
}

const unsigned char *table = chars->buf;
const unsigned char *bin_data = data->buf;

size_t i = 0 ;
int padding = 0;

// Conversion largely inspired from git base85 implementation
while (i < bin_len) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also credit the git implementation for this one as it's heavily based on it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Credit added. I don't know if I should phrase it differently?

I've also added some other comments to try to explain the logic more

// Translate each 4 byte chunk to 32bit integer
uint32_t value = 0;
for (int cnt = 24; cnt >= 0; cnt -= 8) {
value |= bin_data[i] << cnt;
if (++i == bin_len) {
// Number of bytes under the 4 bytes rounded value
padding = cnt / 8;
break;
}
}

// Handle NULL only and space-only cases (specific to ASCII85)
if (foldnuls && value == 0) {
*ascii_data++ = 'z';
}
else if (foldspaces && value == 0x20202020) {
*ascii_data++ = 'y';
}
else {
for (int j = 4; j >= 0; j--) {
ascii_data[j] = table[value % 85];
value /= 85;
}
ascii_data += 5;
}
}

// Expand the last folded null in case it did not fill a full chunk
if (padding && !pad && foldnuls && ascii_data[-1] == 'z') {
ascii_data--;
memset(ascii_data, table[0], 5);
ascii_data += 5;
}

if (!pad) {
ascii_data -= padding;
}

return _PyBytesWriter_Finish(&writer, ascii_data);
}

/* List of functions defined in the module */

static struct PyMethodDef binascii_module_methods[] = {
BINASCII_A2B_UU_METHODDEF
BINASCII_B2A_UU_METHODDEF
BINASCII_A2B_BASE64_METHODDEF
BINASCII_B2A_BASE64_METHODDEF
BINASCII__B2A_BASE85_METHODDEF
BINASCII_A2B_HEX_METHODDEF
BINASCII_B2A_HEX_METHODDEF
BINASCII_HEXLIFY_METHODDEF
Expand Down
112 changes: 111 additions & 1 deletion Modules/clinic/binascii.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading