Skip to content

Support for lz4 compression #163 #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ jobs:
steps:
- name: Install optional tools macOS
if: runner.os == 'macOS' && matrix.optional-deps
run: brew install pigz pbzip2 isa-l zstd
run: brew install pigz pbzip2 isa-l zstd lz4
- name: Install optional tools Linux
if: runner.os == 'Linux' && matrix.optional-deps
run: sudo apt-get install pigz pbzip2 isal zstd
run: sudo apt-get install pigz pbzip2 isal zstd lz4
- name: Remove xz
if: runner.os == 'Linux' && !matrix.optional-deps
run: while which xz; do sudo rm $(which xz); done
Expand Down
10 changes: 7 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Supported compression formats are:
- gzip (``.gz``)
- bzip2 (``.bz2``)
- xz (``.xz``)
- lz4 (``.lz4``)
- Zstandard (``.zst``) (optional)

``xopen`` is compatible with Python versions 3.8 and later.
Expand Down Expand Up @@ -73,7 +74,7 @@ The function opens the file using a function suitable for the detected
file format and returns an open file-like object.

When writing, the file format is chosen based on the file name extension:
``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``.
``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``.
If the extension is not recognized, no compression is used.

When reading and a file name extension is available, the format is detected
Expand Down Expand Up @@ -101,13 +102,13 @@ preferred locale encoding.
**compresslevel**:
The compression level for writing to gzip, xz and Zstandard files.
If set to None, a default depending on the format is used:
gzip: 1, xz: 6, Zstandard: 3.
gzip: 1, xz: 6, Zstandard: 3, lz4: 1.

This parameter is ignored for other compression formats.

**format**:
Override the autodetection of the input or output format.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``.

**threads**:
Set the number of additional threads spawned for compression or decompression.
Expand Down Expand Up @@ -140,6 +141,9 @@ built-in support for multithreaded compression.

For bz2 files, `pbzip2 (parallel bzip2) <http://compression.great-site.net/pbzip2/>`_ is used.

For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html)
package is used.

``xopen`` falls back to Python’s built-in functions
(``gzip.open``, ``lzma.open``, ``bz2.open``)
if none of the other methods can be used.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ requires-python = ">=3.8"
dynamic = ["version"]
dependencies = [
'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"'
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'lz4>4.3.1'
]

[project.urls]
Expand Down
56 changes: 50 additions & 6 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
XOPEN_DEFAULT_BZ2_COMPRESSION = 9
XOPEN_DEFAULT_XZ_COMPRESSION = 6
XOPEN_DEFAULT_ZST_COMPRESSION = 3
XOPEN_DEFAULT_LZ4_COMPRESSION = 0

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
Expand Down Expand Up @@ -70,6 +71,11 @@
except ImportError:
zstandard = None # type: ignore

try:
import lz4.frame # type: ignore
except ImportError:
lz4 = None

try:
import fcntl

Expand Down Expand Up @@ -120,6 +126,7 @@ class _ProgramSettings:
"zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"),
"pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"),
"gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))),
"lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))),
}


Expand Down Expand Up @@ -551,6 +558,36 @@ def _open_zst(
return io.BufferedWriter(f) # mode "ab" and "wb"


def _open_lz4(
filename: FileOrPath,
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
):
assert mode in ("rb", "ab", "wb")
if compresslevel is None:
compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION

if threads != 0:
try:
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_PROGRAM_SETTINGS["lz4"],
)
except OSError:
if lz4 is None:
# No fallback available
raise

if lz4 is None:
raise ImportError("lz4 module not available")
f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel)
return f


def _open_gz(
filename: FileOrPath,
mode: str,
Expand Down Expand Up @@ -683,6 +720,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
elif bs[:4] == b"\x28\xb5\x2f\xfd":
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
return "zst"
elif bs[:4] == b"\x04\x22\x4d\x18":
# https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)
return "lz4"

return None
finally:
if closefd:
Expand All @@ -694,7 +735,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
Attempt to detect file format from the filename extension.
Return None if no format could be detected.
"""
for ext in ("bz2", "xz", "gz", "zst"):
for ext in ("bz2", "xz", "gz", "zst", "lz4"):
if isinstance(filename, bytes):
if filename.endswith(b"." + ext.encode()):
return ext
Expand All @@ -717,7 +758,7 @@ def _file_or_path_to_binary_stream(
# object is not binary, this will crash at a later point.
return file_or_path, False # type: ignore
raise TypeError(
f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}."
f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}."
)


Expand Down Expand Up @@ -797,6 +838,7 @@ def xopen( # noqa: C901
- .bz2 uses bzip2 compression
- .xz uses xz/lzma compression
- .zst uses zstandard compression
- .lz4 uses lz4 compression
- otherwise, no compression is used

When reading, if a file name extension is available, the format is detected
Expand All @@ -808,7 +850,7 @@ def xopen( # noqa: C901
compresslevel is the compression level for writing to gzip, xz and zst files.
This parameter is ignored for the other compression formats.
If set to None, a default depending on the format is used:
gzip: 6, xz: 6, zstd: 3.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: didn't we change the gzip level to 1?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we did.

gzip: 6, xz: 6, zstd: 3, lz4: 0.

When threads is None (the default), compressed file formats are read or written
using a pipe to a subprocess running an external tool such as,
Expand All @@ -828,7 +870,7 @@ def xopen( # noqa: C901

format overrides the autodetection of input and output formats. This can be
useful when compressed output needs to be written to a file without an
extension. Possible values are "gz", "xz", "bz2", "zst".
extension. Possible values are "gz", "xz", "bz2", "zst", "lz4".
"""
if mode in ("r", "w", "a"):
mode += "t" # type: ignore
Expand All @@ -844,10 +886,10 @@ def xopen( # noqa: C901
elif _file_is_a_socket_or_pipe(filename):
filename = open(filename, binary_mode) # type: ignore

if format not in (None, "gz", "xz", "bz2", "zst"):
if format not in (None, "gz", "xz", "bz2", "zst", "lz4"):
raise ValueError(
f"Format not supported: {format}. "
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'."
)
detected_format = format or _detect_format_from_extension(filepath)
if detected_format is None and "r" in mode:
Expand All @@ -861,6 +903,8 @@ def xopen( # noqa: C901
opened_file = _open_bz2(filename, binary_mode, compresslevel, threads)
elif detected_format == "zst":
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
elif detected_format == "lz4":
opened_file = _open_lz4(filename, binary_mode, compresslevel, threads)
else:
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)

Expand Down
Binary file added tests/file.txt.lz4
Binary file not shown.
10 changes: 9 additions & 1 deletion tests/test_piped.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_ProgramSettings,
)

extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"]

try:
import fcntl
Expand Down Expand Up @@ -57,16 +57,24 @@ def available_zstd_programs():
return []


def available_lz4_programs():
if shutil.which("lz4"):
return [_PROGRAM_SETTINGS["lz4"]]
return []


PIPED_GZIP_PROGRAMS = available_gzip_programs()
PIPED_BZIP2_PROGRAMS = available_bzip2_programs()
PIPED_XZ_PROGRAMS = available_xz_programs()
PIPED_ZST_PROGRAMS = available_zstd_programs()
PIPED_LZ4_PROGRAMS = available_lz4_programs()

ALL_PROGRAMS_WITH_EXTENSION = (
list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"])))
+ list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"])))
+ list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"])))
+ list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"])))
+ list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"])))
)


Expand Down
19 changes: 10 additions & 9 deletions tests/test_xopen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,33 @@
Tests for the xopen.xopen function
"""
import bz2
import subprocess
import sys
import tempfile
from contextlib import contextmanager
import functools
import gzip
import io
import lzma
import os
from pathlib import Path
import shutil
import subprocess
import sys
import tempfile
from contextlib import contextmanager
from pathlib import Path

import lz4.frame
import pytest

from xopen import xopen, _detect_format_from_content
from xopen import _detect_format_from_content, xopen

try:
import zstandard
except ImportError:
zstandard = None


# TODO this is duplicated in test_piped.py
TEST_DIR = Path(__file__).parent
CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"]
CONTENT = "".join(CONTENT_LINES)
extensions = ["", ".gz", ".bz2", ".xz"]
extensions = ["", ".gz", ".bz2", ".xz", ".lz4"]
if shutil.which("zstd") or zstandard:
extensions += [".zst"]
base = os.path.join(os.path.dirname(__file__), "file.txt")
Expand Down Expand Up @@ -351,6 +351,7 @@ def test_read_no_threads(ext):
".gz": gzip.GzipFile,
".xz": lzma.LZMAFile,
".zst": io.BufferedReader,
".lz4": lz4.frame.LZ4FrameFile,
"": io.BufferedReader,
}
if ext == ".zst" and zstandard is None:
Expand Down Expand Up @@ -381,6 +382,7 @@ def test_write_no_threads(tmp_path, ext):
".bz2": bz2.BZ2File,
".gz": gzip.GzipFile,
".xz": lzma.LZMAFile,
".lz4": lz4.frame.LZ4FrameFile,
"": io.BufferedWriter,
}
if ext == ".zst":
Expand Down Expand Up @@ -599,7 +601,6 @@ def test_xopen_zst_long_window_size(threads):
def test_pass_file_object_for_reading(ext, threads):
if ext == ".zst" and zstandard is None:
return

with open(TEST_DIR / f"file.txt{ext}", "rb") as fh:
with xopen(fh, mode="rb", threads=threads) as f:
assert f.readline() == CONTENT_LINES[0].encode("utf-8")
Expand Down
Loading