diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index be3a90c..ca90ce6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,10 +59,10 @@ jobs: steps: - name: Install optional tools macOS if: runner.os == 'macOS' && matrix.optional-deps - run: brew install pigz pbzip2 isa-l zstd + run: brew install pigz pbzip2 isa-l zstd lz4 - name: Install optional tools Linux if: runner.os == 'Linux' && matrix.optional-deps - run: sudo apt-get install pigz pbzip2 isal zstd + run: sudo apt-get install pigz pbzip2 isal zstd lz4 - name: Remove xz if: runner.os == 'Linux' && !matrix.optional-deps run: while which xz; do sudo rm $(which xz); done diff --git a/README.rst b/README.rst index b1039f0..1a9dc24 100644 --- a/README.rst +++ b/README.rst @@ -26,6 +26,7 @@ Supported compression formats are: - gzip (``.gz``) - bzip2 (``.bz2``) - xz (``.xz``) +- lz4 (``.lz4``) - Zstandard (``.zst``) (optional) ``xopen`` is compatible with Python versions 3.8 and later. @@ -73,7 +74,7 @@ The function opens the file using a function suitable for the detected file format and returns an open file-like object. When writing, the file format is chosen based on the file name extension: -``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``. +``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``. If the extension is not recognized, no compression is used. When reading and a file name extension is available, the format is detected @@ -101,13 +102,13 @@ preferred locale encoding. **compresslevel**: The compression level for writing to gzip, xz and Zstandard files. If set to None, a default depending on the format is used: -gzip: 1, xz: 6, Zstandard: 3. +gzip: 1, xz: 6, Zstandard: 3, lz4: 1. This parameter is ignored for other compression formats. **format**: Override the autodetection of the input or output format. -Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``. +Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``. **threads**: Set the number of additional threads spawned for compression or decompression. @@ -140,6 +141,9 @@ built-in support for multithreaded compression. For bz2 files, `pbzip2 (parallel bzip2) `_ is used. +For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html) +package is used. + ``xopen`` falls back to Python’s built-in functions (``gzip.open``, ``lzma.open``, ``bz2.open``) if none of the other methods can be used. diff --git a/pyproject.toml b/pyproject.toml index 5522cd7..42b281b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,8 @@ requires-python = ">=3.8" dynamic = ["version"] dependencies = [ 'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', - 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"' + 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', + 'lz4>4.3.1' ] [project.urls] diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 89f5137..472e1b1 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -43,6 +43,7 @@ XOPEN_DEFAULT_BZ2_COMPRESSION = 9 XOPEN_DEFAULT_XZ_COMPRESSION = 6 XOPEN_DEFAULT_ZST_COMPRESSION = 3 +XOPEN_DEFAULT_LZ4_COMPRESSION = 0 igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] @@ -70,6 +71,11 @@ except ImportError: zstandard = None # type: ignore +try: + import lz4.frame # type: ignore +except ImportError: + lz4 = None + try: import fcntl @@ -120,6 +126,7 @@ class _ProgramSettings: "zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"), "pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"), "gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))), + "lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))), } @@ -551,6 +558,36 @@ def _open_zst( return io.BufferedWriter(f) # mode "ab" and "wb" +def _open_lz4( + filename: FileOrPath, + mode: str, + compresslevel: Optional[int], + threads: Optional[int], +): + assert mode in ("rb", "ab", "wb") + if compresslevel is None: + compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION + + if threads != 0: + try: + return _PipedCompressionProgram( + filename, + mode, + compresslevel, + threads, + program_settings=_PROGRAM_SETTINGS["lz4"], + ) + except OSError: + if lz4 is None: + # No fallback available + raise + + if lz4 is None: + raise ImportError("lz4 module not available") + f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel) + return f + + def _open_gz( filename: FileOrPath, mode: str, @@ -683,6 +720,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]: elif bs[:4] == b"\x28\xb5\x2f\xfd": # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1 return "zst" + elif bs[:4] == b"\x04\x22\x4d\x18": + # https://en.wikipedia.org/wiki/LZ4_(compression_algorithm) + return "lz4" + return None finally: if closefd: @@ -694,7 +735,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]: Attempt to detect file format from the filename extension. Return None if no format could be detected. """ - for ext in ("bz2", "xz", "gz", "zst"): + for ext in ("bz2", "xz", "gz", "zst", "lz4"): if isinstance(filename, bytes): if filename.endswith(b"." + ext.encode()): return ext @@ -717,7 +758,7 @@ def _file_or_path_to_binary_stream( # object is not binary, this will crash at a later point. return file_or_path, False # type: ignore raise TypeError( - f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}." + f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}." ) @@ -797,6 +838,7 @@ def xopen( # noqa: C901 - .bz2 uses bzip2 compression - .xz uses xz/lzma compression - .zst uses zstandard compression + - .lz4 uses lz4 compression - otherwise, no compression is used When reading, if a file name extension is available, the format is detected @@ -808,7 +850,7 @@ def xopen( # noqa: C901 compresslevel is the compression level for writing to gzip, xz and zst files. This parameter is ignored for the other compression formats. If set to None, a default depending on the format is used: - gzip: 6, xz: 6, zstd: 3. + gzip: 6, xz: 6, zstd: 3, lz4: 0. When threads is None (the default), compressed file formats are read or written using a pipe to a subprocess running an external tool such as, @@ -828,7 +870,7 @@ def xopen( # noqa: C901 format overrides the autodetection of input and output formats. This can be useful when compressed output needs to be written to a file without an - extension. Possible values are "gz", "xz", "bz2", "zst". + extension. Possible values are "gz", "xz", "bz2", "zst", "lz4". """ if mode in ("r", "w", "a"): mode += "t" # type: ignore @@ -844,10 +886,10 @@ def xopen( # noqa: C901 elif _file_is_a_socket_or_pipe(filename): filename = open(filename, binary_mode) # type: ignore - if format not in (None, "gz", "xz", "bz2", "zst"): + if format not in (None, "gz", "xz", "bz2", "zst", "lz4"): raise ValueError( f"Format not supported: {format}. " - f"Choose one of: 'gz', 'xz', 'bz2', 'zst'" + f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'." ) detected_format = format or _detect_format_from_extension(filepath) if detected_format is None and "r" in mode: @@ -861,6 +903,8 @@ def xopen( # noqa: C901 opened_file = _open_bz2(filename, binary_mode, compresslevel, threads) elif detected_format == "zst": opened_file = _open_zst(filename, binary_mode, compresslevel, threads) + elif detected_format == "lz4": + opened_file = _open_lz4(filename, binary_mode, compresslevel, threads) else: opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode) diff --git a/tests/file.txt.lz4 b/tests/file.txt.lz4 new file mode 100644 index 0000000..5b2ed80 Binary files /dev/null and b/tests/file.txt.lz4 differ diff --git a/tests/test_piped.py b/tests/test_piped.py index 9f8afbe..eba903f 100644 --- a/tests/test_piped.py +++ b/tests/test_piped.py @@ -18,7 +18,7 @@ _ProgramSettings, ) -extensions = ["", ".gz", ".bz2", ".xz", ".zst"] +extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"] try: import fcntl @@ -57,16 +57,24 @@ def available_zstd_programs(): return [] +def available_lz4_programs(): + if shutil.which("lz4"): + return [_PROGRAM_SETTINGS["lz4"]] + return [] + + PIPED_GZIP_PROGRAMS = available_gzip_programs() PIPED_BZIP2_PROGRAMS = available_bzip2_programs() PIPED_XZ_PROGRAMS = available_xz_programs() PIPED_ZST_PROGRAMS = available_zstd_programs() +PIPED_LZ4_PROGRAMS = available_lz4_programs() ALL_PROGRAMS_WITH_EXTENSION = ( list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"]))) + list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"]))) + list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"]))) + list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"]))) + + list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"]))) ) diff --git a/tests/test_xopen.py b/tests/test_xopen.py index 86234bc..31562df 100644 --- a/tests/test_xopen.py +++ b/tests/test_xopen.py @@ -2,33 +2,33 @@ Tests for the xopen.xopen function """ import bz2 -import subprocess -import sys -import tempfile -from contextlib import contextmanager import functools import gzip import io import lzma import os -from pathlib import Path import shutil +import subprocess +import sys +import tempfile +from contextlib import contextmanager +from pathlib import Path +import lz4.frame import pytest -from xopen import xopen, _detect_format_from_content +from xopen import _detect_format_from_content, xopen try: import zstandard except ImportError: zstandard = None - # TODO this is duplicated in test_piped.py TEST_DIR = Path(__file__).parent CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"] CONTENT = "".join(CONTENT_LINES) -extensions = ["", ".gz", ".bz2", ".xz"] +extensions = ["", ".gz", ".bz2", ".xz", ".lz4"] if shutil.which("zstd") or zstandard: extensions += [".zst"] base = os.path.join(os.path.dirname(__file__), "file.txt") @@ -351,6 +351,7 @@ def test_read_no_threads(ext): ".gz": gzip.GzipFile, ".xz": lzma.LZMAFile, ".zst": io.BufferedReader, + ".lz4": lz4.frame.LZ4FrameFile, "": io.BufferedReader, } if ext == ".zst" and zstandard is None: @@ -381,6 +382,7 @@ def test_write_no_threads(tmp_path, ext): ".bz2": bz2.BZ2File, ".gz": gzip.GzipFile, ".xz": lzma.LZMAFile, + ".lz4": lz4.frame.LZ4FrameFile, "": io.BufferedWriter, } if ext == ".zst": @@ -599,7 +601,6 @@ def test_xopen_zst_long_window_size(threads): def test_pass_file_object_for_reading(ext, threads): if ext == ".zst" and zstandard is None: return - with open(TEST_DIR / f"file.txt{ext}", "rb") as fh: with xopen(fh, mode="rb", threads=threads) as f: assert f.readline() == CONTENT_LINES[0].encode("utf-8")