Skip to content

Commit 143d078

Browse files
deekay42NicolasHug
andauthored
Adding GPU acceleration to encode_jpeg (#8391)
Co-authored-by: Nicolas Hug <[email protected]> Co-authored-by: Nicolas Hug <[email protected]>
1 parent f96c42f commit 143d078

File tree

10 files changed

+622
-20
lines changed

10 files changed

+622
-20
lines changed

benchmarks/encoding.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import os
2+
import platform
3+
import statistics
4+
5+
import torch
6+
import torch.utils.benchmark as benchmark
7+
import torchvision
8+
9+
10+
def print_machine_specs():
11+
print("Processor:", platform.processor())
12+
print("Platform:", platform.platform())
13+
print("Logical CPUs:", os.cpu_count())
14+
print(f"\nCUDA device: {torch.cuda.get_device_name()}")
15+
print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
16+
17+
18+
def get_data():
19+
transform = torchvision.transforms.Compose(
20+
[
21+
torchvision.transforms.PILToTensor(),
22+
]
23+
)
24+
path = os.path.join(os.getcwd(), "data")
25+
testset = torchvision.datasets.Places365(
26+
root="./data", download=not os.path.exists(path), transform=transform, split="val"
27+
)
28+
testloader = torch.utils.data.DataLoader(
29+
testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
30+
)
31+
return next(iter(testloader))
32+
33+
34+
def run_benchmark(batch):
35+
results = []
36+
for device in ["cpu", "cuda"]:
37+
batch_device = [t.to(device=device) for t in batch]
38+
for size in [1, 100, 1000]:
39+
for num_threads in [1, 12, 24]:
40+
for stmt, strat in zip(
41+
[
42+
"[torchvision.io.encode_jpeg(img) for img in batch_input]",
43+
"torchvision.io.encode_jpeg(batch_input)",
44+
],
45+
["unfused", "fused"],
46+
):
47+
batch_input = batch_device[:size]
48+
t = benchmark.Timer(
49+
stmt=stmt,
50+
setup="import torchvision",
51+
globals={"batch_input": batch_input},
52+
label="Image Encoding",
53+
sub_label=f"{device.upper()} ({strat}): {stmt}",
54+
description=f"{size} images",
55+
num_threads=num_threads,
56+
)
57+
results.append(t.blocked_autorange())
58+
compare = benchmark.Compare(results)
59+
compare.print()
60+
61+
62+
if __name__ == "__main__":
63+
print_machine_specs()
64+
batch = get_data()
65+
mean_h, mean_w = statistics.mean(t.shape[-2] for t in batch), statistics.mean(t.shape[-1] for t in batch)
66+
print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
67+
run_benchmark(batch)

test/test_image.py

Lines changed: 196 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import concurrent.futures
12
import glob
23
import io
34
import os
@@ -10,7 +11,7 @@
1011
import requests
1112
import torch
1213
import torchvision.transforms.functional as F
13-
from common_utils import assert_equal, IN_OSS_CI, needs_cuda
14+
from common_utils import assert_equal, cpu_and_cuda, IN_OSS_CI, needs_cuda
1415
from PIL import __version__ as PILLOW_VERSION, Image, ImageOps, ImageSequence
1516
from torchvision.io.image import (
1617
_read_png_16,
@@ -508,6 +509,200 @@ def test_encode_jpeg(img_path, scripted):
508509
assert_equal(encoded_jpeg_torch, encoded_jpeg_pil)
509510

510511

512+
@needs_cuda
513+
def test_encode_jpeg_cuda_device_param():
514+
path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path)
515+
516+
data = read_image(path)
517+
518+
current_device = torch.cuda.current_device()
519+
current_stream = torch.cuda.current_stream()
520+
num_devices = torch.cuda.device_count()
521+
devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
522+
results = []
523+
for device in devices:
524+
print(f"python: device: {device}")
525+
results.append(encode_jpeg(data.to(device=device)))
526+
assert len(results) == len(devices)
527+
for result in results:
528+
assert torch.all(result.cpu() == results[0].cpu())
529+
530+
assert current_device == torch.cuda.current_device()
531+
assert current_stream == torch.cuda.current_stream()
532+
533+
534+
@needs_cuda
535+
@pytest.mark.parametrize(
536+
"img_path",
537+
[pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")],
538+
)
539+
@pytest.mark.parametrize("scripted", (False, True))
540+
@pytest.mark.parametrize("contiguous", (False, True))
541+
def test_encode_jpeg_cuda(img_path, scripted, contiguous):
542+
decoded_image_tv = read_image(img_path)
543+
encode_fn = torch.jit.script(encode_jpeg) if scripted else encode_jpeg
544+
545+
if "cmyk" in img_path:
546+
pytest.xfail("Encoding a CMYK jpeg isn't supported")
547+
if decoded_image_tv.shape[0] == 1:
548+
pytest.xfail("Decoding a grayscale jpeg isn't supported")
549+
# For more detail as to why check out: https://github.com/NVIDIA/cuda-samples/issues/23#issuecomment-559283013
550+
if contiguous:
551+
decoded_image_tv = decoded_image_tv[None].contiguous(memory_format=torch.contiguous_format)[0]
552+
else:
553+
decoded_image_tv = decoded_image_tv[None].contiguous(memory_format=torch.channels_last)[0]
554+
encoded_jpeg_cuda_tv = encode_fn(decoded_image_tv.cuda(), quality=75)
555+
decoded_jpeg_cuda_tv = decode_jpeg(encoded_jpeg_cuda_tv.cpu())
556+
557+
# the actual encoded bytestreams from libnvjpeg and libjpeg-turbo differ for the same quality
558+
# instead, we re-decode the encoded image and compare to the original
559+
abs_mean_diff = (decoded_jpeg_cuda_tv.float() - decoded_image_tv.float()).abs().mean().item()
560+
assert abs_mean_diff < 3
561+
562+
563+
@pytest.mark.parametrize("device", cpu_and_cuda())
564+
@pytest.mark.parametrize("scripted", (True, False))
565+
@pytest.mark.parametrize("contiguous", (True, False))
566+
def test_encode_jpegs_batch(scripted, contiguous, device):
567+
if device == "cpu" and IS_MACOS:
568+
pytest.skip("https://github.com/pytorch/vision/issues/8031")
569+
decoded_images_tv = []
570+
for jpeg_path in get_images(IMAGE_ROOT, ".jpg"):
571+
if "cmyk" in jpeg_path:
572+
continue
573+
decoded_image = read_image(jpeg_path)
574+
if decoded_image.shape[0] == 1:
575+
continue
576+
if contiguous:
577+
decoded_image = decoded_image[None].contiguous(memory_format=torch.contiguous_format)[0]
578+
else:
579+
decoded_image = decoded_image[None].contiguous(memory_format=torch.channels_last)[0]
580+
decoded_images_tv.append(decoded_image)
581+
582+
encode_fn = torch.jit.script(encode_jpeg) if scripted else encode_jpeg
583+
584+
decoded_images_tv_device = [img.to(device=device) for img in decoded_images_tv]
585+
encoded_jpegs_tv_device = encode_fn(decoded_images_tv_device, quality=75)
586+
encoded_jpegs_tv_device = [decode_jpeg(img.cpu()) for img in encoded_jpegs_tv_device]
587+
588+
for original, encoded_decoded in zip(decoded_images_tv, encoded_jpegs_tv_device):
589+
c, h, w = original.shape
590+
abs_mean_diff = (original.float() - encoded_decoded.float()).abs().mean().item()
591+
assert abs_mean_diff < 3
592+
593+
# test multithreaded decoding
594+
# in the current version we prevent this by using a lock but we still want to test it
595+
num_workers = 10
596+
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
597+
futures = [executor.submit(encode_fn, decoded_images_tv_device) for _ in range(num_workers)]
598+
encoded_images_threaded = [future.result() for future in futures]
599+
assert len(encoded_images_threaded) == num_workers
600+
for encoded_images in encoded_images_threaded:
601+
assert len(decoded_images_tv_device) == len(encoded_images)
602+
for i, (encoded_image_cuda, decoded_image_tv) in enumerate(zip(encoded_images, decoded_images_tv_device)):
603+
# make sure all the threads produce identical outputs
604+
assert torch.all(encoded_image_cuda == encoded_images_threaded[0][i])
605+
606+
# make sure the outputs are identical or close enough to baseline
607+
decoded_cuda_encoded_image = decode_jpeg(encoded_image_cuda.cpu())
608+
assert decoded_cuda_encoded_image.shape == decoded_image_tv.shape
609+
assert decoded_cuda_encoded_image.dtype == decoded_image_tv.dtype
610+
assert (decoded_cuda_encoded_image.cpu().float() - decoded_image_tv.cpu().float()).abs().mean() < 3
611+
612+
613+
@needs_cuda
614+
def test_single_encode_jpeg_cuda_errors():
615+
with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"):
616+
encode_jpeg(torch.empty((3, 100, 100), dtype=torch.float32, device="cuda"))
617+
618+
with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 5"):
619+
encode_jpeg(torch.empty((5, 100, 100), dtype=torch.uint8, device="cuda"))
620+
621+
with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 1"):
622+
encode_jpeg(torch.empty((1, 100, 100), dtype=torch.uint8, device="cuda"))
623+
624+
with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
625+
encode_jpeg(torch.empty((1, 3, 100, 100), dtype=torch.uint8, device="cuda"))
626+
627+
with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
628+
encode_jpeg(torch.empty((100, 100), dtype=torch.uint8, device="cuda"))
629+
630+
631+
@needs_cuda
632+
def test_batch_encode_jpegs_cuda_errors():
633+
with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"):
634+
encode_jpeg(
635+
[
636+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
637+
torch.empty((3, 100, 100), dtype=torch.float32, device="cuda"),
638+
]
639+
)
640+
641+
with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 5"):
642+
encode_jpeg(
643+
[
644+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
645+
torch.empty((5, 100, 100), dtype=torch.uint8, device="cuda"),
646+
]
647+
)
648+
649+
with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 1"):
650+
encode_jpeg(
651+
[
652+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
653+
torch.empty((1, 100, 100), dtype=torch.uint8, device="cuda"),
654+
]
655+
)
656+
657+
with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
658+
encode_jpeg(
659+
[
660+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
661+
torch.empty((1, 3, 100, 100), dtype=torch.uint8, device="cuda"),
662+
]
663+
)
664+
665+
with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"):
666+
encode_jpeg(
667+
[
668+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
669+
torch.empty((100, 100), dtype=torch.uint8, device="cuda"),
670+
]
671+
)
672+
673+
with pytest.raises(RuntimeError, match="Input tensor should be on CPU"):
674+
encode_jpeg(
675+
[
676+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cpu"),
677+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
678+
]
679+
)
680+
681+
with pytest.raises(
682+
RuntimeError, match="All input tensors must be on the same CUDA device when encoding with nvjpeg"
683+
):
684+
encode_jpeg(
685+
[
686+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"),
687+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cpu"),
688+
]
689+
)
690+
691+
if torch.cuda.device_count() >= 2:
692+
with pytest.raises(
693+
RuntimeError, match="All input tensors must be on the same CUDA device when encoding with nvjpeg"
694+
):
695+
encode_jpeg(
696+
[
697+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda:0"),
698+
torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda:1"),
699+
]
700+
)
701+
702+
with pytest.raises(ValueError, match="encode_jpeg requires at least one input tensor when a list is passed"):
703+
encode_jpeg([])
704+
705+
511706
@pytest.mark.skipif(IS_MACOS, reason="https://github.com/pytorch/vision/issues/8031")
512707
@pytest.mark.parametrize(
513708
"img_path",

torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "decode_jpeg_cuda.h"
1+
#include "encode_decode_jpegs_cuda.h"
22

33
#include <ATen/ATen.h>
44

torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h renamed to torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <torch/types.h>
44
#include "../image_read_mode.h"
5+
#include "encode_jpegs_cuda.h"
56

67
namespace vision {
78
namespace image {
@@ -11,5 +12,9 @@ C10_EXPORT torch::Tensor decode_jpeg_cuda(
1112
ImageReadMode mode,
1213
torch::Device device);
1314

15+
C10_EXPORT std::vector<torch::Tensor> encode_jpegs_cuda(
16+
const std::vector<torch::Tensor>& decoded_images,
17+
const int64_t quality);
18+
1419
} // namespace image
1520
} // namespace vision

0 commit comments

Comments
 (0)