Added support for EXIF orientation transform in read_image for JPEG (#8279)

vfdev-5 · web-flow · commit f3298dc50d45 · 2024-03-05T15:14:11.000+01:00
diff --git a/test/test_image.py b/test/test_image.py
@@ -9,7 +9,7 @@
 import torch
 import torchvision.transforms.functional as F
 from common_utils import assert_equal, needs_cuda
-from PIL import __version__ as PILLOW_VERSION, Image
+from PIL import __version__ as PILLOW_VERSION, Image, ImageOps
 from torchvision.io.image import (
     _read_png_16,
     decode_image,
@@ -100,6 +100,44 @@ def test_decode_jpeg(img_path, pil_mode, mode):
     assert abs_mean_diff < 2
 
 
+@pytest.mark.parametrize("orientation", [1, 2, 3, 4, 5, 6, 7, 8, 0])
+def test_decode_jpeg_with_exif_orientation(tmpdir, orientation):
+    fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.jpg")
+    t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8)
+    im = F.to_pil_image(t)
+    exif = im.getexif()
+    exif[0x0112] = orientation  # set exif orientation
+    im.save(fp, "JPEG", exif=exif.tobytes())
+
+    data = read_file(fp)
+    output = decode_image(data, apply_exif_orientation=True)
+
+    pimg = Image.open(fp)
+    pimg = ImageOps.exif_transpose(pimg)
+
+    expected = F.pil_to_tensor(pimg)
+    torch.testing.assert_close(expected, output)
+
+
+@pytest.mark.parametrize("size", [65533, 1, 7, 10, 23, 33])
+def test_invalid_exif(tmpdir, size):
+    # Inspired from a PIL test:
+    # https://github.com/python-pillow/Pillow/blob/8f63748e50378424628155994efd7e0739a4d1d1/Tests/test_file_jpeg.py#L299
+    fp = os.path.join(tmpdir, "invalid_exif.jpg")
+    t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8)
+    im = F.to_pil_image(t)
+    im.save(fp, "JPEG", exif=b"1" * size)
+
+    data = read_file(fp)
+    output = decode_image(data, apply_exif_orientation=True)
+
+    pimg = Image.open(fp)
+    pimg = ImageOps.exif_transpose(pimg)
+
+    expected = F.pil_to_tensor(pimg)
+    torch.testing.assert_close(expected, output)
+
+
 def test_decode_jpeg_errors():
     with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
         decode_jpeg(torch.empty((100, 1), dtype=torch.uint8))
diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -6,7 +6,10 @@
 namespace vision {
 namespace image {
 
-torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) {
+torch::Tensor decode_image(
+    const torch::Tensor& data,
+    ImageReadMode mode,
+    bool apply_exif_orientation) {
   // Check that tensor is a CPU tensor
   TORCH_CHECK(data.device() == torch::kCPU, "Expected a CPU tensor");
   // Check that the input tensor dtype is uint8
@@ -22,7 +25,7 @@ torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) {
   const uint8_t png_signature[4] = {137, 80, 78, 71}; // == "\211PNG"
 
   if (memcmp(jpeg_signature, datap, 3) == 0) {
-    return decode_jpeg(data, mode);
+    return decode_jpeg(data, mode, apply_exif_orientation);
   } else if (memcmp(png_signature, datap, 4) == 0) {
     return decode_png(data, mode);
   } else {
diff --git a/torchvision/csrc/io/image/cpu/decode_image.h b/torchvision/csrc/io/image/cpu/decode_image.h
@@ -8,7 +8,8 @@ namespace image {
 
 C10_EXPORT torch::Tensor decode_image(
     const torch::Tensor& data,
-    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
+    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
+    bool apply_exif_orientation = false);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -1,5 +1,6 @@
 #include "decode_jpeg.h"
 #include "common_jpeg.h"
+#include "exif.h"
 
 namespace vision {
 namespace image {
@@ -12,6 +13,7 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
 #else
 
 using namespace detail;
+using namespace exif_private;
 
 namespace {
 
@@ -65,6 +67,8 @@ static void torch_jpeg_set_source_mgr(
   src->len = len;
   src->pub.bytes_in_buffer = len;
   src->pub.next_input_byte = src->data;
+
+  jpeg_save_markers(cinfo, APP1, 0xffff);
 }
 
 inline unsigned char clamped_cmyk_rgb_convert(
@@ -121,7 +125,10 @@ void convert_line_cmyk_to_gray(
 
 } // namespace
 
-torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
+torch::Tensor decode_jpeg(
+    const torch::Tensor& data,
+    ImageReadMode mode,
+    bool apply_exif_orientation) {
   C10_LOG_API_USAGE_ONCE(
       "torchvision.csrc.io.image.cpu.decode_jpeg.decode_jpeg");
   // Check that the input tensor dtype is uint8
@@ -191,6 +198,11 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
     jpeg_calc_output_dimensions(&cinfo);
   }
 
+  int exif_orientation = -1;
+  if (apply_exif_orientation) {
+    exif_orientation = fetch_exif_orientation(&cinfo);
+  }
+
   jpeg_start_decompress(&cinfo);
 
   int height = cinfo.output_height;
@@ -227,7 +239,12 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
 
   jpeg_finish_decompress(&cinfo);
   jpeg_destroy_decompress(&cinfo);
-  return tensor.permute({2, 0, 1});
+  auto output = tensor.permute({2, 0, 1});
+
+  if (apply_exif_orientation) {
+    return exif_orientation_transform(output, exif_orientation);
+  }
+  return output;
 }
 #endif // #if !JPEG_FOUND
 
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.h b/torchvision/csrc/io/image/cpu/decode_jpeg.h
@@ -8,7 +8,8 @@ namespace image {
 
 C10_EXPORT torch::Tensor decode_jpeg(
     const torch::Tensor& data,
-    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
+    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
+    bool apply_exif_orientation = false);
 
 C10_EXPORT int64_t _jpeg_version();
 C10_EXPORT bool _is_compiled_against_turbo();
diff --git a/torchvision/csrc/io/image/cpu/exif.h b/torchvision/csrc/io/image/cpu/exif.h
@@ -0,0 +1,212 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this
+license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without
+modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright
+notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote
+products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is"
+and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are
+disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any
+direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#pragma once
+// Functions in this module are taken from OpenCV
+// https://github.com/opencv/opencv/blob/097891e311fae1d8354eb092a0fd0171e630d78c/modules/imgcodecs/src/exif.cpp
+
+#include <jpeglib.h>
+#include <torch/types.h>
+#include <vector>
+
+namespace vision {
+namespace image {
+namespace exif_private {
+
+constexpr uint16_t APP1 = 0xe1;
+constexpr uint16_t ENDIANNESS_INTEL = 0x49;
+constexpr uint16_t ENDIANNESS_MOTO = 0x4d;
+constexpr uint16_t REQ_EXIF_TAG_MARK = 0x2a;
+constexpr uint16_t ORIENTATION_EXIF_TAG = 0x0112;
+constexpr uint16_t INCORRECT_TAG = -1;
+
+inline uint16_t get_endianness(const std::vector<unsigned char>& exif_data) {
+  if ((exif_data.size() < 1) ||
+      (exif_data.size() > 1 && exif_data[0] != exif_data[1])) {
+    return 0;
+  }
+  if (exif_data[0] == 'I') {
+    return ENDIANNESS_INTEL;
+  }
+  if (exif_data[0] == 'M') {
+    return ENDIANNESS_MOTO;
+  }
+  return 0;
+}
+
+inline uint16_t get_uint16(
+    const std::vector<unsigned char>& exif_data,
+    uint16_t endianness,
+    const size_t offset) {
+  if (offset + 1 >= exif_data.size()) {
+    return INCORRECT_TAG;
+  }
+
+  if (endianness == ENDIANNESS_INTEL) {
+    return exif_data[offset] + (exif_data[offset + 1] << 8);
+  }
+  return (exif_data[offset] << 8) + exif_data[offset + 1];
+}
+
+inline uint32_t get_uint32(
+    const std::vector<unsigned char>& exif_data,
+    uint16_t endianness,
+    const size_t offset) {
+  if (offset + 3 >= exif_data.size()) {
+    return INCORRECT_TAG;
+  }
+
+  if (endianness == ENDIANNESS_INTEL) {
+    return exif_data[offset] + (exif_data[offset + 1] << 8) +
+        (exif_data[offset + 2] << 16) + (exif_data[offset + 3] << 24);
+  }
+  return (exif_data[offset] << 24) + (exif_data[offset + 1] << 16) +
+      (exif_data[offset + 2] << 8) + exif_data[offset + 3];
+}
+
+inline int fetch_exif_orientation(j_decompress_ptr cinfo) {
+  int exif_orientation = -1;
+  // Check for Exif marker APP1
+  jpeg_saved_marker_ptr exif_marker = 0;
+  jpeg_saved_marker_ptr cmarker = cinfo->marker_list;
+  while (cmarker && exif_marker == 0) {
+    if (cmarker->marker == APP1) {
+      exif_marker = cmarker;
+    }
+    cmarker = cmarker->next;
+  }
+
+  if (exif_marker) {
+    // Exif binary structure looks like this
+    // First 6 bytes: [E, x, i, f, 0, 0]
+    // Endianness, 2 bytes : [M, M] or [I, I]
+    // Tag mark, 2 bytes: [0, 0x2a]
+    // Offset, 4 bytes
+    // Num entries, 2 bytes
+    // Tag entries and data, tag has 2 bytes and its data has 10 bytes
+    // For more details:
+    // http://www.media.mit.edu/pia/Research/deepview/exif.html
+
+    // Bytes from Exif size field to the first TIFF header
+    constexpr size_t start_offset = 6;
+    if (exif_marker->data_length > start_offset) {
+      auto* exif_data_ptr = exif_marker->data + start_offset;
+      auto size = exif_marker->data_length - start_offset;
+      // Here we copy the data into the vector structure
+      // TODO: we can avoid copying the data and read directly from the pointer
+      std::vector<unsigned char> exif_data_vec(
+          exif_data_ptr, exif_data_ptr + size);
+
+      auto endianness = get_endianness(exif_data_vec);
+
+      // Checking whether Tag Mark (0x002A) correspond to one contained in the
+      // Jpeg file
+      uint16_t tag_mark = get_uint16(exif_data_vec, endianness, 2);
+      if (tag_mark == REQ_EXIF_TAG_MARK) {
+        auto offset = get_uint32(exif_data_vec, endianness, 4);
+        size_t num_entry = get_uint16(exif_data_vec, endianness, offset);
+        offset += 2; // go to start of tag fields
+        constexpr size_t tiff_field_size = 12;
+        for (size_t entry = 0; entry < num_entry; entry++) {
+          // Here we just search for orientation tag and parse it
+          auto tag_num = get_uint16(exif_data_vec, endianness, offset);
+          if (tag_num == INCORRECT_TAG) {
+            break;
+          }
+          if (tag_num == ORIENTATION_EXIF_TAG) {
+            exif_orientation =
+                get_uint16(exif_data_vec, endianness, offset + 8);
+            break;
+          }
+          offset += tiff_field_size;
+        }
+      }
+    }
+  }
+  return exif_orientation;
+}
+
+constexpr uint16_t IMAGE_ORIENTATION_TL = 1; // normal orientation
+constexpr uint16_t IMAGE_ORIENTATION_TR = 2; // needs horizontal flip
+constexpr uint16_t IMAGE_ORIENTATION_BR = 3; // needs 180 rotation
+constexpr uint16_t IMAGE_ORIENTATION_BL = 4; // needs vertical flip
+constexpr uint16_t IMAGE_ORIENTATION_LT =
+    5; // mirrored horizontal & rotate 270 CW
+constexpr uint16_t IMAGE_ORIENTATION_RT = 6; // rotate 90 CW
+constexpr uint16_t IMAGE_ORIENTATION_RB =
+    7; // mirrored horizontal & rotate 90 CW
+constexpr uint16_t IMAGE_ORIENTATION_LB = 8; // needs 270 CW rotation
+
+inline torch::Tensor exif_orientation_transform(
+    const torch::Tensor& image,
+    int orientation) {
+  if (orientation == IMAGE_ORIENTATION_TL) {
+    return image;
+  } else if (orientation == IMAGE_ORIENTATION_TR) {
+    return image.flip(-1);
+  } else if (orientation == IMAGE_ORIENTATION_BR) {
+    // needs 180 rotation equivalent to
+    // flip both horizontally and vertically
+    return image.flip({-2, -1});
+  } else if (orientation == IMAGE_ORIENTATION_BL) {
+    return image.flip(-2);
+  } else if (orientation == IMAGE_ORIENTATION_LT) {
+    return image.transpose(-1, -2);
+  } else if (orientation == IMAGE_ORIENTATION_RT) {
+    return image.transpose(-1, -2).flip(-1);
+  } else if (orientation == IMAGE_ORIENTATION_RB) {
+    return image.transpose(-1, -2).flip({-2, -1});
+  } else if (orientation == IMAGE_ORIENTATION_LB) {
+    return image.transpose(-1, -2).flip(-2);
+  }
+  return image;
+}
+
+} // namespace exif_private
+} // namespace image
+} // namespace vision
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
@@ -23,11 +23,13 @@ static auto registry =
     torch::RegisterOperators()
         .op("image::decode_png", &decode_png)
         .op("image::encode_png", &encode_png)
-        .op("image::decode_jpeg", &decode_jpeg)
+        .op("image::decode_jpeg(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
+            &decode_jpeg)
         .op("image::encode_jpeg", &encode_jpeg)
         .op("image::read_file", &read_file)
         .op("image::write_file", &write_file)
-        .op("image::decode_image", &decode_image)
+        .op("image::decode_image(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
+            &decode_image)
         .op("image::decode_jpeg_cuda", &decode_jpeg_cuda)
         .op("image::_jpeg_version", &_jpeg_version)
         .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo);
diff --git a/torchvision/io/image.py b/torchvision/io/image.py