From 5469049c820ce8307e56e140d96db9342f4c6754 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 23 Apr 2021 05:17:51 -0500
Subject: [PATCH 1/8] document video_sampler.

---
 torchvision/csrc/io/decoder/video_sampler.cpp | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
index 5b9726b7c6c..667f8e027e2 100644
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ b/torchvision/csrc/io/decoder/video_sampler.cpp
@@ -7,13 +7,26 @@
 namespace ffmpeg {
 
 namespace {
+
+// Setup the data pointers and linesizes based on the specified image
+// parameters and the provided array. This sets up "planes" to point to a
+// "buffer"
+// NOTE: this is most likely culprit behind #3534
+// 
+// Args:
+// fmt: desired output video format
+// buffer: source constant image buffer (in different format) that will contain
+// the final image after SWScale planes: destination data pointer to be filled
+// lineSize: target destination linesize (always {0})
 int preparePlanes(
     const VideoFormat& fmt,
     const uint8_t* buffer,
     uint8_t** planes,
     int* lineSize) {
+
   int result;
 
+  // NOTE: 1 at the end of av_fill_arrays is the value used for alignment
   if ((result = av_image_fill_arrays(
            planes,
            lineSize,
@@ -28,6 +41,18 @@ int preparePlanes(
   return result;
 }
 
+// Scale (and crop) the image slice in srcSlice and put the resulting scaled slice 
+// to `planes` buffer, which is mapped to be `out` via preparePlanes as `sws_scale` cannot
+// access buffers directly.
+//
+// Args:
+// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if scale)
+// srcSlice: frame data in YUV420P
+// srcStride: the array containing the strides for each plane of the source
+//            image (from AVFrame->linesize[0])
+// out: destination buffer
+// planes: indirect destination buffer (mapped to "out" via preparePlanes)
+// lines: destination linesize; constant {0}
 int transformImage(
     SwsContext* context,
     const uint8_t* const srcSlice[],
@@ -41,7 +66,7 @@ int transformImage(
   if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
     return result;
   }
-
+  // NOTE: srcY stride always 0: this is a parameter of YUV format
   if ((result = sws_scale(
            context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
       0) {
@@ -153,6 +178,12 @@ bool VideoSampler::init(const SamplerParameters& params) {
   return scaleContext_ != nullptr;
 }
 
+// Main body of the sample function called from one of the overloads below
+//
+// Args:
+// srcSlice: decoded AVFrame->data perpared buffer
+// srcStride: linesize (usually obtained from AVFrame->linesize)
+// out: return buffer (ByteStorage*)
 int VideoSampler::sample(
     const uint8_t* const srcSlice[],
     int srcStride[],
@@ -221,6 +252,7 @@ int VideoSampler::sample(
   return outImageSize;
 }
 
+// Call from `video_stream.cpp::114` - occurs during file reads
 int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
   if (!frame) {
     return 0; // no flush for videos
@@ -229,6 +261,7 @@ int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
   return sample(frame->data, frame->linesize, out);
 }
 
+// Call from `video_stream.cpp::114` - not sure when this occurs
 int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
   if (!in) {
     return 0; // no flush for videos

From 7ed00d24a384289b4314b2feb6372dc6611f0df4 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 23 Apr 2021 08:52:55 -0500
Subject: [PATCH 2/8] minor docs for decoder.cpp

---
 torchvision/csrc/io/decoder/decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index 6c9a3cdf825..f6c8b2b2bbd 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -381,7 +381,7 @@ bool Decoder::init(
     cleanUp();
     return false;
   }
-
+  // not sure what this does?
   onInit();
 
   if (params.startOffset != 0) {

From b8092f72dd7dcb966a8e68679ba29ffa9ef7cbd6 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 29 Apr 2021 09:12:16 -0500
Subject: [PATCH 3/8] descriptive comments for the stream.c

---
 torchvision/csrc/io/decoder/stream.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp
index 4da48647382..d1d8071ad14 100644
--- a/torchvision/csrc/io/decoder/stream.cpp
+++ b/torchvision/csrc/io/decoder/stream.cpp
@@ -24,10 +24,15 @@ Stream::~Stream() {
   }
 }
 
+// look up the proper CODEC querying the function
 AVCodec* Stream::findCodec(AVCodecParameters* params) {
   return avcodec_find_decoder(params->codec_id);
 }
 
+// Allocate memory for the AVCodecContext, which will hold the context for
+// decode/encode process. Then fill this codec context with CODEC parameters
+// defined in stream parameters. Open the codec, and allocate the global frame
+// defined in the header file
 int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
   AVStream* steam = inputCtx_->streams[format_.stream];
 
@@ -93,6 +98,9 @@ int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
   return ret;
 }
 
+// send the raw data packet (compressed frame) to the decoder, through the codec
+// context and receive the raw data frame (uncompressed frame) from the
+// decoder, through the same codec context
 int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
   int consumed = 0;
   int result = avcodec_send_packet(codecCtx_, packet);
@@ -134,6 +142,9 @@ int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
   return consumed;
 }
 
+// General decoding function:
+// given the packet, analyse the metadata, and write the
+// metadata and the buffer to the DecoderOutputImage.
 int Stream::decodePacket(
     const AVPacket* packet,
     DecoderOutputMessage* out,
@@ -167,6 +178,9 @@ int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
   return 1;
 }
 
+// Sets the header and payload via stream::setHeader and copyFrameBytes
+// functions that are defined in type stream subclass (VideoStream, AudioStream,
+// ...)
 int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
   if (flush) {
     // only flush of audio frames makes sense

From a7e55546ab5a9de18a5bec27f3ed7a3eb74ceafb Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 29 Apr 2021 10:50:32 -0500
Subject: [PATCH 4/8] descriptive comments for decoder.cpp

---
 torchvision/csrc/io/decoder/decoder.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index f6c8b2b2bbd..9ef6aaae03f 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -215,6 +215,12 @@ Decoder::~Decoder() {
   cleanUp();
 }
 
+// Initialise the format context that holds information about the container and
+// fill it with minimal information about the format (codecs are not opened
+// here). Function reads in intformation about the streams from the container
+// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
+// specified, it seeks into the correct frame (note, the seek defined here is
+// "precise" seek.
 bool Decoder::init(
     const DecoderParameters& params,
     DecoderInCallback&& in,
@@ -396,6 +402,8 @@ bool Decoder::init(
   return true;
 }
 
+// open appropriate CODEC for every type of stream and move it to the class
+// variable `streams_` and make sure it is in range for decoding
 bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
   for (int i = 0; i < inputCtx_->nb_streams; i++) {
     // - find the corespondent format at params_.formats set
@@ -478,6 +486,10 @@ void Decoder::cleanUp() {
   seekableBuffer_.shutdown();
 }
 
+// function does actual work, derived class calls it in working thread
+// periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if
+// no frames got decoded in the specified timeout time, and error on
+// unrecoverable error.
 int Decoder::getFrame(size_t workingTimeInMs) {
   if (inRange_.none()) {
     return ENODATA;
@@ -594,11 +606,13 @@ int Decoder::getFrame(size_t workingTimeInMs) {
   return 0;
 }
 
+// find stream by stream index
 Stream* Decoder::findByIndex(int streamIndex) const {
   auto it = streams_.find(streamIndex);
   return it != streams_.end() ? it->second.get() : nullptr;
 }
 
+// find stream by type; note finds only the first stream of a given type
 Stream* Decoder::findByType(const MediaFormat& format) const {
   for (auto& stream : streams_) {
     if (stream.second->getMediaFormat().type == format.type) {
@@ -608,6 +622,8 @@ Stream* Decoder::findByType(const MediaFormat& format) const {
   return nullptr;
 }
 
+// given the stream and packet, decode the frame buffers into the
+// DecoderOutputMessage data structure via stream::decodePacket function.
 int Decoder::processPacket(
     Stream* stream,
     AVPacket* packet,

From 3c17397d6f1d60076ca116b73fc081e18bbb0e39 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 29 Apr 2021 10:51:00 -0500
Subject: [PATCH 5/8] per-stream descriptive comments

---
 torchvision/csrc/io/decoder/audio_stream.cpp | 3 +++
 torchvision/csrc/io/decoder/video_stream.cpp | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp
index 9d66e589bf3..c61906f4f75 100644
--- a/torchvision/csrc/io/decoder/audio_stream.cpp
+++ b/torchvision/csrc/io/decoder/audio_stream.cpp
@@ -68,6 +68,7 @@ int AudioStream::initFormat() {
       : -1;
 }
 
+// copies audio sample bytes via swr_convert call in audio_sampler.cpp
 int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
   if (!sampler_) {
     sampler_ = std::make_unique<AudioSampler>(codecCtx_);
@@ -95,6 +96,8 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
             << ", channels: " << format_.format.audio.channels
             << ", format: " << format_.format.audio.format;
   }
+  // calls to a sampler that converts the audio samples and copies them to the
+  // out buffer via ffmpeg::swr_convert
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 
diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp
index a9e20434fe0..8028ae899e2 100644
--- a/torchvision/csrc/io/decoder/video_stream.cpp
+++ b/torchvision/csrc/io/decoder/video_stream.cpp
@@ -80,6 +80,7 @@ int VideoStream::initFormat() {
       : -1;
 }
 
+// copies frame bytes via sws_scale call in video_sampler.cpp
 int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
   if (!sampler_) {
     sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
@@ -110,7 +111,9 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
             << ", minDimension: " << format_.format.video.minDimension
             << ", crop: " << format_.format.video.cropImage;
   }
-
+  // calls to a sampler that converts the frame from YUV422 to RGB24, and
+  // optionally crops and resizes the frame. Frame bytes are copied from
+  // frame_->data to out buffer
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 

From f683f73b0e7c2c20f03c72bf12ebd7185c6eae99 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 4 Jun 2021 05:27:55 -0500
Subject: [PATCH 6/8] Fixing CLANG hopefully

---
 torchvision/csrc/io/decoder/video_sampler.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
index 667f8e027e2..7816d70b706 100644
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ b/torchvision/csrc/io/decoder/video_sampler.cpp
@@ -12,7 +12,7 @@ namespace {
 // parameters and the provided array. This sets up "planes" to point to a
 // "buffer"
 // NOTE: this is most likely culprit behind #3534
-// 
+//
 // Args:
 // fmt: desired output video format
 // buffer: source constant image buffer (in different format) that will contain
@@ -23,7 +23,6 @@ int preparePlanes(
     const uint8_t* buffer,
     uint8_t** planes,
     int* lineSize) {
-
   int result;
 
   // NOTE: 1 at the end of av_fill_arrays is the value used for alignment
@@ -41,14 +40,14 @@ int preparePlanes(
   return result;
 }
 
-// Scale (and crop) the image slice in srcSlice and put the resulting scaled slice 
-// to `planes` buffer, which is mapped to be `out` via preparePlanes as `sws_scale` cannot
-// access buffers directly.
+// Scale (and crop) the image slice in srcSlice and put the resulting scaled
+// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as
+// `sws_scale` cannot access buffers directly.
 //
 // Args:
-// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if scale)
-// srcSlice: frame data in YUV420P
-// srcStride: the array containing the strides for each plane of the source
+// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if
+// scale) srcSlice: frame data in YUV420P srcStride: the array containing the
+// strides for each plane of the source
 //            image (from AVFrame->linesize[0])
 // out: destination buffer
 // planes: indirect destination buffer (mapped to "out" via preparePlanes)

From 2b8bfa7ef19f3556422424a939ddc4a8383d70be Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 13 Aug 2021 22:37:59 +0100
Subject: [PATCH 7/8] addressing prabhat's comments

---
 torchvision/csrc/io/decoder/decoder.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index b02ec5c21a5..348caf93ff6 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -215,10 +215,10 @@ Decoder::~Decoder() {
 
 // Initialise the format context that holds information about the container and
 // fill it with minimal information about the format (codecs are not opened
-// here). Function reads in intformation about the streams from the container
+// here). Function reads in information about the streams from the container
 // into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
-// specified, it seeks into the correct frame (note, the seek defined here is
-// "precise" seek.
+// specified within the decoder parameters, it seeks into the correct frame
+// (note, the seek defined here is"precise" seek).
 bool Decoder::init(
     const DecoderParameters& params,
     DecoderInCallback&& in,
@@ -385,7 +385,7 @@ bool Decoder::init(
     cleanUp();
     return false;
   }
-  // not sure what this does?
+  // SyncDecoder inherits Decoder which would override onInit.
   onInit();
 
   if (params.startOffset != 0) {
@@ -485,7 +485,7 @@ void Decoder::cleanUp() {
 }
 
 // function does actual work, derived class calls it in working thread
-// periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if
+// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if
 // no frames got decoded in the specified timeout time, and error on
 // unrecoverable error.
 int Decoder::getFrame(size_t workingTimeInMs) {

From 7b6c05f168a9d1cbbe3480b3446edbbc25084863 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Fri, 13 Aug 2021 22:41:25 +0100
Subject: [PATCH 8/8] typo I think

---
 torchvision/csrc/io/decoder/decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index 348caf93ff6..5989183e643 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -218,7 +218,7 @@ Decoder::~Decoder() {
 // here). Function reads in information about the streams from the container
 // into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
 // specified within the decoder parameters, it seeks into the correct frame
-// (note, the seek defined here is"precise" seek).
+// (note, the seek defined here is "precise" seek).
 bool Decoder::init(
     const DecoderParameters& params,
     DecoderInCallback&& in,