From 5469049c820ce8307e56e140d96db9342f4c6754 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 23 Apr 2021 05:17:51 -0500 Subject: [PATCH 1/8] document video_sampler. --- torchvision/csrc/io/decoder/video_sampler.cpp | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp index 5b9726b7c6c..667f8e027e2 100644 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ b/torchvision/csrc/io/decoder/video_sampler.cpp @@ -7,13 +7,26 @@ namespace ffmpeg { namespace { + +// Setup the data pointers and linesizes based on the specified image +// parameters and the provided array. This sets up "planes" to point to a +// "buffer" +// NOTE: this is most likely culprit behind #3534 +// +// Args: +// fmt: desired output video format +// buffer: source constant image buffer (in different format) that will contain +// the final image after SWScale planes: destination data pointer to be filled +// lineSize: target destination linesize (always {0}) int preparePlanes( const VideoFormat& fmt, const uint8_t* buffer, uint8_t** planes, int* lineSize) { + int result; + // NOTE: 1 at the end of av_fill_arrays is the value used for alignment if ((result = av_image_fill_arrays( planes, lineSize, @@ -28,6 +41,18 @@ int preparePlanes( return result; } +// Scale (and crop) the image slice in srcSlice and put the resulting scaled slice +// to `planes` buffer, which is mapped to be `out` via preparePlanes as `sws_scale` cannot +// access buffers directly. +// +// Args: +// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if scale) +// srcSlice: frame data in YUV420P +// srcStride: the array containing the strides for each plane of the source +// image (from AVFrame->linesize[0]) +// out: destination buffer +// planes: indirect destination buffer (mapped to "out" via preparePlanes) +// lines: destination linesize; constant {0} int transformImage( SwsContext* context, const uint8_t* const srcSlice[], @@ -41,7 +66,7 @@ int transformImage( if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) { return result; } - + // NOTE: srcY stride always 0: this is a parameter of YUV format if ((result = sws_scale( context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) < 0) { @@ -153,6 +178,12 @@ bool VideoSampler::init(const SamplerParameters& params) { return scaleContext_ != nullptr; } +// Main body of the sample function called from one of the overloads below +// +// Args: +// srcSlice: decoded AVFrame->data perpared buffer +// srcStride: linesize (usually obtained from AVFrame->linesize) +// out: return buffer (ByteStorage*) int VideoSampler::sample( const uint8_t* const srcSlice[], int srcStride[], @@ -221,6 +252,7 @@ int VideoSampler::sample( return outImageSize; } +// Call from `video_stream.cpp::114` - occurs during file reads int VideoSampler::sample(AVFrame* frame, ByteStorage* out) { if (!frame) { return 0; // no flush for videos @@ -229,6 +261,7 @@ int VideoSampler::sample(AVFrame* frame, ByteStorage* out) { return sample(frame->data, frame->linesize, out); } +// Call from `video_stream.cpp::114` - not sure when this occurs int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) { if (!in) { return 0; // no flush for videos From 7ed00d24a384289b4314b2feb6372dc6611f0df4 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 23 Apr 2021 08:52:55 -0500 Subject: [PATCH 2/8] minor docs for decoder.cpp --- torchvision/csrc/io/decoder/decoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp index 6c9a3cdf825..f6c8b2b2bbd 100644 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ b/torchvision/csrc/io/decoder/decoder.cpp @@ -381,7 +381,7 @@ bool Decoder::init( cleanUp(); return false; } - + // not sure what this does? onInit(); if (params.startOffset != 0) { From b8092f72dd7dcb966a8e68679ba29ffa9ef7cbd6 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 29 Apr 2021 09:12:16 -0500 Subject: [PATCH 3/8] descriptive comments for the stream.c --- torchvision/csrc/io/decoder/stream.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp index 4da48647382..d1d8071ad14 100644 --- a/torchvision/csrc/io/decoder/stream.cpp +++ b/torchvision/csrc/io/decoder/stream.cpp @@ -24,10 +24,15 @@ Stream::~Stream() { } } +// look up the proper CODEC querying the function AVCodec* Stream::findCodec(AVCodecParameters* params) { return avcodec_find_decoder(params->codec_id); } +// Allocate memory for the AVCodecContext, which will hold the context for +// decode/encode process. Then fill this codec context with CODEC parameters +// defined in stream parameters. Open the codec, and allocate the global frame +// defined in the header file int Stream::openCodec(std::vector* metadata) { AVStream* steam = inputCtx_->streams[format_.stream]; @@ -93,6 +98,9 @@ int Stream::openCodec(std::vector* metadata) { return ret; } +// send the raw data packet (compressed frame) to the decoder, through the codec +// context and receive the raw data frame (uncompressed frame) from the +// decoder, through the same codec context int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) { int consumed = 0; int result = avcodec_send_packet(codecCtx_, packet); @@ -134,6 +142,9 @@ int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) { return consumed; } +// General decoding function: +// given the packet, analyse the metadata, and write the +// metadata and the buffer to the DecoderOutputImage. int Stream::decodePacket( const AVPacket* packet, DecoderOutputMessage* out, @@ -167,6 +178,9 @@ int Stream::flush(DecoderOutputMessage* out, bool headerOnly) { return 1; } +// Sets the header and payload via stream::setHeader and copyFrameBytes +// functions that are defined in type stream subclass (VideoStream, AudioStream, +// ...) int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) { if (flush) { // only flush of audio frames makes sense From a7e55546ab5a9de18a5bec27f3ed7a3eb74ceafb Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 29 Apr 2021 10:50:32 -0500 Subject: [PATCH 4/8] descriptive comments for decoder.cpp --- torchvision/csrc/io/decoder/decoder.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp index f6c8b2b2bbd..9ef6aaae03f 100644 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ b/torchvision/csrc/io/decoder/decoder.cpp @@ -215,6 +215,12 @@ Decoder::~Decoder() { cleanUp(); } +// Initialise the format context that holds information about the container and +// fill it with minimal information about the format (codecs are not opened +// here). Function reads in intformation about the streams from the container +// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is +// specified, it seeks into the correct frame (note, the seek defined here is +// "precise" seek. bool Decoder::init( const DecoderParameters& params, DecoderInCallback&& in, @@ -396,6 +402,8 @@ bool Decoder::init( return true; } +// open appropriate CODEC for every type of stream and move it to the class +// variable `streams_` and make sure it is in range for decoding bool Decoder::openStreams(std::vector* metadata) { for (int i = 0; i < inputCtx_->nb_streams; i++) { // - find the corespondent format at params_.formats set @@ -478,6 +486,10 @@ void Decoder::cleanUp() { seekableBuffer_.shutdown(); } +// function does actual work, derived class calls it in working thread +// periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if +// no frames got decoded in the specified timeout time, and error on +// unrecoverable error. int Decoder::getFrame(size_t workingTimeInMs) { if (inRange_.none()) { return ENODATA; @@ -594,11 +606,13 @@ int Decoder::getFrame(size_t workingTimeInMs) { return 0; } +// find stream by stream index Stream* Decoder::findByIndex(int streamIndex) const { auto it = streams_.find(streamIndex); return it != streams_.end() ? it->second.get() : nullptr; } +// find stream by type; note finds only the first stream of a given type Stream* Decoder::findByType(const MediaFormat& format) const { for (auto& stream : streams_) { if (stream.second->getMediaFormat().type == format.type) { @@ -608,6 +622,8 @@ Stream* Decoder::findByType(const MediaFormat& format) const { return nullptr; } +// given the stream and packet, decode the frame buffers into the +// DecoderOutputMessage data structure via stream::decodePacket function. int Decoder::processPacket( Stream* stream, AVPacket* packet, From 3c17397d6f1d60076ca116b73fc081e18bbb0e39 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 29 Apr 2021 10:51:00 -0500 Subject: [PATCH 5/8] per-stream descriptive comments --- torchvision/csrc/io/decoder/audio_stream.cpp | 3 +++ torchvision/csrc/io/decoder/video_stream.cpp | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp index 9d66e589bf3..c61906f4f75 100644 --- a/torchvision/csrc/io/decoder/audio_stream.cpp +++ b/torchvision/csrc/io/decoder/audio_stream.cpp @@ -68,6 +68,7 @@ int AudioStream::initFormat() { : -1; } +// copies audio sample bytes via swr_convert call in audio_sampler.cpp int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { if (!sampler_) { sampler_ = std::make_unique(codecCtx_); @@ -95,6 +96,8 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { << ", channels: " << format_.format.audio.channels << ", format: " << format_.format.audio.format; } + // calls to a sampler that converts the audio samples and copies them to the + // out buffer via ffmpeg::swr_convert return sampler_->sample(flush ? nullptr : frame_, out); } diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp index a9e20434fe0..8028ae899e2 100644 --- a/torchvision/csrc/io/decoder/video_stream.cpp +++ b/torchvision/csrc/io/decoder/video_stream.cpp @@ -80,6 +80,7 @@ int VideoStream::initFormat() { : -1; } +// copies frame bytes via sws_scale call in video_sampler.cpp int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { if (!sampler_) { sampler_ = std::make_unique(SWS_AREA, loggingUuid_); @@ -110,7 +111,9 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { << ", minDimension: " << format_.format.video.minDimension << ", crop: " << format_.format.video.cropImage; } - + // calls to a sampler that converts the frame from YUV422 to RGB24, and + // optionally crops and resizes the frame. Frame bytes are copied from + // frame_->data to out buffer return sampler_->sample(flush ? nullptr : frame_, out); } From f683f73b0e7c2c20f03c72bf12ebd7185c6eae99 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 4 Jun 2021 05:27:55 -0500 Subject: [PATCH 6/8] Fixing CLANG hopefully --- torchvision/csrc/io/decoder/video_sampler.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp index 667f8e027e2..7816d70b706 100644 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ b/torchvision/csrc/io/decoder/video_sampler.cpp @@ -12,7 +12,7 @@ namespace { // parameters and the provided array. This sets up "planes" to point to a // "buffer" // NOTE: this is most likely culprit behind #3534 -// +// // Args: // fmt: desired output video format // buffer: source constant image buffer (in different format) that will contain @@ -23,7 +23,6 @@ int preparePlanes( const uint8_t* buffer, uint8_t** planes, int* lineSize) { - int result; // NOTE: 1 at the end of av_fill_arrays is the value used for alignment @@ -41,14 +40,14 @@ int preparePlanes( return result; } -// Scale (and crop) the image slice in srcSlice and put the resulting scaled slice -// to `planes` buffer, which is mapped to be `out` via preparePlanes as `sws_scale` cannot -// access buffers directly. +// Scale (and crop) the image slice in srcSlice and put the resulting scaled +// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as +// `sws_scale` cannot access buffers directly. // // Args: -// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if scale) -// srcSlice: frame data in YUV420P -// srcStride: the array containing the strides for each plane of the source +// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if +// scale) srcSlice: frame data in YUV420P srcStride: the array containing the +// strides for each plane of the source // image (from AVFrame->linesize[0]) // out: destination buffer // planes: indirect destination buffer (mapped to "out" via preparePlanes) From 2b8bfa7ef19f3556422424a939ddc4a8383d70be Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 13 Aug 2021 22:37:59 +0100 Subject: [PATCH 7/8] addressing prabhat's comments --- torchvision/csrc/io/decoder/decoder.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp index b02ec5c21a5..348caf93ff6 100644 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ b/torchvision/csrc/io/decoder/decoder.cpp @@ -215,10 +215,10 @@ Decoder::~Decoder() { // Initialise the format context that holds information about the container and // fill it with minimal information about the format (codecs are not opened -// here). Function reads in intformation about the streams from the container +// here). Function reads in information about the streams from the container // into inputCtx and then passes it to decoder::openStreams. Finally, if seek is -// specified, it seeks into the correct frame (note, the seek defined here is -// "precise" seek. +// specified within the decoder parameters, it seeks into the correct frame +// (note, the seek defined here is"precise" seek). bool Decoder::init( const DecoderParameters& params, DecoderInCallback&& in, @@ -385,7 +385,7 @@ bool Decoder::init( cleanUp(); return false; } - // not sure what this does? + // SyncDecoder inherits Decoder which would override onInit. onInit(); if (params.startOffset != 0) { @@ -485,7 +485,7 @@ void Decoder::cleanUp() { } // function does actual work, derived class calls it in working thread -// periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if +// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if // no frames got decoded in the specified timeout time, and error on // unrecoverable error. int Decoder::getFrame(size_t workingTimeInMs) { From 7b6c05f168a9d1cbbe3480b3446edbbc25084863 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 13 Aug 2021 22:41:25 +0100 Subject: [PATCH 8/8] typo I think --- torchvision/csrc/io/decoder/decoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp index 348caf93ff6..5989183e643 100644 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ b/torchvision/csrc/io/decoder/decoder.cpp @@ -218,7 +218,7 @@ Decoder::~Decoder() { // here). Function reads in information about the streams from the container // into inputCtx and then passes it to decoder::openStreams. Finally, if seek is // specified within the decoder parameters, it seeks into the correct frame -// (note, the seek defined here is"precise" seek). +// (note, the seek defined here is "precise" seek). bool Decoder::init( const DecoderParameters& params, DecoderInCallback&& in,