[fbsync] Fast seek implementation (#3179)

prabhat00155 · facebook-github-bot · commit 0cb7cf37acc3 · 2021-11-08T08:49:16.000-08:00
Summary:
* modify processPacket to support fast seek

* add fastSeek to ProcessPacket decoder definition

* add fastseek flag to DecoderParametersStruct

* add fastseek flag to the process packet call

* no default params in C++ implementation

* enable flag in C++ implementation

* make order of parameters more normal

* register new seek with python api

* [somewhat broken] test suite for keyframes using pyav

* revert " changes

* add type annotations to init

* Adding tests

* linter

* Flake doesn't show up :|

* Change from unitest to pytest syntax

* add return type

Reviewed By: kazhang

Differential Revision: D32216689

fbshipit-source-id: 695975c2930cb663ea82c83e4bc924a09e124a7d

Co-authored-by: Prabhat Roy &lt;prabhatroy@fb.com&gt;
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
@@ -167,6 +167,43 @@ def test_fate_suite(self):
         assert metadata["subtitles"]["duration"] is not None
         os.remove(video_path)
 
+    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
+    def test_keyframe_reading(self):
+        for test_video, config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            av_reader = av.open(full_path)
+            # reduce streams to only keyframes
+            av_stream = av_reader.streams.video[0]
+            av_stream.codec_context.skip_frame = "NONKEY"
+
+            av_keyframes = []
+            vr_keyframes = []
+            if av_reader.streams.video:
+
+                # get all keyframes using pyav. Then, seek randomly into video reader
+                # and assert that all the returned values are in AV_KEYFRAMES
+
+                for av_frame in av_reader.decode(av_stream):
+                    av_keyframes.append(float(av_frame.pts * av_frame.time_base))
+
+            if len(av_keyframes) > 1:
+                video_reader = VideoReader(full_path, "video")
+                for i in range(1, len(av_keyframes)):
+                    seek_val = (av_keyframes[i] + av_keyframes[i - 1]) / 2
+                    data = next(video_reader.seek(seek_val, True))
+                    vr_keyframes.append(data["pts"])
+
+                data = next(video_reader.seek(config.duration, True))
+                vr_keyframes.append(data["pts"])
+
+                assert len(av_keyframes) == len(vr_keyframes)
+                # NOTE: this video gets different keyframe with different
+                # loaders (0.333 pyav, 0.666 for us)
+                if test_video != "TrumanShow_wave_f_nm_np1_fr_med_26.avi":
+                    for i in range(len(av_keyframes)):
+                        assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
@@ -552,9 +552,9 @@ int Decoder::getFrame(size_t workingTimeInMs) {
       bool gotFrame = false;
       bool hasMsg = false;
       // packet either got consumed completely or not at all
-      if ((result = processPacket(stream, &avPacket, &gotFrame, &hasMsg)) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " processPacket failed with code=" << result;
+      if ((result = processPacket(
+               stream, &avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) {
+        LOG(ERROR) << "processPacket failed with code: " << result;
         break;
       }
 
@@ -635,7 +635,8 @@ int Decoder::processPacket(
     Stream* stream,
     AVPacket* packet,
     bool* gotFrame,
-    bool* hasMsg) {
+    bool* hasMsg,
+    bool fastSeek) {
   // decode package
   int result;
   DecoderOutputMessage msg;
@@ -648,7 +649,15 @@ int Decoder::processPacket(
     bool endInRange =
         params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
     inRange_.set(stream->getIndex(), endInRange);
-    if (endInRange && msg.header.pts >= params_.startOffset) {
+    // if fastseek is enabled, we're returning the first
+    // frame that we decode after (potential) seek.
+    // By default, we perform accurate seek to the closest
+    // following frame
+    bool startCondition = true;
+    if (!fastSeek) {
+      startCondition = msg.header.pts >= params_.startOffset;
+    }
+    if (endInRange && startCondition) {
       *hasMsg = true;
       push(std::move(msg));
     }
diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
@@ -72,7 +72,8 @@ class Decoder : public MediaDecoder {
       Stream* stream,
       AVPacket* packet,
       bool* gotFrame,
-      bool* hasMsg);
+      bool* hasMsg,
+      bool fastSeek = false);
   void flushStreams();
   void cleanUp();
 
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
@@ -190,6 +190,8 @@ struct DecoderParameters {
   bool listen{false};
   // don't copy frame body, only header
   bool headerOnly{false};
+  // enable fast seek (seek only to keyframes)
+  bool fastSeek{false};
   // interrupt init method on timeout
   bool preventStaleness{true};
   // seek tolerated accuracy (us)
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
@@ -98,6 +98,7 @@ void Video::_getDecoderParams(
     int64_t getPtsOnly,
     std::string stream,
     long stream_id = -1,
+    bool fastSeek = true,
     bool all_streams = false,
     int64_t num_threads = 1,
     double seekFrameMarginUs = 10) {
@@ -106,6 +107,7 @@ void Video::_getDecoderParams(
   params.timeoutMs = decoderTimeoutMs;
   params.startOffset = videoStartUs;
   params.seekAccuracy = seekFrameMarginUs;
+  params.fastSeek = fastSeek;
   params.headerOnly = false;
   params.numThreads = num_threads;
 
@@ -165,6 +167,7 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
       0, // headerOnly
       std::get<0>(current_stream), // stream info - remove that
       long(-1), // stream_id parsed from info above change to -2
+      false, // fastseek: we're using the default param here
       true, // read all streams
       numThreads_ // global number of Threads for decoding
   );
@@ -246,6 +249,7 @@ bool Video::setCurrentStream(std::string stream = "video") {
       std::get<0>(current_stream), // stream
       long(std::get<1>(
           current_stream)), // stream_id parsed from info above change to -2
+      false, // fastseek param set to 0 false by default (changed in seek)
       false, // read all streams
       numThreads_ // global number of threads
   );
@@ -263,14 +267,15 @@ c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
   return streamsMetadata;
 }
 
-void Video::Seek(double ts) {
+void Video::Seek(double ts, bool fastSeek = false) {
   // initialize the class variables used for seeking and retrurn
   _getDecoderParams(
       ts, // video start
       0, // headerOnly
       std::get<0>(current_stream), // stream
       long(std::get<1>(
           current_stream)), // stream_id parsed from info above change to -2
+      fastSeek, // fastseek
       false, // read all streams
       numThreads_ // global number of threads
   );
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
@@ -23,7 +23,7 @@ struct Video : torch::CustomClassHolder {
   std::tuple<std::string, int64_t> getCurrentStream() const;
   c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
   getStreamMetadata() const;
-  void Seek(double ts);
+  void Seek(double ts, bool fastSeek);
   bool setCurrentStream(std::string stream);
   std::tuple<torch::Tensor, double> Next();
 
@@ -39,6 +39,7 @@ struct Video : torch::CustomClassHolder {
       int64_t getPtsOnly,
       std::string stream,
       long stream_id,
+      bool fastSeek,
       bool all_streams,
       int64_t num_threads,
       double seekFrameMarginUs); // this needs to be improved
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
@@ -135,19 +135,20 @@ def __next__(self) -> Dict[str, Any]:
     def __iter__(self) -> Iterator["VideoReader"]:
         return self
 
-    def seek(self, time_s: float) -> "VideoReader":
+    def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
         """Seek within current stream.
 
         Args:
             time_s (float): seek time in seconds
+            keyframes_only (bool): allow to seek only to keyframes
 
         .. note::
             Current implementation is the so-called precise seek. This
             means following seek, call to :mod:`next()` will return the
             frame with the exact timestamp if it exists or
             the first frame with timestamp larger than ``time_s``.
         """
-        self._c.seek(time_s)
+        self._c.seek(time_s, keyframes_only)
         return self
 
     def get_metadata(self) -> Dict[str, Any]: