metamx · esevastyanov · Apr 6, 2021 · egor-ryashin · Apr 6, 2021 · esevastyanov
diff --git a/src/main/scala/io/druid/indexer/spark/SparkDruidIndexer.scala b/src/main/scala/io/druid/indexer/spark/SparkDruidIndexer.scala
@@ -123,7 +123,11 @@ object SparkDruidIndexer {
       s => {
         val p = new Path(s)
         val fs = p.getFileSystem(sc.hadoopConfiguration)
-        fs.getFileStatus(p).getLen
+        // You can send 3,500 PUT/COPY/POST/DELETE or 5,500 GET/HEAD requests per second per prefix in an S3 bucket
+        // This block catches "503 Slow Down" error, retries a request and sleeps for up to 6 (=2+4) seconds in total
+        retryWithExponentialBackoff(3) {
+          fs.getFileStatus(p).getLen
+        }
       }
     ).sum
     val startingPartitions = (totalGZSize / (100L << 20)).toInt + 1

diff --git a/src/main/scala/io/druid/indexer/spark/package.scala b/src/main/scala/io/druid/indexer/spark/package.scala
@@ -0,0 +1,24 @@
+package io.druid.indexer
+
+import scala.util.Failure
+import scala.util.Random
+import scala.util.Success
+import scala.util.Try
+
+package object spark {
+  @annotation.tailrec
+  def retryWithExponentialBackoff[T](n: Int, sleepMillis: Long)(fn: => T): T = {
+    Try { fn } match {
+      case Success(x) => x
+      case _ if n > 1 => Thread.sleep(sleepMillis); retryWithExponentialBackoff(n - 1, 2 * sleepMillis)(fn)
+      case Failure(e) => throw e
+    }
+  }
+
+  /**
+    * Retry on any non-fatal exception with exponential backoff starting from 1000-2000 millis of sleep
+    */
+  def retryWithExponentialBackoff[T](n: Int)(fn: => T): T = {
+    retryWithExponentialBackoff(n, ((1 + Random.nextDouble()) * 1000).toLong)(fn)
+  }
+}