Refactor Cubic/Quad tests to make sure all threads reach barrier() (#40506)

dnfield · web-flow · commit 253076df3d9f · 2023-03-21T23:52:22.000Z
[Impeller] Refactor Cubic/Quad tests to make sure all threads reach barrier()
diff --git a/impeller/compiler/shader_lib/impeller/path.glsl b/impeller/compiler/shader_lib/impeller/path.glsl
@@ -28,9 +28,18 @@ struct CubicData {
   vec2 p2;
 };
 
-struct Position {
-  uint index;
-  uint count;
+struct QuadDecomposition {
+  float a0;
+  float a2;
+  float u0;
+  float u_scale;
+  uint line_count;
+  float steps;
+};
+
+struct PathComponent {
+  uint index;  // Location in buffer
+  uint count;  // Number of points. 4 = cubic, 3 = quad, 2 = line.
 };
 
 /// Solve for point on a quadratic Bezier curve defined by starting point `p1`,
@@ -65,4 +74,92 @@ float Cross(vec2 p1, vec2 p2) {
   return p1.x * p2.y - p1.y * p2.x;
 }
 
+QuadData GenerateQuadraticFromCubic(CubicData cubic,
+                                    uint index,
+                                    float quad_count) {
+  float t0 = index / quad_count;
+  float t1 = (index + 1) / quad_count;
+
+  // calculate the subsegment
+  vec2 sub_p1 = CubicSolve(cubic, t0);
+  vec2 sub_p2 = CubicSolve(cubic, t1);
+  QuadData quad = QuadData(3.0 * (cubic.cp1 - cubic.p1),   //
+                           3.0 * (cubic.cp2 - cubic.cp1),  //
+                           3.0 * (cubic.p2 - cubic.cp2));
+  float sub_scale = (t1 - t0) * (1.0 / 3.0);
+  vec2 sub_cp1 = sub_p1 + sub_scale * QuadraticSolve(quad, t0);
+  vec2 sub_cp2 = sub_p2 - sub_scale * QuadraticSolve(quad, t1);
+
+  vec2 quad_p1x2 = 3.0 * sub_cp1 - sub_p1;
+  vec2 quad_p2x2 = 3.0 * sub_cp2 - sub_p2;
+
+  return QuadData(sub_p1,                           //
+                  ((quad_p1x2 + quad_p2x2) / 4.0),  //
+                  sub_p2);
+}
+
+uint EstimateQuadraticCount(CubicData cubic, float accuracy) {
+  // The maximum error, as a vector from the cubic to the best approximating
+  // quadratic, is proportional to the third derivative, which is constant
+  // across the segment. Thus, the error scales down as the third power of
+  // the number of subdivisions. Our strategy then is to subdivide `t` evenly.
+  //
+  // This is an overestimate of the error because only the component
+  // perpendicular to the first derivative is important. But the simplicity is
+  // appealing.
+
+  // This magic number is the square of 36 / sqrt(3).
+  // See: http://caffeineowl.com/graphics/2d/vectorial/cubic2quad01.html
+  float max_hypot2 = 432.0 * accuracy * accuracy;
+
+  vec2 err_v = (3.0 * cubic.cp2 - cubic.p2) - (3.0 * cubic.cp1 - cubic.p1);
+  float err = dot(err_v, err_v);
+  return uint(max(1., ceil(pow(err * (1.0 / max_hypot2), 1. / 6.0))));
+}
+
+QuadDecomposition DecomposeQuad(QuadData quad, float tolerance) {
+  float sqrt_tolerance = sqrt(tolerance);
+
+  vec2 d01 = quad.cp - quad.p1;
+  vec2 d12 = quad.p2 - quad.cp;
+  vec2 dd = d01 - d12;
+  float c = Cross(quad.p2 - quad.p1, dd);
+  float x0 = dot(d01, dd) * 1. / c;
+  float x2 = dot(d12, dd) * 1. / c;
+  float scale = abs(c / (sqrt(dd.x * dd.x + dd.y * dd.y) * (x2 - x0)));
+
+  float a0 = ApproximateParabolaIntegral(x0);
+  float a2 = ApproximateParabolaIntegral(x2);
+  float val = 0.f;
+  if (isfinite(scale)) {
+    float da = abs(a2 - a0);
+    float sqrt_scale = sqrt(scale);
+    if ((x0 < 0 && x2 < 0) || (x0 >= 0 && x2 >= 0)) {
+      val = da * sqrt_scale;
+    } else {
+      // cusp case
+      float xmin = sqrt_tolerance / sqrt_scale;
+      val = sqrt_tolerance * da / ApproximateParabolaIntegral(xmin);
+    }
+  }
+  float u0 = ApproximateParabolaIntegral(a0);
+  float u2 = ApproximateParabolaIntegral(a2);
+  float u_scale = 1. / (u2 - u0);
+
+  uint line_count = uint(max(1., ceil(0.5 * val / sqrt_tolerance)) + 1.);
+  float steps = 1. / line_count;
+
+  return QuadDecomposition(a0, a2, u0, u_scale, line_count, steps);
+}
+
+vec2 GenerateLineFromQuad(QuadData quad,
+                          uint index,
+                          QuadDecomposition decomposition) {
+  float u = index * decomposition.steps;
+  float a = decomposition.a0 + (decomposition.a2 - decomposition.a0) * u;
+  float t = (ApproximateParabolaIntegral(a) - decomposition.u0) *
+            decomposition.u_scale;
+  return QuadraticSolve(quad, t);
+}
+
 #endif
diff --git a/impeller/fixtures/cubic_to_quads.comp b/impeller/fixtures/cubic_to_quads.comp
@@ -30,54 +30,24 @@ shared uint count_sums[512];
 
 void main() {
   uint ident = gl_GlobalInvocationID.x;
-  if (ident >= cubics.count) {
-    return;
+  CubicData cubic;
+  uint quad_count = 0;
+  if (ident < cubics.count) {
+    cubic = cubics.data[ident];
+    quad_count = EstimateQuadraticCount(cubic, config.accuracy);
+    quad_counts[ident] = quad_count;
   }
 
-  // The maximum error, as a vector from the cubic to the best approximating
-  // quadratic, is proportional to the third derivative, which is constant
-  // across the segment. Thus, the error scales down as the third power of
-  // the number of subdivisions. Our strategy then is to subdivide `t` evenly.
-  //
-  // This is an overestimate of the error because only the component
-  // perpendicular to the first derivative is important. But the simplicity is
-  // appealing.
-
-  // This magic number is the square of 36 / sqrt(3).
-  // See: http://caffeineowl.com/graphics/2d/vectorial/cubic2quad01.html
-  float max_hypot2 = 432.0 * config.accuracy * config.accuracy;
-
-  CubicData cubic = cubics.data[ident];
-
-  vec2 err_v = (3.0 * cubic.cp2 - cubic.p2) - (3.0 * cubic.cp1 - cubic.p1);
-  float err = dot(err_v, err_v);
-  float quad_count = max(1., ceil(pow(err * (1.0 / max_hypot2), 1. / 6.0)));
-
-  quad_counts[ident] = uint(quad_count);
-
   barrier();
+  if (quad_count == 0) {
+    return;
+  }
+
   count_sums[ident] = subgroupInclusiveAdd(quad_counts[ident]);
 
   quads.count = count_sums[cubics.count - 1];
   for (uint i = 0; i < quad_count; i++) {
-    float t0 = i / quad_count;
-    float t1 = (i + 1) / quad_count;
-
-    // calculate the subsegment
-    vec2 sub_p1 = CubicSolve(cubic, t0);
-    vec2 sub_p2 = CubicSolve(cubic, t1);
-    QuadData quad = QuadData(3.0 * (cubic.cp1 - cubic.p1),   //
-                             3.0 * (cubic.cp2 - cubic.cp1),  //
-                             3.0 * (cubic.p2 - cubic.cp2));
-    float sub_scale = (t1 - t0) * (1.0 / 3.0);
-    vec2 sub_cp1 = sub_p1 + sub_scale * QuadraticSolve(quad, t0);
-    vec2 sub_cp2 = sub_p2 - sub_scale * QuadraticSolve(quad, t1);
-
-    vec2 quad_p1x2 = 3.0 * sub_cp1 - sub_p1;
-    vec2 quad_p2x2 = 3.0 * sub_cp2 - sub_p2;
-    uint offset = count_sums[ident] - uint(quad_count);
-    quads.data[offset + i] = QuadData(sub_p1,                           //
-                                      ((quad_p1x2 + quad_p2x2) / 4.0),  //
-                                      sub_p2);
+    uint offset = count_sums[ident] - quad_count;
+    quads.data[offset + i] = GenerateQuadraticFromCubic(cubic, i, quad_count);
   }
 }
diff --git a/impeller/fixtures/quad_polyline.comp b/impeller/fixtures/quad_polyline.comp
@@ -30,60 +30,30 @@ shared uint count_sums[512];
 
 void main() {
   uint ident = gl_GlobalInvocationID.x;
-  if (ident >= quads.count) {
-    return;
+  QuadData quad;
+  QuadDecomposition decomposition;
+  if (ident < quads.count) {
+    quad = quads.data[ident];
+    decomposition = DecomposeQuad(quad, config.tolerance);
+    point_counts[ident] = uint(decomposition.line_count);
   }
 
-  QuadData quad = quads.data[ident];
-  float sqrt_tolerance = sqrt(config.tolerance);
-
-  vec2 d01 = quad.cp - quad.p1;
-  vec2 d12 = quad.p2 - quad.cp;
-  vec2 dd = d01 - d12;
-  float c = Cross(quad.p2 - quad.p1, dd);
-  float x0 = dot(d01, dd) * 1. / c;
-  float x2 = dot(d12, dd) * 1. / c;
-  float scale = abs(c / (sqrt(dd.x * dd.x + dd.y * dd.y) * (x2 - x0)));
+  barrier();
 
-  float a0 = ApproximateParabolaIntegral(x0);
-  float a2 = ApproximateParabolaIntegral(x2);
-  float val = 0.f;
-  if (isfinite(scale)) {
-    float da = abs(a2 - a0);
-    float sqrt_scale = sqrt(scale);
-    if ((x0 < 0 && x2 < 0) || (x0 >= 0 && x2 >= 0)) {
-      val = da * sqrt_scale;
-    } else {
-      // cusp case
-      float xmin = sqrt_tolerance / sqrt_scale;
-      val = sqrt_tolerance * da / ApproximateParabolaIntegral(xmin);
-    }
+  if (decomposition.line_count == 0) {
+    return;
   }
-  float u0 = ApproximateParabolaIntegral(a0);
-  float u2 = ApproximateParabolaIntegral(a2);
-  float u_scale = 1. / (u2 - u0);
-
-  float line_count = max(1., ceil(0.5 * val / sqrt_tolerance)) + 1.;
-  float steps = 1. / line_count;
-
-  point_counts[ident] = uint(line_count);
-
-  barrier();
   count_sums[ident] = subgroupInclusiveAdd(point_counts[ident]);
-  barrier();
 
   polyline.count = count_sums[quads.count - 1] + 1;
   polyline.data[0] = quads.data[0].p1;
 
   // In theory this could be unrolled into a separate shader, but in practice
   // line_count usually pretty low and currently lack benchmark data to show
   // how much it would even help.
-  for (uint i = 1; i < line_count; i += 1) {
-    float u = i * steps;
-    float a = a0 + (a2 - a0) * u;
-    float t = (ApproximateParabolaIntegral(a) - u0) * u_scale;
-    uint offset = count_sums[ident] - uint(line_count);
-    polyline.data[offset + i] = QuadraticSolve(quad, t);
+  for (uint i = 1; i < decomposition.line_count; i += 1) {
+    uint offset = count_sums[ident] - uint(decomposition.line_count);
+    polyline.data[offset + i] = GenerateLineFromQuad(quad, i, decomposition);
   }
   polyline.data[count_sums[ident]] = quad.p2;
 }