|
29 | 29 | *
|
30 | 30 | **************************************************************************************************/
|
31 | 31 | /***************************************
|
32 |
| -* Mixed Precision PVC Gemm Example For int4_t (RowMajor A) x (ColumnMajor B) |
| 32 | +* Mixed Precision BMG Gemm Example For int4_t (RowMajor A) x (ColumnMajor B) |
33 | 33 | *
|
34 |
| -* This example demonstrates how to dispatch a mixed precision GEMM on PVC, with optional dequantization. |
| 34 | +* This example demonstrates how to dispatch a mixed precision GEMM on BMG, with optional dequantization. |
35 | 35 | * The GemmMode enum describes the 3 modes of operation:
|
36 | 36 | *
|
37 | 37 | * Note: due to a bug in the IGC compiler, it's currently necessary to build this example with the following
|
@@ -117,15 +117,15 @@ struct Options {
|
117 | 117 | cmd.get_cmd_line_argument("warmup", warmup, 0);
|
118 | 118 | cmd.get_cmd_line_argument("flush_cache", flush_cache, 0);
|
119 | 119 | cmd.get_cmd_line_argument("cache_cnt", cache_cnt, 3);
|
120 |
| - cmd.get_cmd_line_argument("l3_cache", l3_cache, 192); |
| 120 | + cmd.get_cmd_line_argument("l3_cache", l3_cache, 32); |
121 | 121 | cmd.get_cmd_line_argument("splits", splits, 2);
|
122 | 122 | cmd.get_cmd_line_argument("splitk", splitk, true);
|
123 | 123 | }
|
124 | 124 |
|
125 | 125 | /// Prints the usage statement.
|
126 | 126 | std::ostream & print_usage(std::ostream &out) const {
|
127 | 127 |
|
128 |
| - out << "PVC int4_t StreamK GEMM Mixed Type Example\n\n" |
| 128 | + out << "BMG int4_t StreamK GEMM Mixed Type Example\n\n" |
129 | 129 | << "Options:\n\n"
|
130 | 130 | << " --help If specified, displays this usage statement\n\n"
|
131 | 131 | << " --dp If specified, uses Data Parallel decomposition\n"
|
@@ -507,17 +507,16 @@ return true;
|
507 | 507 |
|
508 | 508 | float total_time = 0.f;
|
509 | 509 | if (options.warmup >= options.iterations) {
|
510 |
| - return cutlass::Status::kErrorInternal; |
| 510 | + return cutlass::Status::kSuccess; |
511 | 511 | }
|
512 | 512 |
|
513 | 513 | double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
|
514 | 514 | double hbm = (sizeof_bits_v<ElementA> * options.m * options.k / 8 +
|
515 | 515 | sizeof_bits_v<ElementB> * options.k * options.n / 8 +
|
516 | 516 | sizeof_bits_v<ElementOutput> * options.m * options.n / 8) * 1e-9;
|
517 | 517 |
|
518 |
| - std::cout << "\nProblem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; |
519 |
| - printf("--l=%d --iterations=%d --flush_cache=%d\n", options.l, options.iterations, options.flush_cache); |
520 |
| - printf("--warmup=%d, --cache_cnt=%d, --l3_cache_size=%d\n\n", options.warmup, options.cache_cnt, l3_cache_size); |
| 518 | + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << ", splitk: " << options.splits << std::endl; |
| 519 | + // printf(" --iterations=%d --flush_cache=%d, --warmup=%d, --l3_cache_size=%dMB\n", options.iterations, options.flush_cache, options.warmup, options.l3_cache); |
521 | 520 |
|
522 | 521 | if (options.iterations > 0) {
|
523 | 522 | for (int i = 0; i < options.iterations; ++i) {
|
@@ -556,12 +555,12 @@ return true;
|
556 | 555 | total_time += ctime;
|
557 | 556 | }
|
558 | 557 |
|
559 |
| - printf("Cutlass GEMM Performance [%d]: [%4.3f]TFlop/s [%4.3f]GB/s (%6.4f)ms\n", i, tflops / ctime, hbm / ctime, ctime*1000); |
| 558 | + // printf("Cutlass GEMM Performance [%d]: [%4.3f]TFlop/s [%4.3f]GB/s (%6.4f)ms\n", i, tflops / ctime, hbm / ctime, ctime*1000); |
560 | 559 | }
|
561 | 560 |
|
562 | 561 | float cute_time = total_time / (options.iterations - options.warmup);
|
563 | 562 |
|
564 |
| - printf("Cutlass GEMM Performance average: [%4.3f]TFlop/s [%4.3f]GB/s (%6.4f)ms\n", tflops / cute_time, hbm / cute_time, cute_time*1000); |
| 563 | + // printf("Cutlass GEMM Performance average: [%4.3f]TFlop/s [%4.3f]GB/s (%6.4f)ms\n", tflops / cute_time, hbm / cute_time, cute_time*1000); |
565 | 564 | }
|
566 | 565 |
|
567 | 566 | return cutlass::Status::kSuccess;
|
|
0 commit comments