-
Notifications
You must be signed in to change notification settings - Fork 12.4k
Benchmark test case for q4_0 matrix multiplication #653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 4 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
ed5f4fe
Initial version of q4_0 matrix multiplication benchmark
SebastianApel 2877517
Merge branch 'ggerganov:master' into master
SebastianApel 3b7dcc0
Bugfix: Added dependency to ggml.o to benchmark
SebastianApel fd2f59a
Reviewer requests: added parameter for threads, switched to ggml_time…
SebastianApel 6e691af
Reviewer input: removed rtsc, use epsilon for check
SebastianApel 100dc55
Review comment: Removed set_locale
SebastianApel 5833bae
Feature: Param for numer of iterations, Bugfix for use of parameter t…
SebastianApel f370a67
Reviewer suggestion: Moved to examples
SebastianApel 56c78d1
Reviewer feedback: Updated clean: and benchmark: sections
SebastianApel d21e188
Merge branch 'master' into master
ggerganov File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
/* | ||
License: MIT License | ||
|
||
Changelog: | ||
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel) | ||
|
||
*/ | ||
|
||
#include <locale.h> | ||
#include "ggml.h" | ||
#include <assert.h> | ||
#include <math.h> | ||
#include <cstring> | ||
#include <cstdio> | ||
#include <cinttypes> | ||
#include <unordered_map> | ||
#include <queue> | ||
#include <string.h> | ||
#include <cassert> | ||
#include <fstream> | ||
#include <string> | ||
#include <iterator> | ||
#include <algorithm> | ||
|
||
uint64_t rdtsc(){ | ||
slaren marked this conversation as resolved.
Show resolved
Hide resolved
|
||
unsigned int lo,hi; | ||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); | ||
return ((uint64_t)hi << 32) | lo; | ||
} | ||
|
||
float tensor_sum_elements(struct ggml_tensor * tensor) { | ||
float sum = 0; | ||
if (tensor->type==6) { | ||
for (int j = 0; j < tensor->ne[1]; j++) { | ||
for (int k = 0; k < tensor->ne[0]; k++) { | ||
sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; | ||
} | ||
} | ||
} | ||
return sum; | ||
} | ||
|
||
|
||
/* | ||
These are mapping to unknown | ||
GGML_TYPE_I8, | ||
GGML_TYPE_I16, | ||
GGML_TYPE_I32, | ||
GGML_TYPE_COUNT, | ||
*/ | ||
|
||
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" | ||
|
||
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \ | ||
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ | ||
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ | ||
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } | ||
|
||
void print_usage(int /*argc*/, char ** argv, const int n_threads) { | ||
fprintf(stderr, "usage: %s [options]\n", argv[0]); | ||
fprintf(stderr, "\n"); | ||
fprintf(stderr, "options:\n"); | ||
fprintf(stderr, " -h, --help show this help message and exit\n"); | ||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", n_threads); | ||
fprintf(stderr, "\n"); | ||
} | ||
|
||
int main(int argc, char ** argv) { | ||
|
||
int n_threads = 1; | ||
|
||
bool invalid_param = false; | ||
std::string arg; | ||
for (int i = 1; i < argc; i++) { | ||
arg = argv[i]; | ||
|
||
if (arg == "-t" || arg == "--threads") { | ||
if (++i >= argc) { | ||
invalid_param = true; | ||
break; | ||
} | ||
n_threads = std::stoi(argv[i]); | ||
} else if (arg == "-h" || arg == "--help") { | ||
print_usage(argc, argv, n_threads); | ||
exit(0); | ||
} | ||
if (invalid_param) { | ||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); | ||
print_usage(argc, argv, n_threads); | ||
exit(1); | ||
} | ||
} | ||
|
||
|
||
// create the ggml context | ||
printf("Starting Test\n"); | ||
|
||
|
||
|
||
struct ggml_context * ctx; | ||
//const int sizex = 4096; | ||
//const int sizey = 11008; | ||
|
||
#undef VERBOSE_DEBUGGING | ||
#ifndef VERBOSE_DEBUGGING | ||
const int sizey = 4096; | ||
const int sizex = 11008; | ||
const int sizez = 128; | ||
#else | ||
/* Working - let's increase size */ | ||
const int sizey = 1; | ||
const int sizex = (8*32); | ||
const int sizez = 1; | ||
|
||
/*const int sizey = 1; | ||
const int sizex = 3*(8*32); | ||
const int sizez = 1;*/ | ||
#endif | ||
|
||
//printf("Memsize required = %i\n", sizex*sizex); | ||
ggml_type wtype = GGML_TYPE_F32; | ||
|
||
size_t ctx_size = 0; | ||
ctx_size += sizex*sizey*ggml_type_sizef(wtype); | ||
ctx_size += sizex*sizey*ggml_type_sizef(wtype); | ||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); | ||
ctx_size += sizex*sizeof(float); | ||
ctx_size += 1024*1024*100; | ||
|
||
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); | ||
|
||
struct ggml_init_params params = { | ||
/*.mem_size =*/ ctx_size, | ||
/*.mem_buffer =*/ NULL, | ||
/* no_alloc =*/ 0 | ||
}; | ||
|
||
ctx = ggml_init(params); | ||
if (!ctx) { | ||
fprintf(stderr, "%s: ggml_init() failed\n", __func__); | ||
return false; | ||
} | ||
|
||
|
||
printf("Creating new tensors\n"); | ||
// printf("Creating new tensor m1\n"); | ||
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); | ||
ggml_set_f32(m11, 1.0f); | ||
|
||
// printf("Creating new tensor m1\n"); | ||
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); | ||
ggml_set_f32(m12, 1.5f); | ||
|
||
// printf("Creating new tensor m2\n"); | ||
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); | ||
ggml_set_f32(m2, 2.0f); | ||
|
||
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); | ||
// printf("Creating new tensor m11xm2\n"); | ||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); | ||
|
||
// printf("Creating compute graph\n"); | ||
struct ggml_cgraph gf = ggml_build_forward(m11xm2); | ||
|
||
gf.n_threads=n_threads; | ||
printf("cgraph->n_threads=%i\n",gf.n_threads); | ||
|
||
TENSOR_DUMP(m11); | ||
TENSOR_DUMP(m2); | ||
|
||
ggml_graph_compute(ctx, &gf); | ||
|
||
TENSOR_DUMP(gf.nodes[0]); | ||
|
||
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); | ||
|
||
int32_t nelements = sizex*sizey; | ||
int32_t ne[2] = { sizex, sizey }; | ||
|
||
std::vector<int64_t> hist_cur(1 << 4, 0); | ||
|
||
// Set up a the benchmark matrices | ||
// printf("Creating new tensor q11 & Running quantize\n"); | ||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); | ||
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); | ||
|
||
// Set up a the compute graph | ||
// printf("Creating new tensor q31\n"); | ||
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); | ||
|
||
// printf("Creating compute graph\n"); | ||
struct ggml_cgraph gf31 = ggml_build_forward(q31); | ||
gf31.n_threads=n_threads; | ||
|
||
// Set up a second graph computation to make sure we override the CPU cache lines | ||
// printf("Creating new tensor q12 & Running quantize\n"); | ||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); | ||
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); | ||
|
||
// printf("Creating new tensor q32\n"); | ||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); | ||
|
||
//printf("Creating compute graph\n"); | ||
struct ggml_cgraph gf32 = ggml_build_forward(q32); | ||
gf32.n_threads=1; | ||
printf("cgraph->n_threads=%i\n",gf31.n_threads); | ||
|
||
const int dimx = sizex; | ||
const int dimy = sizey; | ||
const int dimz = sizez; | ||
long long int flops_per_dot_product = dimy + dimy; | ||
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; | ||
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); | ||
|
||
|
||
// We cannot use the F32 result, because it will not be exactly the same (due to quantization) | ||
// float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); | ||
float sum_of_F32_reference = 11611395072.00f; | ||
slaren marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n"); | ||
printf("==============================================================================================\n"); | ||
|
||
setlocale(LC_ALL,"de_DE_UTF8"); | ||
slaren marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
for (int i=0;i<10;i++) { | ||
|
||
long long int start = ggml_time_us(); | ||
//printf("Running ggml_graph_compute\n"); | ||
ggml_graph_compute(ctx, &gf31); | ||
long long int stop = ggml_time_us(); | ||
long long int usec = stop-start; | ||
float sec = usec/1000000; | ||
float flops_per_usec = (1.0f*flops_per_matrix)/usec; | ||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n", | ||
i, | ||
gf31.n_threads, | ||
sizex, sizey, sizez, flops_per_matrix, | ||
usec,flops_per_usec); | ||
|
||
#ifdef VERBOSE_DEBUGGING | ||
TENSOR_DUMP("res",gf31.nodes[0]) | ||
#endif | ||
|
||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); | ||
if (sum_of_Q4_result != sum_of_F32_reference) { | ||
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f\n", | ||
sum_of_F32_reference, | ||
sum_of_Q4_result | ||
); | ||
exit(0); | ||
} | ||
|
||
// Running a different graph computation to make sure we override the CPU cache lines | ||
ggml_graph_compute(ctx, &gf32); | ||
|
||
} | ||
|
||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.