@@ -50,49 +50,49 @@ static const size_t MB = 1024*1024;
50
50
51
51
static const std::map<e_model, size_t > & MEM_REQ_SCRATCH0 ()
52
52
{
53
- static std::map<e_model, size_t > _MEM_REQ_SCRATCH0 = {
53
+ static std::map<e_model, size_t > k_sizes = {
54
54
{ MODEL_7B, 512ull * MB },
55
55
{ MODEL_13B, 512ull * MB },
56
56
{ MODEL_30B, 512ull * MB },
57
57
{ MODEL_65B, 1024ull * MB },
58
58
};
59
- return _MEM_REQ_SCRATCH0 ;
59
+ return k_sizes ;
60
60
}
61
61
62
62
static const std::map<e_model, size_t > & MEM_REQ_SCRATCH1 ()
63
63
{
64
- static std::map<e_model, size_t > _MEM_REQ_SCRATCH1 = {
64
+ static std::map<e_model, size_t > k_sizes = {
65
65
{ MODEL_7B, 512ull * MB },
66
66
{ MODEL_13B, 512ull * MB },
67
67
{ MODEL_30B, 512ull * MB },
68
68
{ MODEL_65B, 1024ull * MB },
69
69
};
70
- return _MEM_REQ_SCRATCH1 ;
70
+ return k_sizes ;
71
71
}
72
72
73
73
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
74
static const std::map<e_model, size_t > & MEM_REQ_KV_SELF ()
75
75
{
76
- static std::map<e_model, size_t > _MEM_REQ_KV_SELF = {
76
+ static std::map<e_model, size_t > k_sizes = {
77
77
{ MODEL_7B, 1026ull * MB },
78
78
{ MODEL_13B, 1608ull * MB },
79
79
{ MODEL_30B, 3124ull * MB },
80
80
{ MODEL_65B, 5120ull * MB },
81
81
};
82
- return _MEM_REQ_KV_SELF ;
82
+ return k_sizes ;
83
83
}
84
84
85
85
// this is mostly needed for temporary mul_mat buffers to dequantize the data
86
86
// not actually needed if BLAS is disabled
87
87
static const std::map<e_model, size_t > & MEM_REQ_EVAL ()
88
88
{
89
- static std::map<e_model, size_t > _MEM_REQ_EVAL = {
89
+ static std::map<e_model, size_t > k_sizes = {
90
90
{ MODEL_7B, 768ull * MB },
91
91
{ MODEL_13B, 1024ull * MB },
92
92
{ MODEL_30B, 1280ull * MB },
93
93
{ MODEL_65B, 1536ull * MB },
94
94
};
95
- return _MEM_REQ_EVAL ;
95
+ return k_sizes ;
96
96
}
97
97
98
98
// default hparams (LLaMA 7B)
@@ -586,12 +586,12 @@ struct llama_model_loader {
586
586
std::unique_ptr<llama_mmap> mapping;
587
587
588
588
llama_model_loader (const std::string & fname_base, bool use_mmap, bool vocab_only) {
589
- auto first_file = new llama_file_loader (fname_base.c_str (), 0 , tensors_map);
589
+ auto * first_file = new llama_file_loader (fname_base.c_str (), 0 , tensors_map);
590
590
file_loaders.emplace_back (first_file);
591
591
uint32_t n_parts = vocab_only ? 1 : guess_n_parts ();
592
592
for (uint32_t i = 1 ; i < n_parts; i++) {
593
593
std::string fname = fname_base + " ." + std::to_string (i);
594
- auto ith_file = new llama_file_loader (fname.c_str (), i, tensors_map);
594
+ auto * ith_file = new llama_file_loader (fname.c_str (), i, tensors_map);
595
595
file_loaders.emplace_back (ith_file);
596
596
if (ith_file->hparams != first_file->hparams ) {
597
597
throw format (" llama.cpp: hparams inconsistent between files" );
@@ -638,7 +638,7 @@ struct llama_model_loader {
638
638
}
639
639
}
640
640
641
- struct ggml_tensor * get_tensor (const std::string & name, std::vector<uint32_t > ne) {
641
+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne) {
642
642
auto it = tensors_map.name_to_idx .find (name);
643
643
if (it == tensors_map.name_to_idx .end ()) {
644
644
throw format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ());
@@ -667,7 +667,7 @@ struct llama_model_loader {
667
667
return tensor;
668
668
}
669
669
670
- void done_getting_tensors () {
670
+ void done_getting_tensors () const {
671
671
if (num_ggml_tensors_created != tensors_map.tensors .size ()) {
672
672
throw std::string (" llama.cpp: file contained more tensors than expected" );
673
673
}
@@ -934,7 +934,8 @@ static void llama_model_load_internal(
934
934
935
935
auto & ctx = model.ctx ;
936
936
937
- size_t ctx_size, mmapped_size;
937
+ size_t ctx_size;
938
+ size_t mmapped_size;
938
939
ml->calc_sizes (&ctx_size, &mmapped_size);
939
940
fprintf (stderr, " %s: ggml ctx size = %6.2f KB\n " , __func__, ctx_size/1024.0 );
940
941
@@ -1074,7 +1075,7 @@ static bool llama_eval_internal(
1074
1075
const auto & model = lctx.model ;
1075
1076
const auto & hparams = model.hparams ;
1076
1077
1077
- auto & kv_self = model.kv_self ;
1078
+ const auto & kv_self = model.kv_self ;
1078
1079
1079
1080
LLAMA_ASSERT (!!kv_self.ctx );
1080
1081
@@ -1318,7 +1319,7 @@ static bool llama_eval_internal(
1318
1319
}
1319
1320
1320
1321
// extract embeddings
1321
- if (lctx.embedding .size ()) {
1322
+ if (! lctx.embedding .empty ()) {
1322
1323
auto & embedding_out = lctx.embedding ;
1323
1324
1324
1325
embedding_out.resize (n_embd);
@@ -1369,6 +1370,8 @@ struct llama_sp_symbol {
1369
1370
size_t n;
1370
1371
};
1371
1372
1373
+ static_assert (std::is_trivially_copyable<llama_sp_symbol>::value, " llama_sp_symbol is not trivially copyable" );
1374
+
1372
1375
struct llama_sp_bigram {
1373
1376
struct comparator {
1374
1377
bool operator ()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1401,7 +1404,7 @@ struct llama_tokenizer {
1401
1404
sym.prev = index - 1 ;
1402
1405
sym.next = offs == text.size () ? -1 : index + 1 ;
1403
1406
index ++;
1404
- symbols_.emplace_back (std::move ( sym) );
1407
+ symbols_.emplace_back (sym);
1405
1408
}
1406
1409
1407
1410
// seed the work queue with all possible 2-character tokens.
@@ -1492,7 +1495,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1492
1495
llama_tokenizer tokenizer (vocab);
1493
1496
std::vector<llama_vocab::id> output;
1494
1497
1495
- if (text.size () == 0 ) {
1498
+ if (text.empty () ) {
1496
1499
return output;
1497
1500
}
1498
1501
@@ -1728,7 +1731,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1728
1731
const int64_t t_start_sample_us = ggml_time_us ();
1729
1732
1730
1733
for (size_t i = 0 ; i < candidates->size ; ++i) {
1731
- auto token_iter = std::find (last_tokens, last_tokens + last_tokens_size, candidates->data [i].id );
1734
+ const auto * token_iter = std::find (last_tokens, last_tokens + last_tokens_size, candidates->data [i].id );
1732
1735
if (token_iter == last_tokens + last_tokens_size) {
1733
1736
continue ;
1734
1737
}
@@ -1872,7 +1875,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
1872
1875
const int64_t t_start_sample_us = ggml_time_us ();
1873
1876
1874
1877
// Find max element
1875
- auto max_iter = std::max_element (candidates->data , candidates->data + candidates->size , [](const llama_token_data & a, const llama_token_data & b) {
1878
+ auto * max_iter = std::max_element (candidates->data , candidates->data + candidates->size , [](const llama_token_data & a, const llama_token_data & b) {
1876
1879
return a.logit < b.logit ;
1877
1880
});
1878
1881
@@ -1925,7 +1928,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1925
1928
nthread = std::thread::hardware_concurrency ();
1926
1929
}
1927
1930
1928
- std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp. c_str () , /* use_mmap*/ false ,
1931
+ std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ,
1929
1932
/* vocab_only*/ false ));
1930
1933
llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), ftype);
1931
1934
@@ -1979,7 +1982,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1979
1982
} else if (tensor.type == GGML_TYPE_F16) {
1980
1983
f32_conv_buf.resize (nelements * sizeof (float ));
1981
1984
f32_data = (float *) f32_conv_buf.addr ;
1982
- auto f16_data = (const ggml_fp16_t *) tensor.data ;
1985
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data ;
1983
1986
for (size_t i = 0 ; i < nelements; i++) {
1984
1987
f32_data[i] = ggml_fp16_to_fp32 (f16_data[i]);
1985
1988
}
@@ -2010,21 +2013,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2010
2013
size_t first = counter; counter += chunk_size;
2011
2014
if (first >= nelements) {
2012
2015
if (!local_hist.empty ()) {
2013
- for (int j=0 ; j<int (local_hist.size ()); ++j) hist_cur[j] += local_hist[j];
2016
+ for (int j=0 ; j<int (local_hist.size ()); ++j) {
2017
+ hist_cur[j] += local_hist[j];
2018
+ }
2014
2019
new_size += local_size;
2015
2020
}
2016
2021
break ;
2017
2022
}
2018
2023
lock.unlock ();
2019
2024
size_t last = std::min (nelements, first + chunk_size);
2020
- if (local_hist.empty ()) local_hist.resize (hist_cur.size (), 0 );
2025
+ if (local_hist.empty ()) {
2026
+ local_hist.resize (hist_cur.size (), 0 );
2027
+ }
2021
2028
local_size += ggml_quantize_chunk (new_type, f32_data, new_data, first, last - first, local_hist.data ());
2022
2029
}
2023
2030
};
2024
- if (int (workers.size ()) < nthread_use - 1 ) workers.resize (nthread_use - 1 );
2025
- for (int it = 0 ; it < nthread_use - 1 ; ++it) workers[it] = std::thread (compute);
2031
+ if ((int ) workers.size () < nthread_use - 1 ) {
2032
+ workers.resize (nthread_use - 1 );
2033
+ }
2034
+ for (int it = 0 ; it < nthread_use - 1 ; ++it) {
2035
+ workers[it] = std::thread (compute);
2036
+ }
2026
2037
compute ();
2027
- for (int it = 0 ; it < nthread_use - 1 ; ++it) workers[it].join ();
2038
+ for (int it = 0 ; it < nthread_use - 1 ; ++it) {
2039
+ workers[it].join ();
2040
+ }
2028
2041
}
2029
2042
2030
2043
printf (" size = %8.2f MB -> %8.2f MB | hist: " , tensor.size /1024.0 /1024.0 , new_size/1024.0 /1024.0 );
@@ -2222,7 +2235,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2222
2235
fprintf (stderr, " %s: loading base model from '%s'\n " , __func__, path_base_model);
2223
2236
model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* vocab_only*/ false ));
2224
2237
2225
- size_t ctx_size, mmapped_size;
2238
+ size_t ctx_size;
2239
+ size_t mmapped_size;
2226
2240
model_loader->calc_sizes (&ctx_size, &mmapped_size);
2227
2241
base_buf.resize (ctx_size);
2228
2242
@@ -2261,8 +2275,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2261
2275
fin.read (reinterpret_cast <char *>(&ne[i]), sizeof (ne[i]));
2262
2276
}
2263
2277
2264
- std::string name (length, 0 );
2265
- fin.read (&name[0 ], length);
2278
+ std::string name;
2279
+ {
2280
+ char buf[1024 ];
2281
+ fin.read (buf, length);
2282
+ name = std::string (buf, length);
2283
+ }
2266
2284
2267
2285
// check for lora suffix and get the type of tensor
2268
2286
const std::string lora_suffix = " .lora" ;
@@ -2277,7 +2295,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2277
2295
base_name.erase (pos);
2278
2296
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2279
2297
2280
- if (model_tensors.find (base_name. data () ) == model_tensors.end ()) {
2298
+ if (model_tensors.find (base_name) == model_tensors.end ()) {
2281
2299
fprintf (stderr, " %s: unknown tensor '%s' in lora adapter\n " , __func__, name.data ());
2282
2300
return 1 ;
2283
2301
}
@@ -2379,8 +2397,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2379
2397
lora_tensors.clear ();
2380
2398
2381
2399
n_tensors++;
2382
- if (n_tensors % 4 == 0 )
2400
+ if (n_tensors % 4 == 0 ) {
2383
2401
fprintf (stderr, " ." );
2402
+ }
2384
2403
}
2385
2404
}
2386
2405
@@ -2409,7 +2428,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2409
2428
return ctx->model .kv_self .n ;
2410
2429
}
2411
2430
2412
- #define LLAMA_MAX_RNG_STATE 64 *1024
2431
+ #define LLAMA_MAX_RNG_STATE ( 64 *1024 )
2413
2432
2414
2433
void llama_set_rng_seed (struct llama_context * ctx, int seed) {
2415
2434
if (seed < 0 ) {
@@ -2668,7 +2687,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
2668
2687
const uint32_t magic = file.read_u32 ();
2669
2688
const uint32_t version = file.read_u32 ();
2670
2689
2671
- if (!( magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION) ) {
2690
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2672
2691
fprintf (stderr, " %s : unknown (magic, version) for session file: %08x, %08x\n " , __func__, magic, version);
2673
2692
return false ;
2674
2693
}
0 commit comments