ggml-org
diff --git a/‎src/llama-batch.cpp
Lines changed: 113 additions & 38 deletions b/‎src/llama-batch.cpp
Lines changed: 113 additions & 38 deletions
diff --git a/‎src/llama-batch.h
Lines changed: 18 additions & 10 deletions b/‎src/llama-batch.h
Lines changed: 18 additions & 10 deletions
@@ -18,6 +18,8 @@ llama_batch_allocr::llama_batch_allocr() {
     for (auto & cur : seq_cpl) {
         cur.resize(LLAMA_MAX_SEQ);
     }
+
+    seq_idx.resize(LLAMA_MAX_SEQ, -1);
 }
 
 bool llama_batch_allocr::init(
@@ -137,22 +139,23 @@ bool llama_batch_allocr::init(
     // compute stats
     //
 
+    this->n_embd = n_embd;
+
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         n_outputs += batch.logits[i] != 0;
     }
 
-    this->n_embd = n_embd;
-
     // determine coupled sequences
     // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        const llama_seq_id s0 = batch.seq_id[i][0];
+
         for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-            seq_pos[batch.seq_id[i][s]].insert(batch.pos[i]);
+            const llama_seq_id s1 = batch.seq_id[i][s];
 
-            if (s > 0) {
-                const llama_seq_id s0 = batch.seq_id[i][0];
-                const llama_seq_id s1 = batch.seq_id[i][s];
+            seq_pos[s1].insert(batch.pos[i]);
 
+            if (s > 0) {
                 // mark that sequence s1 is coupled to s0
                 seq_cpl[s1][s0] = true;
 
@@ -162,14 +165,28 @@ bool llama_batch_allocr::init(
         }
     }
 
-    for (int32_t i = 0; i < batch.n_tokens; ++i) {
-        seq_set_t cur;
-        for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-            cur.set(batch.seq_id[i][s]);
+    {
+        seq_set_t seq_set_unq;
+
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            seq_set_t cur;
+            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+                const llama_seq_id s0 = batch.seq_id[i][s];
+
+                cur.set(s0);
+                seq_set_unq.set(s0);
+            }
+
+            seq_set.push_back(cur);
+            seq_set_map[cur].push_back(i);
         }
 
-        seq_set.push_back(cur);
-        seq_set_map[cur].push_back(i);
+        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (seq_set_unq.test(s)) {
+                seq_idx[s] = seq_id_unq.size();
+                seq_id_unq.push_back(s);
+            }
+        }
     }
 
     if (debug > 0) {
@@ -180,11 +197,14 @@ bool llama_batch_allocr::init(
             /*.n_tokens     =*/ (uint32_t) batch.n_tokens,
             /*.n_seq_tokens =*/ (uint32_t) 1,
             /*.n_seqs       =*/ (uint32_t) batch.n_tokens,
+            /*.n_seqs_unq   =*/ (uint32_t) this->seq_id_unq.size(),
             /*.token        =*/ batch.token,
             /*.embd         =*/ batch.embd,
             /*.pos          =*/ batch.pos,
             /*.n_seq_id     =*/ batch.n_seq_id,
             /*.seq_id       =*/ batch.seq_id,
+            /*.seq_id_unq   =*/ this->seq_id_unq.data(),
+            /*.seq_idx      =*/ this->seq_idx.data(),
             /*.output       =*/ batch.logits,
         };
 
@@ -270,32 +290,44 @@ bool llama_batch_allocr::init(
     return true;
 }
 
-llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_tokens) {
+llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs) {
+    const uint32_t n_tokens = n_seq_tokens*n_seqs;
+
     clear();
     split_reset();
 
     ubatches.emplace_back();
 
     auto & ubatch = ubatches.back();
 
-    ubatch.token   .resize(n_tokens);
-    ubatch.embd    .clear();
-    ubatch.pos     .resize(n_tokens);
-    ubatch.n_seq_id.resize(n_tokens);
-    ubatch.seq_id  .resize(n_tokens);
-    ubatch.output  .resize(n_tokens);
+    ubatch.token     .resize(n_tokens);
+    ubatch.embd      .clear();
+    ubatch.pos       .resize(n_tokens);
+    ubatch.n_seq_id  .resize(n_tokens);
+    ubatch.seq_id    .resize(n_tokens);
+    ubatch.seq_id_unq.resize(0);
+    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
+    ubatch.output    .resize(n_tokens);
+
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        ubatch.seq_idx[s] = s;
+        ubatch.seq_id_unq.push_back(s);
+    }
 
     llama_ubatch res {
         /*.equal_seqs   =*/ true,
         /*.n_tokens     =*/ n_tokens,
-        /*.n_seq_tokens =*/ n_tokens,
-        /*.n_seqs       =*/ 1,
+        /*.n_seq_tokens =*/ n_seq_tokens,
+        /*.n_seqs       =*/ n_seqs,
+        /*.n_seqs_unq   =*/ n_seqs,
 
         /*.token        =*/ ubatch.token.data(),
         /*.embd         =*/ nullptr,
         /*.pos          =*/ ubatch.pos.data(),
         /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
         /*.seq_id       =*/ ubatch.seq_id.data(),
+        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
+        /*.seq_idx      =*/ ubatch.seq_idx.data(),
         /*.output       =*/ ubatch.output.data(),
     };
 
@@ -489,10 +521,11 @@ void llama_batch_allocr::clear() {
 
     batch = {};
 
-    pos     .clear();
-    n_seq_id.clear();
-    seq_id  .clear();
-    output  .clear();
+    pos       .clear();
+    n_seq_id  .clear();
+    seq_id    .clear();
+    seq_id_unq.clear();
+    output    .clear();
 
     for (auto & cur : seq_pos) {
         cur.clear();
@@ -516,12 +549,16 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 
     auto & ubatch = ubatches.back();
 
-    ubatch.token   .resize(n_tokens);
-    ubatch.embd    .resize((int64_t) n_tokens*n_embd);
-    ubatch.pos     .resize(n_tokens);
-    ubatch.n_seq_id.resize(n_tokens);
-    ubatch.seq_id  .resize(n_tokens);
-    ubatch.output  .resize(n_tokens);
+    ubatch.token     .resize(n_tokens);
+    ubatch.embd      .resize((int64_t) n_tokens*n_embd);
+    ubatch.pos       .resize(n_tokens);
+    ubatch.n_seq_id  .resize(n_tokens);
+    ubatch.seq_id    .resize(n_tokens);
+    ubatch.seq_id_unq.resize(0);
+    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
+    ubatch.output    .resize(n_tokens);
+
+    seq_set_t seq_set_unq;
 
     for (size_t i = 0; i < idxs.size(); ++i) {
         if (batch.token) {
@@ -537,22 +574,36 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         ubatch.seq_id[i]   = batch.seq_id[idxs[i]];
         ubatch.output[i]   = batch.logits[idxs[i]];
 
+        for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+            seq_set_unq.set(ubatch.seq_id[i][s]);
+        }
+
         if (ubatch.output[i]) {
             out_ids.push_back(idxs[i]);
         }
     }
 
+    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        if (seq_set_unq.test(s)) {
+            ubatch.seq_idx[s] = ubatch.seq_id_unq.size();
+            ubatch.seq_id_unq.push_back(s);
+        }
+    }
+
     llama_ubatch res {
         /*.equal_seqs   =*/ equal_seqs,
         /*.n_tokens     =*/ n_tokens,
         /*.n_seq_tokens =*/ n_tokens/n_seqs,
         /*.n_seqs       =*/ n_seqs,
+        /*.n_seqs_unq   =*/ (uint32_t) ubatch.seq_id_unq.size(),
 
         /*.token        =*/ batch.token ? ubatch.token.data() : nullptr,
         /*.embd         =*/ batch.embd ? ubatch.embd.data() : nullptr,
         /*.pos          =*/ ubatch.pos.data(),
         /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
         /*.seq_id       =*/ ubatch.seq_id.data(),
+        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
+        /*.seq_idx      =*/ ubatch.seq_idx.data(),
         /*.output       =*/ ubatch.output.data(),
     };
 
@@ -571,14 +622,38 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
         LLAMA_LOG_DEBUG("%s:   n_tokens     = %d\n", __func__, ubatch.n_tokens);
         LLAMA_LOG_DEBUG("%s:   n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens);
         LLAMA_LOG_DEBUG("%s:   n_seqs       = %d\n", __func__, ubatch.n_seqs);
+        LLAMA_LOG_DEBUG("%s:   n_seqs_unq   = %d\n", __func__, ubatch.n_seqs_unq);
+
+        std::stringstream ss_seq_id_unq;
+        std::stringstream ss_seq_idx;
+
+        ss_seq_id_unq << "[ ";
+        ss_seq_idx << "[";
+
+        for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+            ss_seq_id_unq << ubatch.seq_id_unq[s] << " ";
+        }
+
+        for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (ubatch.seq_idx[s] >= 0) {
+                ss_seq_idx << ubatch.seq_idx[s]%10;
+            } else {
+                ss_seq_idx << ".";
+            }
+        }
+
+        ss_seq_id_unq << "]";
+        ss_seq_idx    << "]";
 
-        LLAMA_LOG_DEBUG("%s:   token     = %p\n", __func__, (void *) ubatch.token);
-        LLAMA_LOG_DEBUG("%s:   embd      = %p\n", __func__, (void *) ubatch.embd);
-        LLAMA_LOG_DEBUG("%s:   pos       = %p\n", __func__, (void *) ubatch.pos);
-        LLAMA_LOG_DEBUG("%s:   n_seq_id  = %p\n", __func__, (void *) ubatch.n_seq_id);
-        LLAMA_LOG_DEBUG("%s:   seq_id    = %p\n", __func__, (void *) ubatch.seq_id);
-        LLAMA_LOG_DEBUG("%s:   output    = %p\n", __func__, (void *) ubatch.output);
-        LLAMA_LOG_DEBUG("%s:   n_outputs = %d\n", __func__, n_outputs);
+        LLAMA_LOG_DEBUG("%s:   token      = %p\n", __func__, (void *) ubatch.token);
+        LLAMA_LOG_DEBUG("%s:   embd       = %p\n", __func__, (void *) ubatch.embd);
+        LLAMA_LOG_DEBUG("%s:   pos        = %p\n", __func__, (void *) ubatch.pos);
+        LLAMA_LOG_DEBUG("%s:   n_seq_id   = %p\n", __func__, (void *) ubatch.n_seq_id);
+        LLAMA_LOG_DEBUG("%s:   seq_id     = %p\n", __func__, (void *) ubatch.seq_id);
+        LLAMA_LOG_DEBUG("%s:   seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str());
+        LLAMA_LOG_DEBUG("%s:   seq_idx    = %s\n", __func__, ss_seq_idx.str().c_str());
+        LLAMA_LOG_DEBUG("%s:   output     = %p\n", __func__, (void *) ubatch.output);
+        LLAMA_LOG_DEBUG("%s:   n_outputs  = %d\n", __func__, n_outputs);
 
         if (debug > 1) {
             int seq_id_max = 0;
 
@@ -15,15 +15,18 @@ struct llama_ubatch {
     // TODO: whole_seqs for embeddings?
 
     uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
-    uint32_t n_seq_tokens; // tokens per sequence
-    uint32_t n_seqs;
-
-    llama_token  *  token;    // [n_tokens]
-    float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_tokens]
-    llama_seq_id ** seq_id;   // [n_tokens]
-    int8_t       *  output;   // [n_tokens]
+    uint32_t n_seq_tokens; // tokens per sequence set
+    uint32_t n_seqs;       // sequence sets in the ubatch
+    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
+
+    llama_token  *  token;      // [n_tokens]
+    float        *  embd;       // [n_embd, n_tokens]
+    llama_pos    *  pos;        // [n_tokens]
+    int32_t      *  n_seq_id;   // [n_tokens]
+    llama_seq_id ** seq_id;     // [n_tokens]
+    llama_seq_id *  seq_id_unq; // [n_seqs_unq]
+    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]
+    int8_t       *  output;     // [n_tokens]
 };
 
 // a helper for sanitizing, fulfilling and splitting a batch
@@ -61,7 +64,8 @@ class llama_batch_allocr {
     // sequence-wise split - each ubatch contains a single sequence
     llama_ubatch split_seq(uint32_t n_ubatch);
 
-    llama_ubatch ubatch_reserve(uint32_t n_tokens);
+    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
+
 private:
     void clear();
 
@@ -82,6 +86,8 @@ class llama_batch_allocr {
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
+    std::vector<llama_seq_id>   seq_id_unq;
+    std::vector<int32_t>        seq_idx;
     std::vector<int8_t>         output;
 
     using pos_set_t = std::set<llama_pos>;
@@ -108,6 +114,8 @@ class llama_batch_allocr {
         std::vector<llama_pos>      pos;
         std::vector<int32_t>        n_seq_id;
         std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id>   seq_id_unq;
+        std::vector<int32_t>        seq_idx;
         std::vector<int8_t>         output;
     };