@@ -467,7 +467,11 @@ def update_cache_blocks(self, task, block_size, num_computed_tokens):
467467 block_tables = task .block_tables
468468
469469 last_node , num_cached_tokens = self .cache_info [req_id ]
470- input_ids = task .prompt_token_ids + task .output_token_ids
470+ if isinstance (task .prompt_token_ids , np .ndarray ):
471+ prompt_token_ids = task .prompt_token_ids .tolist ()
472+ else :
473+ prompt_token_ids = task .prompt_token_ids
474+ input_ids = prompt_token_ids + task .output_token_ids
471475 can_cache_computed_tokens = num_computed_tokens - num_computed_tokens % block_size
472476 left_input_ids = input_ids [num_cached_tokens :can_cache_computed_tokens ]
473477 gpu_extra_block_ids = block_tables [num_cached_tokens // block_size :]
@@ -517,7 +521,11 @@ def request_match_blocks(self, task, block_size, *args):
517521 hit_info ["gpu_cache_blocks" ] = 0
518522 hit_info ["cpu_cache_blocks" ] = 0
519523 self .metrics .req_count += 1
520- input_ids = task .prompt_token_ids + task .output_token_ids
524+ if isinstance (task .prompt_token_ids , np .ndarray ):
525+ prompt_token_ids = task .prompt_token_ids .tolist ()
526+ else :
527+ prompt_token_ids = task .prompt_token_ids
528+ input_ids = prompt_token_ids + task .output_token_ids
521529 req_id = task .request_id
522530 logger .info (f"request_match_blocks: start to allocate blocks for req_id { req_id } " )
523531 input_token_num = len (input_ids )
0 commit comments