-
Notifications
You must be signed in to change notification settings - Fork 670
Description
Hey,
I'm trying to use "meta-llama/Llama-Guard-4-12B" from Huggingface and I'm encountering the following error:
"Llama4ForCausalLM has no _prepare_4d_causal_attention_mask_with_cache_position method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub."
Code:
lg_small_text_model_id = "meta-llama/Llama-Guard-4-12B"
lg_small_text_tokenizer = AutoTokenizer.from_pretrained(lg_small_text_model_id)
lg_small_text_model = Llama4ForConditionalGeneration.from_pretrained(lg_small_text_model_id, torch_dtype=torch.bfloat16).to(device_guard)
def llama_guard_text_test(tokenizer, model, prompt, categories: dict[str, str]=None, excluded_category_keys: list[str]=[]):
if categories is not None:
input_ids = tokenizer.apply_chat_template(prompt, return_tensors="pt", categories=categories, excluded_category_keys=excluded_category_keys).to("cuda:2")
else:
input_ids = tokenizer.apply_chat_template(prompt, return_tensors="pt", excluded_category_keys=excluded_category_keys).to("cuda:2")
input_prompt = tokenizer.decode(input_ids[0], skip_special_tokens=False)
prompt_len = input_ids.shape[1]
output = model.generate(
input_ids=input_ids,
max_new_tokens=20,
output_scores=True,
return_dict_in_generate=True,
do_sample=False,
pad_token_id=0,
)
generated_tokens = output.sequences[:, prompt_len:]
response = tokenizer.decode(
generated_tokens[0], skip_special_tokens=False
)
return input_prompt, response
conversation = [
{
"role": "user",
"content": prompt,
},
{
"role": "assistant",
"content": model_answer,
},
]
output = llama_guard_text_test_with_confidence(lg_small_text_tokenizer, lg_small_text_model, conversation)