Skip to content

Commit 6b55046

Browse files
authored
[docs] fix not-working example code in perf_infer_gpu_one.md (#36087)
* bug fix * update memory limit
1 parent 14ca7f1 commit 6b55046

File tree

1 file changed

+11
-14
lines changed

1 file changed

+11
-14
lines changed

docs/source/en/perf_infer_gpu_one.md

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
357357
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda")
358358

359359
input_text = "Hello my dog is cute and"
360-
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
360+
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
361361

362362
+ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
363363
outputs = model.generate(**inputs)
@@ -431,14 +431,14 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d
431431
```py
432432
from transformers import AutoModelForCausalLM
433433

434-
model_name = "bigscience/bloom-2b5"
434+
model_name = "bigscience/bloom-1b7"
435435
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True)
436436
```
437437

438-
To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU:
438+
To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:
439439

440440
```py
441-
max_memory_mapping = {0: "600MB", 1: "1GB"}
441+
max_memory_mapping = {0: "2GB", 1: "5GB"}
442442
model_name = "bigscience/bloom-3b"
443443
model_4bit = AutoModelForCausalLM.from_pretrained(
444444
model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
@@ -458,7 +458,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d
458458
```py
459459
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
460460

461-
model_name = "bigscience/bloom-2b5"
461+
model_name = "bigscience/bloom-1b7"
462462
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
463463
```
464464

@@ -467,20 +467,20 @@ If you're loading a model in 8-bit for text generation, you should use the [`~tr
467467
```py
468468
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
469469

470-
model_name = "bigscience/bloom-2b5"
470+
model_name = "bigscience/bloom-1b7"
471471
tokenizer = AutoTokenizer.from_pretrained(model_name)
472472
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
473473

474474
prompt = "Hello, my llama is cute"
475-
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
476-
generated_ids = model.generate(**inputs)
475+
inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device)
476+
generated_ids = model_8bit.generate(**inputs)
477477
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
478478
```
479479

480-
To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:
480+
To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:
481481

482482
```py
483-
max_memory_mapping = {0: "1GB", 1: "2GB"}
483+
max_memory_mapping = {0: "2GB", 1: "5GB"}
484484
model_name = "bigscience/bloom-3b"
485485
model_8bit = AutoModelForCausalLM.from_pretrained(
486486
model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
@@ -545,11 +545,8 @@ quantization_config = BitsAndBytesConfig(
545545
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
546546
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config)
547547

548-
# enable BetterTransformer
549-
model = model.to_bettertransformer()
550-
551548
input_text = "Hello my dog is cute and"
552-
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
549+
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
553550

554551
# enable FlashAttention
555552
with sdpa_kernel(SDPBackend.FLASH_ATTENTION):

0 commit comments

Comments
 (0)