LLaMA-Adapter support

remixer-dec · remixer-dec · commit df3851515f53 · 2023-04-01T15:51:48.000+04:00
diff --git a/README.md b/README.md
@@ -1,9 +1,10 @@
-# LLaMa MPS fork
+# LLaMa MPS fork (llama-adapter branch)
 
-This is a fork of https://github.com/markasoftware/llama-cpu which is a fork of https://github.com/facebookresearch/llama. The goal of this fork is to use GPU acceleration on Apple M1/M2 devices.   
+This is a fork of https://github.com/markasoftware/llama-cpu which is a fork of https://github.com/facebookresearch/llama. The goal of this fork is to use GPU acceleration on Apple M1/M2 devices.
+This branch provides support for [LLaMA-Adapter](https://github.com/ZrrSkywalker/LLaMA-Adapter)
   
 Please check the original repos for installation instructions. After you're done, run this
- `torchrun example.py --ckpt_dir ../7B  --tokenizer_path ../tokenizer.model --max_batch_size=1` with correct paths to the models. You might need to set up env. variable PYTORCH_ENABLE_MPS_FALLBACK=1  
+ `torchrun example.py --ckpt_dir ../7B  --tokenizer_path ../tokenizer.model --max_batch_size=1 --adapter_path ../llama_adapter_len10_layer30_release.pth` with correct paths to the models. You might need to set up env. variable PYTORCH_ENABLE_MPS_FALLBACK=1  
    
 This fork is experimental, currently at the stage which allows to run a full non-quantized model with MPS.  
 
diff --git a/example.py b/example.py
@@ -33,6 +33,7 @@ def setup_model_parallel() -> Tuple[int, int]:
 def load(
     ckpt_dir: str,
     tokenizer_path: str,
+    adapter_path: str,
     local_rank: int,
     world_size: int,
     max_seq_len: int,
@@ -45,19 +46,26 @@ def load(
     ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
     ckpt_path = checkpoints[local_rank]
     print("Loading")
-    checkpoint = torch.load(ckpt_path, map_location="cpu")
+    checkpoint = torch.load(ckpt_path, torch.device("cpu"))
+    if adapter_path:
+        adapter_checkpoint = torch.load(adapter_path, torch.device("cpu"))
     with open(Path(ckpt_dir) / "params.json", "r") as f:
         params = json.loads(f.read())
 
     model_args: ModelArgs = ModelArgs(
         max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
     )
+    if adapter_path:
+        model_args.adapter_layer = int(adapter_checkpoint['adapter_query.weight'].shape[0] / model_args.adapter_len)
     tokenizer = Tokenizer(model_path=tokenizer_path)
     model_args.vocab_size = tokenizer.n_words
     torch.set_default_tensor_type(torch.HalfTensor)
     model = Transformer(model_args)
-    torch.set_default_tensor_type(torch.FloatTensor)
     model.load_state_dict(checkpoint, strict=False)
+    if adapter_path:
+        model.load_state_dict(adapter_checkpoint, strict=False)
+        del adapter_checkpoint
+    del checkpoint
     model = model.to('mps')
     generator = LLaMA(model, tokenizer)
     print(f"Loaded in {time.time() - start_time:.2f} seconds")
@@ -67,6 +75,7 @@ def load(
 def main(
     ckpt_dir: str,
     tokenizer_path: str,
+    adapter_path: str = None,
     temperature: float = 0.8,
     top_p: float = 0.95,
     max_seq_len: int = 512,
@@ -77,47 +86,49 @@ def main(
         sys.stdout = open(os.devnull, "w")
 
     generator = load(
-        ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
+        ckpt_dir, tokenizer_path, adapter_path, local_rank, world_size, max_seq_len, max_batch_size
     )
 
-    prompts = [
-        # For these prompts, the expected answer is the natural continuation of the prompt
-        "I believe the meaning of life is",
-        "Simply put, the theory of relativity states that ",
-        "Building a website can be done in 10 simple steps:\n",
-        # Few shot prompts: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api
-        """Tweet: "I hate it when my phone battery dies."
-Sentiment: Negative
-###
-Tweet: "My day has been 👍"
-Sentiment: Positive
-###
-Tweet: "This is the link to the article"
-Sentiment: Neutral
-###
-Tweet: "This new music video was incredibile"
-Sentiment:""",
-        """Translate English to French:
-
-sea otter => loutre de mer
-
-peppermint => menthe poivrée
-
-plush girafe => girafe peluche
-
-cheese =>""",
+    PROMPT_DICT = {
+        "prompt_input": (
+            "Below is an instruction that describes a task, paired with an input that provides further context. "
+            "Write a response that appropriately completes the request.\n\n"
+            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+        ),
+        "prompt_no_input": (
+            "Below is an instruction that describes a task. "
+            "Write a response that appropriately completes the request.\n\n"
+            "### Instruction:\n{instruction}\n\n### Response:"
+        ),
+    }
+
+
+    instructs = [
+        "Tell me about alpacas.",
+        "Tell me about the president of Mexico in 2019.",
+        "Tell me about the king of France in 2019.",
+        "List all Canadian provinces in alphabetical order.",
+        "Write a Python program that prints the first 10 Fibonacci numbers.",
+        "Write a program that prints the numbers from 1 to 100. But for multiples of three print 'Fizz' instead of the number and for the multiples of five print 'Buzz'. For numbers which are multiples of both three and five print 'FizzBuzz'.",
+        "Tell me five words that rhyme with 'shock'.",
+        "Translate the sentence 'I have no mouth but I must scream' into Spanish.",
+        "Count up from 1 to 500."
     ]
+    prompts = [PROMPT_DICT['prompt_no_input'].format_map({'instruction':x, 'input': ''}) for x in instructs]
     # results = generator.generate(
     #     prompts, max_gen_len=256, temperature=temperature, top_p=top_p
     # )
-    results = [generator.generate(
-        [prompt], max_gen_len=32, temperature=temperature, top_p=top_p
-    ) for prompt in prompts]
+    gen_start_time = time.time()
+
+    with torch.inference_mode(mode=True):
+        results = [generator.generate(
+            [prompt], max_gen_len=32, temperature=temperature, top_p=top_p
+        ) for prompt in prompts]
 
     for result in results:
-        print("\n==================================\n")
         print(result)
         print("\n==================================\n")
+    print(f"Generated in {time.time() - gen_start_time:.2f} seconds")
 
 
 if __name__ == "__main__":
diff --git a/llama/model.py b/llama/model.py
@@ -28,7 +28,9 @@ class ModelArgs:
 
     max_batch_size: int = 32
     max_seq_len: int = 2048
-
+    
+    adapter_len: int=10
+    adapter_layer: int=8
 
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
@@ -65,10 +67,8 @@ def apply_rotary_emb(
     xk: torch.Tensor,
     freqs_cis: torch.Tensor,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    xq = xq.to('cpu')
-    xk = xk.to('cpu')   
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2).to('cpu'))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2).to('cpu'))
     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
     xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
     xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
@@ -117,8 +117,9 @@ def __init__(self, args: ModelArgs):
         self.cache_v = torch.zeros(
             (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
         ).to('mps')#.cuda()
+        self.gate = torch.nn.Parameter(torch.zeros(1))
 
-    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], adapter=None):
         bsz, seqlen, _ = x.shape
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
 
@@ -137,6 +138,13 @@ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask
         keys = self.cache_k[:bsz, : start_pos + seqlen]
         values = self.cache_v[:bsz, : start_pos + seqlen]
 
+        if adapter is not None:
+           adapter_len = adapter.shape[1]
+           adapter_k = self.wk(adapter).view(1, adapter_len, self.n_local_heads, self.head_dim).repeat(bsz, 1, 1, 1)
+           adapter_v = self.wv(adapter).view(1, adapter_len, self.n_local_heads, self.head_dim).repeat(bsz, 1, 1, 1)
+           adapter_k = adapter_k.transpose(1, 2)
+           adapter_v = adapter_v.transpose(1, 2)
+
         xq = xq.transpose(1, 2)
         keys = keys.transpose(1, 2)
         values = values.transpose(1, 2)
@@ -145,6 +153,10 @@ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask
             scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
         scores = F.softmax(scores.float(), dim=-1).type_as(xq)
         output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
+        if adapter is not None:
+            adapter_scores = torch.matmul(xq, adapter_k.transpose(2, 3)) / math.sqrt(self.head_dim)
+            adapter_scores = self.gate * F.softmax(adapter_scores.float(), dim=-1).type_as(xq)
+            output = output + torch.matmul(adapter_scores, adapter_v)
         output = output.transpose(
             1, 2
         ).contiguous().view(bsz, seqlen, -1)
@@ -191,8 +203,8 @@ def __init__(self, layer_id: int, args: ModelArgs):
         self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
-    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
-        h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], adapter=None):
+        h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask, adapter)
         out = h + self.feed_forward.forward(self.ffn_norm(h))
         return out
 
@@ -221,20 +233,29 @@ def __init__(self, params: ModelArgs):
             self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
         )
 
+        self.adapter_query = nn.Embedding(params.adapter_len * params.adapter_layer, params.dim)
+        self.adapter_len = params.adapter_len
+        self.adapter_layer = params.adapter_layer
+
     @torch.inference_mode()
     def forward(self, tokens: torch.Tensor, start_pos: int):
         _bsz, seqlen = tokens.shape
         h = self.tok_embeddings(tokens)
         #self.freqs_cis = self.freqs_cis.float().to(h.device)
         freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+        prompt = self.adapter_query.weight.reshape(self.params.adapter_layer, self.params.adapter_len, self.params.dim).unsqueeze(1)
 
         mask = None
         if seqlen > 1:
             mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=torch.device('cpu'))
             mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
 
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, (mask.to('mps') if mask is not None else mask))
+        for layer in self.layers[: -1 * self.params.adapter_layer]:
+            h = layer(h, start_pos, freqs_cis, (mask.to('mps') if mask is not None else None))
+        layer_index = 0
+        for layer in self.layers[-1 * self.params.adapter_layer:]:
+            h = layer(h, start_pos, freqs_cis, (mask.to('mps') if mask is not None else None), prompt[layer_index])
+            layer_index = layer_index + 1
         h = self.norm(h)
         output = self.output(h[:, -1, :])  # only compute last logits
         return output.float()