Diff of /src/local_llm.py [000000] .. [014e6e]

Switch to side-by-side view

--- a
+++ b/src/local_llm.py
@@ -0,0 +1,238 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+from typing import Optional
+import fire
+from llama import Llama
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, set_seed
+import torch
+
+def api_preload(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    max_seq_len: int = 512,
+    max_batch_size: int = 8,
+):
+    print(">> start loading model")
+
+    generator = Llama.build(
+        ckpt_dir=ckpt_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+        model_parallel_size=1
+    )
+    print(">> model loaded")
+    return generator
+
+def api_generator(instructions,
+                  generator,
+                  temperature: float = 0.2,
+                  top_p: float = 0.95,
+                  max_gen_len: Optional[int] = None,):
+    results = generator.chat_completion(
+        instructions,  # type: ignore
+        max_gen_len=max_gen_len,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    return results
+
+def api_preload_hf(
+    ckpt_dir: str,
+    tokenizer_path: str = None,
+    max_seq_len: int = 512,
+    max_batch_size: int = 8,
+):
+    print(">> start loading model")
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
+    model = AutoModelForCausalLM.from_pretrained(ckpt_dir)
+    model.to('cuda')
+    generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
+    print(">> model loaded")
+    return generator
+
+def api_generator_hf(instructions,
+                  generator):
+    _prompt = ''
+    results = generator(instructions[0][0]['content'], renormalize_logits=True, do_sample=True, use_cache=True, max_new_tokens=10)
+
+    return results
+
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    max_seq_len: int = 512,
+    max_batch_size: int = 8,
+    max_gen_len: Optional[int] = None,
+):
+    generator = Llama.build(
+        ckpt_dir=ckpt_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+    )
+
+    instructions = [
+        [
+            {
+                "role": "user",
+                "content": "In Bash, how do I list all text files in the current directory (excluding subdirectories) that have been modified in the last month?",
+            }
+        ],
+        [
+            {
+                "role": "user",
+                "content": "What is the difference between inorder and preorder traversal? Give an example in Python.",
+            }
+        ],
+        [
+            {
+                "role": "system",
+                "content": "Provide answers in JavaScript",
+            },
+            {
+                "role": "user",
+                "content": "Write a function that computes the set of sums of all contiguous sublists of a given list.",
+            }
+        ],
+    ]
+    results = generator.chat_completion(
+        instructions,  # type: ignore
+        max_gen_len=max_gen_len,
+        temperature=temperature,
+        top_p=top_p,
+    )
+
+    for instruction, result in zip(instructions, results):
+        for msg in instruction:
+            print(f"{msg['role'].capitalize()}: {msg['content']}\n")
+        print(
+            f"> {result['generation']['role'].capitalize()}: {result['generation']['content']}"
+        )
+        print("\n==================================\n")
+
+def test1():
+    ckpt_dir = 'codellama-main/CodeLlama-7b-Instruct/'
+    tokenizer_path = 'codellama-main/CodeLlama-7b-Instruct/tokenizer.model'
+
+    generator = api_preload(ckpt_dir=ckpt_dir,
+                            tokenizer_path=tokenizer_path)
+    instructions = [
+        [
+            {
+                "role": "user",
+                "content": "What is the difference between inorder and preorder traversal? Give an example in Python.",
+            }
+        ],
+    ]
+    results = api_generator(instructions=instructions, generator=generator)
+    for instruction, result in zip(instructions, results):
+        for msg in instruction:
+            print(f"{msg['role'].capitalize()}: {msg['content']}\n")
+        print(
+            f"> {result['generation']['role'].capitalize()}: {result['generation']['content']}"
+        )
+        print("\n==================================\n")
+
+def test2():
+    ckpt_dir = 'codellama-main/CodeLlama-7b-Instruct-hf/'
+    tokenizer_path = 'codellama-main/CodeLlama-7b-Instruct-hf/'
+
+    generator = api_preload_hf(ckpt_dir=ckpt_dir)
+    instructions = [
+        [
+            {
+                "role": "user",
+                "content": "What is the difference between inorder and preorder traversal? Give an example in Python.",
+            }
+        ],
+    ]
+    results = api_generator_hf(instructions=instructions, generator=generator)
+    print(results)
+    for instruction, result in zip(instructions, results):
+        for msg in instruction:
+            print(f"{msg['role'].capitalize()}: {msg['content']}\n")
+        print(
+            f"> {result['generation']['role'].capitalize()}: {result['generation']['content']}"
+        )
+        print("\n==================================\n")
+
+def api_preload_deepseek(
+    ckpt_dir: str,
+    tokenizer_path: str = None,
+    cpu = False
+):
+    print(">> start loading model")
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        trust_remote_code=True)
+    if cpu:
+        generator = AutoModelForCausalLM.from_pretrained(
+            ckpt_dir,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16)
+    else:
+        if '67b' in ckpt_dir:
+            generator = AutoModelForCausalLM.from_pretrained(
+                ckpt_dir,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
+                load_in_8bit=True)
+        else:
+            generator = AutoModelForCausalLM.from_pretrained(
+                ckpt_dir,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16).cuda()
+    print(">> model loaded")
+    return tokenizer, generator
+
+def api_generator_deepseek(instructions,
+                  tokenizer,
+                  generator,
+                  max_new_tokens = 512,
+                  top_k = 50,
+                  top_p = 0.95):
+    messages = instructions[0]
+    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(generator.device)
+    # tokenizer.eos_token_id is the id of <|EOT|> token
+    outputs = generator.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False, top_k=top_k, top_p=top_p, num_return_sequences=1,
+                             eos_token_id=tokenizer.eos_token_id)
+    content = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
+    results = [
+        {
+        'generation': {
+            'role': 'AI',
+            'content': content
+        }
+        }
+    ]
+    return results
+
+def test3():
+    ckpt_dir = 'deepseek/deepseek-coder-7b-instruct-v1.5'
+    tokenizer_path = 'deepseek/deepseek-coder-7b-instruct-v1.5'
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(ckpt_dir, trust_remote_code=True,
+                                                 torch_dtype=torch.bfloat16).cuda()
+    messages = [
+        {'role': 'user', 'content': "write a quick sort algorithm in python."}
+    ]
+    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    # tokenizer.eos_token_id is the id of <|EOT|> token
+    outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1,
+                             eos_token_id=tokenizer.eos_token_id)
+    print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))
+
+
+if __name__ == "__main__":
+    import os
+    import torch.distributed as dist
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '5678'
+    dist.init_process_group(backend='nccl', init_method='env://', rank=0, world_size=1)
+    test3()
+