Diff of /src/local_llm.py [000000] .. [014e6e]

Switch to unified view

a b/src/local_llm.py
1
# Copyright (c) Meta Platforms, Inc. and affiliates.
2
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
3
4
from typing import Optional
5
import fire
6
from llama import Llama
7
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, set_seed
8
import torch
9
10
def api_preload(
11
    ckpt_dir: str,
12
    tokenizer_path: str,
13
    max_seq_len: int = 512,
14
    max_batch_size: int = 8,
15
):
16
    print(">> start loading model")
17
18
    generator = Llama.build(
19
        ckpt_dir=ckpt_dir,
20
        tokenizer_path=tokenizer_path,
21
        max_seq_len=max_seq_len,
22
        max_batch_size=max_batch_size,
23
        model_parallel_size=1
24
    )
25
    print(">> model loaded")
26
    return generator
27
28
def api_generator(instructions,
29
                  generator,
30
                  temperature: float = 0.2,
31
                  top_p: float = 0.95,
32
                  max_gen_len: Optional[int] = None,):
33
    results = generator.chat_completion(
34
        instructions,  # type: ignore
35
        max_gen_len=max_gen_len,
36
        temperature=temperature,
37
        top_p=top_p,
38
    )
39
    return results
40
41
def api_preload_hf(
42
    ckpt_dir: str,
43
    tokenizer_path: str = None,
44
    max_seq_len: int = 512,
45
    max_batch_size: int = 8,
46
):
47
    print(">> start loading model")
48
    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, padding_side="left")
49
    model = AutoModelForCausalLM.from_pretrained(ckpt_dir)
50
    model.to('cuda')
51
    generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
52
    print(">> model loaded")
53
    return generator
54
55
def api_generator_hf(instructions,
56
                  generator):
57
    _prompt = ''
58
    results = generator(instructions[0][0]['content'], renormalize_logits=True, do_sample=True, use_cache=True, max_new_tokens=10)
59
60
    return results
61
62
def main(
63
    ckpt_dir: str,
64
    tokenizer_path: str,
65
    temperature: float = 0.2,
66
    top_p: float = 0.95,
67
    max_seq_len: int = 512,
68
    max_batch_size: int = 8,
69
    max_gen_len: Optional[int] = None,
70
):
71
    generator = Llama.build(
72
        ckpt_dir=ckpt_dir,
73
        tokenizer_path=tokenizer_path,
74
        max_seq_len=max_seq_len,
75
        max_batch_size=max_batch_size,
76
    )
77
78
    instructions = [
79
        [
80
            {
81
                "role": "user",
82
                "content": "In Bash, how do I list all text files in the current directory (excluding subdirectories) that have been modified in the last month?",
83
            }
84
        ],
85
        [
86
            {
87
                "role": "user",
88
                "content": "What is the difference between inorder and preorder traversal? Give an example in Python.",
89
            }
90
        ],
91
        [
92
            {
93
                "role": "system",
94
                "content": "Provide answers in JavaScript",
95
            },
96
            {
97
                "role": "user",
98
                "content": "Write a function that computes the set of sums of all contiguous sublists of a given list.",
99
            }
100
        ],
101
    ]
102
    results = generator.chat_completion(
103
        instructions,  # type: ignore
104
        max_gen_len=max_gen_len,
105
        temperature=temperature,
106
        top_p=top_p,
107
    )
108
109
    for instruction, result in zip(instructions, results):
110
        for msg in instruction:
111
            print(f"{msg['role'].capitalize()}: {msg['content']}\n")
112
        print(
113
            f"> {result['generation']['role'].capitalize()}: {result['generation']['content']}"
114
        )
115
        print("\n==================================\n")
116
117
def test1():
118
    ckpt_dir = 'codellama-main/CodeLlama-7b-Instruct/'
119
    tokenizer_path = 'codellama-main/CodeLlama-7b-Instruct/tokenizer.model'
120
121
    generator = api_preload(ckpt_dir=ckpt_dir,
122
                            tokenizer_path=tokenizer_path)
123
    instructions = [
124
        [
125
            {
126
                "role": "user",
127
                "content": "What is the difference between inorder and preorder traversal? Give an example in Python.",
128
            }
129
        ],
130
    ]
131
    results = api_generator(instructions=instructions, generator=generator)
132
    for instruction, result in zip(instructions, results):
133
        for msg in instruction:
134
            print(f"{msg['role'].capitalize()}: {msg['content']}\n")
135
        print(
136
            f"> {result['generation']['role'].capitalize()}: {result['generation']['content']}"
137
        )
138
        print("\n==================================\n")
139
140
def test2():
141
    ckpt_dir = 'codellama-main/CodeLlama-7b-Instruct-hf/'
142
    tokenizer_path = 'codellama-main/CodeLlama-7b-Instruct-hf/'
143
144
    generator = api_preload_hf(ckpt_dir=ckpt_dir)
145
    instructions = [
146
        [
147
            {
148
                "role": "user",
149
                "content": "What is the difference between inorder and preorder traversal? Give an example in Python.",
150
            }
151
        ],
152
    ]
153
    results = api_generator_hf(instructions=instructions, generator=generator)
154
    print(results)
155
    for instruction, result in zip(instructions, results):
156
        for msg in instruction:
157
            print(f"{msg['role'].capitalize()}: {msg['content']}\n")
158
        print(
159
            f"> {result['generation']['role'].capitalize()}: {result['generation']['content']}"
160
        )
161
        print("\n==================================\n")
162
163
def api_preload_deepseek(
164
    ckpt_dir: str,
165
    tokenizer_path: str = None,
166
    cpu = False
167
):
168
    print(">> start loading model")
169
    tokenizer = AutoTokenizer.from_pretrained(
170
        tokenizer_path,
171
        trust_remote_code=True)
172
    if cpu:
173
        generator = AutoModelForCausalLM.from_pretrained(
174
            ckpt_dir,
175
            trust_remote_code=True,
176
            torch_dtype=torch.bfloat16)
177
    else:
178
        if '67b' in ckpt_dir:
179
            generator = AutoModelForCausalLM.from_pretrained(
180
                ckpt_dir,
181
                trust_remote_code=True,
182
                torch_dtype=torch.bfloat16,
183
                load_in_8bit=True)
184
        else:
185
            generator = AutoModelForCausalLM.from_pretrained(
186
                ckpt_dir,
187
                trust_remote_code=True,
188
                torch_dtype=torch.bfloat16).cuda()
189
    print(">> model loaded")
190
    return tokenizer, generator
191
192
def api_generator_deepseek(instructions,
193
                  tokenizer,
194
                  generator,
195
                  max_new_tokens = 512,
196
                  top_k = 50,
197
                  top_p = 0.95):
198
    messages = instructions[0]
199
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(generator.device)
200
    # tokenizer.eos_token_id is the id of <|EOT|> token
201
    outputs = generator.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False, top_k=top_k, top_p=top_p, num_return_sequences=1,
202
                             eos_token_id=tokenizer.eos_token_id)
203
    content = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
204
    results = [
205
        {
206
        'generation': {
207
            'role': 'AI',
208
            'content': content
209
        }
210
        }
211
    ]
212
    return results
213
214
def test3():
215
    ckpt_dir = 'deepseek/deepseek-coder-7b-instruct-v1.5'
216
    tokenizer_path = 'deepseek/deepseek-coder-7b-instruct-v1.5'
217
218
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
219
    model = AutoModelForCausalLM.from_pretrained(ckpt_dir, trust_remote_code=True,
220
                                                 torch_dtype=torch.bfloat16).cuda()
221
    messages = [
222
        {'role': 'user', 'content': "write a quick sort algorithm in python."}
223
    ]
224
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
225
    # tokenizer.eos_token_id is the id of <|EOT|> token
226
    outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1,
227
                             eos_token_id=tokenizer.eos_token_id)
228
    print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))
229
230
231
if __name__ == "__main__":
232
    import os
233
    import torch.distributed as dist
234
    os.environ['MASTER_ADDR'] = 'localhost'
235
    os.environ['MASTER_PORT'] = '5678'
236
    dist.init_process_group(backend='nccl', init_method='env://', rank=0, world_size=1)
237
    test3()
238