Download this file

65 lines (49 with data), 1.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os, json, itertools, bisect, gc
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers
import torch
from accelerate import Accelerator
import accelerate
import time
model = None
tokenizer = None
generator = None
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from wiki_reader import wiki_prompter
def load_model(model_name, eight_bit=0, device_map="auto"):
global model, tokenizer, generator
print("Loading " + model_name + "...")
if device_map == "zero":
device_map = "balanced_low_0"
# config
gpu_count = torch.cuda.device_count()
print('gpu_count', gpu_count)
tokenizer = transformers.LLaMATokenizer.from_pretrained(model_name)
model = transformers.LLaMAForCausalLM.from_pretrained(
model_name,
# device_map=device_map,
# device_map="auto",
torch_dtype=torch.float16,
# max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
# load_in_8bit=eight_bit,
# from_tf=True,
low_cpu_mem_usage=True,
load_in_8bit=False,
cache_dir="cache"
).cuda()
generator = model.generate
load_model("chatDoctor100k/")
First_chat = "ChatDoctor: I am ChatDoctor, what medical questions do you have?"
print(First_chat)
def go():
invitation = "ChatDoctor: "
human_invitation = "Patient: "
# input
msg = input(human_invitation)
print("")
response = wiki_prompter(generator,tokenizer,msg)
print()
print(invitation + response)
print()
while True:
go()