Spaces:
Running
Running
pseudotensor
commited on
Commit
•
31f9cfa
1
Parent(s):
1265a5f
Update with h2oGPT hash dba6431da758fe9d822c9659f144ee64ea80f111
Browse files- generate.py +42 -24
- stopping.py +2 -2
- utils.py +1 -1
generate.py
CHANGED
@@ -6,6 +6,7 @@ import typing
|
|
6 |
from threading import Thread
|
7 |
|
8 |
import filelock
|
|
|
9 |
|
10 |
from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
|
11 |
|
@@ -135,7 +136,19 @@ def main(
|
|
135 |
api_open = bool(int(os.getenv('API_OPEN', api_open)))
|
136 |
allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
|
137 |
|
138 |
-
n_gpus = torch.cuda.device_count()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
# get defaults
|
141 |
model_lower = base_model.lower()
|
@@ -210,7 +223,7 @@ def main(
|
|
210 |
eval_filename = os.path.join(scoring_path, eval_filename)
|
211 |
|
212 |
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
|
213 |
-
context_class = NullContext() if n_gpus > 1 else torch.device("cuda")
|
214 |
|
215 |
with context_class:
|
216 |
# ensure was set right above before examples generated
|
@@ -340,7 +353,7 @@ def get_device():
|
|
340 |
if torch.cuda.is_available():
|
341 |
device = "cuda"
|
342 |
else:
|
343 |
-
|
344 |
|
345 |
return device
|
346 |
|
@@ -381,16 +394,21 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
|
|
381 |
device_map.update(device_map_model)
|
382 |
print('device_map: %s' % device_map, flush=True)
|
383 |
|
384 |
-
if
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
load_in_8bit = model_kwargs.get('load_in_8bit', False)
|
396 |
model_kwargs['device_map'] = device_map
|
@@ -483,24 +501,24 @@ def get_model(
|
|
483 |
model = model_loader(tokenizer,
|
484 |
model=base_model,
|
485 |
device=0 if device == "cuda" else -1,
|
486 |
-
torch_dtype=torch.float16)
|
487 |
else:
|
488 |
-
assert device
|
489 |
model_kwargs = dict(local_files_only=local_files_only,
|
490 |
-
torch_dtype=torch.float16,
|
491 |
resume_download=resume_download,
|
492 |
use_auth_token=use_auth_token)
|
493 |
if 'mbart-' not in base_model.lower():
|
494 |
model_kwargs.update(dict(load_in_8bit=load_8bit,
|
495 |
-
device_map={"": 0} if load_8bit else "auto",
|
496 |
))
|
497 |
if 'OpenAssistant/reward-model'.lower() in base_model.lower():
|
498 |
# could put on other GPUs
|
499 |
-
model_kwargs['device_map'] = {"": 0}
|
500 |
model_kwargs.pop('torch_dtype', None)
|
501 |
|
502 |
if not lora_weights:
|
503 |
-
with torch.device(
|
504 |
if infer_devices:
|
505 |
model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
|
506 |
gpu_id=gpu_id, use_auth_token=use_auth_token)
|
@@ -521,14 +539,14 @@ def get_model(
|
|
521 |
model = PeftModel.from_pretrained(
|
522 |
model,
|
523 |
lora_weights,
|
524 |
-
torch_dtype=torch.float16,
|
525 |
local_files_only=local_files_only,
|
526 |
resume_download=resume_download,
|
527 |
use_auth_token=use_auth_token,
|
528 |
-
device_map={"": 0}, # seems to be required
|
529 |
)
|
530 |
else:
|
531 |
-
with torch.device(
|
532 |
model = model_loader.from_pretrained(
|
533 |
base_model,
|
534 |
**model_kwargs
|
@@ -536,7 +554,7 @@ def get_model(
|
|
536 |
model = PeftModel.from_pretrained(
|
537 |
model,
|
538 |
lora_weights,
|
539 |
-
torch_dtype=torch.float16,
|
540 |
local_files_only=local_files_only,
|
541 |
resume_download=resume_download,
|
542 |
use_auth_token=use_auth_token,
|
@@ -751,7 +769,7 @@ def evaluate(
|
|
751 |
# handle fake \n added
|
752 |
stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
|
753 |
# build stopper
|
754 |
-
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
|
755 |
else:
|
756 |
stopping_criteria = StoppingCriteriaList()
|
757 |
|
|
|
6 |
from threading import Thread
|
7 |
|
8 |
import filelock
|
9 |
+
import psutil
|
10 |
|
11 |
from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
|
12 |
|
|
|
136 |
api_open = bool(int(os.getenv('API_OPEN', api_open)))
|
137 |
allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
|
138 |
|
139 |
+
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
|
140 |
+
if n_gpus == 0:
|
141 |
+
gpu_id = None
|
142 |
+
load_8bit = False
|
143 |
+
load_half = False
|
144 |
+
infer_devices = False
|
145 |
+
torch.backends.cudnn.benchmark = True
|
146 |
+
torch.backends.cudnn.enabled = False
|
147 |
+
torch.set_default_dtype(torch.float32)
|
148 |
+
if psutil.virtual_memory().available < 94*1024**3:
|
149 |
+
# 12B uses ~94GB
|
150 |
+
# 6.9B uses ~47GB
|
151 |
+
base_model = 'h2oai/h2ogpt-oig-oasst1-512-6.9b'
|
152 |
|
153 |
# get defaults
|
154 |
model_lower = base_model.lower()
|
|
|
223 |
eval_filename = os.path.join(scoring_path, eval_filename)
|
224 |
|
225 |
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
|
226 |
+
context_class = NullContext() if n_gpus > 1 or n_gpus == 0 else torch.device("cuda")
|
227 |
|
228 |
with context_class:
|
229 |
# ensure was set right above before examples generated
|
|
|
353 |
if torch.cuda.is_available():
|
354 |
device = "cuda"
|
355 |
else:
|
356 |
+
device = "cpu"
|
357 |
|
358 |
return device
|
359 |
|
|
|
394 |
device_map.update(device_map_model)
|
395 |
print('device_map: %s' % device_map, flush=True)
|
396 |
|
397 |
+
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
|
398 |
+
|
399 |
+
if n_gpus > 0:
|
400 |
+
if gpu_id >= 0:
|
401 |
+
# FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
|
402 |
+
# So avoid for now, just put on first GPU, unless score_model, put on last
|
403 |
+
if reward_type:
|
404 |
+
device_map = {'': n_gpus - 1}
|
405 |
+
else:
|
406 |
+
device_map = {'': min(n_gpus - 1, gpu_id)}
|
407 |
+
if gpu_id == -1:
|
408 |
+
device_map = {'': 'cuda'}
|
409 |
+
else:
|
410 |
+
device_map = {'': 'cpu'}
|
411 |
+
model_kwargs['load_in_8bit'] = False
|
412 |
|
413 |
load_in_8bit = model_kwargs.get('load_in_8bit', False)
|
414 |
model_kwargs['device_map'] = device_map
|
|
|
501 |
model = model_loader(tokenizer,
|
502 |
model=base_model,
|
503 |
device=0 if device == "cuda" else -1,
|
504 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32)
|
505 |
else:
|
506 |
+
assert device in ["cuda", "cpu"], "Unsupported device %s" % device
|
507 |
model_kwargs = dict(local_files_only=local_files_only,
|
508 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
|
509 |
resume_download=resume_download,
|
510 |
use_auth_token=use_auth_token)
|
511 |
if 'mbart-' not in base_model.lower():
|
512 |
model_kwargs.update(dict(load_in_8bit=load_8bit,
|
513 |
+
device_map={"": 0} if load_8bit and device == 'cuda' else "auto",
|
514 |
))
|
515 |
if 'OpenAssistant/reward-model'.lower() in base_model.lower():
|
516 |
# could put on other GPUs
|
517 |
+
model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
|
518 |
model_kwargs.pop('torch_dtype', None)
|
519 |
|
520 |
if not lora_weights:
|
521 |
+
with torch.device(device):
|
522 |
if infer_devices:
|
523 |
model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
|
524 |
gpu_id=gpu_id, use_auth_token=use_auth_token)
|
|
|
539 |
model = PeftModel.from_pretrained(
|
540 |
model,
|
541 |
lora_weights,
|
542 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
|
543 |
local_files_only=local_files_only,
|
544 |
resume_download=resume_download,
|
545 |
use_auth_token=use_auth_token,
|
546 |
+
device_map={"": 0} if device == 'cuda' else {"": 'cpu'}, # seems to be required
|
547 |
)
|
548 |
else:
|
549 |
+
with torch.device(device):
|
550 |
model = model_loader.from_pretrained(
|
551 |
base_model,
|
552 |
**model_kwargs
|
|
|
554 |
model = PeftModel.from_pretrained(
|
555 |
model,
|
556 |
lora_weights,
|
557 |
+
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
|
558 |
local_files_only=local_files_only,
|
559 |
resume_download=resume_download,
|
560 |
use_auth_token=use_auth_token,
|
|
|
769 |
# handle fake \n added
|
770 |
stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
|
771 |
# build stopper
|
772 |
+
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device)])
|
773 |
else:
|
774 |
stopping_criteria = StoppingCriteriaList()
|
775 |
|
stopping.py
CHANGED
@@ -9,11 +9,11 @@ from transformers import StoppingCriteria
|
|
9 |
|
10 |
class StoppingCriteriaSub(StoppingCriteria):
|
11 |
|
12 |
-
def __init__(self, stops=[], encounters=[]):
|
13 |
super().__init__()
|
14 |
assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
|
15 |
self.encounters = encounters
|
16 |
-
self.stops = [stop.to(
|
17 |
self.num_stops = [0] * len(stops)
|
18 |
|
19 |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
|
|
9 |
|
10 |
class StoppingCriteriaSub(StoppingCriteria):
|
11 |
|
12 |
+
def __init__(self, stops=[], encounters=[], device="cuda"):
|
13 |
super().__init__()
|
14 |
assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
|
15 |
self.encounters = encounters
|
16 |
+
self.stops = [stop.to(device) for stop in stops]
|
17 |
self.num_stops = [0] * len(stops)
|
18 |
|
19 |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
utils.py
CHANGED
@@ -46,7 +46,7 @@ def flatten_list(lis):
|
|
46 |
|
47 |
def clear_torch_cache():
|
48 |
import torch
|
49 |
-
if torch.cuda.is_available:
|
50 |
torch.cuda.empty_cache()
|
51 |
torch.cuda.ipc_collect()
|
52 |
gc.collect()
|
|
|
46 |
|
47 |
def clear_torch_cache():
|
48 |
import torch
|
49 |
+
if torch.cuda.is_available():
|
50 |
torch.cuda.empty_cache()
|
51 |
torch.cuda.ipc_collect()
|
52 |
gc.collect()
|