Spaces:
Running
Running
pseudotensor
commited on
Commit
•
5b1d132
1
Parent(s):
6dd6b04
Update with h2oGPT hash 2cf0e36c0a86f41add0929b2c9217bfe480ffb58
Browse files- generate.py +67 -18
- gradio_runner.py +13 -11
- utils.py +2 -0
generate.py
CHANGED
@@ -36,11 +36,11 @@ eval_extra_columns = ['prompt', 'response', 'score']
|
|
36 |
def main(
|
37 |
load_8bit: bool = False,
|
38 |
load_half: bool = True,
|
39 |
-
infer_devices: bool = True,
|
40 |
base_model: str = '',
|
41 |
tokenizer_base_model: str = '',
|
42 |
lora_weights: str = "",
|
43 |
-
gpu_id: int = 0,
|
44 |
|
45 |
prompt_type: Union[int, str] = None,
|
46 |
# input to generation
|
@@ -61,7 +61,7 @@ def main(
|
|
61 |
share: bool = True,
|
62 |
local_files_only: bool = False,
|
63 |
resume_download: bool = True,
|
64 |
-
use_auth_token: Union[str, bool] = False,
|
65 |
|
66 |
src_lang: str = "English",
|
67 |
tgt_lang: str = "Russian",
|
@@ -69,20 +69,18 @@ def main(
|
|
69 |
gradio: bool = True,
|
70 |
gradio_avoid_processing_markdown: bool = False,
|
71 |
chat: bool = True,
|
72 |
-
chat_history: int = 4096,
|
73 |
-
chat_context: bool = False,
|
74 |
stream_output: bool = True,
|
75 |
show_examples: bool = None,
|
76 |
verbose: bool = False,
|
77 |
h2ocolors: bool = True,
|
78 |
height: int = 400,
|
79 |
show_lora: bool = True,
|
80 |
-
# set to True to load --base_model after client logs in,
|
81 |
-
# to be able to free GPU memory when model is swapped
|
82 |
login_mode_if_model0: bool = False,
|
83 |
block_gradio_exit: bool = True,
|
84 |
concurrency_count: int = 1,
|
85 |
-
api_open: bool = False,
|
86 |
allow_api: bool = True,
|
87 |
input_lines: int = 1,
|
88 |
|
@@ -98,9 +96,64 @@ def main(
|
|
98 |
eval_sharegpt_prompts_only: int = 0,
|
99 |
eval_sharegpt_prompts_only_seed: int = 1234,
|
100 |
eval_sharegpt_as_output: bool = False,
|
101 |
-
|
102 |
-
hard_stop_list: typing.List[str] = [],
|
103 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
|
105 |
is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
|
106 |
is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
|
@@ -652,7 +705,6 @@ def evaluate(
|
|
652 |
debug=False,
|
653 |
concurrency_count=None,
|
654 |
save_dir=None,
|
655 |
-
hard_stop_list=None,
|
656 |
sanitize_bot_response=True,
|
657 |
model_state0=None,
|
658 |
is_low_mem=None,
|
@@ -714,10 +766,6 @@ def evaluate(
|
|
714 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
715 |
prompt = prompter.generate_prompt(data_point)
|
716 |
|
717 |
-
if hard_stop_list is None:
|
718 |
-
# acts like undo on user entry and bot response
|
719 |
-
hard_stop_list = []
|
720 |
-
|
721 |
if isinstance(tokenizer, str):
|
722 |
# pipeline
|
723 |
if tokenizer == "summarization":
|
@@ -1219,7 +1267,9 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
|
|
1219 |
|
1220 |
|
1221 |
if __name__ == "__main__":
|
1222 |
-
|
|
|
|
|
1223 |
WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
|
1224 |
python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
|
1225 |
python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
|
@@ -1245,6 +1295,5 @@ if __name__ == "__main__":
|
|
1245 |
python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
|
1246 |
|
1247 |
python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
|
1248 |
-
|
1249 |
-
""", flush=True)
|
1250 |
fire.Fire(main)
|
|
|
36 |
def main(
|
37 |
load_8bit: bool = False,
|
38 |
load_half: bool = True,
|
39 |
+
infer_devices: bool = True,
|
40 |
base_model: str = '',
|
41 |
tokenizer_base_model: str = '',
|
42 |
lora_weights: str = "",
|
43 |
+
gpu_id: int = 0,
|
44 |
|
45 |
prompt_type: Union[int, str] = None,
|
46 |
# input to generation
|
|
|
61 |
share: bool = True,
|
62 |
local_files_only: bool = False,
|
63 |
resume_download: bool = True,
|
64 |
+
use_auth_token: Union[str, bool] = False,
|
65 |
|
66 |
src_lang: str = "English",
|
67 |
tgt_lang: str = "Russian",
|
|
|
69 |
gradio: bool = True,
|
70 |
gradio_avoid_processing_markdown: bool = False,
|
71 |
chat: bool = True,
|
72 |
+
chat_history: int = 4096,
|
73 |
+
chat_context: bool = False,
|
74 |
stream_output: bool = True,
|
75 |
show_examples: bool = None,
|
76 |
verbose: bool = False,
|
77 |
h2ocolors: bool = True,
|
78 |
height: int = 400,
|
79 |
show_lora: bool = True,
|
|
|
|
|
80 |
login_mode_if_model0: bool = False,
|
81 |
block_gradio_exit: bool = True,
|
82 |
concurrency_count: int = 1,
|
83 |
+
api_open: bool = False,
|
84 |
allow_api: bool = True,
|
85 |
input_lines: int = 1,
|
86 |
|
|
|
96 |
eval_sharegpt_prompts_only: int = 0,
|
97 |
eval_sharegpt_prompts_only_seed: int = 1234,
|
98 |
eval_sharegpt_as_output: bool = False,
|
|
|
|
|
99 |
):
|
100 |
+
"""
|
101 |
+
|
102 |
+
:param load_8bit: load model in 8-bit using bitsandbytes
|
103 |
+
:param load_half: load model in float16
|
104 |
+
:param infer_devices: whether to control devices with gpu_id. If False, then spread across GPUs
|
105 |
+
:param base_model: model HF-type name
|
106 |
+
:param tokenizer_base_model: tokenizer HF-type name
|
107 |
+
:param lora_weights: LORA weights path/HF link
|
108 |
+
:param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
|
109 |
+
:param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
|
110 |
+
:param temperature: generation temperature
|
111 |
+
:param top_p: generation top_p
|
112 |
+
:param top_k: generation top_k
|
113 |
+
:param num_beams: generation number of beams
|
114 |
+
:param repetition_penalty: generation repetition penalty
|
115 |
+
:param num_return_sequences: generation number of sequences (1 forced for chat)
|
116 |
+
:param do_sample: generation sample
|
117 |
+
:param max_new_tokens: generation max new tokens
|
118 |
+
:param min_new_tokens: generation min tokens
|
119 |
+
:param early_stopping: generation early stopping
|
120 |
+
:param max_time: maximum time to allow for generation
|
121 |
+
:param debug: enable debug mode
|
122 |
+
:param save_dir: directory chat data is saved to
|
123 |
+
:param share: whether to share the gradio app with sharable URL
|
124 |
+
:param local_files_only: whether to only use local files instead of doing to HF for models
|
125 |
+
:param resume_download: whether to resume downloads from HF for models
|
126 |
+
:param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
|
127 |
+
:param src_lang: source languages to include if doing translation (None = all)
|
128 |
+
:param tgt_lang: target languages to include if doing translation (None = all)
|
129 |
+
:param gradio: whether to enable gradio, or to enable benchmark mode
|
130 |
+
:param gradio_avoid_processing_markdown:
|
131 |
+
:param chat: whether to enable chat mode with chat history
|
132 |
+
:param chat_history: maximum character length of chat context/history
|
133 |
+
:param chat_context: whether to use extra helpful context if human_bot
|
134 |
+
:param stream_output: whether to stream output from generate
|
135 |
+
:param show_examples: whether to show clickable examples in gradio
|
136 |
+
:param verbose: whether to show verbose prints
|
137 |
+
:param h2ocolors: whether to use H2O.ai theme
|
138 |
+
:param height: height of chat window
|
139 |
+
:param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
|
140 |
+
:param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
|
141 |
+
:param block_gradio_exit: whether to block gradio exit (used for testing)
|
142 |
+
:param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
|
143 |
+
:param api_open: If False, don't let API calls skip gradio queue
|
144 |
+
:param allow_api: whether to allow API calls at all to gradio server
|
145 |
+
:param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
|
146 |
+
:param sanitize_user_prompt: whether to remove profanity from user input
|
147 |
+
:param sanitize_bot_response: whether to remove profanity and repeat lines from bot output
|
148 |
+
:param extra_model_options: extra models to show in list in gradio
|
149 |
+
:param extra_lora_options: extra LORA to show in list in gradio
|
150 |
+
:param score_model: which model to score responses (None means no scoring)
|
151 |
+
:param auto_score: whether to automatically score responses
|
152 |
+
:param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
|
153 |
+
:param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
|
154 |
+
:param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
|
155 |
+
:return:
|
156 |
+
"""
|
157 |
is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
|
158 |
is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
|
159 |
is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
|
|
|
705 |
debug=False,
|
706 |
concurrency_count=None,
|
707 |
save_dir=None,
|
|
|
708 |
sanitize_bot_response=True,
|
709 |
model_state0=None,
|
710 |
is_low_mem=None,
|
|
|
766 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
767 |
prompt = prompter.generate_prompt(data_point)
|
768 |
|
|
|
|
|
|
|
|
|
769 |
if isinstance(tokenizer, str):
|
770 |
# pipeline
|
771 |
if tokenizer == "summarization":
|
|
|
1267 |
|
1268 |
|
1269 |
if __name__ == "__main__":
|
1270 |
+
"""
|
1271 |
+
Examples:
|
1272 |
+
|
1273 |
WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
|
1274 |
python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
|
1275 |
python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
|
|
|
1295 |
python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
|
1296 |
|
1297 |
python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
|
1298 |
+
"""
|
|
|
1299 |
fire.Fire(main)
|
gradio_runner.py
CHANGED
@@ -48,16 +48,8 @@ def go_gradio(**kwargs):
|
|
48 |
Hash: {get_githash()}
|
49 |
"""
|
50 |
else:
|
51 |
-
description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)
|
52 |
-
|
53 |
-
description += "If this host is busy, try [gpt.h2o.ai 20B](https://gpt.h2o.ai) and [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) and [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
|
54 |
-
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
|
55 |
-
if kwargs['load_8bit']:
|
56 |
-
description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
|
57 |
-
description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
|
58 |
-
if 'h2ogpt-research' in kwargs['base_model']:
|
59 |
-
description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
|
60 |
-
description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
|
61 |
|
62 |
if kwargs['verbose']:
|
63 |
task_info_md = f"""
|
@@ -371,6 +363,16 @@ def go_gradio(**kwargs):
|
|
371 |
with gr.Row():
|
372 |
s3up_btn = gr.Button("S3UP")
|
373 |
s3up_text = gr.Textbox(label='S3UP result', interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
# Get flagged data
|
376 |
zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
|
@@ -910,7 +912,7 @@ def go_gradio(**kwargs):
|
|
910 |
|
911 |
|
912 |
input_args_list = ['model_state']
|
913 |
-
inputs_kwargs_list = ['debug', 'save_dir', '
|
914 |
'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
|
915 |
|
916 |
|
|
|
48 |
Hash: {get_githash()}
|
49 |
"""
|
50 |
else:
|
51 |
+
description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
|
52 |
+
description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
if kwargs['verbose']:
|
55 |
task_info_md = f"""
|
|
|
363 |
with gr.Row():
|
364 |
s3up_btn = gr.Button("S3UP")
|
365 |
s3up_text = gr.Textbox(label='S3UP result', interactive=False)
|
366 |
+
with gr.TabItem("Disclaimers"):
|
367 |
+
description = ""
|
368 |
+
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
|
369 |
+
if kwargs['load_8bit']:
|
370 |
+
description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
|
371 |
+
description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
|
372 |
+
if 'h2ogpt-research' in kwargs['base_model']:
|
373 |
+
description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
|
374 |
+
description += """<i><li>By using h2oGPT, you accept our <a href="https://github.com/h2oai/h2ogpt/blob/main/tos.md">Terms of Service</a></i></li></ul></p>"""
|
375 |
+
gr.Markdown(value=description, show_label=False, interactive=False)
|
376 |
|
377 |
# Get flagged data
|
378 |
zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
|
|
|
912 |
|
913 |
|
914 |
input_args_list = ['model_state']
|
915 |
+
inputs_kwargs_list = ['debug', 'save_dir', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
|
916 |
'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
|
917 |
|
918 |
|
utils.py
CHANGED
@@ -96,6 +96,8 @@ def system_info():
|
|
96 |
for k, v in gpu_memory_frac_dict.items():
|
97 |
system[f'GPU_M/%s' % k] = v
|
98 |
|
|
|
|
|
99 |
return system
|
100 |
|
101 |
|
|
|
96 |
for k, v in gpu_memory_frac_dict.items():
|
97 |
system[f'GPU_M/%s' % k] = v
|
98 |
|
99 |
+
system['hash'] = get_githash()
|
100 |
+
|
101 |
return system
|
102 |
|
103 |
|