h2ogpt-chatbot

Running

App Files Files Community

pseudotensor commited on May 3, 2023

Commit

5b1d132

•

1 Parent(s): 6dd6b04

Update with h2oGPT hash 2cf0e36c0a86f41add0929b2c9217bfe480ffb58

Browse files

Files changed (3) hide show

generate.py +67 -18
gradio_runner.py +13 -11
utils.py +2 -0

generate.py CHANGED Viewed

@@ -36,11 +36,11 @@ eval_extra_columns = ['prompt', 'response', 'score']
 def main(
         load_8bit: bool = False,
         load_half: bool = True,
-        infer_devices: bool = True,  # really if to "control" devices now
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
-        gpu_id: int = 0,  # if infer_devices = True and gpu_id != -1
         prompt_type: Union[int, str] = None,
         # input to generation
@@ -61,7 +61,7 @@ def main(
         share: bool = True,
         local_files_only: bool = False,
         resume_download: bool = True,
-        use_auth_token: Union[str, bool] = False,  # True requires CLI did huggingface-cli login before running
         src_lang: str = "English",
         tgt_lang: str = "Russian",
@@ -69,20 +69,18 @@ def main(
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
-        chat_history: int = 4096,  # character length of chat context/history
-        chat_context: bool = False,  # use default context if human_bot
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
         h2ocolors: bool = True,
         height: int = 400,
         show_lora: bool = True,
-        # set to True to load --base_model after client logs in,
-        # to be able to free GPU memory when model is swapped
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
         concurrency_count: int = 1,
-        api_open: bool = False,  # don't let API skip queue
         allow_api: bool = True,
         input_lines: int = 1,
@@ -98,9 +96,64 @@ def main(
         eval_sharegpt_prompts_only: int = 0,
         eval_sharegpt_prompts_only_seed: int = 1234,
         eval_sharegpt_as_output: bool = False,
-        hard_stop_list: typing.List[str] = [],
 ):
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
     is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
@@ -652,7 +705,6 @@ def evaluate(
         debug=False,
         concurrency_count=None,
         save_dir=None,
-        hard_stop_list=None,
         sanitize_bot_response=True,
         model_state0=None,
         is_low_mem=None,
@@ -714,10 +766,6 @@ def evaluate(
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
-    if hard_stop_list is None:
-        # acts like undo on user entry and bot response
-        hard_stop_list = []
     if isinstance(tokenizer, str):
         # pipeline
         if tokenizer == "summarization":
@@ -1219,7 +1267,9 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
 if __name__ == "__main__":
-    print("""
     WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
     python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
     python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
@@ -1245,6 +1295,5 @@ if __name__ == "__main__":
     python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
     python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
-    """, flush=True)
     fire.Fire(main)

 def main(
         load_8bit: bool = False,
         load_half: bool = True,
+        infer_devices: bool = True,
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
+        gpu_id: int = 0,
         prompt_type: Union[int, str] = None,
         # input to generation
         share: bool = True,
         local_files_only: bool = False,
         resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,
         src_lang: str = "English",
         tgt_lang: str = "Russian",
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
+        chat_history: int = 4096,
+        chat_context: bool = False,
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
         h2ocolors: bool = True,
         height: int = 400,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
         concurrency_count: int = 1,
+        api_open: bool = False,
         allow_api: bool = True,
         input_lines: int = 1,
         eval_sharegpt_prompts_only: int = 0,
         eval_sharegpt_prompts_only_seed: int = 1234,
         eval_sharegpt_as_output: bool = False,
 ):
+    """
+    :param load_8bit: load model in 8-bit using bitsandbytes
+    :param load_half: load model in float16
+    :param infer_devices: whether to control devices with gpu_id.  If False, then spread across GPUs
+    :param base_model: model HF-type name
+    :param tokenizer_base_model: tokenizer HF-type name
+    :param lora_weights: LORA weights path/HF link
+    :param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
+    :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
+    :param temperature: generation temperature
+    :param top_p: generation top_p
+    :param top_k: generation top_k
+    :param num_beams: generation number of beams
+    :param repetition_penalty: generation repetition penalty
+    :param num_return_sequences: generation number of sequences (1 forced for chat)
+    :param do_sample: generation sample
+    :param max_new_tokens: generation max new tokens
+    :param min_new_tokens: generation min tokens
+    :param early_stopping: generation early stopping
+    :param max_time: maximum time to allow for generation
+    :param debug: enable debug mode
+    :param save_dir: directory chat data is saved to
+    :param share: whether to share the gradio app with sharable URL
+    :param local_files_only: whether to only use local files instead of doing to HF for models
+    :param resume_download: whether to resume downloads from HF for models
+    :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
+    :param src_lang: source languages to include if doing translation (None = all)
+    :param tgt_lang: target languages to include if doing translation (None = all)
+    :param gradio: whether to enable gradio, or to enable benchmark mode
+    :param gradio_avoid_processing_markdown:
+    :param chat: whether to enable chat mode with chat history
+    :param chat_history: maximum character length of chat context/history
+    :param chat_context: whether to use extra helpful context if human_bot
+    :param stream_output: whether to stream output from generate
+    :param show_examples: whether to show clickable examples in gradio
+    :param verbose: whether to show verbose prints
+    :param h2ocolors: whether to use H2O.ai theme
+    :param height: height of chat window
+    :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
+    :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
+    :param block_gradio_exit: whether to block gradio exit (used for testing)
+    :param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
+    :param api_open: If False, don't let API calls skip gradio queue
+    :param allow_api: whether to allow API calls at all to gradio server
+    :param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
+    :param sanitize_user_prompt: whether to remove profanity from user input
+    :param sanitize_bot_response: whether to remove profanity and repeat lines from bot output
+    :param extra_model_options: extra models to show in list in gradio
+    :param extra_lora_options: extra LORA to show in list in gradio
+    :param score_model: which model to score responses (None means no scoring)
+    :param auto_score: whether to automatically score responses
+    :param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
+    :param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
+    :param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
+    :return:
+    """
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
     is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
         debug=False,
         concurrency_count=None,
         save_dir=None,
         sanitize_bot_response=True,
         model_state0=None,
         is_low_mem=None,
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
     if isinstance(tokenizer, str):
         # pipeline
         if tokenizer == "summarization":
 if __name__ == "__main__":
+    """
+    Examples:
     WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
     python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
     python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
     python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
     python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
+    """
     fire.Fire(main)

gradio_runner.py CHANGED Viewed

@@ -48,16 +48,8 @@ def go_gradio(**kwargs):
                       Hash: {get_githash()}
                       """
     else:
-        description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio).<br>"
-    if is_public:
-        description += "If this host is busy, try [gpt.h2o.ai 20B](https://gpt.h2o.ai) and [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) and [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
-        description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content.  Use at own risk.</i></li>"""
-        if kwargs['load_8bit']:
-            description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
-        description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
-        if 'h2ogpt-research' in kwargs['base_model']:
-            description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
-        description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
     if kwargs['verbose']:
         task_info_md = f"""
@@ -371,6 +363,16 @@ def go_gradio(**kwargs):
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
                                 s3up_text = gr.Textbox(label='S3UP result', interactive=False)
         # Get flagged data
         zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
@@ -910,7 +912,7 @@ def go_gradio(**kwargs):
 input_args_list = ['model_state']
-inputs_kwargs_list = ['debug', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
                       'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']

                       Hash: {get_githash()}
                       """
     else:
+        description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
+    description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
     if kwargs['verbose']:
         task_info_md = f"""
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
                                 s3up_text = gr.Textbox(label='S3UP result', interactive=False)
+                with gr.TabItem("Disclaimers"):
+                    description = ""
+                    description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content.  Use at own risk.</i></li>"""
+                    if kwargs['load_8bit']:
+                        description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
+                    description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
+                    if 'h2ogpt-research' in kwargs['base_model']:
+                        description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
+                    description += """<i><li>By using h2oGPT, you accept our <a href="https://github.com/h2oai/h2ogpt/blob/main/tos.md">Terms of Service</a></i></li></ul></p>"""
+                    gr.Markdown(value=description, show_label=False, interactive=False)
         # Get flagged data
         zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
 input_args_list = ['model_state']
+inputs_kwargs_list = ['debug', 'save_dir', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
                       'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']

utils.py CHANGED Viewed

@@ -96,6 +96,8 @@ def system_info():
     for k, v in gpu_memory_frac_dict.items():
         system[f'GPU_M/%s' % k] = v
     return system

     for k, v in gpu_memory_frac_dict.items():
         system[f'GPU_M/%s' % k] = v
+    system['hash'] = get_githash()
     return system