eswardivi commited on
Commit
00f3401
·
verified ·
1 Parent(s): 51153f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -10
app.py CHANGED
@@ -10,14 +10,22 @@ from threading import Thread
10
  import spaces
11
  import time
12
  import subprocess
13
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
 
 
 
 
14
 
15
  token = os.environ["HF_TOKEN"]
16
 
17
 
18
  model = AutoModelForCausalLM.from_pretrained(
19
- use_cache=False,attn_implementation="flash_attention_2",
20
- "microsoft/Phi-3-mini-128k-instruct", token=token,trust_remote_code=True
 
 
 
21
  )
22
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
23
  terminators = [
@@ -36,7 +44,7 @@ model = model.to(device)
36
 
37
 
38
  @spaces.GPU(duration=60)
39
- def chat(message, history, temperature,do_sample, max_tokens):
40
  chat = []
41
  for item in history:
42
  chat.append({"role": "user", "content": item[0]})
@@ -56,10 +64,10 @@ def chat(message, history, temperature,do_sample, max_tokens):
56
  temperature=temperature,
57
  eos_token_id=terminators,
58
  )
59
-
60
  if temperature == 0:
61
- generate_kwargs['do_sample'] = False
62
-
63
  t = Thread(target=model.generate, kwargs=generate_kwargs)
64
  t.start()
65
 
@@ -68,7 +76,7 @@ def chat(message, history, temperature,do_sample, max_tokens):
68
  partial_text += new_text
69
  yield partial_text
70
 
71
- yield partial_text
72
 
73
 
74
  demo = gr.ChatInterface(
@@ -82,7 +90,7 @@ demo = gr.ChatInterface(
82
  gr.Slider(
83
  minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
84
  ),
85
- gr.Checkbox(label="Sampling",value=True),
86
  gr.Slider(
87
  minimum=128,
88
  maximum=4096,
@@ -94,6 +102,6 @@ demo = gr.ChatInterface(
94
  ],
95
  stop_btn="Stop Generation",
96
  title="Chat With LLMs",
97
- description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)"
98
  )
99
  demo.launch()
 
10
  import spaces
11
  import time
12
  import subprocess
13
+
14
+ subprocess.run(
15
+ "pip install flash-attn --no-build-isolation",
16
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
+ shell=True,
18
+ )
19
 
20
  token = os.environ["HF_TOKEN"]
21
 
22
 
23
  model = AutoModelForCausalLM.from_pretrained(
24
+ "microsoft/Phi-3-mini-128k-instruct",
25
+ use_cache=False,
26
+ attn_implementation="flash_attention_2",
27
+ token=token,
28
+ trust_remote_code=True,
29
  )
30
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
31
  terminators = [
 
44
 
45
 
46
  @spaces.GPU(duration=60)
47
+ def chat(message, history, temperature, do_sample, max_tokens):
48
  chat = []
49
  for item in history:
50
  chat.append({"role": "user", "content": item[0]})
 
64
  temperature=temperature,
65
  eos_token_id=terminators,
66
  )
67
+
68
  if temperature == 0:
69
+ generate_kwargs["do_sample"] = False
70
+
71
  t = Thread(target=model.generate, kwargs=generate_kwargs)
72
  t.start()
73
 
 
76
  partial_text += new_text
77
  yield partial_text
78
 
79
+ yield partial_text
80
 
81
 
82
  demo = gr.ChatInterface(
 
90
  gr.Slider(
91
  minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
92
  ),
93
+ gr.Checkbox(label="Sampling", value=True),
94
  gr.Slider(
95
  minimum=128,
96
  maximum=4096,
 
102
  ],
103
  stop_btn="Stop Generation",
104
  title="Chat With LLMs",
105
+ description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
106
  )
107
  demo.launch()