eswardivi commited on
Commit
c7f7d96
·
verified ·
1 Parent(s): 229588b

Removed 4bit

Browse files
Files changed (1) hide show
  1. app.py +4 -17
app.py CHANGED
@@ -4,7 +4,6 @@ from transformers import (
4
  AutoModelForCausalLM,
5
  AutoTokenizer,
6
  TextIteratorStreamer,
7
- BitsAndBytesConfig,
8
  )
9
  import os
10
  from threading import Thread
@@ -13,12 +12,9 @@ import time
13
 
14
  token = os.environ["HF_TOKEN"]
15
 
16
- quantization_config = BitsAndBytesConfig(
17
- load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
18
- )
19
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
- "microsoft/Phi-3-mini-128k-instruct", quantization_config=quantization_config, token=token,trust_remote_code=True
22
  )
23
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
24
  terminators = [
@@ -32,13 +28,12 @@ else:
32
  device = torch.device("cpu")
33
  print("Using CPU")
34
 
35
- # model = model.to(device)
36
  # Dispatch Errors
37
 
38
 
39
  @spaces.GPU(duration=60)
40
  def chat(message, history, temperature,do_sample, max_tokens):
41
- start_time = time.time()
42
  chat = []
43
  for item in history:
44
  chat.append({"role": "user", "content": item[0]})
@@ -66,19 +61,11 @@ def chat(message, history, temperature,do_sample, max_tokens):
66
  t.start()
67
 
68
  partial_text = ""
69
- first_token_time = None
70
  for new_text in streamer:
71
- if not first_token_time:
72
- first_token_time = time.time() - start_time
73
  partial_text += new_text
74
  yield partial_text
75
 
76
- total_time = time.time() - start_time
77
- tokens = len(tok.tokenize(partial_text))
78
- tokens_per_second = tokens / total_time if total_time > 0 else 0
79
-
80
- timing_info = f"\n\nTime taken to first token: {first_token_time:.2f} seconds\nTokens per second: {tokens_per_second:.2f}"
81
- yield partial_text + timing_info
82
 
83
 
84
  demo = gr.ChatInterface(
@@ -104,6 +91,6 @@ demo = gr.ChatInterface(
104
  ],
105
  stop_btn="Stop Generation",
106
  title="Chat With LLMs",
107
- description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.com/microsoft/Phi-3-mini-128k-instruct) in 4bit"
108
  )
109
  demo.launch()
 
4
  AutoModelForCausalLM,
5
  AutoTokenizer,
6
  TextIteratorStreamer,
 
7
  )
8
  import os
9
  from threading import Thread
 
12
 
13
  token = os.environ["HF_TOKEN"]
14
 
 
 
 
15
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
+ "microsoft/Phi-3-mini-128k-instruct", token=token,trust_remote_code=True
18
  )
19
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
20
  terminators = [
 
28
  device = torch.device("cpu")
29
  print("Using CPU")
30
 
31
+ model = model.to(device)
32
  # Dispatch Errors
33
 
34
 
35
  @spaces.GPU(duration=60)
36
  def chat(message, history, temperature,do_sample, max_tokens):
 
37
  chat = []
38
  for item in history:
39
  chat.append({"role": "user", "content": item[0]})
 
61
  t.start()
62
 
63
  partial_text = ""
 
64
  for new_text in streamer:
 
 
65
  partial_text += new_text
66
  yield partial_text
67
 
68
+ yield partial_text
 
 
 
 
 
69
 
70
 
71
  demo = gr.ChatInterface(
 
91
  ],
92
  stop_btn="Stop Generation",
93
  title="Chat With LLMs",
94
+ description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.com/microsoft/Phi-3-mini-128k-instruct)"
95
  )
96
  demo.launch()