Hjgugugjhuhjggg commited on
Commit
5ffd361
·
verified ·
1 Parent(s): ee7ce1d

Update apghp.py

Browse files
Files changed (1) hide show
  1. apghp.py +142 -141
apghp.py CHANGED
@@ -1,26 +1,34 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel
 
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
5
  from tqdm import tqdm
6
- import uvicorn
7
  from dotenv import load_dotenv
8
- from difflib import SequenceMatcher
9
- import re
10
- import spaces # Importar la librería spaces
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # Cargar variables de entorno
13
  load_dotenv()
14
 
15
- # Inicializar aplicación FastAPI
16
  app = FastAPI()
 
 
 
17
 
18
- # Diccionario global para almacenar los modelos
19
- global_data = {
20
- 'models': []
21
- }
22
-
23
- # Configuración de los modelos
24
  model_configs = [
25
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
26
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
@@ -43,146 +51,139 @@ model_configs = [
43
  {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
44
  ]
45
 
46
- # Clase para gestionar modelos
 
47
  class ModelManager:
48
  def __init__(self):
49
- self.models = []
50
- self.loaded = False # Para verificar si ya están cargados
51
-
52
- def load_model(self, model_config):
53
- print(f"Cargando modelo: {model_config['name']}...")
54
- return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
55
-
56
- def load_all_models(self):
57
- if self.loaded: # Si los modelos ya están cargados, no los vuelve a cargar
58
- print("Modelos ya están cargados. No es necesario volver a cargarlos.")
59
- return self.models
60
-
61
- print("Iniciando carga de modelos...")
62
- with ThreadPoolExecutor() as executor: # No hay límite de trabajadores
63
- futures = [executor.submit(self.load_model, config) for config in model_configs]
64
- models = []
65
- for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
66
  try:
67
- model = future.result()
68
- models.append(model)
69
- print(f"Modelo cargado exitosamente: {model['name']}")
 
 
 
70
  except Exception as e:
71
- print(f"Error al cargar el modelo: {e}")
72
-
73
- self.models = models
74
- self.loaded = True # Marcar como cargados
75
- print("Todos los modelos han sido cargados.")
76
- return self.models
77
-
78
- # Instanciar ModelManager
79
- model_manager = ModelManager()
80
 
81
- # Cargar modelos al iniciar la aplicación, solo la primera vez
82
- global_data['models'] = model_manager.load_all_models()
 
 
 
83
 
84
- # Modelo global para la solicitud de chat
85
  class ChatRequest(BaseModel):
86
  message: str
87
- top_k: int = 50
88
- top_p: float = 0.95
89
- temperature: float = 0.7
90
 
91
- # Función para generar respuestas de chat
92
- @spaces.GPU(duration=0) # Anotación para usar GPU con duración 0
93
- def generate_chat_response(request, model_data):
94
  try:
95
- user_input = normalize_input(request.message)
96
- llm = model_data['model']
97
- response = llm.create_chat_completion(
98
- messages=[{"role": "user", "content": user_input}],
99
- top_k=request.top_k,
100
- top_p=request.top_p,
101
- temperature=request.temperature
102
- )
103
- reply = response['choices'][0]['message']['content']
104
- return {"response": reply, "literal": user_input, "model_name": model_data['name']}
105
  except Exception as e:
106
- return {"response": f"Error: {str(e)}", "literal": user_input, "model_name": model_data['name']}
107
-
108
- def normalize_input(input_text):
109
- return input_text.strip()
110
-
111
- def remove_duplicates(text):
112
- text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
113
- text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
114
- text = text.replace('[/INST]', '')
115
- lines = text.split('\n')
116
- unique_lines = list(dict.fromkeys(lines))
117
- return '\n'.join(unique_lines).strip()
118
-
119
- def remove_repetitive_responses(responses):
120
- seen = set()
121
- unique_responses = []
122
- for response in responses:
123
- normalized_response = remove_duplicates(response['response'])
124
- if normalized_response not in seen:
125
- seen.add(normalized_response)
126
- unique_responses.append(response)
127
- return unique_responses
128
-
129
- def select_best_response(responses):
130
- print("Filtrando respuestas...")
131
- responses = remove_repetitive_responses(responses)
132
- responses = [remove_duplicates(response['response']) for response in responses]
133
- unique_responses = list(set(responses))
134
- coherent_responses = filter_by_coherence(unique_responses)
135
- best_response = filter_by_similarity(coherent_responses)
136
- return best_response
137
-
138
- def filter_by_coherence(responses):
139
- print("Ordenando respuestas por coherencia...")
140
- responses.sort(key=len, reverse=True)
141
- return responses
142
-
143
- def filter_by_similarity(responses):
144
- print("Filtrando respuestas por similitud...")
145
- responses.sort(key=len, reverse=True)
146
- best_response = responses[0]
147
- for i in range(1, len(responses)):
148
- ratio = SequenceMatcher(None, best_response, responses[i]).ratio()
149
- if ratio < 0.9:
150
- best_response = responses[i]
151
- break
152
- return best_response
153
-
154
- def worker_function(model_data, request):
155
- print(f"Generando respuesta con el modelo: {model_data['name']}...")
156
- response = generate_chat_response(request, model_data)
157
- return response
158
-
159
- @app.post("/generate_chat")
160
- async def generate_chat(request: ChatRequest):
161
- if not request.message.strip():
162
- raise HTTPException(status_code=400, detail="The message cannot be empty.")
163
-
164
- print(f"Procesando solicitud: {request.message}")
165
-
166
- responses = []
167
- num_models = len(global_data['models'])
168
-
169
- with ThreadPoolExecutor() as executor: # No se establece límite de concurrencia
170
- futures = [executor.submit(worker_function, model_data, request) for model_data in global_data['models']]
171
- for future in tqdm(as_completed(futures), total=num_models, desc="Generando respuestas", unit="modelo"):
172
  try:
173
- response = future.result()
174
- responses.append(response)
175
- except Exception as exc:
176
- print(f"Error en la generación de respuesta: {exc}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- best_response = select_best_response(responses)
179
-
180
- print(f"Mejor respuesta seleccionada: {best_response}")
181
 
182
- return {
183
- "best_response": best_response,
184
- "all_responses": responses
185
- }
186
 
187
  if __name__ == "__main__":
188
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import io
4
  from llama_cpp import Llama
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from fastapi import FastAPI, Request, HTTPException, Lifespan
7
+ from fastapi.responses import JSONResponse
8
  from tqdm import tqdm
 
9
  from dotenv import load_dotenv
10
+ from pydantic import BaseModel
11
+ from huggingface_hub import hf_hub_download, login
12
+ from nltk.tokenize import word_tokenize
13
+ from nltk.corpus import stopwords
14
+ from sklearn.feature_extraction.text import TfidfVectorizer
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ import nltk
17
+ import uvicorn
18
+ import psutil
19
+ import torch
20
+ import tempfile
21
+
22
+ nltk.download('punkt')
23
+ nltk.download('stopwords')
24
 
 
25
  load_dotenv()
26
 
 
27
  app = FastAPI()
28
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
29
+ if HUGGINGFACE_TOKEN:
30
+ login(token=HUGGINGFACE_TOKEN)
31
 
 
 
 
 
 
 
32
  model_configs = [
33
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
34
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
 
51
  {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
52
  ]
53
 
54
+ global_data = {'model_configs': model_configs, 'training_data': io.StringIO()}
55
+
56
  class ModelManager:
57
  def __init__(self):
58
+ self.models = {}
59
+ self.load_models()
60
+
61
+ def load_models(self):
62
+ for config in tqdm(global_data['model_configs'], desc="Loading models"):
63
+ model_name = config['name']
64
+ if model_name not in self.models:
 
 
 
 
 
 
 
 
 
 
65
  try:
66
+ with tempfile.NamedTemporaryFile(suffix=".gguf", delete=False) as temp_file:
67
+ model_path = hf_hub_download(repo_id=config['repo_id'], filename=temp_file.name, use_auth_token=HUGGINGFACE_TOKEN)
68
+ model = Llama.from_file(model_path, n_ctx=512, n_gpu=1)
69
+ self.models[model_name] = model
70
+ print(f"Model '{model_name}' loaded successfully.")
71
+ os.remove(temp_file.name) #remove the temp file after loading
72
  except Exception as e:
73
+ print(f"Error loading model {model_name}: {e}")
74
+ self.models[model_name] = None
75
+ finally:
76
+ gc.collect()
 
 
 
 
 
77
 
78
+ def get_model(self, model_name: str):
79
+ return self.models.get(model_name)
80
+
81
+
82
+ model_manager = ModelManager()
83
 
 
84
  class ChatRequest(BaseModel):
85
  message: str
 
 
 
86
 
87
+ async def generate_model_response(model, inputs: str) -> str:
 
 
88
  try:
89
+ if model:
90
+ response = model(inputs, max_tokens=150)
91
+ return response['choices'][0]['text'].strip()
92
+ else:
93
+ return "Model not loaded"
 
 
 
 
 
94
  except Exception as e:
95
+ return f"Error: Could not generate a response. Details: {e}"
96
+
97
+ async def process_message(message: str) -> dict:
98
+ inputs = message.strip()
99
+ responses = {}
100
+
101
+ with ThreadPoolExecutor(max_workers=min(len(global_data['model_configs']), 4)) as executor:
102
+ futures = [executor.submit(generate_model_response, model_manager.get_model(config['name']), inputs) for config in global_data['model_configs'] if model_manager.get_model(config['name'])]
103
+ for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Generating responses")):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  try:
105
+ model_name = global_data['model_configs'][i]['name']
106
+ responses[model_name] = future.result()
107
+ except Exception as e:
108
+ responses[model_name] = f"Error processing {model_name}: {e}"
109
+
110
+ stop_words = set(stopwords.words('english'))
111
+ vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stop_words)
112
+ reference_text = message
113
+ response_texts = list(responses.values())
114
+ tfidf_matrix = vectorizer.fit_transform([reference_text] + response_texts)
115
+ similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
116
+ best_response_index = similarities.argmax()
117
+ best_response_model = list(responses.keys())[best_response_index]
118
+ best_response_text = response_texts[best_response_index]
119
+
120
+ return {"best_response": {"model": best_response_model, "text": best_response_text}, "all_responses": responses}
121
+
122
+
123
+ @app.post("/generate_multimodel")
124
+ async def api_generate_multimodel(request: Request):
125
+ try:
126
+ data = await request.json()
127
+ message = data.get("message")
128
+ if not message:
129
+ raise HTTPException(status_code=400, detail="Missing message")
130
+ response = await process_message(message)
131
+ return JSONResponse(response)
132
+ except HTTPException as e:
133
+ raise e
134
+ except Exception as e:
135
+ return JSONResponse({"error": str(e)}, status_code=500)
136
+
137
+
138
+ async def startup():
139
+ pass
140
+
141
+ async def shutdown():
142
+ gc.collect()
143
+
144
+ app.add_event_handler("startup", startup)
145
+ app.add_event_handler("shutdown", shutdown)
146
+
147
+ def release_resources():
148
+ try:
149
+ torch.cuda.empty_cache()
150
+ gc.collect()
151
+ except Exception as e:
152
+ print(f"Failed to release resources: {e}")
153
+
154
+ def resource_manager():
155
+ MAX_RAM_PERCENT = 20
156
+ MAX_CPU_PERCENT = 20
157
+ MAX_GPU_PERCENT = 20
158
+ MAX_RAM_MB = 2048
159
+
160
+ while True:
161
+ try:
162
+ virtual_mem = psutil.virtual_memory()
163
+ current_ram_percent = virtual_mem.percent
164
+ current_ram_mb = virtual_mem.used / (1024 * 1024)
165
+
166
+ if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
167
+ release_resources()
168
+
169
+ current_cpu_percent = psutil.cpu_percent()
170
+ if current_cpu_percent > MAX_CPU_PERCENT:
171
+ psutil.Process(os.getpid()).nice()
172
+
173
+ if torch.cuda.is_available():
174
+ gpu = torch.cuda.current_device()
175
+ gpu_mem = torch.cuda.memory_percent(gpu)
176
 
177
+ if gpu_mem > MAX_GPU_PERCENT:
178
+ release_resources()
 
179
 
180
+ except Exception as e:
181
+ print(f"Error in resource manager: {e}")
 
 
182
 
183
  if __name__ == "__main__":
184
+ import threading
185
+ resource_thread = threading.Thread(target=resource_manager)
186
+ resource_thread.daemon = True
187
+ resource_thread.start()
188
+ port = int(os.environ.get("PORT", 7860))
189
+ uvicorn.run(app, host="0.0.0.0", port=port)