Spaces:

Lap1official
/

API

Running

App Files Files Community

Reality123b commited on Jan 17

Commit

ac4d454

verified ·

1 Parent(s): 55aa708

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -48

app.py CHANGED Viewed

@@ -13,8 +13,6 @@ import networkx as nx
 from collections import Counter
 import json
 from datetime import datetime
-from transformers import AutoProcessor, AutoModelForVision2Seq
-from transformers.image_utils import load_image
 @dataclass
 class ChatMessage:
@@ -34,9 +32,11 @@ class XylariaChat:
             model="mistralai/Mistral-Nemo-Instruct-2407",
             token=self.hf_token
         )
         self.image_gen_api_url = "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell"
-        self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"}
         self.conversation_history = []
         self.persistent_memory = []
@@ -97,13 +97,6 @@ class XylariaChat:
         self.chat_history_file = "chat_history.json"
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.vlm_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
-        self.vlm_model = AutoModelForVision2Seq.from_pretrained(
-            "HuggingFaceTB/SmolVLM-Instruct",
-            torch_dtype=torch.bfloat16,
-            _attn_implementation="flash_attention_2" if self.device == "cuda" else "eager",
-        ).to(self.device)
     def update_internal_state(self, emotion_deltas, cognitive_load_deltas, introspection_delta, engagement_delta):
         for emotion, delta in emotion_deltas.items():
@@ -408,44 +401,34 @@ class XylariaChat:
             print(f"Error resetting API client: {e}")
         return None
-    def caption_image_vlm(self, image, user_input):
         try:
-            if isinstance(image, str) and image.startswith('http'):
-                image = load_image(image)
-            elif isinstance(image, str) and os.path.isfile(image):
-                image = Image.open(image)
-            elif isinstance(image, str) and image.startswith('data:image'):
-                image = Image.open(base64.b64decode(image.split(',')[1]))
             else:
-                image = Image.fromarray(image)
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image"},
-                        {"type": "text", "text": user_input}
-                    ]
-                },
-            ]
-            prompt = self.vlm_processor.apply_chat_template(messages, add_generation_prompt=True)
-            inputs = self.vlm_processor(text=prompt, images=[image], return_tensors="pt")
-            inputs = inputs.to(self.device)
-            generated_ids = self.vlm_model.generate(**inputs, max_new_tokens=500)
-            generated_texts = self.vlm_processor.batch_decode(
-                generated_ids,
-                skip_special_tokens=True,
             )
-            return generated_texts[0].split("Assistant: ")[-1]
-        except Exception as e:
-            return f"Error captioning image with VLM: {str(e)}"
     def generate_image(self, prompt):
         try:
             payload = {"inputs": prompt}
@@ -501,11 +484,8 @@ class XylariaChat:
                 messages.append(msg)
             if image:
-                image_caption = self.caption_image_vlm(image, user_input)
-                messages.append(ChatMessage(
-                    role="user",
-                    content=image_caption
-                ).to_dict())
             messages.append(ChatMessage(
                 role="user",

 from collections import Counter
 import json
 from datetime import datetime
 @dataclass
 class ChatMessage:
             model="mistralai/Mistral-Nemo-Instruct-2407",
             token=self.hf_token
         )
+        self.image_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
+        self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"}
         self.image_gen_api_url = "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell"
         self.conversation_history = []
         self.persistent_memory = []
         self.chat_history_file = "chat_history.json"
     def update_internal_state(self, emotion_deltas, cognitive_load_deltas, introspection_delta, engagement_delta):
         for emotion, delta in emotion_deltas.items():
             print(f"Error resetting API client: {e}")
         return None
+    def caption_image(self, image):
         try:
+            if isinstance(image, str) and os.path.isfile(image):
+                with open(image, "rb") as f:
+                    data = f.read()
+            elif isinstance(image, str):
+                if image.startswith('data:image'):
+                    image = image.split(',')[1]
+                data = base64.b64decode(image)
             else:
+                data = image.read()
+            response = requests.post(
+                self.image_api_url,
+                headers=self.image_api_headers,
+                data=data
             )
+            if response.status_code == 200:
+                caption = response.json()[0].get('generated_text', 'No caption generated')
+                return caption
+            else:
+                return f"Error captioning image: {response.status_code} - {response.text}"
+        except Exception as e:
+            return f"Error processing image: {str(e)}"
     def generate_image(self, prompt):
         try:
             payload = {"inputs": prompt}
                 messages.append(msg)
             if image:
+                image_caption = self.caption_image(image)
+                user_input = f"description of an image: {image_caption}\n\nUser's message about it: {user_input}"
             messages.append(ChatMessage(
                 role="user",