Spaces:

1aurent
/

cogvlm_captionner

Running on Zero

App Files Files Community

1aurent commited on Jul 24

Commit

432150c

•

1 Parent(s): 001c97f

add app

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +83 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Cogvlm Captionner
-emoji: 🦀
 colorFrom: gray
 colorTo: red
 sdk: gradio

 ---
+title: CogVLMv1 Captionner
+emoji: ⚙️
 colorFrom: gray
 colorTo: red
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# type: ignore
+from typing import Any
+import gradio as gr
+import spaces
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, LlamaTokenizer
+DEFAULT_PARAMS = {
+    "do_sample": False,
+    "max_new_tokens": 256,
+}
+DEFAULT_QUERY = (
+    "Provide a factual description of this image in up to two paragraphs. "
+    "Include details on objects, background, scenery, interactions, gestures, poses, and any visible text content. "
+    "Specify the number of repeated objects. "
+    "Describe the dominant colors, color contrasts, textures, and materials. "
+    "Mention the composition, including the arrangement of elements and focus points. "
+    "Note the camera angle or perspective, and provide any identifiable contextual information. "
+    "Include details on the style, lighting, and shadows. "
+    "Avoid subjective interpretations or speculation."
+)
+DTYPE = torch.bfloat16
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tokenizer = LlamaTokenizer.from_pretrained(
+    pretrained_model_name_or_path="lmsys/vicuna-7b-v1.5",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_name_or_path="THUDM/cogvlm-chat-hf",
+    torch_dtype=DTYPE,
+    trust_remote_code=True,
+    low_cpu_mem_usage=True,
+)
+model = model.to(device=DEVICE)
+@spaces.GPU
+@torch.no_grad()
+def generate_caption(
+    image: Image.Image,
+    query: str = DEFAULT_QUERY,
+    params: dict[str, Any] = DEFAULT_PARAMS,
+) -> str:
+    inputs = model.build_conversation_input_ids(
+        tokenizer=tokenizer,
+        query=query,
+        history=[],
+        images=[image],
+    )
+    inputs = {
+        "input_ids": inputs["input_ids"].unsqueeze(0).to(device=DEVICE),
+        "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to(device=DEVICE),
+        "attention_mask": inputs["attention_mask"].unsqueeze(0).to(device=DEVICE),
+        "images": [[inputs["images"][0].to(device=DEVICE, dtype=DTYPE)]],
+    }
+    outputs = model.generate(**inputs, **params)
+    outputs = outputs[:, inputs["input_ids"].shape[1] :]
+    result = tokenizer.decode(outputs[0])
+    result = result.replace("This image showcases", "").lstrip()
+    return result
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil")
+            input_query = gr.Textbox(lines=5, label="Prompt", value=DEFAULT_QUERY)
+            run_button = gr.Button(value="Generate Caption")
+        with gr.Column():
+            output_caption = gr.Textbox(label="Generated Caption", show_copy_button=True)
+    run_button.click(
+        fn=generate_caption,
+        inputs=[input_image, input_query],
+        outputs=output_caption,
+    )
+demo.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers==4.41.2
+xformers==0.0.27
+sentencepiece==0.2.0
+bitsandbytes==0.43.1
+einops==0.8.0
+torchvision==0.18.1
+accelerate==0.31.0