ZhengPeng7
/

BiRefNet

@@ -1,5 +1,5 @@
 ---
-library_name: birefnet
 tags:
 - background-removal
 - mask-generation
@@ -141,6 +141,51 @@ plt.show()
 ```
 > This BiRefNet for standard dichotomous image segmentation (DIS) is trained on **DIS-TR** and validated on **DIS-TEs and DIS-VD**.

 ---
+library_name: BiRefNet
 tags:
 - background-removal
 - mask-generation
 ```
+### 2. Use inference endpoint locally:
+> You may need to click the *deploy* and set up the endpoint by yourself, which would make some costs.
+```
+import requests
+import base64
+from io import BytesIO
+from PIL import Image
+YOUR_HF_TOKEN = 'xxx'
+API_URL = "xxx"
+headers = {
+	"Authorization": "Bearer {}".format(YOUR_HF_TOKEN)
+}
+def base64_to_bytes(base64_string):
+    # Remove the data URI prefix if present
+    if "data:image" in base64_string:
+        base64_string = base64_string.split(",")[1]
+    # Decode the Base64 string into bytes
+    image_bytes = base64.b64decode(base64_string)
+    return image_bytes
+def bytes_to_base64(image_bytes):
+    # Create a BytesIO object to handle the image data
+    image_stream = BytesIO(image_bytes)
+    # Open the image using Pillow (PIL)
+    image = Image.open(image_stream)
+    return image
+def query(payload):
+	response = requests.post(API_URL, headers=headers, json=payload)
+	return response.json()
+output = query({
+	"inputs": "https://hips.hearstapps.com/hmg-prod/images/gettyimages-1229892983-square.jpg",
+	"parameters": {}
+})
+output_image = bytes_to_base64(base64_to_bytes(output))
+output_image
+```
 > This BiRefNet for standard dichotomous image segmentation (DIS) is trained on **DIS-TR** and validated on **DIS-TEs and DIS-VD**.

birefnet.py CHANGED Viewed

@@ -7,7 +7,7 @@ import math
 class Config():
     def __init__(self) -> None:
         # PATH settings
-        self.sys_home_dir = os.path.expanduser('~')    # Make up your file system as: SYS_HOME_DIR/codes/dis/BiRefNet, SYS_HOME_DIR/datasets/dis/xx, SYS_HOME_DIR/weights/xx
         # TASK settings
         self.task = ['DIS5K', 'COD', 'HRSOD', 'DIS5K+HRSOD+HRS10K', 'P3M-10k'][0]
@@ -615,6 +615,7 @@ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
 # config = Config()
 class Mlp(nn.Module):
     """ Multilayer perceptron."""
@@ -739,7 +740,8 @@ class WindowAttention(nn.Module):
             attn = (q @ k.transpose(-2, -1))
             relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-                self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
             relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
             attn = attn + relative_position_bias.unsqueeze(0)
@@ -974,8 +976,9 @@ class BasicLayer(nn.Module):
         """
         # calculate attention mask for SW-MSA
-        Hp = int(np.ceil(H / self.window_size)) * self.window_size
-        Wp = int(np.ceil(W / self.window_size)) * self.window_size
         img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
         h_slices = (slice(0, -self.window_size),
                     slice(-self.window_size, -self.shift_size),
@@ -1961,6 +1964,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from kornia.filters import laplacian
 from transformers import PreTrainedModel
 # from config import Config
 # from dataset import class_labels_TR_sorted
@@ -1974,13 +1978,24 @@ from transformers import PreTrainedModel
 from .BiRefNet_config import BiRefNetConfig
 class BiRefNet(
     PreTrainedModel
 ):
     config_class = BiRefNetConfig
     def __init__(self, bb_pretrained=True, config=BiRefNetConfig()):
-        super(BiRefNet, self).__init__(config)
-        bb_pretrained = config.bb_pretrained
         self.config = Config()
         self.epoch = 1
         self.bb = build_backbone(self.config.bb, pretrained=bb_pretrained)
@@ -2124,18 +2139,6 @@ class Decoder(nn.Module):
                 self.gdt_convs_attn_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
                 self.gdt_convs_attn_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
-    def get_patches_batch(self, x, p):
-        _size_h, _size_w = p.shape[2:]
-        patches_batch = []
-        for idx in range(x.shape[0]):
-            columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
-            patches_x = []
-            for column_x in columns_x:
-                patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
-            patch_sample = torch.cat(patches_x, dim=1)
-            patches_batch.append(patch_sample)
-        return torch.cat(patches_batch, dim=0)
     def forward(self, features):
         if self.training and self.config.out_ref:
             outs_gdt_pred = []
@@ -2146,10 +2149,10 @@ class Decoder(nn.Module):
         outs = []
         if self.config.dec_ipt:
-            patches_batch = self.get_patches_batch(x, x4) if self.split else x
             x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
         p4 = self.decoder_block4(x4)
-        m4 = self.conv_ms_spvn_4(p4) if self.config.ms_supervision else None
         if self.config.out_ref:
             p4_gdt = self.gdt_convs_4(p4)
             if self.training:
@@ -2167,10 +2170,10 @@ class Decoder(nn.Module):
         _p3 = _p4 + self.lateral_block4(x3)
         if self.config.dec_ipt:
-            patches_batch = self.get_patches_batch(x, _p3) if self.split else x
             _p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
         p3 = self.decoder_block3(_p3)
-        m3 = self.conv_ms_spvn_3(p3) if self.config.ms_supervision else None
         if self.config.out_ref:
             p3_gdt = self.gdt_convs_3(p3)
             if self.training:
@@ -2193,10 +2196,10 @@ class Decoder(nn.Module):
         _p2 = _p3 + self.lateral_block3(x2)
         if self.config.dec_ipt:
-            patches_batch = self.get_patches_batch(x, _p2) if self.split else x
             _p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
         p2 = self.decoder_block2(_p2)
-        m2 = self.conv_ms_spvn_2(p2) if self.config.ms_supervision else None
         if self.config.out_ref:
             p2_gdt = self.gdt_convs_2(p2)
             if self.training:
@@ -2214,17 +2217,17 @@ class Decoder(nn.Module):
         _p1 = _p2 + self.lateral_block2(x1)
         if self.config.dec_ipt:
-            patches_batch = self.get_patches_batch(x, _p1) if self.split else x
             _p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
         _p1 = self.decoder_block1(_p1)
         _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
         if self.config.dec_ipt:
-            patches_batch = self.get_patches_batch(x, _p1) if self.split else x
             _p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
         p1_out = self.conv_out1(_p1)
-        if self.config.ms_supervision:
             outs.append(m4)
             outs.append(m3)
             outs.append(m2)

 class Config():
     def __init__(self) -> None:
         # PATH settings
+        self.sys_home_dir = os.path.expanduser('~')     # Make up your file system as: SYS_HOME_DIR/codes/dis/BiRefNet, SYS_HOME_DIR/datasets/dis/xx, SYS_HOME_DIR/weights/xx
         # TASK settings
         self.task = ['DIS5K', 'COD', 'HRSOD', 'DIS5K+HRSOD+HRS10K', 'P3M-10k'][0]
 # config = Config()
 class Mlp(nn.Module):
     """ Multilayer perceptron."""
             attn = (q @ k.transpose(-2, -1))
             relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+            )   # Wh*Ww, Wh*Ww, nH
             relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
             attn = attn + relative_position_bias.unsqueeze(0)
         """
         # calculate attention mask for SW-MSA
+        # Turn int to torch.tensor for the compatiability with torch.compile in PyTorch 2.5.
+        Hp = torch.ceil(torch.tensor(H) / self.window_size).to(torch.int64) * self.window_size
+        Wp = torch.ceil(torch.tensor(W) / self.window_size).to(torch.int64) * self.window_size
         img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
         h_slices = (slice(0, -self.window_size),
                     slice(-self.window_size, -self.shift_size),
 import torch.nn.functional as F
 from kornia.filters import laplacian
 from transformers import PreTrainedModel
+from einops import rearrange
 # from config import Config
 # from dataset import class_labels_TR_sorted
 from .BiRefNet_config import BiRefNetConfig
+def image2patches(image, grid_h=2, grid_w=2, patch_ref=None, transformation='b c (hg h) (wg w) -> (b hg wg) c h w'):
+    if patch_ref is not None:
+        grid_h, grid_w = image.shape[-2] // patch_ref.shape[-2], image.shape[-1] // patch_ref.shape[-1]
+    patches = rearrange(image, transformation, hg=grid_h, wg=grid_w)
+    return patches
+def patches2image(patches, grid_h=2, grid_w=2, patch_ref=None, transformation='(b hg wg) c h w -> b c (hg h) (wg w)'):
+    if patch_ref is not None:
+        grid_h, grid_w = patch_ref.shape[-2] // patches[0].shape[-2], patch_ref.shape[-1] // patches[0].shape[-1]
+    image = rearrange(patches, transformation, hg=grid_h, wg=grid_w)
+    return image
 class BiRefNet(
     PreTrainedModel
 ):
     config_class = BiRefNetConfig
     def __init__(self, bb_pretrained=True, config=BiRefNetConfig()):
+        super(BiRefNet, self).__init__()
         self.config = Config()
         self.epoch = 1
         self.bb = build_backbone(self.config.bb, pretrained=bb_pretrained)
                 self.gdt_convs_attn_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
                 self.gdt_convs_attn_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
     def forward(self, features):
         if self.training and self.config.out_ref:
             outs_gdt_pred = []
         outs = []
         if self.config.dec_ipt:
+            patches_batch = image2patches(x, patch_ref=x4, transformation='b c (hg h) (wg w) -> b (c hg wg) h w') if self.split else x
             x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
         p4 = self.decoder_block4(x4)
+        m4 = self.conv_ms_spvn_4(p4) if self.config.ms_supervision and self.training else None
         if self.config.out_ref:
             p4_gdt = self.gdt_convs_4(p4)
             if self.training:
         _p3 = _p4 + self.lateral_block4(x3)
         if self.config.dec_ipt:
+            patches_batch = image2patches(x, patch_ref=_p3, transformation='b c (hg h) (wg w) -> b (c hg wg) h w') if self.split else x
             _p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
         p3 = self.decoder_block3(_p3)
+        m3 = self.conv_ms_spvn_3(p3) if self.config.ms_supervision and self.training else None
         if self.config.out_ref:
             p3_gdt = self.gdt_convs_3(p3)
             if self.training:
         _p2 = _p3 + self.lateral_block3(x2)
         if self.config.dec_ipt:
+            patches_batch = image2patches(x, patch_ref=_p2, transformation='b c (hg h) (wg w) -> b (c hg wg) h w') if self.split else x
             _p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
         p2 = self.decoder_block2(_p2)
+        m2 = self.conv_ms_spvn_2(p2) if self.config.ms_supervision and self.training else None
         if self.config.out_ref:
             p2_gdt = self.gdt_convs_2(p2)
             if self.training:
         _p1 = _p2 + self.lateral_block2(x1)
         if self.config.dec_ipt:
+            patches_batch = image2patches(x, patch_ref=_p1, transformation='b c (hg h) (wg w) -> b (c hg wg) h w') if self.split else x
             _p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
         _p1 = self.decoder_block1(_p1)
         _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
         if self.config.dec_ipt:
+            patches_batch = image2patches(x, patch_ref=_p1, transformation='b c (hg h) (wg w) -> b (c hg wg) h w') if self.split else x
             _p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
         p1_out = self.conv_out1(_p1)
+        if self.config.ms_supervision and self.training:
             outs.append(m4)
             outs.append(m3)
             outs.append(m2)

handler.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# These HF deployment codes refer to https://huggingface.co/not-lain/BiRefNet/raw/main/handler.py.
+from typing import Dict, List, Any, Tuple
+import os
+import requests
+from io import BytesIO
+import cv2
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+from transformers import AutoModelForImageSegmentation
+torch.set_float32_matmul_precision(["high", "highest"][0])
+device = "cuda" if torch.cuda.is_available() else "cpu"
+### image_proc.py
+def refine_foreground(image, mask, r=90):
+    if mask.size != image.size:
+        mask = mask.resize(image.size)
+    image = np.array(image) / 255.0
+    mask = np.array(mask) / 255.0
+    estimated_foreground = FB_blur_fusion_foreground_estimator_2(image, mask, r=r)
+    image_masked = Image.fromarray((estimated_foreground * 255.0).astype(np.uint8))
+    return image_masked
+def FB_blur_fusion_foreground_estimator_2(image, alpha, r=90):
+    # Thanks to the source: https://github.com/Photoroom/fast-foreground-estimation
+    alpha = alpha[:, :, None]
+    F, blur_B = FB_blur_fusion_foreground_estimator(image, image, image, alpha, r)
+    return FB_blur_fusion_foreground_estimator(image, F, blur_B, alpha, r=6)[0]
+def FB_blur_fusion_foreground_estimator(image, F, B, alpha, r=90):
+    if isinstance(image, Image.Image):
+        image = np.array(image) / 255.0
+    blurred_alpha = cv2.blur(alpha, (r, r))[:, :, None]
+    blurred_FA = cv2.blur(F * alpha, (r, r))
+    blurred_F = blurred_FA / (blurred_alpha + 1e-5)
+    blurred_B1A = cv2.blur(B * (1 - alpha), (r, r))
+    blurred_B = blurred_B1A / ((1 - blurred_alpha) + 1e-5)
+    F = blurred_F + alpha * \
+        (image - alpha * blurred_F - (1 - alpha) * blurred_B)
+    F = np.clip(F, 0, 1)
+    return F, blurred_B
+class ImagePreprocessor():
+    def __init__(self, resolution: Tuple[int, int] = (1024, 1024)) -> None:
+        self.transform_image = transforms.Compose([
+            transforms.Resize(resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ])
+    def proc(self, image: Image.Image) -> torch.Tensor:
+        image = self.transform_image(image)
+        return image
+usage_to_weights_file = {
+    'General': 'BiRefNet',
+    'General-Lite': 'BiRefNet_lite',
+    'General-Lite-2K': 'BiRefNet_lite-2K',
+    'General-reso_512': 'BiRefNet-reso_512',
+    'Matting': 'BiRefNet-matting',
+    'Portrait': 'BiRefNet-portrait',
+    'DIS': 'BiRefNet-DIS5K',
+    'HRSOD': 'BiRefNet-HRSOD',
+    'COD': 'BiRefNet-COD',
+    'DIS-TR_TEs': 'BiRefNet-DIS5K-TR_TEs',
+    'General-legacy': 'BiRefNet-legacy'
+}
+# Choose the version of BiRefNet here.
+usage = 'General'
+# Set resolution
+if usage in ['General-Lite-2K']:
+    resolution = (2560, 1440)
+elif usage in ['General-reso_512']:
+    resolution = (512, 512)
+else:
+    resolution = (1024, 1024)
+class EndpointHandler():
+    def __init__(self, path=''):
+        self.birefnet = AutoModelForImageSegmentation.from_pretrained(
+            '/'.join(('zhengpeng7', usage_to_weights_file[usage])), trust_remote_code=True
+        )
+        self.birefnet.to(device)
+        self.birefnet.eval()
+    def __call__(self, data: Dict[str, Any]):
+        """
+        data args:
+            inputs (:obj: `str`)
+            date (:obj: `str`)
+        Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        print('data["inputs"] = ', data["inputs"])
+        image_src = data["inputs"]
+        if isinstance(image_src, str):
+            if os.path.isfile(image_src):
+                image_ori = Image.open(image_src)
+            else:
+                response = requests.get(image_src)
+                image_data = BytesIO(response.content)
+                image_ori = Image.open(image_data)
+        else:
+            image_ori = Image.fromarray(image_src)
+        image = image_ori.convert('RGB')
+        # Preprocess the image
+        image_preprocessor = ImagePreprocessor(resolution=tuple(resolution))
+        image_proc = image_preprocessor.proc(image)
+        image_proc = image_proc.unsqueeze(0)
+        # Prediction
+        with torch.no_grad():
+            preds = self.birefnet(image_proc.to(device))[-1].sigmoid().cpu()
+        pred = preds[0].squeeze()
+        # Show Results
+        pred_pil = transforms.ToPILImage()(pred)
+        image_masked = refine_foreground(image, pred_pil)
+        image_masked.putalpha(pred_pil.resize(image.size))
+        return image_masked

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.0.1
+--extra-index-url https://download.pytorch.org/whl/cu118
+torchvision==0.15.2
+numpy<2
+opencv-python
+timm
+scipy
+scikit-image
+kornia
+einops
+tqdm
+prettytable
+transformers
+huggingface-hub>0.25
+accelerate