Spaces:

John6666
/

hfsearch

Running

App Files Files Community

John6666 commited on 7 days ago

Commit

b53c1d8

verified ·

1 Parent(s): 68eb6f0

Upload 2 files

Browse files

Files changed (2) hide show

app.py +3 -3
hfsearch.py +134 -46

app.py CHANGED Viewed

@@ -17,10 +17,10 @@ with gr.Blocks(theme="NoCrypt/miku", fill_width=True, css=CSS) as demo:
         with gr.Tab("Normal Search"):
             with gr.Group():
                 with gr.Row(equal_height=True):
-                    repo_types = gr.CheckboxGroup(label="Repo type", choices=["model", "dataset", "space"], value=["model", "dataset", "space"])
                 with gr.Accordion("Advanced", open=False):
                     with gr.Row(equal_height=True):
-                        filter_str = gr.Textbox(label="Filter", info="String(s) to filter repos", value="")
                         search_str = gr.Textbox(label="Search", info="A string that will be contained in the returned repo ids", placeholder="bert", value="", lines=1)
                         author = gr.Textbox(label="Author", info="The author (user or organization)", value="", lines=1)
                     with gr.Column():
@@ -79,7 +79,7 @@ with gr.Blocks(theme="NoCrypt/miku", fill_width=True, css=CSS) as demo:
                     #rec_repo_id = gr.Textbox(label="Repo ID", info="Input your favorite repo", value="")
                     rec_repo_id = HuggingfaceHubSearch(label="Repo ID", placeholder="Input your favorite Repo ID", search_type=["model", "dataset", "space"],
                                                        sumbit_on_select=False)
-                    rec_repo_types = gr.CheckboxGroup(label="Repo type", choices=["model", "dataset", "space"], value=["model", "dataset", "space"])
                 with gr.Row(equal_height=True):
                     rec_sort = gr.Radio(label="Sort", choices=["last_modified", "likes", "downloads", "downloads_all_time", "trending_score"], value="likes")
                     rec_limit = gr.Number(label="Limit", value=20, step=1, minimum=1, maximum=1000)

         with gr.Tab("Normal Search"):
             with gr.Group():
                 with gr.Row(equal_height=True):
+                    repo_types = gr.CheckboxGroup(label="Repo type", choices=["model", "dataset", "space", "collection"], value=["model", "dataset", "space"])
+                    filter_str = gr.Textbox(label="Filter", info="String(s) to filter repos", value="")
                 with gr.Accordion("Advanced", open=False):
                     with gr.Row(equal_height=True):
                         search_str = gr.Textbox(label="Search", info="A string that will be contained in the returned repo ids", placeholder="bert", value="", lines=1)
                         author = gr.Textbox(label="Author", info="The author (user or organization)", value="", lines=1)
                     with gr.Column():
                     #rec_repo_id = gr.Textbox(label="Repo ID", info="Input your favorite repo", value="")
                     rec_repo_id = HuggingfaceHubSearch(label="Repo ID", placeholder="Input your favorite Repo ID", search_type=["model", "dataset", "space"],
                                                        sumbit_on_select=False)
+                    rec_repo_types = gr.CheckboxGroup(label="Repo type", choices=["model", "dataset", "space", "collection"], value=["model", "dataset", "space", "collection"])
                 with gr.Row(equal_height=True):
                     rec_sort = gr.Radio(label="Sort", choices=["last_modified", "likes", "downloads", "downloads_all_time", "trending_score"], value="likes")
                     rec_limit = gr.Number(label="Limit", value=20, step=1, minimum=1, maximum=1000)

hfsearch.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import spaces
 import gradio as gr
-from huggingface_hub import HfApi, ModelInfo, DatasetInfo, SpaceInfo
 from typing import Union
 import gc
 import pandas as pd
@@ -16,8 +17,11 @@ def dummy_gpu():
 RESULT_ITEMS = {
     "Type": [1, "str", True],
     "ID": [2, "markdown", True, "40%"],
-    "Status": [4, "markdown", True],
-    "Gated": [6, "str", True],
     "Likes": [10, "number", True],
     "DLs": [12, "number", True],
     "AllDLs": [13, "number", False],
@@ -30,6 +34,14 @@ RESULT_ITEMS = {
     "NFAA": [40, "str", False],
 }
 try:
     with open("tags.json", encoding="utf-8") as f:
         TAGS = json.load(f)
@@ -122,7 +134,6 @@ def get_repo_collections(repo_id: str, repo_type: str="model", limit=10):
         for c in cols:
             col = api.get_collection(collection_slug=c.slug)
             for i in col.items:
-                if i.item_type == "paper": continue
                 id = i.item_id
                 cols_dict[id] = cols_dict.get(id, 1) + 1
                 types_dict[id] = i.item_type
@@ -145,7 +156,6 @@ def get_users_collections(users: list[str], limit=10):
             for c in cols:
                 col = api.get_collection(collection_slug=c.slug)
                 for i in col.items:
-                    if i.item_type == "paper": continue
                     id = i.item_id
                     cols_dict[id] = cols_dict.get(id, 1) + 1
                     types_dict[id] = i.item_type
@@ -176,6 +186,42 @@ def get_ref_repos(repo_id: str):
     counts_list = list(refs.values())
     return refs_list, types_list, counts_list
 def str_to_list(s: str):
     try:
         m = re.split("\n", s)
@@ -263,25 +309,47 @@ class HFSearchResult():
         if isinstance(i, ModelInfo): type = "model"
         elif isinstance(i, DatasetInfo): type = "dataset"
         elif isinstance(i, SpaceInfo): type = "space"
         else: return
         self._set(type, "Type")
-        self._set(i.id, "ID")
-        if i.likes is not None: self._set(i.likes, "Likes")
-        if i.last_modified is not None: self._set(date_to_str(i.last_modified), "LastMod.")
-        if i.trending_score is not None: self._set(int(i.trending_score), "Trending")
-        if i.tags is not None: self._set("True" if "not-for-all-audiences" in i.tags else "False", "NFAA")
-        if type in ["model", "dataset"]:
-            if i.gated is not None: self._set(i.gated if i.gated else "off", "Gated")
-            if i.downloads is not None: self._set(i.downloads, "DLs")
-            if i.downloads_all_time is not None: self._set(i.downloads_all_time, "AllDLs")
-        if type == "model":
-            if i.inference is not None: self._set(i.inference, "Status")
-            if i.library_name is not None: self._set(i.library_name, "Library")
-            if i.pipeline_tag is not None: self._set(i.pipeline_tag, "Pipeline")
-        if type == "space":
-            if i.runtime is not None:
-                self._set(i.runtime.hardware, "Hardware")
-                self._set(i.runtime.stage, "Stage")
         self._next()
     def search(self, repo_types: list, sort: str, sort_method: str, filter_str: str, search_str: str, author: str, tags: str, infer: str, gated: str, appr: list[str],
@@ -294,13 +362,22 @@ class HFSearchResult():
             mkwargs = {}
             dkwargs = {}
             skwargs = {}
-            if filter_str: kwargs["filter"] = str_to_list(filter_str)
             if search_str: kwargs["search"] = search_str
-            if author: kwargs["author"] = author
             if tags and is_valid_arg(tags):
                 mkwargs["tags"] = str_to_list(tags)
                 dkwargs["tags"] = str_to_list(tags)
-            if limit > 0: kwargs["limit"] = limit
             if sort_method == "descending order": kwargs["direction"] = -1
             if gated == "gated":
                 mkwargs["gated"] = True
@@ -332,11 +409,15 @@ class HFSearchResult():
                          if len(hardware) > 0 and space.runtime.stage == "RUNNING" and space.runtime.hardware not in hardware: continue
                          if len(stage) > 0 and space.runtime.stage not in stage: continue
                     self.add_item(space)
-            if sort == "downloads" and ("space" not in repo_types): self.sort("DLs")
-            elif sort == "downloads_all_time" and ("space" not in repo_types): self.sort("AllDLs")
-            elif sort == "likes": self.sort("Likes")
-            elif sort == "trending_score": self.sort("Trending")
-            else: self.sort("LastMod.")
         except Exception as e:
             raise Exception(f"Search error: {e}") from e
@@ -345,19 +426,20 @@ class HFSearchResult():
             self.reset()
             self.show_labels = show_labels.copy()
             api = HfApi()
-            repos, types, counts = get_ref_repos(repo_id)
-            i = 0
-            for r, t in zip(repos, types):
-                if i + 1 > limit: break
-                i += 1
-                if t not in repo_types: continue
-                info = api.repo_info(repo_id=r, repo_type=t)
-                if info: self.add_item(info)
-            if sort == "downloads" and ("space" not in repo_types): self.sort("DLs")
-            elif sort == "downloads_all_time" and ("space" not in repo_types): self.sort("AllDLs")
-            elif sort == "likes": self.sort("Likes")
-            elif sort == "trending_score": self.sort("Trending")
-            else: self.sort("LastMod.")
         except Exception as e:
             raise Exception(f"Search error: {e}") from e
@@ -410,9 +492,9 @@ class HFSearchResult():
             return sdf
         def id_to_md(df: pd.DataFrame):
-            if df["Type"] == "dataset": return f'[{df["ID"]}](https://hf.co/datasets/{df["ID"]})'
-            elif df["Type"] == "space": return f'[{df["ID"]}](https://hf.co/spaces/{df["ID"]})'
-            else: return f'[{df["ID"]}](https://hf.co/{df["ID"]})'
         def format_md_df(df: pd.DataFrame):
             df["ID"] = df.apply(id_to_md, axis=1)
@@ -460,6 +542,12 @@ class HFSearchResult():
     def sort(self, key="Likes"):
         if len(self.item_list) == 0: raise Exception("No item found.")
         if not key in self.labels.get()[0]: key = "Likes"
         self.item_list, self.item_hide_flags, self.item_info_list = zip(*sorted(zip(self.item_list, self.item_hide_flags, self.item_info_list), key=lambda x: x[0][key], reverse=True))

 import spaces
 import gradio as gr
+from huggingface_hub import HfApi, ModelInfo, DatasetInfo, SpaceInfo, Collection
+from huggingface_hub.hf_api import PaperInfo
 from typing import Union
 import gc
 import pandas as pd
 RESULT_ITEMS = {
     "Type": [1, "str", True],
     "ID": [2, "markdown", True, "40%"],
+    "User": [4, "str", False],
+    "Name": [5, "str", False],
+    "URL": [6, "str", False],
+    "Status": [7, "markdown", True],
+    "Gated": [8, "str", True],
     "Likes": [10, "number", True],
     "DLs": [12, "number", True],
     "AllDLs": [13, "number", False],
     "NFAA": [40, "str", False],
 }
+SORT_PARAM_TO_ITEM = {
+    "last_modified": "LastMod.",
+    "likes": "Likes",
+    "downloads": "DLs",
+    "downloads_all_time": "AllDLs",
+    "trending_score": "Trending",
+}
 try:
     with open("tags.json", encoding="utf-8") as f:
         TAGS = json.load(f)
         for c in cols:
             col = api.get_collection(collection_slug=c.slug)
             for i in col.items:
                 id = i.item_id
                 cols_dict[id] = cols_dict.get(id, 1) + 1
                 types_dict[id] = i.item_type
             for c in cols:
                 col = api.get_collection(collection_slug=c.slug)
                 for i in col.items:
                     id = i.item_id
                     cols_dict[id] = cols_dict.get(id, 1) + 1
                     types_dict[id] = i.item_type
     counts_list = list(refs.values())
     return refs_list, types_list, counts_list
+def get_collections_by_repo(repo_id: str, repo_type: str="model", limit=100):
+    try:
+        api = HfApi()
+        if repo_type == "dataset": item = f"datasets/{repo_id}"
+        elif repo_type == "space": item = f"spaces/{repo_id}"
+        else: item = f"models/{repo_id}"
+        cols = api.list_collections(item=item, sort="upvotes", limit=limit)
+        return [c for c in cols]
+    except Exception as e:
+        print(e)
+        raise Exception(e)
+def get_collections_by_users(users: list[str], limit=100):
+    try:
+        api = HfApi()
+        cols_list = []
+        for user in users[0:6]:
+            cols = api.list_collections(owner=user, sort="upvotes", limit=limit)
+            for col in cols:
+                cols_list.append(col)
+        return cols_list
+    except Exception as e:
+        print(e)
+        raise Exception(e)
+def get_ref_collections(repo_id: str, limit=10):
+    try:
+        repo_type = get_repo_type(repo_id)
+        likers = get_repo_likers(repo_id, repo_type)[0:10]
+        cols = get_collections_by_repo(repo_id, repo_type, limit) + get_collections_by_users(likers, limit)
+        cols = list({k.slug: k for k in cols}.values())
+        return cols
+    except Exception as e:
+        print(e)
+        raise Exception(e)
 def str_to_list(s: str):
     try:
         m = re.split("\n", s)
         if isinstance(i, ModelInfo): type = "model"
         elif isinstance(i, DatasetInfo): type = "dataset"
         elif isinstance(i, SpaceInfo): type = "space"
+        elif isinstance(i, PaperInfo): type = "paper"
+        elif isinstance(i, Collection): type = "collection"
         else: return
         self._set(type, "Type")
+        if type in ["space", "model", "dataset"]:
+            self._set(i.id, "ID")
+            self._set(i.id.split("/")[0], "User")
+            self._set(i.id.split("/")[1], "Name")
+            if type == "dataset": self._set(f"https://hf.co/datasets/{i.id}", "URL")
+            elif type == "space": self._set(f"https://hf.co/spaces/{i.id}", "URL")
+            else: self._set(f"https://hf.co/{i.id}", "URL")
+            if i.likes is not None: self._set(i.likes, "Likes")
+            if i.last_modified is not None: self._set(date_to_str(i.last_modified), "LastMod.")
+            if i.trending_score is not None: self._set(int(i.trending_score), "Trending")
+            if i.tags is not None: self._set("True" if "not-for-all-audiences" in i.tags else "False", "NFAA")
+            if type in ["model", "dataset"]:
+                if i.gated is not None: self._set(i.gated if i.gated else "off", "Gated")
+                if i.downloads is not None: self._set(i.downloads, "DLs")
+                if i.downloads_all_time is not None: self._set(i.downloads_all_time, "AllDLs")
+            if type == "model":
+                if i.inference is not None: self._set(i.inference, "Status")
+                if i.library_name is not None: self._set(i.library_name, "Library")
+                if i.pipeline_tag is not None: self._set(i.pipeline_tag, "Pipeline")
+            if type == "space":
+                if i.runtime is not None:
+                    self._set(i.runtime.hardware, "Hardware")
+                    self._set(i.runtime.stage, "Stage")
+        elif type == "paper": # https://github.com/huggingface/huggingface_hub/blob/v0.27.0/src/huggingface_hub/hf_api.py#L1428
+            self._set(i.id, "ID")
+            self._set(f"https://hf.co/papers/{i.id}", "URL")
+            if i.submitted_by is not None: self._set(i.submitted_by, "User")
+            if i.title is not None: self._set(i.title, "Name")
+            if i.submitted_at is not None: self._set(date_to_str(i.submitted_at), "LastMod.")
+            if i.upvotes is not None: self._set(i.upvotes, "Likes")
+        elif type == "collection":
+            self._set(i.slug, "ID")
+            if i.owner is not None: self._set(i.owner["name"], "User")
+            if i.title is not None: self._set(i.title, "Name")
+            if i.last_updated is not None: self._set(date_to_str(i.last_updated), "LastMod.")
+            if i.upvotes is not None: self._set(i.upvotes, "Likes")
+            if i.url is not None: self._set(i.url, "URL")
         self._next()
     def search(self, repo_types: list, sort: str, sort_method: str, filter_str: str, search_str: str, author: str, tags: str, infer: str, gated: str, appr: list[str],
             mkwargs = {}
             dkwargs = {}
             skwargs = {}
+            ckwargs = {}
+            pkwargs = {}
+            if filter_str:
+                kwargs["filter"] = str_to_list(filter_str)
+                ckwargs["item"] = str_to_list(filter_str)
+                pkwargs["query"] = str_to_list(filter_str)
             if search_str: kwargs["search"] = search_str
+            if author:
+                kwargs["author"] = author
+                ckwargs["owner"] = author
             if tags and is_valid_arg(tags):
                 mkwargs["tags"] = str_to_list(tags)
                 dkwargs["tags"] = str_to_list(tags)
+            if limit > 0:
+                kwargs["limit"] = limit
+                ckwargs["limit"] = 100 if limit > 100 else limit
             if sort_method == "descending order": kwargs["direction"] = -1
             if gated == "gated":
                 mkwargs["gated"] = True
                          if len(hardware) > 0 and space.runtime.stage == "RUNNING" and space.runtime.hardware not in hardware: continue
                          if len(stage) > 0 and space.runtime.stage not in stage: continue
                     self.add_item(space)
+            if "paper" in repo_types:
+                papers = api.list_papers(**pkwargs)
+                for paper in papers:
+                    self.add_item(paper)
+            if "collection" in repo_types:
+                cols = api.list_collections(**ckwargs)
+                for col in cols:
+                    self.add_item(col)
+            self.sort(sort)
         except Exception as e:
             raise Exception(f"Search error: {e}") from e
             self.reset()
             self.show_labels = show_labels.copy()
             api = HfApi()
+            if "model" in repo_types or "dataset" in repo_types or "space" in repo_types or "paper" in repo_types:
+                repos, types, counts = get_ref_repos(repo_id)
+                i = 0
+                for r, t in zip(repos, types):
+                    if i + 1 > limit: break
+                    i += 1
+                    if t not in repo_types: continue
+                    info = api.repo_info(repo_id=r, repo_type=t)
+                    if info: self.add_item(info)
+            if "collection" in repo_types:
+                cols = get_ref_collections(repo_id, limit)
+                for col in cols:
+                    self.add_item(col)
+            self.sort(sort)
         except Exception as e:
             raise Exception(f"Search error: {e}") from e
             return sdf
         def id_to_md(df: pd.DataFrame):
+            if df["Type"] == "collection": return f'[{df["User"]}: {df["Name"]}]({df["URL"]})'
+            elif df["Type"] == "paper": return f'[{df["Name"]} (arxiv:{df["ID"]})]({df["URL"]})'
+            else: return f'[{df["ID"]}]({df["URL"]})'
         def format_md_df(df: pd.DataFrame):
             df["ID"] = df.apply(id_to_md, axis=1)
     def sort(self, key="Likes"):
         if len(self.item_list) == 0: raise Exception("No item found.")
+        if key in SORT_PARAM_TO_ITEM.keys(): key = SORT_PARAM_TO_ITEM[key]
+        types = set()
+        for i in self.item_list:
+            if "Type" in i.keys(): types.add(i["Type"])
+        if "paper" in types: return
+        if key in ["DLs", "AllDLs"] and ("space" in types or "collection" in types): key = "Likes"
         if not key in self.labels.get()[0]: key = "Likes"
         self.item_list, self.item_hide_flags, self.item_info_list = zip(*sorted(zip(self.item_list, self.item_hide_flags, self.item_info_list), key=lambda x: x[0][key], reverse=True))