HowardZhangdqs commited on
Commit
c7478e3
·
1 Parent(s): a038fae

feat: main feature

Browse files
Files changed (7) hide show
  1. .gitattributes +0 -35
  2. .gitignore +2 -0
  3. app.py +182 -0
  4. css/interface.css +12 -0
  5. fetch_paper.py +85 -0
  6. parser.py +133 -0
  7. sorter.py +32 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ .gradio/
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datetime import datetime, timedelta
3
+ from fetch_paper import fetch_papers, fetch_papers_with_daterange
4
+ from sorter import sort_by_upvotes
5
+
6
+
7
+ def format_author(author):
8
+ """格式化作者信息"""
9
+ if not author:
10
+ return ""
11
+ hidden_status = "(隐藏)" if author.hidden else ""
12
+ if author.name:
13
+ return f"<a href='https://scholar.google.com/citations?view_op=search_authors&mauthors={author.name.replace(' ', '+')}'>{author.name}</a>{hidden_status}"
14
+ return f"匿名作者{hidden_status}"
15
+
16
+
17
+ def format_paper_info(article):
18
+ """生成论文展示的 HTML 内容"""
19
+ if not article.paper:
20
+ return "论文信息缺失"
21
+
22
+ info = []
23
+ # 标题部分
24
+ info.append(f"<h2>{article.title or '无标题论文'}</h2>")
25
+
26
+ # 缩略图
27
+ if article.thumbnail:
28
+ info.append(f"<p><img src='{article.thumbnail}' style='max-width: 30em; width: 100%; margin: auto'/></p>")
29
+
30
+ # 基本信息
31
+ info.append(f"<p><strong>论文 ID</strong>:<a href='https://huggingface.co/papers/{article.paper.id}'>{article.paper.id or '未知'}</a></p>")
32
+ info.append(f"<p><strong>发布时间</strong>:{article.paper.publishedAt.strftime('%Y-%m-%d %H:%M') if article.paper.publishedAt else '未知'}</p>")
33
+
34
+ # 作者信息
35
+ authors = "、".join([format_author(a) for a in article.paper.authors]) if article.paper.authors else "作者信息暂缺"
36
+ info.append(f"<p><strong>作者</strong>:{authors}</p>")
37
+
38
+ # 摘要
39
+ if article.paper.summary:
40
+ summary = article.paper.summary.replace('{{', '{').replace('}}', '}').replace('\n', ' ')
41
+ info.append(f"<h3>摘要</h3><p>{summary}</p>")
42
+
43
+ # 讨论信息
44
+ info.append(f"<p><strong>点赞数</strong>:{article.paper.upvotes or 0}<span style='margin-left: .5rem'></span>")
45
+ info.append(f"<strong>评论数</strong>:{article.numComments or 0}</p>")
46
+ if article.paper.discussionId:
47
+ info.append(f"<a href='https://huggingface.co/papers/{article.paper.id}/discussion/{article.paper.discussionId}'>进入讨论</a></p>")
48
+
49
+ # 提交者信息
50
+ if article.submittedBy:
51
+ submitter = article.submittedBy
52
+ info.append(f"<hr><p><strong>提交者</strong>: ")
53
+ info.append(
54
+ f"<span><img src='{submitter.avatarUrl}' class='author' /></span>{submitter.fullname}(<a href='https://huggingface.co/{submitter.name}'>@{submitter.name}</a>) ")
55
+ info.append(f"粉丝数:{submitter.followerCount or 0}</p>")
56
+
57
+ return "".join(info)
58
+
59
+
60
+ def generate_table_html(papers):
61
+ """生成带可点击标题的表格 HTML"""
62
+ html = ['<table class="paper-table"><tr><th>标题</th><th>👍点赞</th><th>💬评论</th><th>📅日期</th></tr>']
63
+
64
+ for article in papers:
65
+ title = article.title or "无标题"
66
+ upvotes = article.paper.upvotes or 0
67
+ comments = article.numComments or 0
68
+ date = article.paper.publishedAt.strftime("%Y-%m-%d") if article.paper.publishedAt else "未知"
69
+ paper_id = article.paper.id
70
+
71
+ row = f"""
72
+ <tr>
73
+ <td><a class="paper-title" href="javascript:void(0)" onclick="showDetail('{paper_id}')">{title}</a></td>
74
+ <td>{upvotes}</td>
75
+ <td>{comments}</td>
76
+ <td>{date}</td>
77
+ </tr>
78
+ """
79
+ html.append(row)
80
+
81
+ html.append("</table>")
82
+ return "".join(html)
83
+
84
+
85
+ def build_html(papers):
86
+ # 将所有的papers转换为一个html字符串,每个paper用一个div包裹,div内部包含paper的信息,div的id为paper的id
87
+ html = ""
88
+ for article in papers:
89
+ article_html = format_paper_info(article)
90
+ html += f"<div id='smartflow-paper-{article.paper.id.replace('.', '-')}' style='display: none'>{article_html}</div>"
91
+ return html
92
+
93
+
94
+ def query_papers(start_date_str, end_date_str):
95
+ """处理日期查询"""
96
+ try:
97
+ start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
98
+ end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
99
+ papers = fetch_papers_with_daterange(start_date, end_date)
100
+ papers = sort_by_upvotes(papers)
101
+ return generate_table_html(papers), build_html(papers)
102
+ except Exception as e:
103
+ print(f"查询出错: {e}")
104
+ return "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>", "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>"
105
+
106
+
107
+ def show_detail(paper_id, papers):
108
+ """显示论文详情"""
109
+ if not papers:
110
+ return "请先进行查询"
111
+
112
+ return build_html(papers)
113
+
114
+
115
+ # CSS 样式(可放入单独文件)
116
+ custom_css = """
117
+ .paper-table { width: 100%; border-collapse: collapse; }
118
+ .paper-table td { padding: 12px; border-bottom: 1px solid #ddd; }
119
+ .paper-table th { font-weight: bold; background: #f9f9f920; }
120
+ .paper-table tr:hover { background: #f9f9f920; }
121
+ .paper-title { color: #1a73e8; cursor: pointer; text-decoration: none !important; }
122
+ .paper-title:hover { text-decoration: underline !important; }
123
+ .paper-table td:nth-child(2), .paper-table td:nth-child(3), .paper-table td:nth-child(4) { text-align: center; }
124
+ .paper-table th:nth-child(2), .paper-table th:nth-child(3), .paper-table th:nth-child(4) { text-align: center; }
125
+ .detail-area { margin-top: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
126
+ """
127
+
128
+ custom_js = """
129
+ function showDetail(paperId) {
130
+ // 隐藏 smartflow-paper-paperId 的所有兄弟节点
131
+ var siblings = document.querySelectorAll(`div[id^='smartflow-paper-']:not(#smartflow-paper-${paperId.replace('.', '-')})`);
132
+ siblings.forEach(sibling => sibling.style.display = 'none');
133
+
134
+ // 显示当前节点
135
+ var paper = document.getElementById(`smartflow-paper-${paperId.replace('.', '-')}`);
136
+ if (paper) {
137
+ paper.style.display = 'block';
138
+ }
139
+ }
140
+ """
141
+
142
+
143
+ def create_interface():
144
+ """创建新的界面布局"""
145
+ with gr.Blocks(title="Hugging Face Daily Paper", css=custom_css, head=f"<script>{custom_js}</script>") as app:
146
+
147
+ # 主界面
148
+ gr.Markdown("# 📚 Hugging Face Daily Paper")
149
+
150
+ # 查询控制区
151
+ with gr.Row():
152
+ start_date = gr.Textbox(label="起始日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
153
+ end_date = gr.Textbox(label="结束日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
154
+ query_btn = gr.Button("🔍 查询", variant="primary")
155
+
156
+ # 结果显示区
157
+ with gr.Column(visible=True):
158
+ results_html = gr.HTML(label="查询结果")
159
+
160
+ # 论文详情区
161
+ with gr.Column(visible=True, elem_classes="detail-area"):
162
+ gr.Markdown("## 论文详情")
163
+ detail_html = gr.HTML(elem_id="detail-html")
164
+
165
+ # 事件处理
166
+ query_btn.click(
167
+ fn=query_papers,
168
+ inputs=[start_date, end_date],
169
+ outputs=[results_html, detail_html]
170
+ )
171
+
172
+ return app
173
+
174
+
175
+ if __name__ == "__main__":
176
+ gr.close_all()
177
+ app = create_interface()
178
+ app.launch(
179
+ # server_name="localhost",
180
+ # server_port=7860,
181
+ # share=True
182
+ )
css/interface.css ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a.author {
2
+ text-decoration: underline;
3
+ color: #000;
4
+ }
5
+
6
+ img.author {
7
+ height: 1.5rem;
8
+ border: 1px solid #000;
9
+ vertical-align: middle;
10
+ border-radius: 50%;
11
+ display: inline;
12
+ }
fetch_paper.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from parser import parse_article
2
+ import os
3
+ import requests
4
+ import datetime
5
+ import hashlib
6
+ import json
7
+
8
+
9
+ API_URL = "https://huggingface.co/api/daily_papers"
10
+
11
+ cache = {}
12
+
13
+
14
+ def make_request(url: str):
15
+ # Create a hash of the URL to use as the cache key
16
+ url_hash = hashlib.md5(url.encode()).hexdigest()
17
+
18
+ # Check if the response is already cached
19
+ if url_hash in cache:
20
+ print(f"Cache hit for URL: {url}")
21
+ return cache[url_hash]
22
+
23
+ http_proxy = os.getenv("HF_HTTP_PROXY")
24
+ https_proxy = os.getenv("HF_HTTPS_PROXY")
25
+ proxies = {
26
+ "http": http_proxy,
27
+ "https": https_proxy
28
+ } if http_proxy or https_proxy else None
29
+
30
+ attempts = 0
31
+ while attempts < 3:
32
+ try:
33
+ response = requests.get(url, proxies=proxies)
34
+ response.raise_for_status()
35
+ data = response.json()
36
+
37
+ # Cache the response
38
+ cache[url_hash] = data
39
+
40
+ return data
41
+ except requests.RequestException as e:
42
+ attempts += 1
43
+ print(f"Attempt {attempts} failed: {e}")
44
+ if attempts == 3:
45
+ return []
46
+
47
+
48
+ def fetch_papers():
49
+ data = make_request(API_URL)
50
+ return [parse_article(item) for item in data]
51
+
52
+
53
+ def fetch_papers_with_date(date: datetime):
54
+ formatted_date = date.strftime("%Y-%m-%d")
55
+ data = make_request(API_URL + "?date=" + formatted_date)
56
+ return [parse_article(item) for item in data]
57
+
58
+
59
+ def fetch_papers_with_daterange(start_date: datetime, end_date: datetime):
60
+ # return []
61
+ # 每天的数据都是独立的,所以只需要遍历日期范围即可
62
+ articles = []
63
+ current_date = start_date
64
+ while current_date <= end_date:
65
+ print(current_date)
66
+ articles.extend(fetch_papers_with_date(current_date))
67
+ print(f"Total articles: {len(articles)}")
68
+ current_date += datetime.timedelta(days=1)
69
+
70
+ # 根据每个文章的.paper.id去重
71
+ unique_articles = {}
72
+ for article in articles:
73
+ if article.paper.id not in unique_articles:
74
+ unique_articles[article.paper.id] = article
75
+
76
+ return list(unique_articles.values())
77
+
78
+
79
+ if __name__ == "__main__":
80
+ from rich import print
81
+ start_date = datetime.datetime(2024, 1, 28)
82
+ end_date = datetime.datetime(2024, 1, 30)
83
+ articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
84
+ # print(articles)
85
+ print(f"Total articles: {len(articles)}")
parser.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from typing import List, Optional, Any, Dict
4
+
5
+ # 修改后的数据类(添加 Optional 和默认值)
6
+
7
+
8
+ @dataclass
9
+ class Author:
10
+ _id: Optional[str] = None
11
+ name: Optional[str] = None
12
+ hidden: Optional[bool] = None
13
+
14
+
15
+ @dataclass
16
+ class Paper:
17
+ id: Optional[str] = None
18
+ authors: List[Author] = None
19
+ publishedAt: Optional[datetime] = None
20
+ title: Optional[str] = None
21
+ summary: Optional[str] = None
22
+ upvotes: Optional[int] = None
23
+ discussionId: Optional[str] = None
24
+
25
+
26
+ @dataclass
27
+ class SubmittedBy:
28
+ _id: Optional[str] = None
29
+ avatarUrl: Optional[str] = None
30
+ fullname: Optional[str] = None
31
+ name: Optional[str] = None
32
+ type: Optional[str] = None
33
+ isPro: Optional[bool] = None
34
+ isHf: Optional[bool] = None
35
+ isMod: Optional[bool] = None
36
+ followerCount: Optional[int] = None
37
+
38
+
39
+ @dataclass
40
+ class Article:
41
+ paper: Optional[Paper] = None
42
+ publishedAt: Optional[datetime] = None
43
+ title: Optional[str] = None
44
+ thumbnail: Optional[str] = None
45
+ numComments: Optional[int] = None
46
+ submittedBy: Optional[SubmittedBy] = None
47
+ isAuthorParticipating: Optional[bool] = None
48
+
49
+
50
+ def safe_get(data: Dict, *keys: str) -> Any:
51
+ """安全获取嵌套字典值"""
52
+ for key in keys:
53
+ data = data.get(key, {}) if isinstance(data, dict) else None
54
+ return data if data != {} else None
55
+
56
+
57
+ def parse_article(data: Dict[str, Any]) -> Article:
58
+ """容错式解析函数"""
59
+
60
+ def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
61
+ """安全解析时间"""
62
+ if not dt_str:
63
+ return None
64
+ try:
65
+ if dt_str.endswith('Z'):
66
+ dt_str = dt_str[:-1] + '+00:00'
67
+ return datetime.fromisoformat(dt_str)
68
+ except ValueError:
69
+ return None
70
+
71
+ # 解析作者列表
72
+ authors = []
73
+ for author_data in safe_get(data, "paper", "authors") or []:
74
+ authors.append(Author(
75
+ _id=author_data.get("_id"),
76
+ name=author_data.get("name"),
77
+ hidden=author_data.get("hidden")
78
+ ))
79
+
80
+ # 解析论文
81
+ paper = Paper(
82
+ id=safe_get(data, "paper", "id"),
83
+ authors=authors,
84
+ publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
85
+ title=safe_get(data, "paper", "title"),
86
+ summary=safe_get(data, "paper", "summary"),
87
+ upvotes=safe_get(data, "paper", "upvotes"),
88
+ discussionId=safe_get(data, "paper", "discussionId")
89
+ ) if safe_get(data, "paper") else None
90
+
91
+ # 解析提交者
92
+ submitted_by_data = safe_get(data, "submittedBy")
93
+ submitted_by = SubmittedBy(
94
+ _id=submitted_by_data.get("_id") if submitted_by_data else None,
95
+ avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
96
+ fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
97
+ name=submitted_by_data.get("name") if submitted_by_data else None,
98
+ type=submitted_by_data.get("type") if submitted_by_data else None,
99
+ isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
100
+ isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
101
+ isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
102
+ followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
103
+ ) if submitted_by_data else None
104
+
105
+ # 构建最终对象
106
+ return Article(
107
+ paper=paper,
108
+ publishedAt=parse_datetime(data.get("publishedAt")),
109
+ title=data.get("title"),
110
+ thumbnail=data.get("thumbnail"),
111
+ numComments=data.get("numComments"),
112
+ submittedBy=submitted_by,
113
+ isAuthorParticipating=data.get("isAuthorParticipating")
114
+ )
115
+
116
+
117
+ # 使用示例
118
+ if __name__ == "__main__":
119
+ import json
120
+ from rich import print
121
+
122
+ # 假设您的原始数据保存在 article.json 文件中
123
+ with open("article.json") as f:
124
+ raw_data = json.load(f)
125
+
126
+ articles = []
127
+
128
+ for raw_article in raw_data:
129
+ article = parse_article(raw_article)
130
+ articles.append(article)
131
+
132
+ print(articles[0])
133
+ print(len(articles))
sorter.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from parser import Article
2
+
3
+
4
+ def sort_by_date(articles):
5
+ return sorted(articles, key=lambda x: x.publishedAt, reverse=True)
6
+
7
+
8
+ def sort_by_upvotes(articles):
9
+ return sorted(articles, key=lambda x: x.paper.upvotes, reverse=True)
10
+
11
+
12
+ def sort_by_comments(articles):
13
+ return sorted(articles, key=lambda x: x.numComments, reverse=True)
14
+
15
+
16
+ if __name__ == "__main__":
17
+ from fetch_paper import fetch_papers
18
+ from rich import print
19
+
20
+ articles = fetch_papers()
21
+
22
+ print("Latest paper:")
23
+ articles = sort_by_date(articles)
24
+ print(articles[0])
25
+
26
+ print("Most upvoted paper:")
27
+ articles = sort_by_upvotes(articles)
28
+ print(articles[0])
29
+
30
+ print("Most commented paper:")
31
+ articles = sort_by_comments(articles)
32
+ print(articles[0])