Commit
·
c7478e3
1
Parent(s):
a038fae
feat: main feature
Browse files- .gitattributes +0 -35
- .gitignore +2 -0
- app.py +182 -0
- css/interface.css +12 -0
- fetch_paper.py +85 -0
- parser.py +133 -0
- sorter.py +32 -0
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.gradio/
|
app.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from datetime import datetime, timedelta
|
3 |
+
from fetch_paper import fetch_papers, fetch_papers_with_daterange
|
4 |
+
from sorter import sort_by_upvotes
|
5 |
+
|
6 |
+
|
7 |
+
def format_author(author):
|
8 |
+
"""格式化作者信息"""
|
9 |
+
if not author:
|
10 |
+
return ""
|
11 |
+
hidden_status = "(隐藏)" if author.hidden else ""
|
12 |
+
if author.name:
|
13 |
+
return f"<a href='https://scholar.google.com/citations?view_op=search_authors&mauthors={author.name.replace(' ', '+')}'>{author.name}</a>{hidden_status}"
|
14 |
+
return f"匿名作者{hidden_status}"
|
15 |
+
|
16 |
+
|
17 |
+
def format_paper_info(article):
|
18 |
+
"""生成论文展示的 HTML 内容"""
|
19 |
+
if not article.paper:
|
20 |
+
return "论文信息缺失"
|
21 |
+
|
22 |
+
info = []
|
23 |
+
# 标题部分
|
24 |
+
info.append(f"<h2>{article.title or '无标题论文'}</h2>")
|
25 |
+
|
26 |
+
# 缩略图
|
27 |
+
if article.thumbnail:
|
28 |
+
info.append(f"<p><img src='{article.thumbnail}' style='max-width: 30em; width: 100%; margin: auto'/></p>")
|
29 |
+
|
30 |
+
# 基本信息
|
31 |
+
info.append(f"<p><strong>论文 ID</strong>:<a href='https://huggingface.co/papers/{article.paper.id}'>{article.paper.id or '未知'}</a></p>")
|
32 |
+
info.append(f"<p><strong>发布时间</strong>:{article.paper.publishedAt.strftime('%Y-%m-%d %H:%M') if article.paper.publishedAt else '未知'}</p>")
|
33 |
+
|
34 |
+
# 作者信息
|
35 |
+
authors = "、".join([format_author(a) for a in article.paper.authors]) if article.paper.authors else "作者信息暂缺"
|
36 |
+
info.append(f"<p><strong>作者</strong>:{authors}</p>")
|
37 |
+
|
38 |
+
# 摘要
|
39 |
+
if article.paper.summary:
|
40 |
+
summary = article.paper.summary.replace('{{', '{').replace('}}', '}').replace('\n', ' ')
|
41 |
+
info.append(f"<h3>摘要</h3><p>{summary}</p>")
|
42 |
+
|
43 |
+
# 讨论信息
|
44 |
+
info.append(f"<p><strong>点赞数</strong>:{article.paper.upvotes or 0}<span style='margin-left: .5rem'></span>")
|
45 |
+
info.append(f"<strong>评论数</strong>:{article.numComments or 0}</p>")
|
46 |
+
if article.paper.discussionId:
|
47 |
+
info.append(f"<a href='https://huggingface.co/papers/{article.paper.id}/discussion/{article.paper.discussionId}'>进入讨论</a></p>")
|
48 |
+
|
49 |
+
# 提交者信息
|
50 |
+
if article.submittedBy:
|
51 |
+
submitter = article.submittedBy
|
52 |
+
info.append(f"<hr><p><strong>提交者</strong>: ")
|
53 |
+
info.append(
|
54 |
+
f"<span><img src='{submitter.avatarUrl}' class='author' /></span>{submitter.fullname}(<a href='https://huggingface.co/{submitter.name}'>@{submitter.name}</a>) ")
|
55 |
+
info.append(f"粉丝数:{submitter.followerCount or 0}</p>")
|
56 |
+
|
57 |
+
return "".join(info)
|
58 |
+
|
59 |
+
|
60 |
+
def generate_table_html(papers):
|
61 |
+
"""生成带可点击标题的表格 HTML"""
|
62 |
+
html = ['<table class="paper-table"><tr><th>标题</th><th>👍点赞</th><th>💬评论</th><th>📅日期</th></tr>']
|
63 |
+
|
64 |
+
for article in papers:
|
65 |
+
title = article.title or "无标题"
|
66 |
+
upvotes = article.paper.upvotes or 0
|
67 |
+
comments = article.numComments or 0
|
68 |
+
date = article.paper.publishedAt.strftime("%Y-%m-%d") if article.paper.publishedAt else "未知"
|
69 |
+
paper_id = article.paper.id
|
70 |
+
|
71 |
+
row = f"""
|
72 |
+
<tr>
|
73 |
+
<td><a class="paper-title" href="javascript:void(0)" onclick="showDetail('{paper_id}')">{title}</a></td>
|
74 |
+
<td>{upvotes}</td>
|
75 |
+
<td>{comments}</td>
|
76 |
+
<td>{date}</td>
|
77 |
+
</tr>
|
78 |
+
"""
|
79 |
+
html.append(row)
|
80 |
+
|
81 |
+
html.append("</table>")
|
82 |
+
return "".join(html)
|
83 |
+
|
84 |
+
|
85 |
+
def build_html(papers):
|
86 |
+
# 将所有的papers转换为一个html字符串,每个paper用一个div包裹,div内部包含paper的信息,div的id为paper的id
|
87 |
+
html = ""
|
88 |
+
for article in papers:
|
89 |
+
article_html = format_paper_info(article)
|
90 |
+
html += f"<div id='smartflow-paper-{article.paper.id.replace('.', '-')}' style='display: none'>{article_html}</div>"
|
91 |
+
return html
|
92 |
+
|
93 |
+
|
94 |
+
def query_papers(start_date_str, end_date_str):
|
95 |
+
"""处理日期查询"""
|
96 |
+
try:
|
97 |
+
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
98 |
+
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
|
99 |
+
papers = fetch_papers_with_daterange(start_date, end_date)
|
100 |
+
papers = sort_by_upvotes(papers)
|
101 |
+
return generate_table_html(papers), build_html(papers)
|
102 |
+
except Exception as e:
|
103 |
+
print(f"查询出错: {e}")
|
104 |
+
return "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>", "<p>⚠️ 查询失败,请检查日期格式(YYYY-MM-DD)</p>"
|
105 |
+
|
106 |
+
|
107 |
+
def show_detail(paper_id, papers):
|
108 |
+
"""显示论文详情"""
|
109 |
+
if not papers:
|
110 |
+
return "请先进行查询"
|
111 |
+
|
112 |
+
return build_html(papers)
|
113 |
+
|
114 |
+
|
115 |
+
# CSS 样式(可放入单独文件)
|
116 |
+
custom_css = """
|
117 |
+
.paper-table { width: 100%; border-collapse: collapse; }
|
118 |
+
.paper-table td { padding: 12px; border-bottom: 1px solid #ddd; }
|
119 |
+
.paper-table th { font-weight: bold; background: #f9f9f920; }
|
120 |
+
.paper-table tr:hover { background: #f9f9f920; }
|
121 |
+
.paper-title { color: #1a73e8; cursor: pointer; text-decoration: none !important; }
|
122 |
+
.paper-title:hover { text-decoration: underline !important; }
|
123 |
+
.paper-table td:nth-child(2), .paper-table td:nth-child(3), .paper-table td:nth-child(4) { text-align: center; }
|
124 |
+
.paper-table th:nth-child(2), .paper-table th:nth-child(3), .paper-table th:nth-child(4) { text-align: center; }
|
125 |
+
.detail-area { margin-top: 20px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
|
126 |
+
"""
|
127 |
+
|
128 |
+
custom_js = """
|
129 |
+
function showDetail(paperId) {
|
130 |
+
// 隐藏 smartflow-paper-paperId 的所有兄弟节点
|
131 |
+
var siblings = document.querySelectorAll(`div[id^='smartflow-paper-']:not(#smartflow-paper-${paperId.replace('.', '-')})`);
|
132 |
+
siblings.forEach(sibling => sibling.style.display = 'none');
|
133 |
+
|
134 |
+
// 显示当前节点
|
135 |
+
var paper = document.getElementById(`smartflow-paper-${paperId.replace('.', '-')}`);
|
136 |
+
if (paper) {
|
137 |
+
paper.style.display = 'block';
|
138 |
+
}
|
139 |
+
}
|
140 |
+
"""
|
141 |
+
|
142 |
+
|
143 |
+
def create_interface():
|
144 |
+
"""创建新的界面布局"""
|
145 |
+
with gr.Blocks(title="Hugging Face Daily Paper", css=custom_css, head=f"<script>{custom_js}</script>") as app:
|
146 |
+
|
147 |
+
# 主界面
|
148 |
+
gr.Markdown("# 📚 Hugging Face Daily Paper")
|
149 |
+
|
150 |
+
# 查询控制区
|
151 |
+
with gr.Row():
|
152 |
+
start_date = gr.Textbox(label="起始日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
|
153 |
+
end_date = gr.Textbox(label="结束日期", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
|
154 |
+
query_btn = gr.Button("🔍 查询", variant="primary")
|
155 |
+
|
156 |
+
# 结果显示区
|
157 |
+
with gr.Column(visible=True):
|
158 |
+
results_html = gr.HTML(label="查询结果")
|
159 |
+
|
160 |
+
# 论文详情区
|
161 |
+
with gr.Column(visible=True, elem_classes="detail-area"):
|
162 |
+
gr.Markdown("## 论文详情")
|
163 |
+
detail_html = gr.HTML(elem_id="detail-html")
|
164 |
+
|
165 |
+
# 事件处理
|
166 |
+
query_btn.click(
|
167 |
+
fn=query_papers,
|
168 |
+
inputs=[start_date, end_date],
|
169 |
+
outputs=[results_html, detail_html]
|
170 |
+
)
|
171 |
+
|
172 |
+
return app
|
173 |
+
|
174 |
+
|
175 |
+
if __name__ == "__main__":
|
176 |
+
gr.close_all()
|
177 |
+
app = create_interface()
|
178 |
+
app.launch(
|
179 |
+
# server_name="localhost",
|
180 |
+
# server_port=7860,
|
181 |
+
# share=True
|
182 |
+
)
|
css/interface.css
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a.author {
|
2 |
+
text-decoration: underline;
|
3 |
+
color: #000;
|
4 |
+
}
|
5 |
+
|
6 |
+
img.author {
|
7 |
+
height: 1.5rem;
|
8 |
+
border: 1px solid #000;
|
9 |
+
vertical-align: middle;
|
10 |
+
border-radius: 50%;
|
11 |
+
display: inline;
|
12 |
+
}
|
fetch_paper.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from parser import parse_article
|
2 |
+
import os
|
3 |
+
import requests
|
4 |
+
import datetime
|
5 |
+
import hashlib
|
6 |
+
import json
|
7 |
+
|
8 |
+
|
9 |
+
API_URL = "https://huggingface.co/api/daily_papers"
|
10 |
+
|
11 |
+
cache = {}
|
12 |
+
|
13 |
+
|
14 |
+
def make_request(url: str):
|
15 |
+
# Create a hash of the URL to use as the cache key
|
16 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()
|
17 |
+
|
18 |
+
# Check if the response is already cached
|
19 |
+
if url_hash in cache:
|
20 |
+
print(f"Cache hit for URL: {url}")
|
21 |
+
return cache[url_hash]
|
22 |
+
|
23 |
+
http_proxy = os.getenv("HF_HTTP_PROXY")
|
24 |
+
https_proxy = os.getenv("HF_HTTPS_PROXY")
|
25 |
+
proxies = {
|
26 |
+
"http": http_proxy,
|
27 |
+
"https": https_proxy
|
28 |
+
} if http_proxy or https_proxy else None
|
29 |
+
|
30 |
+
attempts = 0
|
31 |
+
while attempts < 3:
|
32 |
+
try:
|
33 |
+
response = requests.get(url, proxies=proxies)
|
34 |
+
response.raise_for_status()
|
35 |
+
data = response.json()
|
36 |
+
|
37 |
+
# Cache the response
|
38 |
+
cache[url_hash] = data
|
39 |
+
|
40 |
+
return data
|
41 |
+
except requests.RequestException as e:
|
42 |
+
attempts += 1
|
43 |
+
print(f"Attempt {attempts} failed: {e}")
|
44 |
+
if attempts == 3:
|
45 |
+
return []
|
46 |
+
|
47 |
+
|
48 |
+
def fetch_papers():
|
49 |
+
data = make_request(API_URL)
|
50 |
+
return [parse_article(item) for item in data]
|
51 |
+
|
52 |
+
|
53 |
+
def fetch_papers_with_date(date: datetime):
|
54 |
+
formatted_date = date.strftime("%Y-%m-%d")
|
55 |
+
data = make_request(API_URL + "?date=" + formatted_date)
|
56 |
+
return [parse_article(item) for item in data]
|
57 |
+
|
58 |
+
|
59 |
+
def fetch_papers_with_daterange(start_date: datetime, end_date: datetime):
|
60 |
+
# return []
|
61 |
+
# 每天的数据都是独立的,所以只需要遍历日期范围即可
|
62 |
+
articles = []
|
63 |
+
current_date = start_date
|
64 |
+
while current_date <= end_date:
|
65 |
+
print(current_date)
|
66 |
+
articles.extend(fetch_papers_with_date(current_date))
|
67 |
+
print(f"Total articles: {len(articles)}")
|
68 |
+
current_date += datetime.timedelta(days=1)
|
69 |
+
|
70 |
+
# 根据每个文章的.paper.id去重
|
71 |
+
unique_articles = {}
|
72 |
+
for article in articles:
|
73 |
+
if article.paper.id not in unique_articles:
|
74 |
+
unique_articles[article.paper.id] = article
|
75 |
+
|
76 |
+
return list(unique_articles.values())
|
77 |
+
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
from rich import print
|
81 |
+
start_date = datetime.datetime(2024, 1, 28)
|
82 |
+
end_date = datetime.datetime(2024, 1, 30)
|
83 |
+
articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
|
84 |
+
# print(articles)
|
85 |
+
print(f"Total articles: {len(articles)}")
|
parser.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from datetime import datetime
|
3 |
+
from typing import List, Optional, Any, Dict
|
4 |
+
|
5 |
+
# 修改后的数据类(添加 Optional 和默认值)
|
6 |
+
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class Author:
|
10 |
+
_id: Optional[str] = None
|
11 |
+
name: Optional[str] = None
|
12 |
+
hidden: Optional[bool] = None
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class Paper:
|
17 |
+
id: Optional[str] = None
|
18 |
+
authors: List[Author] = None
|
19 |
+
publishedAt: Optional[datetime] = None
|
20 |
+
title: Optional[str] = None
|
21 |
+
summary: Optional[str] = None
|
22 |
+
upvotes: Optional[int] = None
|
23 |
+
discussionId: Optional[str] = None
|
24 |
+
|
25 |
+
|
26 |
+
@dataclass
|
27 |
+
class SubmittedBy:
|
28 |
+
_id: Optional[str] = None
|
29 |
+
avatarUrl: Optional[str] = None
|
30 |
+
fullname: Optional[str] = None
|
31 |
+
name: Optional[str] = None
|
32 |
+
type: Optional[str] = None
|
33 |
+
isPro: Optional[bool] = None
|
34 |
+
isHf: Optional[bool] = None
|
35 |
+
isMod: Optional[bool] = None
|
36 |
+
followerCount: Optional[int] = None
|
37 |
+
|
38 |
+
|
39 |
+
@dataclass
|
40 |
+
class Article:
|
41 |
+
paper: Optional[Paper] = None
|
42 |
+
publishedAt: Optional[datetime] = None
|
43 |
+
title: Optional[str] = None
|
44 |
+
thumbnail: Optional[str] = None
|
45 |
+
numComments: Optional[int] = None
|
46 |
+
submittedBy: Optional[SubmittedBy] = None
|
47 |
+
isAuthorParticipating: Optional[bool] = None
|
48 |
+
|
49 |
+
|
50 |
+
def safe_get(data: Dict, *keys: str) -> Any:
|
51 |
+
"""安全获取嵌套字典值"""
|
52 |
+
for key in keys:
|
53 |
+
data = data.get(key, {}) if isinstance(data, dict) else None
|
54 |
+
return data if data != {} else None
|
55 |
+
|
56 |
+
|
57 |
+
def parse_article(data: Dict[str, Any]) -> Article:
|
58 |
+
"""容错式解析函数"""
|
59 |
+
|
60 |
+
def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
|
61 |
+
"""安全解析时间"""
|
62 |
+
if not dt_str:
|
63 |
+
return None
|
64 |
+
try:
|
65 |
+
if dt_str.endswith('Z'):
|
66 |
+
dt_str = dt_str[:-1] + '+00:00'
|
67 |
+
return datetime.fromisoformat(dt_str)
|
68 |
+
except ValueError:
|
69 |
+
return None
|
70 |
+
|
71 |
+
# 解析作者列表
|
72 |
+
authors = []
|
73 |
+
for author_data in safe_get(data, "paper", "authors") or []:
|
74 |
+
authors.append(Author(
|
75 |
+
_id=author_data.get("_id"),
|
76 |
+
name=author_data.get("name"),
|
77 |
+
hidden=author_data.get("hidden")
|
78 |
+
))
|
79 |
+
|
80 |
+
# 解析论文
|
81 |
+
paper = Paper(
|
82 |
+
id=safe_get(data, "paper", "id"),
|
83 |
+
authors=authors,
|
84 |
+
publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
|
85 |
+
title=safe_get(data, "paper", "title"),
|
86 |
+
summary=safe_get(data, "paper", "summary"),
|
87 |
+
upvotes=safe_get(data, "paper", "upvotes"),
|
88 |
+
discussionId=safe_get(data, "paper", "discussionId")
|
89 |
+
) if safe_get(data, "paper") else None
|
90 |
+
|
91 |
+
# 解析提交者
|
92 |
+
submitted_by_data = safe_get(data, "submittedBy")
|
93 |
+
submitted_by = SubmittedBy(
|
94 |
+
_id=submitted_by_data.get("_id") if submitted_by_data else None,
|
95 |
+
avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
|
96 |
+
fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
|
97 |
+
name=submitted_by_data.get("name") if submitted_by_data else None,
|
98 |
+
type=submitted_by_data.get("type") if submitted_by_data else None,
|
99 |
+
isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
|
100 |
+
isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
|
101 |
+
isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
|
102 |
+
followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
|
103 |
+
) if submitted_by_data else None
|
104 |
+
|
105 |
+
# 构建最终对象
|
106 |
+
return Article(
|
107 |
+
paper=paper,
|
108 |
+
publishedAt=parse_datetime(data.get("publishedAt")),
|
109 |
+
title=data.get("title"),
|
110 |
+
thumbnail=data.get("thumbnail"),
|
111 |
+
numComments=data.get("numComments"),
|
112 |
+
submittedBy=submitted_by,
|
113 |
+
isAuthorParticipating=data.get("isAuthorParticipating")
|
114 |
+
)
|
115 |
+
|
116 |
+
|
117 |
+
# 使用示例
|
118 |
+
if __name__ == "__main__":
|
119 |
+
import json
|
120 |
+
from rich import print
|
121 |
+
|
122 |
+
# 假设您的原始数据保存在 article.json 文件中
|
123 |
+
with open("article.json") as f:
|
124 |
+
raw_data = json.load(f)
|
125 |
+
|
126 |
+
articles = []
|
127 |
+
|
128 |
+
for raw_article in raw_data:
|
129 |
+
article = parse_article(raw_article)
|
130 |
+
articles.append(article)
|
131 |
+
|
132 |
+
print(articles[0])
|
133 |
+
print(len(articles))
|
sorter.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from parser import Article
|
2 |
+
|
3 |
+
|
4 |
+
def sort_by_date(articles):
|
5 |
+
return sorted(articles, key=lambda x: x.publishedAt, reverse=True)
|
6 |
+
|
7 |
+
|
8 |
+
def sort_by_upvotes(articles):
|
9 |
+
return sorted(articles, key=lambda x: x.paper.upvotes, reverse=True)
|
10 |
+
|
11 |
+
|
12 |
+
def sort_by_comments(articles):
|
13 |
+
return sorted(articles, key=lambda x: x.numComments, reverse=True)
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
from fetch_paper import fetch_papers
|
18 |
+
from rich import print
|
19 |
+
|
20 |
+
articles = fetch_papers()
|
21 |
+
|
22 |
+
print("Latest paper:")
|
23 |
+
articles = sort_by_date(articles)
|
24 |
+
print(articles[0])
|
25 |
+
|
26 |
+
print("Most upvoted paper:")
|
27 |
+
articles = sort_by_upvotes(articles)
|
28 |
+
print(articles[0])
|
29 |
+
|
30 |
+
print("Most commented paper:")
|
31 |
+
articles = sort_by_comments(articles)
|
32 |
+
print(articles[0])
|