HowardZhangdqs's picture
feat: main feature
c7478e3
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Any, Dict
# 修改后的数据类(添加 Optional 和默认值)
@dataclass
class Author:
_id: Optional[str] = None
name: Optional[str] = None
hidden: Optional[bool] = None
@dataclass
class Paper:
id: Optional[str] = None
authors: List[Author] = None
publishedAt: Optional[datetime] = None
title: Optional[str] = None
summary: Optional[str] = None
upvotes: Optional[int] = None
discussionId: Optional[str] = None
@dataclass
class SubmittedBy:
_id: Optional[str] = None
avatarUrl: Optional[str] = None
fullname: Optional[str] = None
name: Optional[str] = None
type: Optional[str] = None
isPro: Optional[bool] = None
isHf: Optional[bool] = None
isMod: Optional[bool] = None
followerCount: Optional[int] = None
@dataclass
class Article:
paper: Optional[Paper] = None
publishedAt: Optional[datetime] = None
title: Optional[str] = None
thumbnail: Optional[str] = None
numComments: Optional[int] = None
submittedBy: Optional[SubmittedBy] = None
isAuthorParticipating: Optional[bool] = None
def safe_get(data: Dict, *keys: str) -> Any:
"""安全获取嵌套字典值"""
for key in keys:
data = data.get(key, {}) if isinstance(data, dict) else None
return data if data != {} else None
def parse_article(data: Dict[str, Any]) -> Article:
"""容错式解析函数"""
def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
"""安全解析时间"""
if not dt_str:
return None
try:
if dt_str.endswith('Z'):
dt_str = dt_str[:-1] + '+00:00'
return datetime.fromisoformat(dt_str)
except ValueError:
return None
# 解析作者列表
authors = []
for author_data in safe_get(data, "paper", "authors") or []:
authors.append(Author(
_id=author_data.get("_id"),
name=author_data.get("name"),
hidden=author_data.get("hidden")
))
# 解析论文
paper = Paper(
id=safe_get(data, "paper", "id"),
authors=authors,
publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
title=safe_get(data, "paper", "title"),
summary=safe_get(data, "paper", "summary"),
upvotes=safe_get(data, "paper", "upvotes"),
discussionId=safe_get(data, "paper", "discussionId")
) if safe_get(data, "paper") else None
# 解析提交者
submitted_by_data = safe_get(data, "submittedBy")
submitted_by = SubmittedBy(
_id=submitted_by_data.get("_id") if submitted_by_data else None,
avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
name=submitted_by_data.get("name") if submitted_by_data else None,
type=submitted_by_data.get("type") if submitted_by_data else None,
isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
) if submitted_by_data else None
# 构建最终对象
return Article(
paper=paper,
publishedAt=parse_datetime(data.get("publishedAt")),
title=data.get("title"),
thumbnail=data.get("thumbnail"),
numComments=data.get("numComments"),
submittedBy=submitted_by,
isAuthorParticipating=data.get("isAuthorParticipating")
)
# 使用示例
if __name__ == "__main__":
import json
from rich import print
# 假设您的原始数据保存在 article.json 文件中
with open("article.json") as f:
raw_data = json.load(f)
articles = []
for raw_article in raw_data:
article = parse_article(raw_article)
articles.append(article)
print(articles[0])
print(len(articles))