|
from dataclasses import dataclass |
|
from datetime import datetime |
|
from typing import List, Optional, Any, Dict |
|
|
|
|
|
|
|
|
|
@dataclass |
|
class Author: |
|
_id: Optional[str] = None |
|
name: Optional[str] = None |
|
hidden: Optional[bool] = None |
|
|
|
|
|
@dataclass |
|
class Paper: |
|
id: Optional[str] = None |
|
authors: List[Author] = None |
|
publishedAt: Optional[datetime] = None |
|
title: Optional[str] = None |
|
summary: Optional[str] = None |
|
upvotes: Optional[int] = None |
|
discussionId: Optional[str] = None |
|
|
|
|
|
@dataclass |
|
class SubmittedBy: |
|
_id: Optional[str] = None |
|
avatarUrl: Optional[str] = None |
|
fullname: Optional[str] = None |
|
name: Optional[str] = None |
|
type: Optional[str] = None |
|
isPro: Optional[bool] = None |
|
isHf: Optional[bool] = None |
|
isMod: Optional[bool] = None |
|
followerCount: Optional[int] = None |
|
|
|
|
|
@dataclass |
|
class Article: |
|
paper: Optional[Paper] = None |
|
publishedAt: Optional[datetime] = None |
|
title: Optional[str] = None |
|
thumbnail: Optional[str] = None |
|
numComments: Optional[int] = None |
|
submittedBy: Optional[SubmittedBy] = None |
|
isAuthorParticipating: Optional[bool] = None |
|
|
|
|
|
def safe_get(data: Dict, *keys: str) -> Any: |
|
"""安全获取嵌套字典值""" |
|
for key in keys: |
|
data = data.get(key, {}) if isinstance(data, dict) else None |
|
return data if data != {} else None |
|
|
|
|
|
def parse_article(data: Dict[str, Any]) -> Article: |
|
"""容错式解析函数""" |
|
|
|
def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]: |
|
"""安全解析时间""" |
|
if not dt_str: |
|
return None |
|
try: |
|
if dt_str.endswith('Z'): |
|
dt_str = dt_str[:-1] + '+00:00' |
|
return datetime.fromisoformat(dt_str) |
|
except ValueError: |
|
return None |
|
|
|
|
|
authors = [] |
|
for author_data in safe_get(data, "paper", "authors") or []: |
|
authors.append(Author( |
|
_id=author_data.get("_id"), |
|
name=author_data.get("name"), |
|
hidden=author_data.get("hidden") |
|
)) |
|
|
|
|
|
paper = Paper( |
|
id=safe_get(data, "paper", "id"), |
|
authors=authors, |
|
publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")), |
|
title=safe_get(data, "paper", "title"), |
|
summary=safe_get(data, "paper", "summary"), |
|
upvotes=safe_get(data, "paper", "upvotes"), |
|
discussionId=safe_get(data, "paper", "discussionId") |
|
) if safe_get(data, "paper") else None |
|
|
|
|
|
submitted_by_data = safe_get(data, "submittedBy") |
|
submitted_by = SubmittedBy( |
|
_id=submitted_by_data.get("_id") if submitted_by_data else None, |
|
avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None, |
|
fullname=submitted_by_data.get("fullname") if submitted_by_data else None, |
|
name=submitted_by_data.get("name") if submitted_by_data else None, |
|
type=submitted_by_data.get("type") if submitted_by_data else None, |
|
isPro=submitted_by_data.get("isPro") if submitted_by_data else None, |
|
isHf=submitted_by_data.get("isHf") if submitted_by_data else None, |
|
isMod=submitted_by_data.get("isMod") if submitted_by_data else None, |
|
followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None |
|
) if submitted_by_data else None |
|
|
|
|
|
return Article( |
|
paper=paper, |
|
publishedAt=parse_datetime(data.get("publishedAt")), |
|
title=data.get("title"), |
|
thumbnail=data.get("thumbnail"), |
|
numComments=data.get("numComments"), |
|
submittedBy=submitted_by, |
|
isAuthorParticipating=data.get("isAuthorParticipating") |
|
) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
import json |
|
from rich import print |
|
|
|
|
|
with open("article.json") as f: |
|
raw_data = json.load(f) |
|
|
|
articles = [] |
|
|
|
for raw_article in raw_data: |
|
article = parse_article(raw_article) |
|
articles.append(article) |
|
|
|
print(articles[0]) |
|
print(len(articles)) |
|
|