SmartFlow_DailyPaper / fetch_paper.py
HowardZhangdqs's picture
feat: main feature
c7478e3
from parser import parse_article
import os
import requests
import datetime
import hashlib
import json
API_URL = "https://huggingface.co./api/daily_papers"
cache = {}
def make_request(url: str):
# Create a hash of the URL to use as the cache key
url_hash = hashlib.md5(url.encode()).hexdigest()
# Check if the response is already cached
if url_hash in cache:
print(f"Cache hit for URL: {url}")
return cache[url_hash]
http_proxy = os.getenv("HF_HTTP_PROXY")
https_proxy = os.getenv("HF_HTTPS_PROXY")
proxies = {
"http": http_proxy,
"https": https_proxy
} if http_proxy or https_proxy else None
attempts = 0
while attempts < 3:
try:
response = requests.get(url, proxies=proxies)
response.raise_for_status()
data = response.json()
# Cache the response
cache[url_hash] = data
return data
except requests.RequestException as e:
attempts += 1
print(f"Attempt {attempts} failed: {e}")
if attempts == 3:
return []
def fetch_papers():
data = make_request(API_URL)
return [parse_article(item) for item in data]
def fetch_papers_with_date(date: datetime):
formatted_date = date.strftime("%Y-%m-%d")
data = make_request(API_URL + "?date=" + formatted_date)
return [parse_article(item) for item in data]
def fetch_papers_with_daterange(start_date: datetime, end_date: datetime):
# return []
# 每天的数据都是独立的,所以只需要遍历日期范围即可
articles = []
current_date = start_date
while current_date <= end_date:
print(current_date)
articles.extend(fetch_papers_with_date(current_date))
print(f"Total articles: {len(articles)}")
current_date += datetime.timedelta(days=1)
# 根据每个文章的.paper.id去重
unique_articles = {}
for article in articles:
if article.paper.id not in unique_articles:
unique_articles[article.paper.id] = article
return list(unique_articles.values())
if __name__ == "__main__":
from rich import print
start_date = datetime.datetime(2024, 1, 28)
end_date = datetime.datetime(2024, 1, 30)
articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date)
# print(articles)
print(f"Total articles: {len(articles)}")