from parser import parse_article import os import requests import datetime import hashlib import json API_URL = "https://huggingface.co./api/daily_papers" cache = {} def make_request(url: str): # Create a hash of the URL to use as the cache key url_hash = hashlib.md5(url.encode()).hexdigest() # Check if the response is already cached if url_hash in cache: print(f"Cache hit for URL: {url}") return cache[url_hash] http_proxy = os.getenv("HF_HTTP_PROXY") https_proxy = os.getenv("HF_HTTPS_PROXY") proxies = { "http": http_proxy, "https": https_proxy } if http_proxy or https_proxy else None attempts = 0 while attempts < 3: try: response = requests.get(url, proxies=proxies) response.raise_for_status() data = response.json() # Cache the response cache[url_hash] = data return data except requests.RequestException as e: attempts += 1 print(f"Attempt {attempts} failed: {e}") if attempts == 3: return [] def fetch_papers(): data = make_request(API_URL) return [parse_article(item) for item in data] def fetch_papers_with_date(date: datetime): formatted_date = date.strftime("%Y-%m-%d") data = make_request(API_URL + "?date=" + formatted_date) return [parse_article(item) for item in data] def fetch_papers_with_daterange(start_date: datetime, end_date: datetime): # return [] # 每天的数据都是独立的,所以只需要遍历日期范围即可 articles = [] current_date = start_date while current_date <= end_date: print(current_date) articles.extend(fetch_papers_with_date(current_date)) print(f"Total articles: {len(articles)}") current_date += datetime.timedelta(days=1) # 根据每个文章的.paper.id去重 unique_articles = {} for article in articles: if article.paper.id not in unique_articles: unique_articles[article.paper.id] = article return list(unique_articles.values()) if __name__ == "__main__": from rich import print start_date = datetime.datetime(2024, 1, 28) end_date = datetime.datetime(2024, 1, 30) articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date) # print(articles) print(f"Total articles: {len(articles)}")