|
from parser import parse_article |
|
import os |
|
import requests |
|
import datetime |
|
import hashlib |
|
import json |
|
|
|
|
|
API_URL = "https://huggingface.co./api/daily_papers" |
|
|
|
cache = {} |
|
|
|
|
|
def make_request(url: str): |
|
|
|
url_hash = hashlib.md5(url.encode()).hexdigest() |
|
|
|
|
|
if url_hash in cache: |
|
print(f"Cache hit for URL: {url}") |
|
return cache[url_hash] |
|
|
|
http_proxy = os.getenv("HF_HTTP_PROXY") |
|
https_proxy = os.getenv("HF_HTTPS_PROXY") |
|
proxies = { |
|
"http": http_proxy, |
|
"https": https_proxy |
|
} if http_proxy or https_proxy else None |
|
|
|
attempts = 0 |
|
while attempts < 3: |
|
try: |
|
response = requests.get(url, proxies=proxies) |
|
response.raise_for_status() |
|
data = response.json() |
|
|
|
|
|
cache[url_hash] = data |
|
|
|
return data |
|
except requests.RequestException as e: |
|
attempts += 1 |
|
print(f"Attempt {attempts} failed: {e}") |
|
if attempts == 3: |
|
return [] |
|
|
|
|
|
def fetch_papers(): |
|
data = make_request(API_URL) |
|
return [parse_article(item) for item in data] |
|
|
|
|
|
def fetch_papers_with_date(date: datetime): |
|
formatted_date = date.strftime("%Y-%m-%d") |
|
data = make_request(API_URL + "?date=" + formatted_date) |
|
return [parse_article(item) for item in data] |
|
|
|
|
|
def fetch_papers_with_daterange(start_date: datetime, end_date: datetime): |
|
|
|
|
|
articles = [] |
|
current_date = start_date |
|
while current_date <= end_date: |
|
print(current_date) |
|
articles.extend(fetch_papers_with_date(current_date)) |
|
print(f"Total articles: {len(articles)}") |
|
current_date += datetime.timedelta(days=1) |
|
|
|
|
|
unique_articles = {} |
|
for article in articles: |
|
if article.paper.id not in unique_articles: |
|
unique_articles[article.paper.id] = article |
|
|
|
return list(unique_articles.values()) |
|
|
|
|
|
if __name__ == "__main__": |
|
from rich import print |
|
start_date = datetime.datetime(2024, 1, 28) |
|
end_date = datetime.datetime(2024, 1, 30) |
|
articles = fetch_papers_with_daterange(start_date=start_date, end_date=end_date) |
|
|
|
print(f"Total articles: {len(articles)}") |
|
|