Spaces:

usmanyousaf
/

AI-WebScraper-App

Running

File size: 2,305 Bytes

bce428a
 
135b855
 
 
2a0bf8d
2d64614
 
2a0bf8d
 
135b855
2a0bf8d
530d27a
135b855
 
 
 
 
2a0bf8d
135b855
 
 
 
2a0bf8d
 
 
135b855
2a0bf8d
 
6a10786
2a0bf8d
135b855
2a0bf8d
 
135b855
 
 
 
2a0bf8d
135b855
 
2d64614
 
135b855
 
2d64614
 
530d27a
2d64614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530d27a

from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

def scrape_website(website):
    print("Connecting to Chrome Browser...")
    
    # Setup ChromeDriver options
    options = Options()
    options.add_argument("--headless")  # Run in headless mode for deployment
    options.add_argument('--no-sandbox')  # Overcome limited resource problems
    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems

    # Initialize the driver without a specified service (assumes ChromeDriver is in PATH)
    wd = None
    try:
        wd = webdriver.Chrome(options=options)
        wd.set_window_size(1080, 720)  # Set the window size
        wd.get(website)
        wd.implicitly_wait(10)
        print("Waiting for CAPTCHA to be solved manually (if present)...")
        
        # Optional waiting loop for manual CAPTCHA solving
        while "captcha" in wd.page_source.lower():
            print("CAPTCHA detected, waiting...")
            time.sleep(5)

        print("CAPTCHA solved or not present. Scraping page content...")
        html = wd.page_source
        return html
    
    except WebDriverException as e:
        print(f"WebDriverException occurred: {e}")
        return None  # Return None or an empty string based on your requirement

    finally:
        if wd:
            wd.quit()

def extract_body_content(html_content):
    if html_content is None:
        return ""  # Return empty if there is no content
    soup = BeautifulSoup(html_content, "html.parser")
    body_content = soup.body
    return str(body_content) if body_content else ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content, "html.parser")

    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()

    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )

    return cleaned_content

def split_dom_content(dom_content, max_length=6000):
    return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]