usmanyousaf commited on
Commit
135b855
·
verified ·
1 Parent(s): bce428a

Update scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +23 -16
scrape.py CHANGED
@@ -1,41 +1,48 @@
1
-
2
  from selenium import webdriver
3
  from selenium.common.exceptions import WebDriverException
4
- from selenium import webdriver # type: ignore
5
- from selenium.webdriver.chrome.service import Service # type: ignore
6
- from selenium.webdriver.chrome.options import Options # type: ignore
7
- from bs4 import BeautifulSoup # type: ignore
8
  import time
9
 
10
- # Define the ChromeDriver path directly
11
- CHROME_DRIVER_PATH = "./chrome"
12
-
13
  def scrape_website(website):
14
  print("Connecting to Chrome Browser...")
15
 
16
- # Setup ChromeDriver service and options
17
- service = Service(CHROME_DRIVER_PATH)
18
  options = Options()
19
  options.add_argument("--headless") # Run in headless mode for deployment
20
- driver = webdriver.Chrome(service=service, options=options)
21
-
 
 
 
22
  try:
23
- driver.get(website)
 
 
 
24
  print("Waiting for CAPTCHA to be solved manually (if present)...")
25
 
26
  # Optional waiting loop for manual CAPTCHA solving
27
- while "captcha" in driver.page_source.lower():
28
  print("CAPTCHA detected, waiting...")
29
  time.sleep(5)
30
 
31
  print("CAPTCHA solved or not present. Scraping page content...")
32
- html = driver.page_source
33
  return html
34
 
 
 
 
 
35
  finally:
36
- driver.quit()
 
37
 
38
  def extract_body_content(html_content):
 
 
39
  soup = BeautifulSoup(html_content, "html.parser")
40
  body_content = soup.body
41
  return str(body_content) if body_content else ""
 
 
1
  from selenium import webdriver
2
  from selenium.common.exceptions import WebDriverException
3
+ from selenium.webdriver.chrome.service import Service
4
+ from selenium.webdriver.chrome.options import Options
5
+ from bs4 import BeautifulSoup
 
6
  import time
7
 
 
 
 
8
  def scrape_website(website):
9
  print("Connecting to Chrome Browser...")
10
 
11
+ # Setup ChromeDriver options
 
12
  options = Options()
13
  options.add_argument("--headless") # Run in headless mode for deployment
14
+ options.add_argument('--no-sandbox') # Overcome limited resource problems
15
+ options.add_argument('--disable-dev-shm-usage') # Overcome limited resource problems
16
+
17
+ # Initialize the driver without a specified service (assumes ChromeDriver is in PATH)
18
+ wd = None
19
  try:
20
+ wd = webdriver.Chrome(options=options)
21
+ wd.set_window_size(1080, 720) # Set the window size
22
+ wd.get(website)
23
+ wd.implicitly_wait(10)
24
  print("Waiting for CAPTCHA to be solved manually (if present)...")
25
 
26
  # Optional waiting loop for manual CAPTCHA solving
27
+ while "captcha" in wd.page_source.lower():
28
  print("CAPTCHA detected, waiting...")
29
  time.sleep(5)
30
 
31
  print("CAPTCHA solved or not present. Scraping page content...")
32
+ html = wd.page_source
33
  return html
34
 
35
+ except WebDriverException as e:
36
+ print(f"WebDriverException occurred: {e}")
37
+ return None # Return None or an empty string based on your requirement
38
+
39
  finally:
40
+ if wd:
41
+ wd.quit()
42
 
43
  def extract_body_content(html_content):
44
+ if html_content is None:
45
+ return "" # Return empty if there is no content
46
  soup = BeautifulSoup(html_content, "html.parser")
47
  body_content = soup.body
48
  return str(body_content) if body_content else ""