usmanyousaf commited on
Commit
530d27a
·
verified ·
1 Parent(s): 2e18161

Update scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +5 -11
scrape.py CHANGED
@@ -2,13 +2,10 @@ from selenium import webdriver # type: ignore
2
  from selenium.webdriver.chrome.service import Service # type: ignore
3
  from selenium.webdriver.chrome.options import Options # type: ignore
4
  from bs4 import BeautifulSoup # type: ignore
5
- from dotenv import load_dotenv # type: ignore
6
- import os
7
  import time
8
 
9
- load_dotenv()
10
-
11
- CHROME_DRIVER_PATH = os.getenv("./chrome")
12
 
13
  def scrape_website(website):
14
  print("Connecting to Chrome Browser...")
@@ -16,6 +13,7 @@ def scrape_website(website):
16
  # Setup ChromeDriver service and options
17
  service = Service(CHROME_DRIVER_PATH)
18
  options = Options()
 
19
  driver = webdriver.Chrome(service=service, options=options)
20
 
21
  try:
@@ -37,9 +35,7 @@ def scrape_website(website):
37
  def extract_body_content(html_content):
38
  soup = BeautifulSoup(html_content, "html.parser")
39
  body_content = soup.body
40
- if body_content:
41
- return str(body_content)
42
- return ""
43
 
44
  def clean_body_content(body_content):
45
  soup = BeautifulSoup(body_content, "html.parser")
@@ -55,6 +51,4 @@ def clean_body_content(body_content):
55
  return cleaned_content
56
 
57
  def split_dom_content(dom_content, max_length=6000):
58
- return [
59
- dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
60
- ]
 
2
  from selenium.webdriver.chrome.service import Service # type: ignore
3
  from selenium.webdriver.chrome.options import Options # type: ignore
4
  from bs4 import BeautifulSoup # type: ignore
 
 
5
  import time
6
 
7
+ # Define the ChromeDriver path directly
8
+ CHROME_DRIVER_PATH = "./chrome"
 
9
 
10
  def scrape_website(website):
11
  print("Connecting to Chrome Browser...")
 
13
  # Setup ChromeDriver service and options
14
  service = Service(CHROME_DRIVER_PATH)
15
  options = Options()
16
+ options.add_argument("--headless") # Run in headless mode for deployment
17
  driver = webdriver.Chrome(service=service, options=options)
18
 
19
  try:
 
35
  def extract_body_content(html_content):
36
  soup = BeautifulSoup(html_content, "html.parser")
37
  body_content = soup.body
38
+ return str(body_content) if body_content else ""
 
 
39
 
40
  def clean_body_content(body_content):
41
  soup = BeautifulSoup(body_content, "html.parser")
 
51
  return cleaned_content
52
 
53
  def split_dom_content(dom_content, max_length=6000):
54
+ return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]