Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions backend/onyx/connectors/web/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from playwright.sync_api import Route
from playwright.sync_api import Request
from requests_oauthlib import OAuth2Session # type:ignore
from urllib3.exceptions import MaxRetryError

Expand Down Expand Up @@ -328,6 +330,13 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:
return playwright, context


def abort_unnecessary_resources(route: Route, request: Request) -> None:
if request.resource_type in ["image", "stylesheet", "font", "media", "websocket", "manifest", "other"]:
route.abort()
else:
route.continue_()


def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
try:
response = requests.get(
Expand Down Expand Up @@ -465,13 +474,15 @@ def __init__(
batch_size: int = INDEX_BATCH_SIZE,
scroll_before_scraping: bool = False,
remove_by_selector: list = [],
timeout: int = 30000,
**kwargs: Any,
) -> None:
self.mintlify_cleanup = mintlify_cleanup
self.batch_size = batch_size
self.recursive = False
self.scroll_before_scraping = scroll_before_scraping
self.remove_by_selector = remove_by_selector or []
self.timeout = timeout
self.web_connector_type = web_connector_type

if not isinstance(self.remove_by_selector, list):
Expand Down Expand Up @@ -556,13 +567,21 @@ def _do_scrape(
return result

page = session_ctx.playwright_context.new_page()
page.route("**/*", abort_unnecessary_resources)
try:
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior
page_response = page.goto(
initial_url,
timeout=30000, # 30 seconds
wait_until="domcontentloaded", # Wait for DOM to be ready
timeout=self.timeout, # 30 seconds
wait_until="commit",
)
page.wait_for_function("document.readyState === 'interactive'")
page.evaluate("""
() => {
const images = document.querySelectorAll('img');
images.forEach(img => img.remove());
}
""")
page.wait_for_function("document.readyState === 'complete'") # wait for domcontentloaded

last_modified = page_response.header_value(
"Last-Modified") if page_response else None
Expand All @@ -588,7 +607,7 @@ def _do_scrape(
page.evaluate(
"window.scrollTo(0, document.body.scrollHeight)")
# wait for the content to load if we scrolled
page.wait_for_load_state("networkidle", timeout=30000)
page.wait_for_load_state("networkidle", timeout=self.timeout)
time.sleep(0.5) # let javascript run

new_height = page.evaluate("document.body.scrollHeight")
Expand Down
9 changes: 9 additions & 0 deletions web/src/lib/connectors/connectors.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,15 @@ export const connectorConfigs: Record<
name: "remove_by_selector",
optional: true
},
{
type: "number",
query: "Timeout (milliseconds):",
label: "Timeout (milliseconds)",
description:
"Timeout for the website to load the desired content",
name: "timeout",
optional: true,
},
],
overrideDefaultFreq: 60 * 60 * 24,
},
Expand Down
Loading