Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
63 changes: 63 additions & 0 deletions docs/guides/code_examples/http_crawlers/lexbor_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import asyncio

from pydantic import ValidationError
from selectolax.lexbor import LexborHTMLParser
from yarl import URL

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
crawler = HttpCrawler(
max_request_retries=1,
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Parse the HTML content using Selectolax with Lexbor backend.
parsed_html = LexborHTMLParser(await context.http_response.read())

# Extract data from the page.
data = {
'url': context.request.url,
'title': parsed_html.css_first('title').text(),
'h1s': [h1.text() for h1 in parsed_html.css('h1')],
'h2s': [h2.text() for h2 in parsed_html.css('h2')],
'h3s': [h3.text() for h3 in parsed_html.css('h3')],
}
await context.push_data(data)

# Css selector to extract valid href attributes.
links_selector = (
'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
)
base_url = URL(context.request.url)
extracted_requests = []

# Extract links.
for item in parsed_html.css(links_selector):
href = item.attributes.get('href')
if not href:
continue

# Convert relative URLs to absolute if needed.
url = str(base_url.join(URL(href)))
try:
request = Request.from_url(url)
except ValidationError as exc:
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
continue
extracted_requests.append(request)

# Add extracted requests to the queue with the same-domain strategy.
await context.add_requests(extracted_requests, strategy='same-domain')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
61 changes: 61 additions & 0 deletions docs/guides/code_examples/http_crawlers/lxml_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import asyncio

from lxml import html
from pydantic import ValidationError

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
crawler = HttpCrawler(
max_request_retries=1,
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Parse the HTML content using lxml.
parsed_html = html.fromstring(await context.http_response.read())

# Extract data from the page.
data = {
'url': context.request.url,
'title': parsed_html.findtext('.//title'),
'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')],
'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')],
'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')],
}
await context.push_data(data)

# Convert relative URLs to absolute before extracting links.
parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)

# Xpath 1.0 selector for extracting valid href attributes.
links_xpath = (
'//a/@href[not(starts-with(., "#")) '
'and not(starts-with(., "javascript:")) '
'and not(starts-with(., "mailto:"))]'
)

extracted_requests = []

# Extract links.
for url in parsed_html.xpath(links_xpath):
try:
request = Request.from_url(url)
except ValidationError as exc:
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
continue
extracted_requests.append(request)

# Add extracted requests to the queue with the same-domain strategy.
await context.add_requests(extracted_requests, strategy='same-domain')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
77 changes: 77 additions & 0 deletions docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import asyncio

from lxml import html
from pydantic import ValidationError
from saxonche import PySaxonProcessor

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
crawler = HttpCrawler(
max_request_retries=1,
max_requests_per_crawl=10,
)

# Create Saxon processor once and reuse across requests.
saxon_proc = PySaxonProcessor(license=False)
xpath_proc = saxon_proc.new_xpath_processor()

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Parse HTML with lxml.
parsed_html = html.fromstring(await context.http_response.read())
# Convert relative URLs to absolute before extracting links.
parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
# Convert parsed HTML to XML for Saxon processing.
xml = html.tostring(parsed_html, encoding='unicode', method='xml')
# Parse XML with Saxon.
parsed_xml = saxon_proc.parse_xml(xml_text=xml)
# Set the parsed context for XPath evaluation.
xpath_proc.set_context(xdm_item=parsed_xml)

# Extract data using XPath 2.0 string() function.
data = {
'url': context.request.url,
'title': xpath_proc.evaluate_single('.//title/string()'),
'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])],
'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])],
'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])],
}
await context.push_data(data)

# XPath 2.0 with distinct-values() to get unique links and remove fragments.
links_xpath = """
distinct-values(
for $href in //a/@href[
not(starts-with(., "#"))
and not(starts-with(., "javascript:"))
and not(starts-with(., "mailto:"))
]
return replace($href, "#.*$", "")
)
"""

extracted_requests = []

# Extract links.
for item in xpath_proc.evaluate(links_xpath) or []:
url = item.string_value
try:
request = Request.from_url(url)
except ValidationError as exc:
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
continue
extracted_requests.append(request)

# Add extracted requests to the queue with the same-domain strategy.
await context.add_requests(extracted_requests, strategy='same-domain')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
64 changes: 64 additions & 0 deletions docs/guides/code_examples/http_crawlers/pyquery_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import asyncio

from pydantic import ValidationError
from pyquery import PyQuery
from yarl import URL

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
crawler = HttpCrawler(
max_request_retries=1,
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Parse the HTML content using PyQuery.
parsed_html = PyQuery(await context.http_response.read())

# Extract data using jQuery-style selectors.
data = {
'url': context.request.url,
'title': parsed_html('title').text(),
'h1s': [h1.text() for h1 in parsed_html('h1').items()],
'h2s': [h2.text() for h2 in parsed_html('h2').items()],
'h3s': [h3.text() for h3 in parsed_html('h3').items()],
}
await context.push_data(data)

# Css selector to extract valid href attributes.
links_selector = (
'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
)
base_url = URL(context.request.url)

extracted_requests = []

# Extract links.
for item in parsed_html(links_selector).items():
href = item.attr('href')
if not href:
continue

# Convert relative URLs to absolute if needed.
url = str(base_url.join(URL(str(href))))
try:
request = Request.from_url(url)
except ValidationError as exc:
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
continue
extracted_requests.append(request)

# Add extracted requests to the queue with the same-domain strategy.
await context.add_requests(extracted_requests, strategy='same-domain')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
74 changes: 74 additions & 0 deletions docs/guides/code_examples/http_crawlers/scrapling_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import asyncio

from pydantic import ValidationError
from scrapling.parser import Selector
from yarl import URL

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
crawler = HttpCrawler(
max_request_retries=1,
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Parse the HTML content using Scrapling.
page = Selector(await context.http_response.read(), url=context.request.url)

# Extract data using Xpath selectors with .get_all_text method for full text
# content.
title_el = page.xpath_first('//title')
data = {
'url': context.request.url,
'title': title_el.text if isinstance(title_el, Selector) else title_el,
'h1s': [
h1.get_all_text() if isinstance(h1, Selector) else h1
for h1 in page.xpath('//h1')
],
'h2s': [
h2.get_all_text() if isinstance(h2, Selector) else h2
for h2 in page.xpath('//h2')
],
'h3s': [
h3.get_all_text() if isinstance(h3, Selector) else h3
for h3 in page.xpath('//h3')
],
}
await context.push_data(data)

# Css selector to extract valid href attributes.
links_selector = (
'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
)
base_url = URL(context.request.url)
extracted_requests = []

# Extract links.
for item in page.css(links_selector):
href = item.attrib.get('href') if isinstance(item, Selector) else None
if not href:
continue

# Convert relative URLs to absolute if needed.
url = str(base_url.join(URL(href)))
try:
request = Request.from_url(url)
except ValidationError as exc:
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
continue
extracted_requests.append(request)

# Add extracted requests to the queue with the same-domain strategy.
await context.add_requests(extracted_requests, strategy='same-domain')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
38 changes: 38 additions & 0 deletions docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import asyncio

from crawlee.crawlers import (
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlerStatisticState,
AdaptivePlaywrightCrawlingContext,
)
from crawlee.statistics import Statistics

from .selectolax_parser import SelectolaxLexborParser


async def main() -> None:
crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler(
max_requests_per_crawl=10,
# Use custom Selectolax parser for static content parsing.
static_parser=SelectolaxLexborParser(),
# Set up statistics with AdaptivePlaywrightCrawlerStatisticState.
statistics=Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState),
)

@crawler.router.default_handler
async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = {
'url': context.request.url,
'title': await context.query_selector_one('title'),
}

await context.push_data(data)

await context.enqueue_links()

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
Loading
Loading