使用 Playwright PageMethods

使用 Playwright PageMethods

要使用scrapy-playwright与页面交互,我们需要使用该类PageMethod

PageMethod允许我们在页面上做很多不同的事情,包括:

  • 等待元素加载后再返回响应
  • 滚动页面
  • 点击页面元素
  • 截取页面截图
  • 创建页面的 PDF

首先,要在您的蜘蛛中使用PageMethod功能,您需要将其设置playwright_include_pageTrue,以便我们可以访问 PlaywrightPage对象,并将任何回调(即def parse)定义为协同函数(async def)以等待提供的Page对象。

# spiders/quotes.py

import scrapy
from quotes_js_scraper.items import QuoteItem


class QuotesSpider(scrapy.Spider):
name = 'quotes'

def start_requests(self):
url = 'https://quotes.toscrape.com/js/'
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
))

async def parse(self, response):
...

注意:设置'playwright_include_page': True时还建议您设置请求错误回调,以确保即使请求失败也会关闭页面(如果 playwright_include_page=False 或未设置,则遇到异常时页面会自动关闭)。

# spiders/quotes.py

import scrapy
from quotes_js_scraper.items import QuoteItem


class QuotesSpider(scrapy.Spider):
name = 'quotes'

def start_requests(self):
url = 'https://quotes.toscrape.com/js/'
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
errback=self.errback,
))

async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()

for quote in response.css('div.quote'):
quote_item = QuoteItem()
quote_item['text'] = quote.css('span.text::text').get()
quote_item['author'] = quote.css('small.author::text').get()
quote_item['tags'] = quote.css('div.tags a.tag::text').getall()
yield quote_item

async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()


1. 等待页面元素

为了在停止 javascript 渲染并向我们的抓取工具返回响应之前等待特定的页面元素,我们只需要在 Playwrright 设置中的键PageMethod中添加一个并定义一个。playwright_page_methods``wait_for_selector

现在,当我们运行蜘蛛时,scrapy-playwright 将呈现页面,直到页面上出现带有类引号的****div 。

# spiders/quotes.py

import scrapy
from quotes_js_scraper.items import QuoteItem
from scrapy_playwright.page import PageMethod

class QuotesSpider(scrapy.Spider):
name = 'quotes'

def start_requests(self):
url = "https://quotes.toscrape.com/js/"
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[PageMethod('wait_for_selector', 'div.quote')],
errback=self.errback,
))

async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()

for quote in response.css('div.quote'):
quote_item = QuoteItem()
quote_item['text'] = quote.css('span.text::text').get()
quote_item['author'] = quote.css('small.author::text').get()
quote_item['tags'] = quote.css('div.tags a.tag::text').getall()
yield quote_item

async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()

2. 抓取多个页面

通常我们需要在 javascript 呈现的网站上抓取多个页面。我们将通过检查页面上是否存在下一页链接,然后使用从页面抓取的 URL 请求该页面来实现此目的。

# spiders/quotes.py

import scrapy
from quotes_js_scraper.items import QuoteItem
from scrapy_playwright.page import PageMethod


class QuotesSpider(scrapy.Spider):
name = 'quotes'

def start_requests(self):
url = "https://quotes.toscrape.com/js/"
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[
PageMethod('wait_for_selector', 'div.quote'),
],
errback=self.errback,
))

async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()

for quote in response.css('div.quote'):
quote_item = QuoteItem()
quote_item['text'] = quote.css('span.text::text').get()
quote_item['author'] = quote.css('small.author::text').get()
quote_item['tags'] = quote.css('div.tags a.tag::text').getall()
yield quote_item

next_page = response.css('.next>a ::attr(href)').get()

if next_page is not None:
next_page_url = 'http://quotes.toscrape.com' + next_page
yield scrapy.Request(next_page_url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[
PageMethod('wait_for_selector', 'div.quote'),
],
errback=self.errback,
))

async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()

3. 向下滚动无限滚动页面

当网站使用无限滚动加载数据时,我们还可以配置 scrapy-playwright 来向下滚动页面。

在这个例子中,Playwright 将等待div.quote出现,然后向下滚动页面,直到到达第 10 条引言。

# spiders/quotes.py

import scrapy
from quotes_js_scraper.items import QuoteItem
from scrapy_playwright.page import PageMethod

class QuotesSpider(scrapy.Spider):
name = 'quotes'

def start_requests(self):
url = "https://quotes.toscrape.com/scroll"
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[
PageMethod("wait_for_selector", "div.quote"),
PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
],
errback=self.errback,
))

async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()

for quote in response.css('div.quote'):
quote_item = QuoteItem()
quote_item['text'] = quote.css('span.text::text').get()
quote_item['author'] = quote.css('small.author::text').get()
quote_item['tags'] = quote.css('div.tags a.tag::text').getall()
yield quote_item

async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()

4. 截取页面

截取页面屏幕截图也很简单。

在这里我们等待 Playwright 看到选择器div.quote,然后它截取页面的屏幕截图。

# spiders/quotes.py

import scrapy
from quotes_js_scraper.items import QuoteItem
from scrapy_playwright.page import PageMethod

class QuotesSpider(scrapy.Spider):
name = 'quotes'

def start_requests(self):
url = "https://quotes.toscrape.com/js/"
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[
PageMethod("wait_for_selector", "div.quote"),
]
))

async def parse(self, response):
page = response.meta["playwright_page"]
screenshot = await page.screenshot(path="example.png", full_page=True)
# screenshot contains the image's bytes
await page.close()


使用 Scrapy Playwright

proxy在 Scrapy Playwright 中,可以通过在设置中指定键在浏览器级别配置代理PLAYWRIGHT_LAUNCH_OPTIONS

我们还需要ignore_https_errorsplaywright_context_kwargsscrapy.Request 中设置键:

在下面的示例中,我们将展示如果您使用ScrapeOps Proxy API Aggregator ,它是如何工作的。您唯一需要更改的部分是YOUR_API_KEY_HERE- 将其替换为您的 scrapeops api 密钥。

(以下代码自 2023 年 10 月起有效)

# spiders/quotes.py

import scrapy

class ProxySpider(scrapy.Spider):
name = "quotes"
custom_settings = {
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"proxy": {
"server": "http://proxy.scrapeops.io:5353",
"username": "scrapeops",
"password": "YOUR_API_KEY_HERE",
},
}
}

def start_requests(self):
url = 'https://quotes.toscrape.com/js/'

yield scrapy.Request(
"http://httpbin.org/get",
meta=dict(
playwright = True,
playwright_context_kwargs= {
"ignore_https_errors": True,
})
)

def parse(self, response):
print(response.text)

import scrapy
from scrapy_playwright.page import PageMethod

class DynamicClickSpider(scrapy.Spider):
name = 'dynamic_click_spider'

def start_requests(self):
url = 'https://example.com' # 替换为实际的起始页面 URL
yield scrapy.Request(url, meta=dict(
playwright=True,
playwright_include_page=True,
playwright_page_methods=[
PageMethod("wait_for_selector", "button.load-more"), # 等待加载更多按钮
],
), callback=self.parse, errback=self.errback)

async def parse(self, response):
page = response.meta["playwright_page"]

# 提取初始页面内容
initial_content = await page.content()
yield {
'html': initial_content,
}

# 假设需要点击多个按钮加载不同的内容
for i in range(1, 6): # 假设要点击5次
button_selector = f"button.load-more-{i}" # 替换为实际选择器

await page.click(button_selector) # 点击加载不同内容的按钮
await page.wait_for_timeout(2000) # 等待内容加载

# 提取更新后的内容
updated_content = await page.content()
yield {
'html': updated_content,
}

# 关闭页面
await page.close()

async def errback(self, failure):
page = failure.request.meta.get("playwright_page")
if page:
await page.close()

常用的 PageMethod 操作
wait_for_selector(selector):

等待指定的选择器在页面中可用。
用法:PageMethod("wait_for_selector", "div.quote")
fill(selector, value):

向指定的输入框填充值。
用法:PageMethod("fill", "input[name='username']", "your_username")
click(selector):

点击指定的元素。
用法:PageMethod("click", "button[type='submit']")
evaluate(expression):

执行自定义的 JavaScript 代码在页面上下文中。
用法:PageMethod("evaluate", "document.title") (获取页面标题)
goto(url):

导航到指定的 URL。
用法:PageMethod("goto", "https://example.com")
wait_for_navigation():

等待页面导航完成。
用法:PageMethod("wait_for_navigation")
scroll_to(selector):

滚动到指定元素的位置(虽然在 Scrapy Playwright 中不一定直接支持,通常可以通过 evaluate 来实现)。
用法:PageMethod("evaluate", "document.querySelector('div').scrollIntoView()")
screenshot(path):

截取当前页面的屏幕截图,并保存到指定路径。
用法:PageMethod("screenshot", {"path": "screenshot.png"})
close():

关闭当前页面。
用法:PageMethod("close")(通常不直接在请求中使用,而是在解析或错误处理中使用)。