playwright的数据爬取

对于playwright的爬虫,一般就是先使用xpathhlerper来查找到自己想要的东西,关于xhelp,按下ctrl+shift+x可以弹出界面。然后在只按ctrl+shift,然后把鼠标位于想要查找的东西上就能显示了,这会直接把文本内容显示出来的,然后使用playwright codegen来一步一步到达所在位置,然后使用循环来进行批量爬取

#导入所需要的包
import re
from playwright.sync_api import Playwright, sync_playwright, expect
from openpyxl import Workbook
import os
import time

#初始化 Excel 文件
wb = Workbook()
ws = wb.active
ws.append(["队伍", "材料名称", "类型", "单位", "数量"])
============

#设置excel文件路径
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop_path, "output.xlsx")
============

#浏览器自动化查找对应位置
def run(playwright: Playwright) -> None:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto("https://********")

# 登录过程
page.get_by_role("textbox", name="请输入用户名").click()
page.get_by_role("textbox", name="请输入用户名").fill("17353190891")
page.get_by_role("textbox", name="请输入用户名").press("Tab")
page.get_by_role("textbox", name="请输入密码").press("CapsLock")
page.get_by_role("textbox", name="请输入密码").fill("A")
page.get_by_role("textbox", name="请输入密码").press("CapsLock")
page.get_by_role("textbox", name="请输入密码").fill("Aa736188huang")
page.get_by_role("button", name="登 录").click()
page.get_by_text("应用中心").click()

with page.expect_popup() as page1_info:
page.locator("#app-group-0").get_by_role("img", name="材料管理系统").click()
page1 = page1_info.value
page1.locator("div").filter(has_text=re.compile(r"^库存管理$")).click()
page1.locator("div").filter(has_text=re.compile(r"^材料出库$")).click()
page1.get_by_role("link", name="领料出库").click()

page1.wait_for_timeout(5000) # 等待页面加载完成

page1.get_by_role("textbox", name="请选择").click()
page1.get_by_text("100条/页").click()
============

#获取总页数
time.sleep(5) # 获取总页数
total_pages = int(page1.locator(f"//button[@class='vxe-pager--num-btn'][5]").first.inner_text())
print(f"共 {total_pages} 页")
============

#数据定位
# 页面循环
for page_num in range(1, total_pages + 1):
#行数循环
for i in range(1, 101):
try:
# 获取数据
cadre = page1.locator(f"//table[contains(@class, 'vxe-table--body')]//tr[{i}]//td[contains(@class, 'col_22')]//span").first.inner_text()
name_of_material = page1.locator(f"//table[contains(@class, 'vxe-table--body')]//tr[{i}]//td[contains(@class, 'col_32')]//span").first.inner_text()
setpecification = page1.locator(f"//table[contains(@class, 'vxe-table--body')]//tr[{i}]//td[contains(@class, 'col_43')]//span").first.inner_text()
unit = page1.locator(f"//table[contains(@class, 'vxe-table--body')]//tr[{i}]//td[contains(@class, 'col_44')]//span").first.inner_text()
number = page1.locator(f"//table[contains(@class, 'vxe-table--body')]//tr[{i}]//td[contains(@class, 'col_45')]//span").first.inner_text()
============

# 打印数据并写入Excel
print(f"{cadre}|{name_of_material}|{setpecification}|{unit}|{number}")
ws.append([cadre, name_of_material, setpecification, unit, number])
except Exception as e:
print(f"第{i}行出错:{e}")
break

print("============================================")
============

# 点击下一页
if page_num < total_pages:
page1.get_by_role("button", name=str(page_num + 1), exact=True).click()

page1.wait_for_timeout(3000) # 等待页面加载完成
============

#excel保存
wb.save(file_path)

#关闭
context.close()
browser.close()

with sync_playwright() as playwright:
run(playwright)