爬取学习
佬的学习笔记 | https://www.acwing.com/blog/content/41265/
之前的验证码什么的,我偷懒跳过
以下是slenium简单案例
# 导包
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
# 导包沉睡时间
from time import sleep
driver = webdriver.Chrome()
driver.get("https://www.taobao.com/")
driver.find_element(By.ID, 'q').send_keys("充电宝")
sleep(5)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(5)
# 定位到搜索按钮 点击搜索
driver.find_element(By.CLASS_NAME, 'btn-search').click()
sleep(3)
driver.get('https://www.baidu.com')
sleep(2)
driver.find_element(By.ID, 'kw').send_keys("充电宝")
sleep(2)
driver.find_element(By.ID, 'su').click()
sleep(2)
driver.back()
sleep(2)
driver.forward()
sleep(5)
# 关闭
driver.quit()
无头规避检测
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions # 实现规避检测
from selenium.webdriver.chrome.service import Service
# 导包沉睡时间
from time import sleep
chrom_options = Options()
chrom_options.add_argument('--headless')
chrom_options.add_argument('--disable-gpu')
chrom_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrom_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=chrom_options)
driver.get('https://www.baidu.com')
# 关于无头检测还是有问题,服务器还是可以检测到
print(driver.page_source)
sleep(15)
driver.quit() # 使用完关闭浏览器
登录qq空间简单案例
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
# 导包沉睡时间
from time import sleep
if __name__ == "__main__":
driver = webdriver.Chrome()
driver.get('https://i.qq.com/')
sleep(1)
driver.switch_to.frame('login_frame')
tag = driver.find_element(By.ID, 'switcher_plogin').click()
sleep(2)
driver.find_element(By.ID, 'u').send_keys('*@qq.com')
sleep(2)
driver.find_element(By.ID, 'p').send_keys("*")
sleep(2)
driver.find_element(By.ID, 'login_button').click()
sleep(6)
driver.quit()
简单拖动案例
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
# 导包沉睡时间
from time import sleep
driver = webdriver.Chrome()
driver.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.switch_to.frame('iframeResult')
div = driver.find_element(By.ID, 'draggable')
action = ActionChains(driver)
action.click_and_hold(div)
for i in range(5):
action.move_by_offset(17, 0).perform()
sleep(1)
action.release()
sleep(5)
driver.quit()
利用scrapy爬取笔趣阁小说标题
import scrapy
class FirstSpider(scrapy.Spider):
name = "first"
# allowed_domains = ["www.baidu.com"]
start_urls = ["https://www.bq90.cc/book/136005/"]
def parse(self, response):
div_list = response.xpath('//*[@class="listmain"]/dl/dd')
for i in div_list:
title = i.xpath('.//a/text()')[0]
print(title)
第1章 选妻,还是参军?
第2章 我全都要
第3章 三个人
第4章 遭遇劫匪
第5章 剿匪
第6章 手腕强硬
第7章 出手阔绰
第8章 齐人之福不好享