基于xpath的爬虫
爬取起点的热门书籍名称,作者,月票以及简介,并将结果保存在xiaoshuo.txt中
import requests
from lxml import etree
import time
import sys #以下三行是为了解决编码报错问题
reload(sys)
sys.setdefaultencoding("utf8")
fo = open("xiaoshuo.txt","w")
i=1
for i in range(5):
url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=%d"%i
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
data = requests.get(url,headers=header).text
f = etree.HTML(data)
hrefs = f.xpath('/html/body/div[1]/div[5]/div[2]/div[2]/div/ul/li/div[2]/h4/a/@href')
for href in hrefs:
href = "https:"+href
book = requests.get(href,headers=header).text
e = etree.HTML(book)
title = e.xpath('/html/body/div/div[6]/div[1]/div[2]/h1/em/text()')[0]
zuozhe = e.xpath('/html/body/div/div[6]/div[1]/div[2]/h1/span/a/text()')[0]
jieshao = e.xpath('/html/body/div/div[6]/div[4]/div[1]/div[1]/div[1]/p/text()')
yuepiao = e.xpath('//*[@id="monthCount"]/text()')[0]
str = '<----->'+title+'<----->'+zuozhe+'<----->'+yuepiao+'
'
fo.write(str)
for te in jieshao:
fo.write(te)
fo.close()
基于selenium的爬虫
目的是爬取校园网上个人基本信息,未完成。最终目的是做出批量查询(学号密码有固定形式)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
#由于find.element_by_*始终无法定位到需要点击的按钮上,无法进入下一页,下一步准备尝试与requests库连用
driver = webdriver.Chrome()
driver.get("http://cas.hdu.edu.cn/cas/login?service=http%3A%2F%2Fonce.hdu.edu.cn%2Fdcp%2Findex.jsp")
elem1 = driver.find_element_by_id("un")
elem2 = driver.find_element_by_id("pd")
elem1.send_keys("学号") #将学号密码替换为自己的真实学号密码
elem2.send_keys("密码")
driver.find_element_by_id('index_login_btn').click()
driver.find_element_by_class_name('quickhome_item_link').click()
print driver.page_source
基于正则表达式
贴吧图片批量下载
import urllib
import re
def gethtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getimg(html):
reg = r'src="(.+?.jpg)" size'
imgre= re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
def downimg(imglist):
x=0
local = 'D:/VScode/image/'
for img in imglist:
urllib.urlretrieve(img,local+'%s.jpg'%x)
x+=1
html = gethtml("https://movie.douban.com/subject/26942674/")
print html