1、代码:
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
from urllib import parse
#爬虫
class BtcSpider(object):
def __init__(self):
#爬取美女吧图片
#解析:美女吧url地址,pn值决定页数,pn =0 ,表示第一页,pn = 50 表示第二页……
self.url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
}
#发送请求
def get_response(self,url):
response = requests.get(url,headers = self.headers)
data = response.text
return data
#发送请求获取网页数据,数据读取方式为二进制content形式
def get_data(self,url):
data = requests.get(url,headers = self.headers).content
return data
#解析数据,封装xpath
def get_xpath(self,html,pattern):
#构建树
p = etree.HTML(html)
#解析网页内容,获取url_lists
result = p.xpath(pattern)
return result
#下载图片&
def download_src(self,url):
html = self.get_response(url)
html = html.replace("<!--", "")
pattern1 = '//div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
#获取每个帖子的url地址
url_lists = self.get_xpath(html,pattern1)
base_url = "http://tieba.baidu.com/"
true_lists = []
for i in url_lists:
#帖子完整的url地址
tie_url = "http://tieba.baidu.com" + i
tie_html = self.get_response(tie_url)
pattern2 = '//img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video'
img_lists = self.get_xpath(tie_html,pattern2)
#下载图片
self.save_data(img_lists)
#保存数据
def save_data(self,url_lists):
for url in url_lists:
img_data = self.get_data(url)
file_name = url[-10:]
print('正在下载图片:',file_name)
file_path = self.dir+'\'+file_name
with open(file_path,'wb')as f:
f.write(img_data)
def run(self):
word = input("请输入关键词:")
begin = int(input("请输入起始页:"))
end = int(input("请输入终止页:"))
#转化格式
name = parse.quote(word)
os.makedirs(word, exist_ok=True)
self.dir = word
#设置需要爬取的页数
for i in range(begin,end):
i = (i-1)*50
url = self.url.format(name,i)
self.download_src(url)
if __name__ == "__main__":
spider = BtcSpider()
spider.run()