import scrapy, requests, json, re
from copy import deepcopy
from bs4 import BeautifulSoup
from urllib.parse import urlencode
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['suning.com']
start_urls = ['http://book.suning.com/']
def parse(self, response):
"""
进入图书首页,分析图书类别和子类别
该方法仅解析了一个子类别图书
:param response:
:return:
"""
one_types = response.xpath('//div[@class="menu-list"]/div[@class="menu-item"]')
two_types = response.xpath('//div[@class="menu-list"]/div[@class="menu-sub"]/div[@class="submenu-left"]')
for one_index, menu_item in enumerate([one_types[0]]):
one_type = menu_item.xpath('./dl/dt/h3/a/text()').extract_first()
two_type_list = two_types[one_index].xpath('./p[@class="submenu-item"]/a/text()').extract()
for two_index, two_type_item in enumerate([two_type_list[0]]):
two_type = two_type_item
three_types = two_types[one_index].xpath('./ul')[two_index].xpath('./li')
for three_type_a in [three_types[0]]:
three_type = three_type_a.xpath('./a/text()').extract_first()
url = three_type_a.xpath('./a/@href').extract_first()
item = {}
item["one_type"] = one_type
item["two_type"] = two_type
item["three_type"] = three_type
item["type_url"] = url
yield scrapy.Request(item["type_url"], callback=self.get_book_page_num, meta={"item": deepcopy(item)})
def get_book_page_num(self, response):
"""进入图书列表页面获取每个图书详情地址
由于列表页面默认只会加载30条数据,因此通过接口的方式获取图书详情地址
该方法仅解析了第一页数据
"""
item = response.meta.get("item", {})
page_num = int(response.xpath('//div[@id="bottom_pager"]/a[@role="menuitem"]')[-1].xpath('./@pagenum').extract_first())
item["page_num"] = page_num
ci = item["type_url"].split("-")[1]
for i in range(1):
params = (
('ci', str(ci)),
('pg', '03'),
('cp', str(i)),
('il', '0'),
('iy', '0'),
('adNumber', '0'),
('n', '1'),
('ch', '4'),
('prune', '0'),
('sesab', 'ACBAABC'),
('id', 'IDENTIFYING'),
('cc', '089'),
)
book_lsit_api = "https://list.suning.com/emall/showProductList.do?" + urlencode(params)
# 获取前30条数据
yield scrapy.Request(book_lsit_api, callback=self.parse_book_list, meta={"item": deepcopy(item)})
# 获取后30条数据
params.append(('paging', '1'))
params.append(('sub', '0'))
yield scrapy.Request(book_lsit_api, callback=self.parse_book_list, meta={"item": deepcopy(item)})
def parse_book_list(self, response):
"""
接口返回的数据为存在缺失的html代码,xpath解析有误,因此使用BeautifulSoup解析获取详情页地址
:param response:
:return:
"""
item = response.meta.get("item", {})
soup = BeautifulSoup(response.text, "lxml")
books = soup.find_all('a', attrs={'class': 'sellPoint'})
for book in books:
detail_url = "https:" + book.get('href')
yield scrapy.Request(detail_url, callback=self.parse_book_detail, meta={"item": deepcopy(item)})
def parse_book_detail(self, response):
"""
解析详情页获取图书名称、价格、作者、出版社信息
由于详情页有反爬措施,xpath无法解析因此使用BeautifulSoup
:param response:
:return:
"""
price = self.get_price(response)
item = response.meta.get("item", {})
soup = BeautifulSoup(response.text, "html.parser")
li_list = soup.find_all('li', attrs={'class': 'pb-item'})
if len(li_list) > 0: item["author"] = self.replace(li_list[0].text)
if len(li_list) > 1: item["press"] = self.replace(li_list[1].text)
if len(li_list) > 2: item["time"] = self.replace(li_list[2].text)
name = soup.find('h1', attrs={"id": "itemDisplayName"}).text.replace("\n", "").replace("\u3000", " ")
image_url = response.xpath('//div[@class="imgzoom-main"]/a/img/@src').extract_first()
item["name"] = name
item["price"] = price
item["image_url"] = "https:" + image_url
print(item)
def get_price(self, response):
"""
获取价格
通过接口分析参数后发现仅passPartNumber、vendorCode控制价格信息因解析该参数即可
由于详情页有反爬措施,xpath无法解析因此使用BeautifulSoup
:param response:
:return:
"""
passPartNumber_str = re.findall(r'"passPartNumber":"[0-9]*?"', response.text)[0]
passPartNumber = passPartNumber_str.split('"')[-2]
vendorCode_str = re.findall(r'"vendorCode":"[0-9]*?"', response.text)[0]
vendorCode = vendorCode_str.split('"')[-2]
url = "https://pas.suning.com/nspcsale_0_{}_{}_{}_300_089_0890199_502282_1000347_8999_100138_Z001___R9011205_3.0____0001400PA____0___16.0_2__502320_502687_.html?callback=pcData&_=1637305043921".format(
passPartNumber, passPartNumber, vendorCode
)
r = requests.get(url=url)
json_data = r.text.replace("pcData(", "")[:-2]
price = json.loads(json_data)["data"]["price"]["saleInfo"][0]["netPrice"]
return price
def replace(self, str):
"""特殊符号清理
1:删除\n
2:删除\t
3:删除" "
4:替换\u3000、\xa0为空格
"""
temp = str.replace("\n", "").replace("\t", "").replace(" ", "").replace("\u3000", " ").replace(u'\xa0', u' ')
return temp