"""
抓取
解析
存储
"""
import re
#import ast
from urllib import parse
from datetime import datetime
import requests
from scrapy import Selector
from models import *
domain = "http://www.91jf.com/"
#函数用来保存写入测试文本
def write_txt(html_data):
f = open("a.txt", 'w')
f.write(html_data)
f.close()
def get_nodes_json():
left_menu_text = requests.get("http://www.91jf.com/").text
#write_txt(left_menu_text)
#etree.HTML(res0.text)
sel = Selector(text=left_menu_text)
all_divs = sel.xpath("//div[@class='class_child_li']//a[@href]").extract()
if all_divs:
nodes_lists = []
for i in range(len(all_divs)):
nodes_str = all_divs[i]
nodes_str = nodes_str.replace("&","&")#此处&由于被转义成&导致需要重新进行处理
nodes_lists.append(nodes_str)
return nodes_lists
return []
url_list_names = []
def process_nodes_list(nodes_list):
#将js的格式提取出url到list中
for item in nodes_list:
#此处为对应的url数据
url = re.search('".*d"', item)
url = url.group(0).replace(""", "")
url = parse.urljoin(domain,url)
#此处为url对应的商品标签
name = re.search('<span>.*</span>',item)
name = name.group(0).replace("<span>","")
name = name.replace("</span>","")
url_list_name = [url,name]
url_list_names.append(url_list_name)
return url_list_names
def get_level1_list(nodes_list):
level1_url = []
#将js的格式提取出url到list中
for item in nodes_list:
#此处为对应的url数据
url = re.search('".*d"', item)
url = url.group(0).replace(""", "")
url1 = parse.urljoin(domain,url + "&okey=salenum&order=desc&page=1")
level1_url.append(url1)
return level1_url
def get_last_urls():
#获取最终需要抓取的url
nodes_list = get_nodes_json()
url_names = process_nodes_list(nodes_list)
level1_url = get_level1_list(nodes_list)
return level1_url
def parse_product(url):
#获取商品的详情以及销售数量
res_text = requests.get(url).text
#print(res_text)
sel = Selector(text=res_text)
res_li = sel.xpath("//div[@class='pro_list_div g-clearfix c']/ul//li[@class='goods_offset']")
for item in res_li:
name = item.xpath("./div[@class='row row-2 title']/a/text()").extract()
price = item.xpath('./div[@id="goods_detail_b"]/div[@class="row row-1"]/div[@class="g_price fm2"]/strong/text()').extract()
sales_num = item.xpath("./div[@id='goods_detail_b']/div[2]/p[1]/text()").extract()
merchant = item.xpath("./div[@id='goods_detail_b']/div[2]/p[2]/text()").extract()
main_Products = item.xpath("./div[@id='goods_detail_b']/div[2]/p[3]/text()").extract()
merchant_Place = item.xpath("./div[@id='goods_detail_b']/div[2]/p[4]/text()").extract()
#names = sel.xpath("//div[@class='row row-2 title']/a/text()").extract()
#prices = sel.xpath("//div[@id='goods_detail_b']//div[@class='g_price fm2']/strong/text()").extract()
#prices = sel.xpath('//*[@id="goods_detail_b"]/div[1]/div/strong').extract()
#datas = sel.xpath("//div[@class='row row-member']//p/text()").extract()
#'//*[@id="goods_detail_b"]/div[1]/div/strong'
product = Product()
product.name = name
product.price = price
product.sales_num = sales_num
product.merchant = merchant
product.main_Products = main_Products
product.merchant_Place = merchant_Place
#print(product.name,product.price,product.sales_num,product.merchant,product.main_Products,product.merchant_Place)
#product.save()
#/html/body/div[6]/div[3]/div[1]/div[4]/div
next_page = sel.xpath("//*[@class='pagination2']/a[@href]").extract()
if len(next_page) > 2 and len(res_li) > 31:
url_next = re.search('".*d"',next_page[-1])
url_next = url_next.group().replace("&","&")#此处&由于被转义成&导致需要重新进行处理
url_next = url_next.replace(""","")
url_next = parse.urljoin(domain,url_next)
#print(url_next)
parse_product(url_next)
def parse_data_last(url):
#获取商品的详情以及销售数量
res_text = requests.get(url).text
sel = Selector(text=res_text)
res_li = sel.xpath("//div[@class='pro_list_div g-clearfix c']/ul//li[@class='goods_offset']")
for item in res_li:
data = item.xpath("./div[@class='pro_pic_box']/a").extract()
data = re.search('".*d"',data[0])
data = data.group().replace("&","&")
data = data.replace(""","")
data_url = parse.urljoin(domain,data)#链接为销量排序之后的单个商品链接,传出链接
print(data_url)
#此处代码用来切到下一页链接数据,商品的详情排布页
next_page = sel.xpath("//*[@class='pagination2']/a[@href]").extract()
if len(next_page) > 2 and len(res_li) > 31:
url_next = re.search('".*d"',next_page[-1])
url_next = url_next.group().replace("&","&")#此处&由于被转义成&导致需要重新进行处理
url_next = url_next.replace(""","")
url_next = parse.urljoin(domain,url_next)
parse_data_last(url_next)
if __name__ == "__main__":
last_urls = get_last_urls()
for url in last_urls:
#parse_product(url)
parse_data_last(url)
#print(last_urls)