91家纺网，两次数据连接更新

91家纺网，两次数据连接更新

"""

抓取

解析

存储

"""

import re

#import ast

from urllib import parse

from datetime import datetime

import requests

import time

from scrapy import Selector

from models import *

store_list_urls = []

product_list_urls = []

domain = "http://www.91jf.com/"

store_domain = "http://www.91jf.com/default.php?act=corp&sort=list&page="

#函数用来保存写入测试文本

def write_txt(html_data):

    f = open("a.txt", 'w')

    f.write(html_data)

    f.close()

def get_nodes_json():

    left_menu_text = requests.get("http://www.91jf.com/").text

    #write_txt(left_menu_text)

    #etree.HTML(res0.text)

    sel = Selector(text=left_menu_text)

    all_divs = sel.xpath("//div[@class='class_child_li']//a[@href]").extract()

    if all_divs:

        nodes_lists = []

        for i in range(len(all_divs)):

            nodes_str = all_divs[i]

            nodes_str = nodes_str.replace("&","&") # 此处&由于被转义成&导致需要重新进行处理

            nodes_lists.append(nodes_str)

        return nodes_lists

    return []

url_list_names = []

def process_nodes_list(nodes_list):

    #将js的格式提取出url到list中

    for item in nodes_list:

        #此处为对应的url数据

        url = re.search('".*d"', item)

        url = url.group(0).replace(""", "")

        url = parse.urljoin(domain,url)

        #此处为url对应的商品标签

        name = re.search('<span>.*</span>',item)

        name = name.group(0).replace("<span>","")

        name = name.replace("</span>","")

        url_list_name = [url,name] # 系列商品链接，商品系列名字

        url_list_names.append(url_list_name)

    return url_list_names

def get_level1_list(nodes_list):

    level1_url = []

    #将js的格式提取出url到list中

    for item in nodes_list:

        #此处为对应的url数据

        url = re.search('".*d"', item)

        url = url.group(0).replace(""", "")

        url1 = parse.urljoin(domain,url + "&okey=salenum&order=desc&page=1")

        level1_url.append(url1)

    return level1_url

def get_last_urls():

    #获取最终需要抓取的url

    url_list = []

    nodes_list = get_nodes_json()

    #url_names = process_nodes_list(nodes_list)

    level1_url = get_level1_list(nodes_list) # 所有系列商品对应的第一页url

    for url in level1_url:

        #print(url)

        parse_product(url)

        url_list,store_id_list = parse_data_last(url)

        #url_list.extend(parse_data_last(url))

    return url_list

def parse_product(url):

    #获取商品的详情以及销售数量

    res_text = requests.get(url).text

    print(url)

    #print(res_text)

    sel = Selector(text=res_text)

    res_li = sel.xpath("//div[@class='pro_list_div g-clearfix c']/ul//li[@class='goods_offset']")

    flag_num = 0

    for item in res_li:

        name = item.xpath("./div[@class='row row-2 title']/a/text()").extract() # 产品名字

        name = ''.join(name)

        price = item.xpath('./div[@id="goods_detail_b"]/div[@class="row row-1"]/div[@class="g_price fm2"]/strong/text()').extract() # 显示价格

        price = ''.join(price)

        try:

            price = float(price)

        except:

            print("价格会员可见|价格请咨询商家")

            continue

        sales_num = item.xpath("./div[@id='goods_detail_b']/div[2]/p[1]/text()").extract()  # 销售数量

        sales_num= ''.join(sales_num)

        sales_num = sales_num.split('销量：')[1]

        sales_num = int(sales_num)

        flag_num = sales_num

        if sales_num < 1:

            continue



        merchant = item.xpath("./div[@id='goods_detail_b']/div[2]/p[2]/text()").extract() # 商家

        merchant = ''.join(merchant)

        main_Products = item.xpath("./div[@id='goods_detail_b']/div[2]/p[3]/text()").extract() # 主营

        main_Products = ''.join(main_Products)

        merchant_Place = item.xpath("./div[@id='goods_detail_b']/div[2]/p[4]/text()").extract() # 地址

        merchant_Place = ''.join(merchant_Place)



        product = Product()

        product.name = name

        product.price = price

        product.sales_num = sales_num

        product.merchant = merchant

        product.main_Products = main_Products

        product.merchant_Place = merchant_Place



        existed_name = Product.select().where(Product.name==product.name)

        if existed_name:

            product.save()

        else:

            product.save(force_insert=True)



    next_page = sel.xpath("//*[@class='pagination2']/a[@href]").extract()

    if len(next_page) > 2 and flag_num > 0:

        url_next = re.search('".*d"',next_page[-1])

        url_next = url_next.group().replace("&","&") # 此处&由于被转义成&导致需要重新进行处理

        url_next = url_next.replace(""","")

        url_next = parse.urljoin(domain,url_next)

        #print(url_next)

        parse_product(url_next)

    else:

        pass

#获取商品链接,上一级url为商品详情页

def parse_data_last(url):

    url_list = []

    store_id_list = []

    flag_num = 0

    #获取商品的详情标签

    res_text = requests.get(url).text

    sel = Selector(text=res_text)

    res_li = sel.xpath("//div[@class='pro_list_div g-clearfix c']/ul//li[@class='goods_offset']")

    for item in res_li:

        sales_num = item.xpath("./div[@id='goods_detail_b']/div[2]/p[1]/text()").extract() # 销售数量

        sales_num= ''.join(sales_num)

        sales_num = sales_num.split('销量：')[1]

        sales_num = int(sales_num)

        flag_num = int(sales_num)

        data = item.xpath("./div[@class='pro_pic_box']/a").extract()

        data = re.search('".*d"',data[0])

        data = data.group().replace("&","&")

        data = data.replace(""","")

        data_url = parse.urljoin(domain,data) # 链接为销量排序之后的单个商品链接，传出链接

        print("开始获取商品：{}".format(data_url))

        store_id = parse_store_data(data_url)

        store_id_list.append(store_id)

        parse_product_data(data_url)

        url_list.append(data_url)

    #此处代码用来切到下一页链接数据，商品的详情排布页

    next_page = sel.xpath("//*[@class='pagination2']/a[@href]").extract()

    if len(next_page) > 2 and flag_num > 0:

        url_next = re.search('".*d"',next_page[-1])

        url_next = url_next.group().replace("&","&") # 此处&由于被转义成&导致需要重新进行处理

        url_next = url_next.replace(""","")

        url_next = parse.urljoin(domain,url_next)

        parse_data_last(url_next)

    return url_list ,store_id_list

#获取商品详细数据

def parse_product_data(url):

    #获取商品的详情以及销售数量

    #print(url) # 打印当前商品页的url用来定位

    product_id = url.split('id=')[1] # 对商品id进行切片处理，用来获取ajax数据

    res_text = requests.get(url).text

    sel = Selector(text=res_text)

    #筛选规则，当is_price之后的value属性值为0的时候，说明不需要咨询商家，同时需要注意的是，商品会有打折批次数量的差异导致价格差异，

    #这一点需要根据具体的显示页面来处理，现在忽略，由于可能存在打折段的数据差异，所以暂时不考虑

    Is_price = sel.xpath("//input[contains(@id,'is_price')]").extract()#取到的数据用来判断价格是否需要咨询商家

    print(Is_price)

    if len(Is_price) < 1:

        print("页面数据为空")



    is_value = re.search('d',Is_price[0])

    if is_value.group() == '0': # 0表示商品价格不需要咨询商户

        #datas = sel.xpath("//table[contains(@class,'goods_spec_list')]").extract()

        datas = sel.xpath("//div[contains(@class,'show_all')]/table[contains(@class,'goods_spec_list')]//tr")

        #price_base

        price_base = 0.0

        for item in range(len(datas)):

            price = datas[item].xpath("./input[3]").extract()

            price = re.search('value=".*"',price[0])

            price = re.search('d.*d',price[0])

            price = price.group()

            price_base = price_base + float(price)

        price_base = price_base / len(datas) # 商品基准价格计算

        #此处获取商品的描述信息

        attributes_list = sel.xpath("//span[contains(@class,'attributes-list')]//li/text()").extract()

        str_attributes = ' '.join(attributes_list)

        str_attributes = str_attributes.replace(" "," ") # 商品信息描述

        #此处发送请求获取商品购买数据

        url_sales = parse.urljoin(domain,'default.php?act=evallist')

        data = {

            'id': product_id,

            'page': '0',

            'info_type': 'sale'

        }

        response = requests.post(url_sales, data=data)

        buyer_num = response.json().get("member") # 购买人数

        sale_num = response.json().get('num') # 销售数量

        buyer_rate = response.json().get('re_buyer_rate') # 商品复购率

        product_id = int(product_id) # 此处对商品ID进行转换

        product_attributes = Product_attributes()

        product_attributes.product_id = product_id

        product_attributes.price_base = price_base

        product_attributes.attributes = str_attributes

        product_attributes.buyer_num = buyer_num

        product_attributes.sale_num = sale_num

        product_attributes.buyer_rate = buyer_rate

        existed_id = Product_attributes.select().where(Product_attributes.product_id==product_id)

        if existed_id:

            product_attributes.save()

        else:

            product_attributes.save(force_insert=True)

    else :

        price = "价格请咨询商家"

        #print(price)

        #price1 = sel.xpath("//input[@id='goods_spec_price_0_0']").extract()



    #print("获取指定商品失败，不存在的商品")

#获取商户详细数据,处理逻辑为根据单个商品目录来获取对应的商户id

def parse_store_data(url):

    #print(url) # 打印当前商品页的url用来定位

    res_text = requests.get(url).text

    sel = Selector(text=res_text)

    store_id = 0

    #筛选规则，当is_price之后的value属性值为0的时候，说明不需要咨询商家，同时需要注意的是，商品会有打折批次数量的差异导致价格差异，

    #这一点需要根据具体的显示页面来处理，现在忽略，由于可能存在打折段的数据差异，所以暂时不考虑

    Is_price = sel.xpath("//input[contains(@id,'is_price')]").extract()#取到的数据用来判断价格是否需要咨询商家

    #print(Is_price)

    if len(Is_price) < 1:

        print("页面数据为空")

    else:

        is_value = re.search('d',Is_price[0])

        if is_value.group() == '0': # 0表示商品价格不需要咨询商户

            #datas = sel.xpath("//table[contains(@class,'goods_spec_list')]").extract()

            #store_name = sel.xpath('//span[contains(@class,"container_title_span")]/a[@href]/text()').extract()

            #store_name = ''.join(store_name) # 商户的名字

            store_id = sel.xpath('//span[@class="container_title_span"]/a[@href]').extract()

            store_id = ''.join(store_id)

            store_id = re.search('storeid=d*"',store_id)

            store_id = store_id.group()

            store_id = store_id.split('storeid=')[1]

            store_id = store_id.replace(""","")

            #print(store_id)

            store_id = int(store_id) # 商户的id

            '''

            store_data = sel.xpath('//ul[contains(@class,"gy_info_list")]/li/text()').extract()

            if len(store_data) > 3:

                store_level = store_data[2] # 商户等级

                store_level = store_level.replace(" ","")

                store_level = store_level.replace(" ","")

                store_place = store_data[3] # 商户地址

                store_place = store_place.replace(" ","")

            print(store_level)

            print(store_place)

            '''

        else :

            pass

    return store_id

#获取所有商户id

def parse_store_id(url):

    print(url) # 打印当前商户详情页的url用来定位

    store_id_list = []

    res_text = requests.get(url).text

    sel = Selector(text=res_text)

    res_li = sel.xpath("//div[contains(@class ,'corp_list')]//div[@class='supply-list']")

    for item in res_li:

        store_id = item.xpath(".//a[contains(@class,'supply-left-tltle')]").extract()

        store_id = ''.join(store_id)

        store_id = re.search('storeid=d*"',store_id)

        store_id = store_id.group()

        store_id = store_id.split('storeid=')[1]

        store_id = store_id.replace(""","")

        store_id = int(store_id) # 获取店铺id

        store_id_list.append(store_id)



    #此处代码用来切到下一页链接数据，商户的详情排布页

    next_page = sel.xpath("//*[@class='pagination2']/a[@href][last()]/text()").extract()

    next_page = ''.join(next_page)

    try:

        next_page = int(next_page)

    except:

        url_next = sel.xpath("//*[@class='pagination2']/a[@href][last()]").extract()

        url_next = ''.join(url_next)

        url_next = re.search('".*d"',url_next)

        url_next = url_next.group().replace("&","&") # 此处&由于被转义成&导致需要重新进行处理

        url_next = url_next.replace(""","")

        url_next = parse.urljoin(domain,url_next)

        parse_store_id(url_next)

    return store_id_list

def get_last_store_id():

    #获取最终需要抓取的店铺id，传回拼接之后的url

    store_id_list = parse_store_id(store_domain)

    pass

if __name__ == "__main__":

    start_time = datetime.now()

    last_urls = get_last_urls()

    end_time = datetime.now()

    '''

    for url in last_urls:

        #parse_product_data(url)

        #print("开始获取商品：{}".format(url))

    '''
相关阅读:
是河南大学的悲哀？？？
装完manjaro先要卸载
 技术博客
 VIM从入门到中级教程
 HTTP中GET与POST的区别
 AngularJS 拦截器实现全局$http请求loading效果
 angular指令监听ng-repeat渲染完成后执行自定义事件方法
 icheck如何修改样式大小
 Sublime text3 代码格式化插件
 代理模式小试
原文地址：https://www.cnblogs.com/dog-and-cat/p/13256177.html