个人冲刺（七）

苏宁爬虫基本完成
import requests
import re
import time
import mysql
import json


def getitem(keyword, n):
    itemurl = "https://search.suning.com/emall/searchV1Product.do?keyword=" + keyword + "&pg=01&cp=" + "0" + "&paging=" + str(n)
    print("itemurl:"+itemurl)
    response = requests.get(itemurl, headers=headers).text.replace("|||||", ",")
    # print(response)
    # 商品的prdid 和 shopid
    getID = re.compile(r'<span class="def-price" datasku="(.*?),(.*?)" brand_id=".*?" mdmGroupId=".*?">')
    IDlist = re.findall(getID, response)

    # 商品的图片和描述
    getDetail = re.compile(r'<img alt="(.*?)" src="(.*?)" picPriority=".*?">')
    dList = re.findall(getDetail, response)

    # 商品的评价数
    # getComment = re.compile(r'<i>(.*?)</i>评价</a>')
    # cList = re.findall(getComment, response)

    i = 0
    print("长度:"+str(len(IDlist)))
    item = []
    for key in IDlist:
        print("——————————————————————第"+str(i+1)+"件商品——————————————————————")
        print("描述:"+dList[i][0]+"
图片:"+dList[i][1])

        name = dList[i][0]
        image = dList[i][1]

        # if cList:
        #     print("评价:"+cList[i])
        # else:
        #     print("评价:暂无")
        i = i+1
        # getprice(key[0], key[1])
        durl = "https://product.suning.com/" + key[1] + "/" + key[0] + ".html"
        print("商品网址:" + durl)

        link = durl

        shopid2 = int(key[1])
        pjurl = "https://product.suning.com/pds-web/ajax/getApiRemoteMap_" + str(shopid2) + "_shopScoreCallback.html?"
        # print(pjurl)
        response2 = requests.get(pjurl, headers=headers).text.replace("\", "")
        getpj = re.compile(
            r'{"parentIndexName":"评价","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"物流","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"售后","parentIndexScore":"(.*?)"')
        pjlist = re.findall(getpj, response2)
        if pjlist:
            print("店铺评分:" + str(pjlist[0][0]))
            print("物流评分:" + str(pjlist[0][1]))
            print("售后评分:" + str(pjlist[0][2]))
        else:
            print("评分:暂无店铺各项评分")

        response3 = requests.get(durl, headers=headers).text
        getNum = re.compile(r'<span>货号</span> </div> </td> <td class="val">(.*?)</td>')
        numlist = re.findall(getNum, response3)
        id = ""
        if numlist:
            id = str(numlist[0])
            print("货号:" + str(numlist[0]))
        else:
            id = "暂无"
            print("货号:暂无此商品货号")

        getShop = re.compile(r'<a id="chead_indexUrl" href="(.*?)" title="(.*?)">')
        shopList = re.findall(getShop, response3)
        shopI = ""
        shopH = ""
        if shopList:
            shopI = shopList[0][1]
            shopH = shopList[0][0]
            print("店铺:"+shopI+" 店铺链接:"+shopH)
        else:
            shopI = "暂无"
            shopH = "暂无"
            print("店铺:" + "暂无" + " 店铺链接:" + "暂无")

        pjurl = "https://review.suning.com/ajax/getClusterReview_labels/style--0000000"+key[0]+"-"+key[1]+"-----commodityrLabels.htm?"
        # print(pjurl)
        reponse4 = requests.get(pjurl, headers=headers).text.replace('commodityrLabels(', '').replace('})', '}')

        # print(reponse4)
        if reponse4:
            d = json.loads(reponse4)
            print("评论关键字个数:"+str(len(d["commodityLabelCountList"])))
            for q in d["commodityLabelCountList"]:
                print("label:"+q["labelName"]+" num:"+str(q["labelCnt"]))
        else:
            print("无")
        priceurl = " https://pas.suning.com/nspcsale_0_0000000" + key[0] + "_0000000" + key[0] + "_" + key[1] + "_60_311_3110199_20089_1000095_9095_10638_Z001___R1901001_0.36_0___000060021____0___448.224_2_01_20002_20006__.html?"
        # print(priceurl)
        res = requests.get(priceurl, headers=headers).text
        # print(res)
        getK = re.compile(r'"gbPrice":"(.*?)"')
        keyL = re.findall(getK, res)
        price = 0.0
        if keyL:
            price = str(keyL[0])
            print("价格:" + str(keyL[0]))
        else :
            getK = re.compile(r'"netPrice":"(.*?)"')
            keyL = re.findall(getK, res)
            if keyL:
                price = str(keyL[0])
                print("价格:" + str(keyL[0]))
            else:
                price = "无"
                print("无价格:")
        orgin = "苏宁"
        item.append([time.strftime("%Y-%m-%d"), id, price, name, link, image, orgin])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    mysql.insert_item(item)
    # if not mysql.get_conn()[1]:
    #     mysql.insert_item(item)
    # else:
    #     mysql.update_item(item)


# def getprice(prdid, shopid):
#     durl = "https://product.suning.com/"+shopid+"/"+prdid+".html"
#     print("商品网址:"+durl)
#
#     shopid2 = int(shopid)
#     pjurl = "https://product.suning.com/pds-web/ajax/getApiRemoteMap_"+str(shopid2)+"_shopScoreCallback.html?"
#     # print(pjurl)
#     response2 = requests.get(pjurl).text.replace("\", "")
#     getpj = re.compile(r'{"parentIndexName":"评价","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"物流","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"售后","parentIndexScore":"(.*?)"')
#     pjlist = re.findall(getpj, response2)
#     if pjlist:
#         print("店铺评分:"+str(pjlist[0][0]))
#         print("物流评分:"+str(pjlist[0][1]))
#         print("售后评分:"+str(pjlist[0][2]))
#     else:
#         print("评分:暂无店铺各项评分")
#
#     response3 = requests.get(durl).text
#     getNum = re.compile(r'<span>货号</span> </div> </td> <td class="val">(.*?)</td>')
#     numlist = re.findall(getNum, response3)
#     if numlist:
#         print("货号:"+str(numlist[0]))
#     else:
#         print("货号:暂无此商品货号")
#
#     priceurl = " https://pas.suning.com/nspcsale_0_0000000"+prdid+"_0000000"+prdid+"_"+shopid+"_60_311_3110199_20089_1000095_9095_10638_Z001___R1901001_0.36_0___000060021____0___448.224_2_01_20002_20006__.html?"
#     res = requests.get(priceurl).text
#     # print(res)
#     getK = re.compile(r'"gbPrice":"(.*?)"')
#     keyL = re.findall(getK, res)
#     if keyL:
#         print("价格:"+str(keyL[0]))
#     else:
#         getK = re.compile(r'"netPrice":"(.*?)"')
#         keyL = re.findall(getK, res)
#         print("价格:"+str(keyL[0]))


if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }
    key = {"手机", "运营商", "智能数码", "家用电器", "帮客", "房产", "厨卫", "生活家电", "厨具", "电脑办公", "相机", "电竞", "家具", "家装", "家纺", "灯具",
           "食品", "酒水", "生鲜", "特产", "美妆", "个护", "清洁", "宠物", "母婴", "玩具", "车床", "童装", "运动", "户外", "国米", "骑行", "女装", "男装",
           "内衣", "鞋靴", "箱包", "钟表", "珠宝", "艺术", "汽车", "电摩", "汽车用品", "图书", "艺术", "原版", "文学", "医药健康", "计生情趣", "理财", "分期",
           "便民"}
    for i in range(200):
        getitem("外套", i)
相关阅读:
介绍下自己的Delphi学习环境
 我所理解的Delphi中的数组类型
 字符串的基本操作
 以太网网络变压器的作用
 S3C2416 2D加速
 DM9000AEP调试的时候注意事项
 设置activity背景图片
 如何從現有的share library開發！？
struct mntent linux挂载信息读取
 Qt中Qstring,char,int,QByteArray之间到转换
原文地址：https://www.cnblogs.com/mumulailai/p/14911983.html