• 爬取京东所有商品的评论代码


    #-*-coding:utf-8-*-
    #@Time :2022/3/14 12:49
    #@Author:shuaichao
    #@File :.py
    #@Software: PyCharm
    
    import openpyxl as op
    import urllib.request
    from bs4 import BeautifulSoup  # 网页解析,获悉数据.231
    import urllib.request, urllib.error  # 制定URL,获取网页数据
    import time
    import random
    import json
    
    
    list_goodid = []  #商品id
    list_id = []  # 产品ID
    list_content = []  # 评论内容
    list_time = []  # 时间
    list_score = []  # 评分
    list_name = []  # 名字
    list_mobileVersion = []  # 是否移动端
    list_plusAvailable = []  # 会员等级
    list_days = []          # 收货间隔
    def askUrl(url):
        headers = {
            # "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
            # 'Host': 'movie.douban.com',
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
        request = urllib.request.Request(url, headers=headers)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reasen)
        return html
    
    
    # 爬取网页信息
    def get_info(baseurl):
        html = askUrl(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        return bs
    
    
    # soup处理并转换成字符串
    def transport(bs, info):
        ex_info = bs.find_all(class_=info)
        info = str(ex_info)
        return ex_info, info
    
    
    def askUrl2(url):
        headers = {
            # "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
            # 'Host': 'movie.douban.com',
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
        request = urllib.request.Request(url, headers=headers)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("gb2312")
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reasen)
        return html
    
    def get_info2(baseurl):
        html = askUrl2(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        return bs
    
    if __name__ == '__main__':
        pagesize = 100      # 某一页包含的商品数量
        list_Goods = []    # 所有商品的ID
        '''
            获取商品ID
        '''
        print("开始")
        for i in range(1,2):
            url = 'https://floor.jd.com/user-v20/feed/get?' \
                  'page=' + str(i) + '&pagesize=' + str(pagesize) + \
                  '&area=1_2802_0_0&source=pc-home&callback=jsonpMore2Goods&_=1647233103435'
            time.sleep(random.randint(2, 5))
            res = get_info(url)
            time.sleep(random.randint(2,5))
            response_data =json.loads(res.text.replace('jsonpMore2Goods(','')[:-1])['data']
            for v in response_data:
                list_Goods.append(v['sku'])
        '''
            获取商品的评论和相关信息
        '''
    
        print("开始获取评论等用户信息")
    
        for v in list_Goods:
            try:
                time.sleep(random.randint(2, 5))
                print("完成一个商品购买信息收集")
                url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \
                              '&productId=' + str(v) + '&score=0&sortType=10&page=10&pageSize=100'
                res = get_info2(url_comment)
                time.sleep(random.randint(2, 5))
                response_data = json.loads(res.text)['maxPage']
                pageCount = response_data
                for i in range(1, pageCount):
                    try:
                        print("中循环成功运行一次")
                        url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \
                                      '&productId='+str(v)+'&score=0&sortType=10&page='+str(i)+'&pageSize=100'
                        time.sleep(random.randint(2, 5))
                        res = get_info2(url_comment)
                        time.sleep(random.randint(2, 5))
                        response_data = json.loads(res.text)['comments']
                    except:
                        print("中循环报错一次")
                        try:
                            time.sleep(random.randint(2, 5))
                            res = get_info(url_comment)
                            time.sleep(random.randint(2, 5))
                            response_data = json.loads(res.text)['comments']
                        except:
                            print("中循环二次报错")
                            continue
                    for value in response_data:
                        list_goodid.append(v)
                        list_id.append(value['id'])
                        list_content.append(value['content'])
                        list_time.append(value['creationTime'])
                        list_score.append(value['score'])
                        list_days.append(value['days'])
                        if value['mobileVersion'] != "":
                            list_mobileVersion.append(value['mobileVersion'])
                        else:
                            list_mobileVersion.append("pc")
                        list_name.append(value['nickname'])
                        list_plusAvailable.append(value['plusAvailable'])
            except:
                print("大循环报错一次")
                continue
            finally:
                wb = op.Workbook()  # 创建工作簿对象
                ws = wb['Sheet']  # 创建子表
                for i in range(len(list_id)):
                    d = list_goodid[i],list_id[i], list_name[i], list_content[i], list_time[i], list_mobileVersion[i], list_plusAvailable[
                        i], list_score[i],list_days[i]
                    ws.append(d)
                wb.save("./comment1.xlsx")
            wb = op.Workbook()  # 创建工作簿对象
            ws = wb['Sheet']  # 创建子表
            for i in range(len(list_id)):
                d = list_goodid[i],list_id[i],list_name[i],list_content[i],list_time[i],list_mobileVersion[i],list_plusAvailable[i],list_score[i],list_days[i]
                ws.append(d)
            wb.save("./comment1.xlsx")
    

      

  • 相关阅读:
    iOS开发之视频播放
    iOS开发之Copy & MutableCopy及深复制 & 浅复制
    iOS开发之JSON & XML
    iOS开发之NSObject的多线程
    iOS开发之单例模式
    iOS开发之Run Loop
    taro开发微信小程序-页面开发规范
    视频Video放器的部分实例方法
    Input框搜索关键字高亮显示
    vue上拉加载下拉加载
  • 原文地址:https://www.cnblogs.com/chaogehahaha/p/16007520.html
Copyright © 2020-2023  润新知