• python:爬虫获取淘宝/天猫的商品信息


    【需求】输入关键字,如书包,可以搜索出对应商品的信息,包括:商品标题、商品链接、价格范围;且最终的商品信息需要符合:包邮、价格差不会超过某数值

    #coding=utf-8
    """
    以下三个字可以自行设置:search_keyword、page、price_interval_max
    """
    #设置搜索的关键字
    search_keyword = "戒指"
    #设置需要搜索的商品的页数,比如设置10,就是淘宝搜出结果中前10页的商品数据,淘宝默认一页有44个商品
    page = 10
    #设置最大价格和最小价格之间可接受的差
    price_interval_max = 1000
    
    import re, os, requests, sys, time, shutil
    from selenium import webdriver
    from lxml import etree
    from xlrd import open_workbook
    from xlutils.copy import copy
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    time1 = time.time()
    phantomjs_path = os.getcwd() + "phantomjs.exe"
    driver=webdriver.PhantomJS(executable_path='D:/Python27/Scripts/phantomjs.exe')
    # driver=webdriver.PhantomJS(executable_path=phantomjs_path)
    search_url = 'https://s.taobao.com/search'
    payload = {'q':search_keyword, 's':'1', 'ie':'utf8'}  #字典传递url参数
    payload1 = {'ie':'utf8'}
    excel_path_ori = os.getcwd() + "//result.xls"
    excel_path = os.getcwd() + "//tb_result.xls"
    if not os.path.exists(excel_path):
        shutil.copy(excel_path_ori, excel_path)
    else:
        os.remove(excel_path)
        shutil.copy(excel_path_ori, excel_path)
    file = open('taobao_test.txt', 'w')
    
    sheetName = "Sheet1"
    url_lineindex = 0
    title_lineindex = 1
    price_lineindex = 2
    price_interval_lineindex = 3
    interval_lineindex = 4
    fee_lineindex = 5
    
    def Write_Excel(rowIndex, lineIndex, content):
        """
        - rowIndex:行
        - lineIndex:列
        """
        rowIndex = int(rowIndex)
        lineIndex = int(lineIndex)
        rb = 'r+w'
        rb = open_workbook(excel_path, 'r')
        rbook = open_workbook(excel_path, 'w')
        wb = copy(rbook)
        sheetIndex = rbook.sheet_names().index(sheetName)
        wb.get_sheet(int(sheetIndex)).write(int(rowIndex), int(lineIndex), content)
        wb.save(excel_path)
    
    def get_detail_price(url):
        """
        获取价格范围字段
        :param url:
        :return:
        """
        driver.get(url)
        time.sleep(1)
        html=driver.page_source
        selector=etree.HTML(html)
        if "tmall" in url:
            detail_price = selector.xpath('//div[@class="tm-promo-price"]/span[@class="tm-price"]/text()')
    
        elif "taobao" in url:
            detail_price = selector.xpath('//em[@class="tb-rmb-num"]/text()')
        return detail_price
    
    def get_price_interval(price):
        """
        部分商品的价格是一个范围,如:12.00-25.00,以下获取价格范围,及价格差
        :param price:
        :return:
        """
        print price
        price_interval = price[0]
        price_interval = ''.join(price_interval)
        if "-" in price_interval:
            start_price = price_interval.split("-")[0]
            end_price = price_interval.split("-")[1]
            interval = float(end_price) - float(start_price)
        else:
            interval = 0
        return price_interval, interval
    
    def get_url_test():
        """
        获取商品信息:标题、链接、最大价格、价格范围、价格差
        :return:NONE
        """
        j = 0
        Write_Excel(j, url_lineindex, u"商品链接")
        Write_Excel(j, title_lineindex, u"商品标题")
        Write_Excel(j, price_lineindex, u"最低价格")
        Write_Excel(j, price_interval_lineindex, u"价格范围")
        Write_Excel(j, interval_lineindex, u"价格差")
        Write_Excel(j, fee_lineindex, u"运费")
        for k in range(0, page):        #10次,就是10页的商品数据
    
            payload['s'] = 44 * k + 1   #此处改变的url参数为s,s为1时第一页,s为45是第二页,89时第三页以此类推
            resp = requests.get(search_url, params=payload)
              #设置编码
            title = re.findall(r'"raw_title":"([^"]+)"', resp.text, re.I)  #正则保存所有raw_title的内容,这个是书名,下面是价格,地址
            price = re.findall(r'"view_price":"([^"]+)"', resp.text, re.I)
            loc = re.findall(r'"i003d568963194127tem_loc":"([^"]+)"', resp.text, re.I)
            url = re.findall(r'"detail_url":"([^"]+)"', resp.text, re.I)
            fee = re.findall(r'"view_fee":"([^"]+)"', resp.text, re.I)
            x = len(title)           #每一页商品的数量
    
            for i in range(0, x) :    #把缓冲中的数据保存到文件中
                print i
                print('商品标题:' + title[i])
                print('最低价格:' + price[i])
                print('运费:' + fee[i])
                #获取商品链接
                url[i] = url[i].replace("u003d","=").replace("u0026","&")
                # print('goods_url:' + url[i])
                url[i] = "https:" + url[i]
                print('商品链接:' + url[i])
                #获取商品价格区间
                try:
                    resp_detail = requests.get(url[i])
                    resp_detail.encoding = 'utf-8'
                    detail_price = get_detail_price(url[i])
                    data = get_price_interval(detail_price)
                    price_interval = data[0]
                    interval = data[1]
                    print('price_interval:' + price_interval)
                    print('interval:' + str(interval))
                    #保存数据
                    file.write(
                        str(k * 44 + i + 1) +
                        '商品链接:' + url[i] + '
    ' +
                        '商品标题:' + title[i] + '
    ' +
                        '最低价格:' + price[i] + '
    ' +
                        '价格范围:' + str(price_interval) + '
    ' +
                        '价格差:' + str(interval) + '
    ' )
                        # 'goods_fee:' + fee[i] + '
    ')
                    #将过滤数据写入excel表格
                    if fee[i] == "0.00" and interval < int(price_interval_max):
                        print "该商品符合要求:包邮,且最大价格与最小价格差小于%s" % price_interval_max
                        j = j + 1
                        Write_Excel(j, url_lineindex, url[i])
                        Write_Excel(j, title_lineindex, title[i])
                        Write_Excel(j, price_lineindex, price[i])
                        Write_Excel(j, price_interval_lineindex, price_interval)
                        Write_Excel(j, interval_lineindex, interval)
                        Write_Excel(j, fee_lineindex, fee[i])
                except:
                    print "该商品信息获取失败,跳过"
                    continue
    
    
    get_url_test()
    # #环境恢复
    file.close()
    os.system("taskkill /im phantomjs.exe")
    time2 = time.time()
    print u'ok,结束!'
    print u'总共耗时:' + str((time2 - time1)/60) + '分钟'
    每天努力一点,每天学习一点。 Keep Moving...
  • 相关阅读:
    JavaWeb网上商城项目中用户注册,使用MailServer和FoxMail搭建本地邮件服务器
    myeclipse编码问题
    Date日期类型的绑定
    springmvc学习之jdk版本,tomcat版本,spring版本
    mybatis-ehcache整合中出现的异常 ibatis处理器异常(executor.ExecutorException)解决方法
    .net里面<app.config>中value值不能填写特殊符号问题
    sqldeveloper中Excel数据的导入与导出
    IntelliJ IDEA 2019.2最新版本免费激活码(转)
    sqlserver 的一些小总结
    SQL 跨数据库同步数据 、跨数据库跨更新数据
  • 原文地址:https://www.cnblogs.com/channy14/p/9266979.html
Copyright © 2020-2023  润新知