【需求】输入关键字,如书包,可以搜索出对应商品的信息,包括:商品标题、商品链接、价格范围;且最终的商品信息需要符合:包邮、价格差不会超过某数值
#coding=utf-8 """ 以下三个字可以自行设置:search_keyword、page、price_interval_max """ #设置搜索的关键字 search_keyword = "戒指" #设置需要搜索的商品的页数,比如设置10,就是淘宝搜出结果中前10页的商品数据,淘宝默认一页有44个商品 page = 10 #设置最大价格和最小价格之间可接受的差 price_interval_max = 1000 import re, os, requests, sys, time, shutil from selenium import webdriver from lxml import etree from xlrd import open_workbook from xlutils.copy import copy reload(sys) sys.setdefaultencoding( "utf-8" ) time1 = time.time() phantomjs_path = os.getcwd() + "phantomjs.exe" driver=webdriver.PhantomJS(executable_path='D:/Python27/Scripts/phantomjs.exe') # driver=webdriver.PhantomJS(executable_path=phantomjs_path) search_url = 'https://s.taobao.com/search' payload = {'q':search_keyword, 's':'1', 'ie':'utf8'} #字典传递url参数 payload1 = {'ie':'utf8'} excel_path_ori = os.getcwd() + "//result.xls" excel_path = os.getcwd() + "//tb_result.xls" if not os.path.exists(excel_path): shutil.copy(excel_path_ori, excel_path) else: os.remove(excel_path) shutil.copy(excel_path_ori, excel_path) file = open('taobao_test.txt', 'w') sheetName = "Sheet1" url_lineindex = 0 title_lineindex = 1 price_lineindex = 2 price_interval_lineindex = 3 interval_lineindex = 4 fee_lineindex = 5 def Write_Excel(rowIndex, lineIndex, content): """ - rowIndex:行 - lineIndex:列 """ rowIndex = int(rowIndex) lineIndex = int(lineIndex) rb = 'r+w' rb = open_workbook(excel_path, 'r') rbook = open_workbook(excel_path, 'w') wb = copy(rbook) sheetIndex = rbook.sheet_names().index(sheetName) wb.get_sheet(int(sheetIndex)).write(int(rowIndex), int(lineIndex), content) wb.save(excel_path) def get_detail_price(url): """ 获取价格范围字段 :param url: :return: """ driver.get(url) time.sleep(1) html=driver.page_source selector=etree.HTML(html) if "tmall" in url: detail_price = selector.xpath('//div[@class="tm-promo-price"]/span[@class="tm-price"]/text()') elif "taobao" in url: detail_price = selector.xpath('//em[@class="tb-rmb-num"]/text()') return detail_price def get_price_interval(price): """ 部分商品的价格是一个范围,如:12.00-25.00,以下获取价格范围,及价格差 :param price: :return: """ print price price_interval = price[0] price_interval = ''.join(price_interval) if "-" in price_interval: start_price = price_interval.split("-")[0] end_price = price_interval.split("-")[1] interval = float(end_price) - float(start_price) else: interval = 0 return price_interval, interval def get_url_test(): """ 获取商品信息:标题、链接、最大价格、价格范围、价格差 :return:NONE """ j = 0 Write_Excel(j, url_lineindex, u"商品链接") Write_Excel(j, title_lineindex, u"商品标题") Write_Excel(j, price_lineindex, u"最低价格") Write_Excel(j, price_interval_lineindex, u"价格范围") Write_Excel(j, interval_lineindex, u"价格差") Write_Excel(j, fee_lineindex, u"运费") for k in range(0, page): #10次,就是10页的商品数据 payload['s'] = 44 * k + 1 #此处改变的url参数为s,s为1时第一页,s为45是第二页,89时第三页以此类推 resp = requests.get(search_url, params=payload) #设置编码 title = re.findall(r'"raw_title":"([^"]+)"', resp.text, re.I) #正则保存所有raw_title的内容,这个是书名,下面是价格,地址 price = re.findall(r'"view_price":"([^"]+)"', resp.text, re.I) loc = re.findall(r'"i003d568963194127tem_loc":"([^"]+)"', resp.text, re.I) url = re.findall(r'"detail_url":"([^"]+)"', resp.text, re.I) fee = re.findall(r'"view_fee":"([^"]+)"', resp.text, re.I) x = len(title) #每一页商品的数量 for i in range(0, x) : #把缓冲中的数据保存到文件中 print i print('商品标题:' + title[i]) print('最低价格:' + price[i]) print('运费:' + fee[i]) #获取商品链接 url[i] = url[i].replace("u003d","=").replace("u0026","&") # print('goods_url:' + url[i]) url[i] = "https:" + url[i] print('商品链接:' + url[i]) #获取商品价格区间 try: resp_detail = requests.get(url[i]) resp_detail.encoding = 'utf-8' detail_price = get_detail_price(url[i]) data = get_price_interval(detail_price) price_interval = data[0] interval = data[1] print('price_interval:' + price_interval) print('interval:' + str(interval)) #保存数据 file.write( str(k * 44 + i + 1) + '商品链接:' + url[i] + ' ' + '商品标题:' + title[i] + ' ' + '最低价格:' + price[i] + ' ' + '价格范围:' + str(price_interval) + ' ' + '价格差:' + str(interval) + ' ' ) # 'goods_fee:' + fee[i] + ' ') #将过滤数据写入excel表格 if fee[i] == "0.00" and interval < int(price_interval_max): print "该商品符合要求:包邮,且最大价格与最小价格差小于%s" % price_interval_max j = j + 1 Write_Excel(j, url_lineindex, url[i]) Write_Excel(j, title_lineindex, title[i]) Write_Excel(j, price_lineindex, price[i]) Write_Excel(j, price_interval_lineindex, price_interval) Write_Excel(j, interval_lineindex, interval) Write_Excel(j, fee_lineindex, fee[i]) except: print "该商品信息获取失败,跳过" continue get_url_test() # #环境恢复 file.close() os.system("taskkill /im phantomjs.exe") time2 = time.time() print u'ok,结束!' print u'总共耗时:' + str((time2 - time1)/60) + '分钟'