代码如下:
1 """ 2 Created on Thu Jun 10 21:42:17 2021 3 4 @author: 泥烟 5 6 本爬虫可实现指定商品,指定页数的信息爬取,仅用来学习 7 具有时效性(cookie会过期,用的时候手动改一下即可) 8 """ 9 10 11 import requests 12 import re 13 import csv 14 import time 15 16 count=1 17 #步骤1:提交商品搜索请求,循环获取页面 18 def getHTMLText(url): 19 me = {'cookie':'略', 20 'User-agent':'Mozilla/5.0'} 21 try: 22 r = requests.get(url, headers=me,timeout=30) 23 r.raise_for_status() 24 r.encoding = r.apparent_encoding 25 return r.text 26 except: 27 return "" 28 29 #步骤2:对于每个页面,提取商品序号,名称和价格信息 30 def parsePage(ilt, html,page): 31 try: 32 plt = re.findall(r'"view_price":"[d.]*"', html) 33 tlt = re.findall(r'"raw_title":".*?"', html) 34 #每页第一个商品的序号 35 global count 36 for i in range(len(plt)): 37 price = eval(plt[i].split(':')[1]) 38 title = eval(tlt[i].split(':')[1]) 39 ilt.append([count,price, title]) 40 count+=1 41 except: 42 print("") 43 44 45 def main(): 46 print('输入要爬取的商品名字:') 47 goods = input() 48 print("输入要爬取的页数(仅做练习,请尽量少于10页):") 49 depth = int(input()) 50 basic_url = 'https://s.taobao.com/search?q=' + goods 51 uList = [] 52 header = ["序号", "价格", "商品名称"] 53 54 for i in range(depth): 55 try: 56 url = basic_url + '&s=' + str(44 * i) 57 html = getHTMLText(url) 58 parsePage(uList, html,i) 59 print("第"+str(i+1)+"页爬取成功") 60 time.sleep(0.5) 61 except: 62 continue 63 filename = goods+".csv" 64 #步骤3:将信息保存在文件中,文件名为该商品的名字 65 with open(filename, 'a', newline='') as f: 66 writer = csv.writer(f) 67 writer.writerow(header) 68 for row in uList: 69 writer.writerow(row) 70 71 if __name__ == '__main__': 72 main() 73 print("输入回车退出...") 74 input()