获取淘宝特定商品信息
import re
import requests
from bs4 import BeautifulSoup
import numpy as np
import bs4
def getHTMLText(url):
headers = {
'User-Agent': 'Chorme'}
try: # 请求爬虫框架
coo = "这里输入你们自己的cookie"
cookies = {}
for line in coo.split(';'): # 浏览器伪装
name, value = line.strip().split('=', 1)
cookies[name] = value
r = requests.get(url, cookies=cookies, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parasePage(ilt,html):
try:
plt=re.findall(r'"view_price":"[d|.]*"',html)
tlt=re.findall(r'"raw_title":".*?"',html)
soup=BeautifulSoup(html,'html.parser')
#for link in soup.find_all('a'):
# xx=link.get('href')
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])#把得到的数最外层的双引号单引号去掉
title=eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
print("2")
def printGoodList(ilt):
try:
tplt = "{:4} {:8} {:16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
a=[]
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
except:
print('3')
def main():
goods =input('请输入你想要查询的商品:
');
depth = 2
#start_url = "https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=0" + goods # start_url通过将淘宝搜索页面的代码与变量goods的整合实现对商品的检索
start_url="https://s.taobao.com/search?q="+goods
infoList = []
for i in range(depth): # 单独对每一个url链接进行单独处理
try:
url = start_url + "&s=" + str(44 * i) # 44是淘宝每个页面呈现的宝贝数量
html = getHTMLText(url) # 获得输入的url的网页
parasePage(infoList, html)
except:
continue
printGoodList(infoList)
input()
main()
使用方式:
- 运行代码
- 输入想要查询的商品信息
- 显示出所有相关商品信息