• 爬虫系列之淘宝商品爬取


     1 import re
     2 import requests
     3 
     4 def getHTMLText(url):
     5     try:
     6         r = requests.get(url, timeout = 30)
     7         r.raise_for_status()
     8         r.encoding = r.apparent_encoding
     9         return r.text
    10     except:
    11         return ""
    12 
    13 
    14 def parsePage(ilt, html):
    15     try:
    16         plt = re.findall(r'"view_price":"[d.]*"', html)
    17         tlt = re.findall(r'"raw_title":".*?"', html)
    18         for i in range(len(plt)):
    19             price = eval(plt[i].split(":")[1])  #eval就是将字符串string对象转化为有效的表达式参与求值运算返回计算结果
    20             title = eval(tlt[i].split(":")[1])
    21             ilt.append([price, title])
    22     except:
    23         print("")       
    24 
    25 def printGoodsList(ilt):
    26     tplt = "{:4}	{:8}	{:16}"  #规定输出格式
    27     print(tplt.format("序号", "价格", "商品名称"))
    28     count = 0
    29     for g in ilt:
    30         count = count + 1
    31         print(tplt.format(count, g[0], g[1]))
    32     print("")
    33 
    34 def main():
    35     goods = '书包'
    36     depth = 2
    37     start_url = 'https://s.taobao.com/search?q=' + goods
    38     infoList = []
    39     for i in range(depth):
    40         try:
    41             url = start_url + '&s=' + str(44*i)
    42             html = getHTMLText(url)
    43             parsePage(infoList,html)
    44         except:
    45             continue
    46     printGoodsList(infoList)
    47 
    48 
    49 main()

  • 相关阅读:
    Synchronized和Lock的实现原理和锁升级
    如何禁止CPU指令重排
    MESI缓存一致性
    归并排序
    强软弱虚四种引用和ThreadLocal内存泄露
    VINS-Mono代码分析与总结(完整版)
    IMU误差模型与校准
    小感
    K8S conul部署
    Centos Consul集群及Acl配置
  • 原文地址:https://www.cnblogs.com/zyb993963526/p/9090107.html
Copyright © 2020-2023  润新知