• 手动爬虫之京东笔记本栏(ptyhon3)


     1 import urllib.request as ur
     2 import urllib.error as ue
     3 import re
     4 # 目标网址
     5 url = 'https://list.jd.com/list.html?cat=670,671,672'
     6 # 存放路径
     7 save_path = 'E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo1/images/'
     8 # 代理服务器ip
     9 proxy_add = '115.174.66.148:8118'
    10 
    11 def get_JD_pictures(url, save_path, proxy_add, page):
    12     # 根据页面设置url
    13     url = url+"&page="+str(page)
    14     # 添加报头
    15     req = ur.Request(url)
    16     req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
    17 
    18     # 设置代理
    19     proxy = ur.ProxyHandler({'http': proxy_add})
    20     opener = ur.build_opener(proxy, ur.HTTPHandler)
    21     ur.install_opener(opener)
    22 
    23     # 爬取页面
    24     info = ur.urlopen(req).read()
    25     # 信息存档
    26     info = str(info)
    27     pattern_1 = '<div id="plist".+? <div class="page clearfix">'
    28     info = re.compile(pattern=pattern_1).findall(info)
    29     info = info[0]
    30     pattern_2 = '<img width="220" height="220" data-img="1" src="//(.+?.jpg)">'
    31     image_list = re.compile(pattern=pattern_2).findall(info)
    32     x = 1
    33     for image_url in image_list:
    34         image_name = save_path+str(page)+"_"+str(x)+".jpg"
    35         image_url = "http://"+image_url
    36         try:
    37             ur.urlretrieve(image_url, filename=image_name)
    38         except ue.HTTPError as e:
    39             if hasattr(e, 'code'):
    40                 print(e.code)
    41             if hasattr(e, 'reason'):
    42                 print(e.reason)
    43         except ue.URLError as e:
    44             if hasattr(e, 'code'):
    45                 print(e.code)
    46             if hasattr(e, 'reason'):
    47                 print(e.reason)
    48         x += 1
    49 
    50 get_JD_pictures(url, save_path, proxy_add, 1)
  • 相关阅读:
    oo第四次博客
    oo第三次博客
    oo第二次博客
    oo第一次博客
    OO第四次博客
    OO第三次博客总结
    第二次博客总结
    oo第一次博客总结
    oo第四次博客作业
    oo第三次博客作业
  • 原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7107677.html
Copyright © 2020-2023  润新知