• 爬虫实例


    # 爬取糗图上的图片

    import
    re import urllib.request import os def handler_request(url, page): url = url + str(page) + "/" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } request = urllib.request.Request(url, headers=headers) return request def download_image(page, html): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } pattern = re.compile(r'<img src="(.*?)" alt=".*?" />') src_list = pattern.findall(html) dirs = os.path.join(os.getcwd(), "糗图") if not os.path.exists(dirs): os.makedirs(dirs) for i, src in enumerate(src_list): src = "https:" + src # print(src) file_name = os.path.join(dirs, "" + str(page) + "" + str(i) + ".jpg") print("图片%s开始下载..." % (str(page) + "" + str(i) + ".jpg")) try: request = urllib.request.Request(src, headers=headers) image = urllib.request.urlopen(request).read() except Exception as e: print("图片%s下载出错了" % (str(page) + "" + str(i) + ".jpg")) continue print("图片%s已经下载完毕" % (str(page) + "" + str(i) + ".jpg")) with open(file_name, "wb") as f: f.write(image) if __name__ == '__main__': url = "https://www.qiushibaike.com/pic/page/" start_page = int(input("请输入你想要查询的起始页:")) end_page = int(input("请输入你想要查询的结束页:")) for page in range(start_page, end_page + 1): print("第%s页开始下载..." % page) request = handler_request(url, page) content = urllib.request.urlopen(request).read().decode() download_image(page, content) print("第%s页已经下载完毕" % page) print() print() # print(content)
  • 相关阅读:
    leetcode_697. 数组的度
    645. 错误的集合
    leetcode_448. 找到所有数组中消失的数字
    leetcode_628. 三个数的最大乘积
    leetcode_414. 第三大的数
    leetcode_495. 提莫攻击
    leetcode_485. 最大连续1的个数
    在 Mac、Linux、Windows 下Go交叉编译
    Goland基本操作
    etcd搭建及基本使用
  • 原文地址:https://www.cnblogs.com/nxrs/p/11335241.html
Copyright © 2020-2023  润新知