• 爬虫


    爬虫

    全网爬取,能爬到1000多张

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import re
    import requests
    
    def save_pic(img_filename, img_data, n):
        with open(img_filename, 'wb') as f:  # write,read,wb是写入二进制
            f.write(img_data)
            print('第%s张保存完毕' % (n))
    
    def main():
    
        for n in range(45):
            url = f'http://www.xiaohuar.com/list-1-%s.html' % (n + 1)
            print(url)
            response = requests.get(url)
            response.encoding = "gb2312"
            data = response.text
            # 拿到校花网所有的图片链接
            results = re.findall(r'" src="(.*?)" /></a>', data)
            name = re.findall(r'"  alt="(.*?)" src="', data)
            for n in range(len(name)):  # type:str
                try:
                    # 判断是不是有链接的
                    if results[n].startswith('http') and results[n].endswith("jpg"):
                        img_result = results[n]
                        # 获取图片内容
                        img_response = requests.get(img_result)
                        img_data = img_response.content
                        img_name = name[n] + ".jpg"
                        img_filename = r'd:\444\' + img_name
                        print("条件1")
                        # 保存图片内容
                        save_pic(img_filename, img_data, n)
                    elif results[n].endswith("jpg"):
                        print(results[n])
                        img_result = 'http://www.xiaohuar.com/' + results[n]
                        # 获取图片内容
                        img_response = requests.get(img_result)
                        img_data = img_response.content
                        img_name = name[n]
                        img_filename = r'd:\55\' + img_name + '.jpg'
                        # 保存图片内容
                        save_pic(img_filename, img_data, n)
                    else:
                        print('没有可以爬取的信息!')
                except:
                    print('好像哪里出现了异常!')
    
            else:
                print('没有可以爬取的信息!')
    
    if __name__ == '__main__':
        input('按任意键继续')
        main()
    
  • 相关阅读:
    vitual box 虚拟机调整磁盘大小 resize partiton of vitual os
    单向链表逆转
    搭建公司的React开发环境
    2018 ICPC 沈阳网络预赛 Fantastic Graph (优先队列)
    背包问题初探
    HDU 2588 GCD (欧拉函数)
    ZOJ
    ZOJ
    ZOJ
    HDU
  • 原文地址:https://www.cnblogs.com/bladecheng/p/11103627.html
Copyright © 2020-2023  润新知