• 【爬虫】python+urllib+beautifusoup爬取花瓣网美女图片


    爬取花瓣网的图片

    #!/usr/bin/env python
    # -*- encoding:utf-8 -*-
    
    import urllib2
    from bs4 import BeautifulSoup
    import re
    import requests
    
    url = 'http://huaban.com/favorite/beauty/'
    
    
    def requestMain():
        request = urllib2.Request(url)
        request.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    
        }
    
        html_doc = urllib2.urlopen(request)
    
        print html_doc.getcode()
        return html_doc
    
    
    def getPins():
        html_doc = requestMain().read()
    
        soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
    
        pins = soup.find_all('a', href=re.compile(r"/pins/d+/"))
        # print pins
        huaban = 'http://huaban.com'
        i = 0
        for pin in pins:
            pin_urls = huaban + pin['href']
            req = urllib2.Request(pin_urls)
            resp = urllib2.urlopen(req)
            soup = BeautifulSoup(resp, 'html.parser', from_encoding='utf-8')
            div_tag = soup.find_all('div', class_="image-holder")
            i = i+1
            print i
            for tag in div_tag:
    
                img = tag.find('img')
    
                link = 'http:'+img.get('src')
    
                print link
    
                a = requests.get(link)
                imgname = i
                #imgname = link.split('/')[-1]
                with open(r'C:Userswuzhi_000DesktopPythonpy_scrapyimage\%s.jpg' % imgname, 'wb') as pic:
                    pic.write(a.content)
                   
    
    if __name__ == '__main__':
        print getPins()
    
    
    # print (soup.prettify())
    
    # print soup.title
    # print soup.title.name
    #
    # print soup.title.string
    #
    # print soup.p
    #
    # print soup.p['class']
    #
    # print soup.a
    #
    # print soup.find_all('img')
    #
    # print ('
    ')
    #
    # print soup.find(href="/pins/1147154763/")
    #
    # print ('
    ')
    #
    # for img in soup.find_all('img'):
    #     print (img.get('src'))
  • 相关阅读:
    Python pandas 入门 05 JSON
    Python matplotlib 画图入门 07 散点图
    Python matplotlib 画图入门 06 绘制多图
    Python pandas 入门 04 CSV 文件
    Python pandas 入门 01 安装
    Python 入门示例系列 35 迭代器与生成器
    Python matplotlib 画图入门 03 绘图线
    Python pandas 入门 目录
    Python 零散知识点琐碎知识
    Python numpy 入门系列 目录
  • 原文地址:https://www.cnblogs.com/wuzhiyi/p/6868589.html
Copyright © 2020-2023  润新知