• 【Python爬虫基础】抓取知乎页面所有图片


    抓取地址所有图片

    #! /usr/bin/env python
    from urlparse import urlsplit
    from os.path import basename
    import urllib2
    import re
    import requests
    import os
    import json
    
    
    url = 'https://www.zhihu.com/question/37787176'
    
    if not os.path.exists('images'):
        os.mkdir("images")
    
    print("start>>>>>>>")
    
    page_size = 50
    offset = 0
    url_content = urllib2.urlopen(url).read()
    answers = re.findall('h3 data-num="(.*?)"', url_content)
    limits = int(answers[0])
    
    while offset < limits:
        post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
        params = json.dumps({
            'url_token': 37787176,
            'pagesize': page_size,
            'offset': offset
        })
        data = {
            '_xsrf': '',
            'method': 'next',
            'params': params
        }
        header = {
            'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
            'Host': "www.zhihu.com",
            'Referer': url
        }
        response = requests.post(post_url, data=data, headers=header)
        answer_list = response.json()["msg"]
        img_urls = re.findall('img .*?src="(.*?_b.*?)"', ''.join(answer_list))
        for img_url in img_urls:
            try:
                img_data = urllib2.urlopen(img_url).read()
                file_name = basename(urlsplit(img_url)[2])
                print(file_name)
                output = open('images/' + file_name, 'wb')
                output.write(img_data)
                output.close()
            except:
                pass
        offset += page_size
    
    print("end>>>>>>>")

    正则抓取网页title

    #!/usr/bin/python  
    # coding:utf-8   
    import httplib2  
    import urllib2  
    import re #正则表达式模块  
    
    class PageClass:  
        #获取指定url的网页内容  
        def get_page(self,url,headers):  
            http=httplib2.Http()  
            response,content=http.request(url,'GET',headers=headers)
            return content.decode('utf-8')
    
    def main():              
        headers={"cookie":'your cookie'}
        url = 'http://v.ktgj.com'
        #print headers
        page = PageClass()
        content = page.get_page(url,headers)
        return content
    
    if __name__ == "__main__":
        htmltext = main()
        pattern = re.compile(r'<title>(.*?)</title>')
        match = pattern.match(htmltext)
        if match:
            print match.group()
        print htmltext

    下载网页图片

    #! /usr/bin/env python
    from urlparse import urlsplit
    from os.path import basename
    import urllib2
    import re
    import requests
    import os
    import json
    import datetime
    
    if not os.path.exists('images'):
        os.mkdir("images")
    
    print("start>>>>>>>>>>>>>>>>>>>>>>>")
    
    url = "http://www.ssff66.com/se/jingpintaotu/519271.html"
    response = requests.get(url)
    #print(response.text)
    img_urls = re.findall('img .*?src="(.*?)"', response.text)
    #print(img_urls)
    
    for img_url in img_urls:
        try:
            img_data = urllib2.urlopen(img_url,timeout = 5).read()
            file_name = basename(urlsplit(img_url)[2])
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "  " + file_name)
            output = open('images/' + file_name, 'wb')
            output.write(img_data)
            output.close()
        except Exception,e:
            print("error : " + e.message)
            pass
    
    print("end>>>>>>>>>>>>>>>>>>>>>>>")
  • 相关阅读:
    指针
    Centos6.5 安装Vim7.4
    C++ Prime:指针和const
    C++ Prime:const的引用
    C++ Prime:函数
    C++ Prime:范围for语句
    python的oop概述
    脚本单独调用django模块
    xtrabackup备份之xbstream压缩
    MySQL8.0安装
  • 原文地址:https://www.cnblogs.com/jhli/p/5915329.html
Copyright © 2020-2023  润新知