• 爬取糗事百科的热门段子,以及热图链接


    # -*- coding:utf-8 -*-
    import urllib
    import urllib2
    from bs4 import BeautifulSoup
    import re
    import os
    
    
    page = 1
    while page<10 :
    
        url = 'http://www.qiushibaike.com/hot/page/' + str(page)
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        try:
            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request)
    
            qiubai_html = response.read()
            #print qiubai_html
            soup = BeautifulSoup(qiubai_html,"html.parser")
            #print soup.find("a",class_="contentHerf")
            #print soup.find("a",class_="contenHerf").span.text
    
            file = open('imgsrc.txt','a')
    
            qiubailist = soup.find_all("a",class_="contentHerf")
            print 'this is page ',page
            for x in qiubailist:
                print x.span.text
                file.write(x.span.text.encode('utf-8')+'
    ')
                print '
    '
    
            imgSrclist = soup.find_all("div",class_="thumb")
            for x in imgSrclist:
                file.write(x.img['src'].encode('utf-8')+'
    ')
            file.close()
    
            print soup.find("div",class_="thumb").img['src']
    
            page = page + 1
        except urllib2.URLError, e:
            if hasattr(e,"code"):
                print e.code
                if hasattr(e,"reason"):
                    print e.reason 

     

  • 相关阅读:
    spring boot 与 spring cloud 版本映射
    Java锁
    并发编程(二)
    并发工具类和线程池
    并发编程
    Map双列集合(二)
    Map双列集合(一)
    单列集合List
    类加载
    JVM字节码与代码优化
  • 原文地址:https://www.cnblogs.com/lovely7/p/6119532.html
Copyright © 2020-2023  润新知