python爬取糗百内容

#-*- coding: utf-8 -*-
import urllib
import urllib2
import re

#页面为1
page=1
url='http://www.qiushibaike.com/hot/page/'+str(page)

#需要header验证
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
    #获取地址
    request=urllib2.Request(url,headers=headers)
    #打开连接
    response=urllib2.urlopen(request)
    #输出读取内容
    #print response.read()

    content=response.read().decode('utf-8')

    # 去奇趣百科找不带图片的段子结构，匹配正则，糗百的标签会不定时改变，正则可能要重新匹配
    pattern = re.compile(
'<div.*?author.*?users.*?<h2>(.*?)</h2>.*?content.*?<span>(.*?)</span>.*?vote.*?number">(.*?)</i>.*?comments.*?number">(.*?)</i>',
        re.S)
    #<div.*?author.*?users.*?<h2>(.*?)</h2>.*?content.*?<span>(.*?)</span>.*?<a.*?img.*?>(.*?)</a>
    # 组 作者，内容，点赞，评论
    items=re.findall(pattern,content)

    for item in items:

    print item[0],item[1],item[3]
except urllib2.URLError,e:
    if hasattr(e,'code'):
        print e.code
    if hasattr(e,'reason'):
        print e.reason

相关阅读:
成都58同城快速租房的爬虫，nodeJS爬虫
`qs.parse` 的简单实现
使用windbg定位内存问题【入门级】
C#正则实现匹配一块代码段
Zeebe服务学习3-Raft算法与集群部署
Zeebe服务学习2-状态机
Zeebe服务学习1-简单部署与实现demo
C#后端接收前端的各种类型数据
大话设计模式--单例模式具体使用
大话设计模式--DI（依赖注入）

原文地址：https://www.cnblogs.com/anxiaoyu/p/6535228.html