#-*-coding:utf-8-*-
import urllib
import urllib2
import re
def get_duanzi(url):
store=[]
user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64)'
headers={'User-Agent':user_agent}
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
html=response.read().decode('utf-8')
pattern=re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div.*?span>(.*?)</span>(.*?)<div class="stats">.*?"number">(.*?)</i>',re.S)
results=re.findall(pattern,html)
for result in results:
haveImg=re.search("img",result[2])
if not haveImg:
store.append([result[0],result[1],result[3]])
for st in store:
print st[0]
print st[1]
print st[2]
#爬取糗事百科前13页的段子
for page in range(1,14):
url='http://www.qiushibaike.com/hot/page/'+str(page)
get_duanzi(url)