1 #coding=utf-8 2 __author__ = "carry" 3 4 5 import sys 6 reload(sys) 7 sys.setdefaultencoding('utf-8') 8 9 import urllib 10 import urllib2 11 import re 12 13 14 #获取源码 15 def get_content(page): 16 headers = {#'Host':'search.51job.com', 17 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 18 #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 19 #'Connection':'keep-alive' 20 } 21 url ='http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+ str(page)+'.html' 22 req = urllib2.Request(url,headers=headers) 23 r = urllib2.urlopen(req) 24 response = r.read() #读取源代码并转为unicode 25 html = response.decode('gbk').encode('utf-8') 26 return html 27 28 def get(html): 29 reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)#匹配换行符 30 items=re.findall(reg,html) 31 return items 32 33 #多页处理,下载到文件 34 for j in range(1,11): 35 print(u"正在爬取第"+str(j)+"页数据...") 36 html = get_content(j) #调用获取网页原码 37 for i in get(html): 38 #print(i[0],i[1],i[2],i[3],i[4]) 39 with open ('51job.txt','a') as f: 40 f.write(i[0]+' '+i[1]+' '+i[2]+' '+i[3]+' '+i[4]+' ') 41 f.write("-----------------------------------------------------") 42 f.close()