• python2 urllib2抓取51job网的招聘数据


     1 #coding=utf-8
     2 __author__ = "carry"
     3 
     4 
     5 import sys
     6 reload(sys)
     7 sys.setdefaultencoding('utf-8')
     8 
     9 import urllib
    10 import urllib2
    11 import re
    12 
    13 
    14 #获取源码
    15 def get_content(page):
    16     headers = {#'Host':'search.51job.com',
    17                'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
    18                #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    19                #'Connection':'keep-alive'
    20                }
    21     url ='http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+ str(page)+'.html'
    22     req = urllib2.Request(url,headers=headers)
    23     r = urllib2.urlopen(req)
    24     response = r.read() #读取源代码并转为unicode
    25     html = response.decode('gbk').encode('utf-8')
    26     return html
    27 
    28 def get(html):
    29     reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)#匹配换行符
    30     items=re.findall(reg,html)
    31     return items
    32 
    33 #多页处理,下载到文件
    34 for  j in range(1,11):
    35     print(u"正在爬取第"+str(j)+"页数据...")
    36     html = get_content(j) #调用获取网页原码
    37     for i in get(html):
    38         #print(i[0],i[1],i[2],i[3],i[4])
    39         with open ('51job.txt','a') as f:
    40             f.write(i[0]+'	'+i[1]+'	'+i[2]+'	'+i[3]+'	'+i[4]+'
    ')
    41             f.write("-----------------------------------------------------")
    42             f.close()
  • 相关阅读:
    html 一号店静态页面
    多线程
    TCP通信
    MySQL连接查询
    Mysql数据库 DDL 数据定义语言
    MySQL数据库 DML 数据操作语言
    java字符流
    java File类
    java变量
    JDK、JRE、JVM的关系
  • 原文地址:https://www.cnblogs.com/lxs1314/p/7133844.html
Copyright © 2020-2023  润新知