• Python网络爬虫案例(二)——爬取招聘信息网站


    利用Python,爬取 51job 上面有关于 IT行业 的招聘信息

      版权声明:未经博主授权,内容严禁分享转载  

    案例代码:

    # __author : "J"
    # date : 2018-03-07
    
    import urllib.request
    import re
    import pymysql
    
    connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='******', db='51job',
                                 charset='utf8')
    cursor = connection.cursor()
    
    num = 0
    textnum = 1
    while num < 18:
    
        num += 1
        # 51job IT行业招聘网址 需要翻页,大约800多条数据
        request = urllib.request.Request(
            "http://search.51job.com/list/120000,000000,0100,32,9,99,%2B,2," + str(
                num) + ".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=")
    
        response = urllib.request.urlopen(request)
        my_html = response.read().decode('gbk')
        # print(my_html)
    
        my_re = re.compile(r'href="(.*?)" onmousedown="">')
        second_html_list = re.findall(my_re, my_html)
    
        for i in second_html_list:
            second_request = urllib.request.Request(i)
            second_response = urllib.request.urlopen(second_request)
            second_my_html = second_response.read().decode('gbk')
            # 职位 地区 工资 公司名称 公司简介
            # 工作经验 学历 招聘人数 发布时间
            # 职位信息 联系方式 公司信息
            second_my_re = re.compile('<h1 title=.*?">(.*?)<input value=.*?' +
                                      '<span class="lname">(.*?)</span>.*?' +
                                      '<strong>(.*?)</strong>.*?' +
                                      'target="_blank" title=".*?">(.*?)<em class="icon_b i_link"></em></a>.*?' +
                                      '<p class="msg ltype">(.*?)</p>.*?</div>'
                                      , re.S | re.M | re.I)
            second_html_news = re.findall(second_my_re, second_my_html)[0]
            zhiwei = second_html_news[0].replace("
    |	|
    |
    ", '').replace("&nbsp;", '').replace("    ", '').replace(
                " ",
                '')
            diqu = second_html_news[1].replace("
    |	|
    |
    ", '').replace("&nbsp;", '').replace("    ", '').replace(
                " ",
                '')
            gongzi = second_html_news[2].replace("
    |	|
    |
    ", '').replace("&nbsp;", '').replace("    ", '').replace(
                " ",
                '')
            gongsimingcheng = second_html_news[3].replace("
    |	|
    |
    ", '').replace("&nbsp;", '').replace("    ",
                                                                                                             '').replace(
                " ",
                '')
            gongsijianjie = second_html_news[4].replace("
    |	|
    |
    ", '').replace("&nbsp;", '').replace("    ",
                                                                                                           '').replace(
                " ",
                '')
            # print(zhiwei,diqu,gongzi,gongsimingcheng,gongsijianjie)
            try:
                second_my_re = re.compile('<span class="sp4"><em class="i1"></em>(.*?)</span>'
                                          , re.S | re.M | re.I)
                yaoqiu = re.findall(second_my_re, second_my_html)[0]
            except Exception as e:
                pass
            try:
                second_my_re = re.compile('<span class="sp4"><em class="i2"></em>(.*?)</span>'
                                          , re.S | re.M | re.I)
                yaoqiu += ' | ' + re.findall(second_my_re, second_my_html)[0]
            except Exception as e:
                pass
            try:
                second_my_re = re.compile('<span class="sp4"><em class="i3"></em>(.*?)</span>'
                                          , re.S | re.M | re.I)
                yaoqiu += ' | ' + re.findall(second_my_re, second_my_html)[0]
            except Exception as e:
                pass
            try:
                second_my_re = re.compile('<span class="sp4"><em class="i4"></em>(.*?)</span>'
                                          , re.S | re.M | re.I)
                yaoqiu += ' | ' + re.findall(second_my_re, second_my_html)[0]
            except Exception as e:
                pass
            # print(yaoqiu)
            second_my_re = re.compile('<div class="bmsg job_msg inbox">(.*?)<div class="mt10">'
                                      , re.S | re.M | re.I)
            gangweizhize = re.findall(second_my_re, second_my_html)[0].replace("
    |
    |	|
    ", '').replace("    ",
                                                                                                            '').replace(
                " ", '').replace("&nbsp;", '')
    
            dr = re.compile(r'<[^>]+>', re.S)
            gangweizhize = dr.sub('', gangweizhize)
    
            second_my_re = re.compile('<span class="bname">联系方式</span>(.*?)<div class="tBorderTop_box">'
                                      , re.S | re.M | re.I)
            lianxifangshi = re.findall(second_my_re, second_my_html)[0].replace("
    |
    |	|
    ", '').replace("&nbsp;", '')
    
            dr = re.compile(r'<[^>]+>', re.S)
            lianxifangshi = dr.sub('', lianxifangshi)
            lianxifangshi = re.sub('s', '', lianxifangshi)
    
            second_my_re = re.compile('<span class="bname">公司信息</span>(.*?)<div class="tCompany_sidebar">'
                                      , re.S | re.M | re.I)
            gongsixinxi = re.findall(second_my_re, second_my_html)[0].replace("&nbsp;", '')
            dr = re.compile(r'<[^>]+>', re.S)
            gongsixinxi = dr.sub('', gongsixinxi)
            gongsixinxi = re.sub('s', '', gongsixinxi)
    
            print(''+str(textnum) + ' 条数据 **********************************************')
            print(zhiwei, diqu, gongzi, gongsimingcheng, gongsijianjie, yaoqiu, gangweizhize, lianxifangshi, gongsixinxi)
            textnum += 1
            # try:
            #     sql = "INSERT INTO `jobNews` (`position`,`region`,`Pay`,`company`,`Nature`,`Requirement`,`Job_information`,`Contact_information`,`Company_information`) VALUES ('" + zhiwei + "','" + diqu + "','" + gongzi + "','" + gongsimingcheng + "','" + gongsijianjie + "','" + yaoqiu + "','" + gangweizhize + "','" + lianxifangshi + "','" + gongsixinxi + "')"
            #     cursor.execute(sql)
            #     connection.commit()
            #     print('存储成功!')
            # except Exception as e:
            #     pass
    
    cursor.close()
    connection.close()

    效果:

    我正则表达式用的不好,所以写的很麻烦,接受建议~

    【版权声明】本博文著作权归作者所有,任何形式的转载都请联系作者获得授权并注明出处!
    【重要说明】本文为本菜鸟的学习记录,论点和观点仅代表个人不代表此技术的真理,目的是学习和可能成为向别人分享的经验,因此有错误会虚心接受改正,但不代表此时博文无误!
    【博客园地址】JayveeWong: http://www.cnblogs.com/wjw1014
    【CSDN地址】JayveeWong: https://blog.csdn.net/weixin_42776111
    【Gitee地址】Jayvee:https://gitee.com/wjw1014
    【GitHub地址】Jayvee:https://github.com/wjw1014
  • 相关阅读:
    个人WPF快速入门笔记 基础样式篇02
    个人WPF快速入门笔记 基础布局篇01
    nginx常用笔记备忘
    【leetcode】1685. Sum of Absolute Differences in a Sorted Array
    【leetcode】1696. Jump Game VI
    【leetcode】1694. Reformat Phone Number
    【leetcode】1684. Count the Number of Consistent Strings
    【leetcode】1695. Maximum Erasure Value
    【leetcode】1671. Minimum Number of Removals to Make Mountain Array
    【leetcode】1689. Partitioning Into Minimum Number Of DeciBinary Numbers
  • 原文地址:https://www.cnblogs.com/wjw1014/p/8657184.html
Copyright © 2020-2023  润新知