• 51job招聘信息爬虫


    #-*- codding = utf-8 -*-
    #@Time: 2020/7/7 14:47
    #@Author: liruifeng
    #@File: zhuaqu.py
    #@Software: PyCharm
    
    from bs4 import BeautifulSoup
    import urllib.request,urllib.error
    from urllib import parse
    import sqlite3
    
    #关键字二次转译
    kw = input("请输入你要搜索的岗位关键字:")
    keyword = parse.quote(parse.quote(kw))
    pageNum = 1
    
    jobData = {}  #每一个记录是一个列表,每个列表中有多个键值对
    jobList = []  #所有工作信息放入列表中,每个列表的元素是上面的字典
    
    def main():
        for i in range(1):
            url = "https://search.51job.com/list/010000,000000,0000,00,9,99," + keyword + ",2," + str(pageNum) + ".html"
            pageList = getLink(url)   #爬取一个获取页获得所有信息
            if len(pageList) == 0:
                break
            for jobpage in pageList:
                getData(jobpage)   #一个详情页的链接
        #datalist = getData(baseurl)
        dbpath = "./51job.db"
        #保存数据
        #saveData2(datalist,dbpath)    #保存至sqlite
        print(jobList)
    def getLink(url):
        jobLink = []
        html = askURL(url)    #获取列表页
        bs = BeautifulSoup(html,"html.parser")
        eldiv = bs.select(".el > .t1 > span > a")
        for link in eldiv:
            jobLink.append(link["href"])
            jobList.append({"link":link["href"]})
    
        #print(jobList)
        return jobLink
    
    def getData(jobpage):
        jobHtml = askURL(jobpage)  #获取详情页
        bs = BeautifulSoup(jobHtml,"html.parser")
        #解析数据
        for job in jobList:
            if jobpage == job["link"]:
                jobName = bs.select(".cn > h1")
                for name in jobName:
                    print(name["title"])
                #job["title"] = jobName["title"][0] #将岗位标题放入字典
    
                CnameList = bs.select(".catn") #公司名称
                for cNmae in CnameList:
                    print(cNmae["title"])
    
                days = bs.select(".ltype")   #招聘要求
                info = days[0]["title"].split("|")
                # for inf in days:
                #     print(inf.strip())
                print(info[0].strip(),end=' ')
                print(info[1].strip(),end=' ')
                print(info[2].strip(),end=' ')
                print(info[3].strip(),end=' ')
                try:
                    print(info[3].strip(),end=' ')
                except IndexError as e:
                    break
    
                fuli = bs.select(".sp4")     #福利
                for fulis in fuli:
                    print(fulis.text,end=' ')
    
    
                jobMsgList = bs.select(".job_msg > p") #工作描述
                jobMsgStr = ""
                for str in jobMsgList:
                    jobMsgStr = jobMsgStr + str.text
                    print(jobMsgStr.lstrip())
    
    
    
        #print(jobHtml)
        return jobHtml
    
    def askURL(url):
        head = {
            "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.138Safari / 537.36"
        }  #模拟请求头
        #用户代理
        request = urllib.request.Request(url,headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("gbk")
            #print(html)
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        return html
    
    
    def init_db(dbpath):
        conn = sqlite3.connect(dbpath)  #打开或创建数据文件
        c = conn.cursor()  #获取游标
        sql = '''
            create table job
                (id integer primary key autoincrement,
                job_link text,
                job_name text,
                cname varchar,
                area varchar,
                ssalary numeric,
                educate numeric,
                info text)
        '''  #创建数据表
        c.execute(sql)  #执行SQL
        conn.commit()   #提交数据库操作
        conn.close()    #关闭数据库连接
    #保存数据
    def saveData2(datalist,dbpath):
        init_db(dbpath)
        conn = sqlite3.connect(dbpath)
        cur = conn.cursor()
        for data in datalist:
            for index in range(len(data)):
                if index == 4 or index == 5:
                    continue
                data[index] = '"'+data[index]+'"'
            sql = '''
                insert into job(job_link, job_name, cname, area, ssalary, educate, info)
                values (%s)'''%",".join(data)
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()
    #init_db(dbpath)
    if __name__ == '__main__':
        main()
  • 相关阅读:
    将Python的Django框架与认证系统整合的方法
    将Python的Django框架与认证系统整合的方法
    Python的Asyncore异步Socket模块及实现端口转发的例子
    每天一个linux命令(3):du命令
    每天一个linux命令(2):file 命令
    Ubantu 使用extundelete恢复数据
    ubantu 单用户模式进入系统
    GDB 调试解析
    服务器搭建5 Samba实现文件共享
    服务器搭建4 安装其它库
  • 原文地址:https://www.cnblogs.com/FireLL/p/13372159.html
Copyright © 2020-2023  润新知