• 简单爬虫爬去51job职位


    #-*- coding:utf-8 -*-
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib import parse
    import pymysql
    from sqlalchemy import *
    from sqlalchemy.orm import *
    def getYao(url):
        url = url
        urlFirst = request.Request(url)
        urlFirst.add_header("User-Agent",
                       "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
        urlFirst.add_header("Origin", "http://search.51job.com")
        postData = parse.urlencode([
            ("s", "01"),
            ("t", "0"),
        ])
        print(postData)
        return_ = request.urlopen(urlFirst, data=postData.encode("gbk"))
    
        contentNei = return_.read().decode("gbk")
        neisp = BeautifulSoup(contentNei,"html.parser")
        return neisp.find("div",class_="job_msg").get_text()
    engine=create_engine("mysql://root:root@localhost:3306/laravel?charset=utf8",echo=True)
    metadata=MetaData(engine)
    users_table = Table("jobs",metadata,autoload=True)
    for i in list(range(1,11)):
        url = "http://search.51job.com/list/000000,000000,0000,00,9,99,C,2,"+str(i)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=102&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        rep = request.Request(url)
        rep.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
        rep.add_header("Origin","http://search.51job.com")
        postData = parse.urlencode([
            ("fromJs", "1"),
            ("jobarea", "040000"),
            ("keyword", "php"),
            ("keywordtype", "2"),
            ("lang", "c"),
            ("stype", "2"),
            ("postchannel", "0000"),
            ("fromType", "1"),
            ("confirmdate", "9")
        ])
        print(postData)
        return_ = request.urlopen(rep,data=postData.encode("gbk"))
        content = return_.read().decode("gbk")
        sp = BeautifulSoup(content,"html.parser")
        print(content)
        sql_moban = users_table.insert()
        info_set = set([])
        j = 0
        for i in sp.find("div",class_="dw_table").find_all("div",class_="el"):
          if j==0:
              j = j + 1
              continue
          j = j + 1
          getYao(i.find('a').get('href'))
          result = sql_moban.execute(zhiwei=i.find("a").get_text().strip(), company=i.find("span",class_="t2").string,address=i.find("span",class_="t3").string,slary=i.find("span",class_="t4").string,riqi=i.find("span",class_="t5").string,yaoqiu=getYao(i.find('a').get('href')))
        print("下载完成")
        print(info_set)
    
    #print(sql_moban)

    将爬到的数据写到数据库中,php工资真不高;

  • 相关阅读:
    递归函数
    python学习笔记
    套接字学习笔记
    Tomcat学习笔记3
    Tomcat学习笔记2
    《Python学习手册 第五版》 -第29章 类代码编写细节
    《Python学习手册 第五版》 -第28章 一个更加实际的示例
    《Python学习手册 第五版》 -第27章 类代码编写基础
    《Python学习手册 第五版》 -第26章 OOP:宏伟蓝图
    《Python学习手册 第五版》 -第25章 高级模块话题
  • 原文地址:https://www.cnblogs.com/summerkxy/p/7083682.html
Copyright © 2020-2023  润新知