• 04爬取拉勾网Python岗位分析报告


    # 导入需要的包
    import requests
    import time,random
    from openpyxl import Workbook
    import pymysql.cursors
    #@ 连接数据库;
    # 这个是我本地上边运行的程序,用来获取代理服务器。
    def get_proxy():
    try:
    PROXY_POOL_URL = 'http://localhost:5555/random'
    response = requests.get(PROXY_POOL_URL)
    print(response.text)
    if response.status_code == 200:
    return response.text
    except ConnectionError:
    return None
    # 用来连接本地mysql,可以不连接,直接写入Excel中
    def get_conn():
    """连接本地数据库"""
    # 定义要连接的主机IP,账号名称和密码,连接的数据库,编码等等
    conn = pymysql.connect(host = 'localhost',
    user = 'root',
    password = '123456',
    db = 'python',
    charset = 'utf8mb4',
    cursorclass = pymysql.cursors.DictCursor)
    return conn
    # 将数据写入到数据库中
    def insert(conn,info):
    """数据写入数据库"""
    with conn.cursor() as cursor:
    sql = "INSERT INTO `python` (`companyShortName`, `companyFullName`, `industryField`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
    cursor.execute(sql, info)
    conn.commit()
    # 获取当前网址的信息
    def get_json(url,page,lang_name):
    """返回当前页面的信息列表"""
    data = {'first':'false','pn':page,'kd':lang_name}
    proxies = get_proxy()
    proxies = {
    "http": "http://" + proxies
    }
    json = ses.post(url,data,proxies = proxies).json()
    list_con = json['content']['positionResult']['result']
    info_list = []
    for i in list_con:
    info = []
    info.append(i.get('companyShortName','无')) # 公司名称
    info.append(i.get('companyFullName','无'))
    info.append(i.get('industryField','无'))
    info.append(i.get('companySize','无'))
    info.append(i.get('salary','无'))
    info.append(i.get('city','无'))
    info.append(i.get('education','无'))
    info_list.append(info)
    return info_list

    def main():
    lang_name = 'python'
    wb = Workbook() # 打开Excel工作薄
    conn = get_conn() # 建立数据库连接 不存放数据,注释此行
    for i in ['北京','上海','广州','深圳','杭州']: #五个城市
    page = 1
    wsl = wb.active
    wsl.title = lang_name
    url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
    while page < 2: # 每个城市30页信息
    info = get_json(url,page,lang_name)
    page += 1
    # time.sleep(random.randint(10,20))
    for row in info:
    # 插入数据库,若不想存入 注释此行
    insert(conn,tuple(row))
    wsl.append(row)
    # 关闭数据库连接,不存放数据,注释此行
    conn.close()
    wb.save('{}职位信息.xlsx'.format(lang_name))

    if __name__ == "__main__":
    my_headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
    "Referer": "https://www.lagou.com/jobs/list_Python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
    "Content-Type": "application/x-www-form-urlencoded;charset = UTF-8"
    }
    # time.sleep(5)
    ses = requests.session() # 获取 session
    ses.headers.update(my_headers) # 更新
    ses.get(
    "https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=")
    main()












  • 相关阅读:
    CentOS 7下PXE+Kickstart无人值守安装操作系统
    利用pentestbox打造ms17-010移动"杀器"
    XSS测试代码
    sublime Text3基本配置记录+python
    CTF中那些脑洞大开的编码和加密
    信息安全相关资源
    RIP 实验
    python输出有色记录
    下载Chrome商店和Youtube资源
    mysql使用问题记录
  • 原文地址:https://www.cnblogs.com/cong12586/p/13376765.html
Copyright © 2020-2023  润新知