• 免费简历的爬取


    # 免费的简历模板进行爬取本地保存  
    # http://sc.chinaz.com/jianli/free.html
    # http://sc.chinaz.com/jianli/free_2.html
    
    import requests
    from lxml import etree
    import os
    
    dirName = './resumeLibs'
    if not os.path.exists(dirName):
        os.mkdir(dirName)
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    }
    url = 'http://sc.chinaz.com/jianli/free_%d.html'
    for page in range(1,2):
        if page == 1:
            new_url = 'http://sc.chinaz.com/jianli/free.html'
        else:
            new_url = format(url%page)
        page_text = requests.get(url=new_url,headers=headers).text
        tree = etree.HTML(page_text)
        a_list = tree.xpath('//div[@id="container"]/div/p/a')
        
        for a in a_list:
            a_src = a.xpath('./@href')[0]
            a_title = a.xpath('./text()')[0]
            a_title = a_title.encode('iso-8859-1').decode('utf-8')
            # 爬取下载页面
            page_text = requests.get(url=a_src,headers=headers).text
            tree = etree.HTML(page_text)
            dl_src = tree.xpath('//div[@id="down"]/div[2]/ul/li[8]/a/@href')[0]
            
            resume_data = requests.get(url=dl_src,headers=headers).content
            resume_name = a_title
            resume_path = dirName + '/' + resume_name + '.rar'
            with open(resume_path,'wb') as fp:
                fp.write(resume_data)
                print(resume_name,'下载成功!')
    
  • 相关阅读:
    学习使用linux下tags文件
    uboot常用命令详解
    U-boot中TFTP 解释
    eth0: ERROR while getting interface flags: No such device 没有eth0 有其他的eth
    取消挂载 umount 时出现的 “Device is busy”
    Linux的log日志功能
    oracle的安装
    SOA架构设计分析
    立方体模型
    质量属性的六个常见属性场景(淘宝网)
  • 原文地址:https://www.cnblogs.com/straightup/p/13664724.html
Copyright © 2020-2023  润新知