• 51job多线程爬取指定职业信息数据


    51job多线程爬取指定职业信息数据

    # datetime:2020/10/7 14:02
    # 51job多线程
    import requests
    import chardet
    from bs4 import BeautifulSoup
    import csv
    from openpyxl import Workbook
    import random
    import time
    import threading
    
    def getOnePageInfo(url):
        # 访问链接
        res = requests.get(url,
                           headers={
                               'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
                           )
        # 转为beautifulsoup对象
        soup = BeautifulSoup(res.text, 'html.parser')
    
        # 那么我们只能按照实际得到的对象来找信息
        allstring = soup.find_all('script')[-4].string
        # allstring=soup.find_all('script')[-4].text
    
        # 1:使用 = 分割1次, 的第二个值就是所有数据
        data = allstring.split('=', 1)[-1]
    
        # 2 :
        index = allstring.find('{')
        data2 = allstring[index:]
    
        # 1使用eval()将字符串转换为相关数据
        dict_data = eval(data)
    
        bigdata = []
        for each in dict_data['engine_search_result']:
            oneInfo = []
            # 职位名 job_name
            oneInfo.append(each.get('job_name'))
            # 公司名 commpany_name
            oneInfo.append(each.get('company_name'))
            # 薪资 providesalary_text
            oneInfo.append(each.get('providesalary_text'))
            # 工作地点 workarea_text
            oneInfo.append(each.get('workarea_text'))
            # 发布日期 updatedate
            oneInfo.append(each.get('updatedate'))
            # 公司类型 companytype_text
            oneInfo.append(each.get('companytype_text'))
            # 额外信息 attribute_text
            oneInfo.append(str(each.get('attribute_text')))
            # 所属行业 companyind_text
            oneInfo.append(each.get('companyind_text'))
            # 将最后一条信息放入bigdata
            bigdata.append(oneInfo)
        return bigdata
    
    
    # 存储二维列表专用类
    class MySave():
        def __init__(self):
            pass
        def saveToCsv(self, data, fileName: str, mode='w'):
            with open(fileName, mode=mode, encoding='utf-8', newline='')as f:
                csvfile = csv.writer(f)
                # 写入data
                for each in data:
                    csvfile.writerow(each)
                print(fileName, '存储完成')
    
        def saveToExcel(self, data, fileName):
            # 实例化工作簿对象
            wb = Workbook()
            # 准备工作表
            sheet = wb.active
            # 写入数据
            for each in data:
                sheet.append(each)
            wb.save(fileName)
            print(fileName, '存储完成')
    
    
    # 抓多页
    jobName = input("请输入搜索关键词:")
    
    def getJobInfo(jobName, startNum, endNum):
        for i in range(startNum, endNum):
            time.sleep(random.randint(1, 3))
            # 拼接链接
            url = f'http://search.51job.com/list/000000,000000,0000,00,9,99,' + jobName + ',2,' + str(i) + '.html'
            print(f'正在抓取第{i}页')
            # 运行函数访问url,返回数据
            data = getOnePageInfo(url)
            save = MySave()
            # 存储到csv
            save.saveToCsv(data,'51job数据.csv','a')
    
    # 设置四个线程
    t1 = threading.Thread(target=getJobInfo, args=(jobName, 1, 25))
    t2 = threading.Thread(target=getJobInfo, args=(jobName, 25, 50))
    t3 = threading.Thread(target=getJobInfo, args=(jobName, 51, 75))
    t4 = threading.Thread(target=getJobInfo, args=(jobName, 75, 100))
    # 开启四个线程
    t1.start()
    t2.start()
    t3.start()
    t4.start()
    
    

  • 相关阅读:
    【Git】windows上git命令中文乱码的问题
    【spring boot】集成了druid后,同样的mybatis模糊查询语句出错Caused by: com.alibaba.druid.sql.parser.ParserException: syntax error, error in :'name LIKE '%' ? '%'
    【log4j】springboot项目启动 ,使用的druid数据源,log4j报错 log4j:WARN Please initialize the log4j system properly.
    ScheduledExecutorService run方法要加入try catch
    基于t-io的MI工具实现
    Java 8:不要再用循环了 Stream替代for循环
    java gzip压缩与解压
    Java将字符串写入文件与将文件内容读取到字符串
    Des加解密
    spring boot 利用redisson实现redis的分布式锁
  • 原文地址:https://www.cnblogs.com/James-221/p/13777794.html
Copyright © 2020-2023  润新知