• 基于selenium爬取拉勾网职位信息


    1.selenium

      Selenium 本是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。而这一特性为爬虫开发提供了一个选择及方向,由于其本身依赖于浏览器,所以使用Python的selenium库的前提是:需要下载相应的浏览器驱动程序,这里附上Chromedriver的下载地址:chromedriver;(64位的向下兼容,)

    2.code:

      说明:

        1.多线程爬取(producer&consumer modal);

        2.结果集的存取文件类型为csv格式;

        3.大三党一枚,代码噪点多提出来哦,相互学习。

    #coding:utf-8
    
    from selenium import webdriver
    from lxml import etree
    import csv
    import time
    import re
    import threading
    import queue
    from selenium.webdriver.common.by import By
    import random
    
    
    POSITIONS_INFO = queue.Queue()
    event = threading.Event()
    
    
    def write_to_csv(file):
        with open(file, 'a', encoding='utf-8', newline="") as f:
            writer = csv.DictWriter(f, ['name', 'salary', 'addr', 'experience', 'degree', 'type', 'advantage', 'detail'])
            writer.writeheader()
            while True:
                if POSITIONS_INFO.empty():
                    event.wait()
                writer.writerow(POSITIONS_INFO.get())
    
    
    
    class LagouSpider(threading.Thread):
        driver_path = r'D:Pythonlearningchromedriver.exe'
        options = webdriver.ChromeOptions()
        options.add_argument('--proxy-server="http://117.63.87.177:9999"')
        def __init__(self, kw, *args, **kwargs):
            super(LagouSpider, self).__init__(*args, **kwargs)
            self.driver = webdriver.Chrome(executable_path=self.driver_path)
            self.url = r'https://www.lagou.com/jobs/list_%s?labelWords=&fromSearch=true&suginput=' % kw
    
        def run(self):
            self.driver.get(self.url)
            cur_page_source = self.driver.page_source
            # 总页数
            max_page_num = int(self.driver.find_element(By.XPATH, "//span[contains(@hidefocus, 'hidefocus')][last()-1]").text)
            for page in range(1, max_page_num):
                self.parse_cur_page(cur_page_source)
                # 爬取当前页, 点击下一页进行抓取
                next_page_tag = self.driver.find_element(By.CSS_SELECTOR, "div.pager_container .pager_next")
                next_page_tag.click()
    
                # 防止被识别, 设置随机等待秒数
                rand_seconds = random.choice([2,3])+random.random()
                time.sleep(rand_seconds)
                cur_page_source = self.driver.page_source
    
    
        def parse_cur_page(self, cur_page_source):
            """解析结果页,解析出所有的待爬取职位信息页地址"""
            html = etree.HTML(cur_page_source)
            info_links = html.xpath("//a[contains(@class, 'position_link')]")
            for link in info_links:
                # print(link.get('href'))
                link_addr = link.get('href', None)
                self.request_info_page(link_addr)
    
    
        def request_info_page(self, page_url:str):
            """获取职位详情页"""
            js_code = "window.open('%s')" % page_url
            self.driver.execute_script(js_code)
            # 解析当前信息页
            cur_handles = self.driver.window_handles
            self.driver.switch_to_window(cur_handles[-1])
            self.parse_position_page(self.driver.page_source)
            # 解析完成, 关闭当前标签卡并切换回原窗口
            self.driver.close()
            self.driver.switch_to_window(cur_handles[0])
    
        def parse_position_page(self, html):
            # print(html)
            """解析职位详情页"""
            list_xpath = {
                "job_name_xpath" : "//div[@class='job-name']/span[@class='name']/text()",
                "job_salary_xpath" : "//dd[contains(@class, 'job_request')]//span[@class='salary']/text()",
                "job_addr_xpath" : "//dd[contains(@class, 'job_request')]//span[2]/text()",
                "job_experience_xpath" :"//dd[contains(@class, 'job_request')]//span[3]/text()",
                "job_degree_xpath" : "//dd[contains(@class, 'job_request')]//span[4]/text()",
                "job_type_xpath" : "//dd[contains(@class, 'job_request')]//span[5]/text()",
                "job_advantage_xpath": "//dd[contains(@class, 'job-advantage')]/p/text()",
                "job_detail_xpath": "//div[@class='job-detail']/*/text()",
            }
    
            key_name = ['name', 'salary', 'addr', 'experience', 'degree', 'type', 'advantage', 'detail']
    
            cur_position_info = dict()
            cur_html = etree.HTML(html)
    
            for index, xpath in enumerate(list_xpath.values()):
                cur_val = cur_html.xpath(xpath)
                if len(cur_val) == 1:
                    temp_value = re.sub(r'[s/<.*?>\]', "", cur_val[0].strip())
    
                else:
                    temp_value = re.sub(r'[s/<.*?>\][]', "", "".join(cur_val))
                    # 去掉xa0(&nbsp;)
                    temp_value = "".join(temp_value.split())
                temp_key = key_name[index]
                cur_position_info[temp_key] = temp_value
    
            print(cur_position_info)
            # {'name': '高级java开发工程师', 'salary': '20k-35k', 'addr': '上海', 'experience': '经验5-10年', 'degree': '大专及以上', 'type': '全职', 'advantage': '电商微服务', 'detail': '任职资格:1、计算机或相关专业本科以上学历,具有4年以上JAVA开发经验;2、熟练掌握数据结构、多线程编程,掌握常用的设计模式;熟悉JVM,包括内存模型、类加载机制以及性能优化;3、熟练掌握Java语言,熟悉Spring、SpringMVC、mybatis、springboot等框架;4、掌握数据库设计,熟练使用数据库如Mysql、Mongodb、Redis等;5、熟悉linux常用的shell命令;6、熟悉rpc开发,有过分布式开发经验者优先;7、具有很强的分析复杂问题和解决复杂问题的能力,有强烈的责任心和使命感,良好的沟通表达能力和团队协作能力。8、熟悉常见的中间件、分布式解决方案及其原理:分布式缓存、SOA、消息中间件,负载均衡、连接池等;9、具有一定的项目规划和决策能力,善于捕捉业务需求、架构设计中存在的问题,并给出有效的解决措施和方法;'}
            POSITIONS_INFO.put(cur_position_info)
            # gCondition.notify_all()
            event.set()
    
    def main():
        spider = LagouSpider('java')
        spider.start()
        writer = threading.Thread(target=write_to_csv, args=['lagou.csv',])
        writer.start()
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    关于c:fakepath的解决办法
    golang channel 源码剖析
    深入虚拟内存(Virtual Memory,VM)
    浅析 golang module
    浅析 golang interface 实现原理
    Golang channel实现
    LCS(最长公共字序列)实现
    Golang令牌桶-频率限制
    OpenGL(3)-三角形
    OpenGL(2)-窗口
  • 原文地址:https://www.cnblogs.com/kisun168/p/10860631.html
Copyright © 2020-2023  润新知