• selenium +lxml爬取拉钩网公司详情页


    #encoding: utf-8
    import webbrowser
    
    import requests
    from selenium import webdriver
    from selenium.webdriver.support.ui import Select,WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from lxml import etree
    import time
    import re
    import csv
    import requests
    
    class LagouSpider(object):
        # chromedriver的绝对路径
        driver_path ='/Users/mac126/chromedriver'
        def __init__(self):
            # 初始化一个driver,并且指定chromedriver的路径
            self.driver = webdriver.Chrome(executable_path=self.driver_path)
            self.company_lists = []
            self.fp = open('lago.csv','a',encoding='utf-8',newline='')
            self.writer = csv.DictWriter(self.fp,['company_name','img','scale','address','description'])
    
            self.writer.writeheader()
    
        def run(self):
            #运行
            # url='https://www.lagou.com/jobs/list_java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
            url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='#路径
            self.driver.get(url)#获取路径
            while True:#死循环
    
                WebDriverWait(driver=self.driver,timeout=10).until(
                    EC.presence_of_element_located((By.XPATH,"//span[contains(@class,'pager_next')]"))
                )
                resource = self.driver.page_source
                self.parse_list_page(resource)
                next_btn = self.driver.find_element_by_xpath("//span[contains(@class,'pager_next')]")
                if "pager_next_disabled" in next_btn.get_attribute('class'):
                    break
                next_btn.click()
                time.sleep(5)
    
    
        def parse_list_page(self,resource):
            '''
            获取页面信息
            :param resource:
            :return:
            '''
            html = etree.HTML(resource)
            links = html.xpath("//a[@class='position_link']/@href")
            for link in links:
                self.parse_detail_page(link)
                time.sleep(1)
    
        def parse_detail_page(self,url):
            '''
            详情页解析
            :param url:
            :return:
            '''
            self.driver.execute_script("window.open('"+url+"')")
            self.driver.switch_to.window(self.driver.window_handles[1])
            WebDriverWait(self.driver,timeout=10).until(
                EC.presence_of_element_located((By.XPATH,"//dd[@class='job_bt']"))
            )
            resource = self.driver.page_source
            html = etree.HTML(resource)
    
            #找到公司详情页链接
            self.third_url = html.xpath('//*[@id="job_company"]/dt/a/@href')[0]
            self.parse_three_page(self.third_url)
            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])
        def parse_three_page(self,url):
    
            url=self.third_url
            self.driver.get(url)
            resource = self.driver.page_source
            html = etree.HTML(resource)
            company_name = html.xpath('//div[@class="company_info"]/div[@class="company_main"]/h1/a/text()')[0]#1
            img = html.xpath('//div[@class="top_info"]/div[1]/img/@src')[0]
            scale = html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0]#1
            address = html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0]#1
            description = html.xpath('//div[@class="company_intro_text"]/span[@class="company_content"]/p/text()')#1
            if description:
                self.company_list = {
                    'company_name': company_name,
                    'img': img,
                    'scale': scale,
                    'address': address,
                    'description': "".join(description),
    
                }
            # print(company_name,img,scale,address,description)
            else:
                description = html.xpath('//div[@class="company_intro_text"]/span[@class="company_content"]/text()')#1
                if description:
                    self.company_list = {
                        'company_name': company_name,
                        'img': img,
                        'scale': scale,
                        'address': address,
                        'description': "".join(description),
    
                    }
            self.write_position(self.company_list)
    
        def write_position(self,company_list):
                '''
                保存
                :param position:
                :return:
                '''
                if len(self.company_lists) >= 100:
                    self.writer.writerows(self.company_lists)
                    self.company_lists.clear()
                self.company_lists.append(self.company_list)
                print(self.company_list)
    
    
    
    
    def main():
    
        spider = LagouSpider()
        spider.run()
    
    if __name__ == '__main__':
        main()

    爬取结果如图:

  • 相关阅读:
    整数
    mysql-5.7.13-win32 安装
    Flex air修改外部xml文件 (转)
    JAVA 取得当前目录的路径/Servlet/class/文件路径/web路径/url地址
    C#中&和&&的区别
    百度UEditor1.4.3编辑器和asp.net MVC 5结合
    ASP.NET MVC ajax数组,模型绑定问题。
    MEF依赖注入无法在在构造函数中使用的解决办法
    AJaxFileUpload 文件上传<pre>,json字符串为空解决方法
    C#中字符串转换为IPAdress
  • 原文地址:https://www.cnblogs.com/liangliangzz/p/10391747.html
Copyright © 2020-2023  润新知