• python 爬虫系列09-selenium+拉钩


    使用selenium爬取拉勾网职位

     1 from selenium import webdriver
     2 from lxml import etree
     3 import re
     4 import time
     5 from selenium.webdriver.support.ui import WebDriverWait
     6 from selenium.webdriver.support import expected_conditions as EC
     7 from selenium.webdriver.common.by import By
     8 class LagouSpider(object):
     9     driver_path = r"D:driverchromedriver.exe"
    10 
    11     def __init__(self):
    12         self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
    13         self.url = 'https://www.lagou.com/jobs/list_%E4%BA%91%E8%AE%A1%E7%AE%97?labelWords=&fromSearch=true&suginput='
    14         self.positions = []
    15 
    16     def run(self):
    17         self.driver.get(self.url)
    18         while True:
    19             source = self.driver.page_source
    20             WebDriverWait(driver=self.driver,timeout=10).until(
    21                 EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
    22             )
    23             self.parse_list_page(source)
    24             try:
    25                 next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
    26                 if "pager_next_disabled" in next_btn.get_attribute("class"):
    27                     break
    28                 else:
    29                     next_btn.click()
    30             except:
    31                 print(source)
    32 
    33             time.sleep(1)
    34 
    35     def parse_list_page(self,source):
    36         html = etree.HTML(source)
    37         links = html.xpath("//a[@class='position_link']/@href")
    38         for link in links:
    39             self.request_detail_page(link)
    40             time.sleep(1)
    41 
    42     def request_detail_page(self,url):
    43         # self.driver.get(url)
    44         print()
    45         print(url)
    46         print()
    47         self.driver.execute_script("window.open('%s')" % url)
    48         self.driver.switch_to.window(self.driver.window_handles[1])
    49         WebDriverWait(self.driver,timeout=10).until(
    50             EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))
    51         )
    52         source = self.driver.page_source
    53         self.parse_detail_page(source)
    54         self.driver.close()
    55         self.driver.switch_to.window(self.driver.window_handles[0])
    56 
    57     def parse_detail_page(self,source):
    58         html = etree.HTML(source)
    59         position_name = html.xpath("//span[@class='name']/text()")[0]
    60         job_request_spans = html.xpath("//dd[@class='job_request']//span")
    61         salary = job_request_spans[0].xpath('.//text()')[0].strip()
    62         city = job_request_spans[1].xpath(".//text()")[0].strip()
    63         city = re.sub(r"[s/]", "", city)
    64         work_years = job_request_spans[2].xpath(".//text()")[0].strip()
    65         work_years = re.sub(r"[s/]", "", work_years)
    66         education = job_request_spans[3].xpath(".//text()")[0].strip()
    67         education = re.sub(r"[s/]", "", education)
    68         desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
    69         company_name = html.xpath("//h2[@class='f1']/text()")
    70         position = {
    71             'name': position_name,
    72             'company_name': company_name,
    73             'salary': salary,
    74             'city': city,
    75             'work_years': work_years,
    76             'education': education,
    77             'desc': desc
    78         }
    79         self.positions.append(position)
    80         print(position)
    81 if __name__ == '__main__':
    82     spider = LagouSpider()
    83     spider.run()



  • 相关阅读:
    各种推荐资料汇总。。。
    不错的blog,做计算广告学的,还有机器学习的
    大数据建模,eBay的一个牛人
    factor graph和sum product和TrueSkill相关的两个blog,相当不错
    词云制作工具。。。
    CSS:nthchild选择器用法练习
    CSS控制表格隔行变色:nthchild()选择器
    CSS3calc()函数练习(制作响应式布局)
    CSS3boxsizing属性练习(borderbox设置padding和border不会改变width和height的值)
    CSS3clippath练习
  • 原文地址:https://www.cnblogs.com/kingle-study/p/9953842.html
Copyright © 2020-2023  润新知