• 爬虫22-使用selenium爬取信息


    1.正常使用cookie爬取拉勾网ajax数据

    import requests
    from lxml import etree
    import time
    import re
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
        "Cookie": "user_trace_token=20200226133453-084540c1-9531-4fa8-873f-0dda32aa3ca4; _ga=GA1.2.836052667.1582695295; LGUID=20200226133454-167deda5-1930-4e79-8834-719427ac01be; index_location_city=%E5%85%A8%E5%9B%BD; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22%24device_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; gate_login_token=5976db005818f45ed7756b1348563965e46f1400511d886af3d4d57dd9d9166a; LG_LOGIN_USER_ID=5b895ff2a4e23c48dc4c9110a6a1361bbf709630b5b17ac6756340fef1babfbf; LG_HAS_LOGIN=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1583857959,1583912708,1583912713; JSESSIONID=ABAAAECABGFABFF1412C84500FD39A23D7C1D5172179D66; WEBTJ-ID=20200315123348-170dc782d0e4cf-05e9fb23740e5e-3a614f0b-2073600-170dc782d0f63d; _gid=GA1.2.1720707822.1584246829; _putrc=387928C58CE0A7D1123F89F2B170EADC; login=true; unick=%E7%90%B3%E7%90%B3; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=0a8830791829a77f99654a1bb3d568ae; LGSID=20200315140707-568ce08c-c655-44b2-9cd4-66632e1bb6f4; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fpython%2Fp-city%5F0%3F%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; _gat=1; SEARCH_ID=79abbbd66c2b4a59b7ca19ee8fb77e01; X_HTTP_TOKEN=9944cc335d13b0d30552524851b568c7665cd1a0ff; LGRID=20200315140911-acf5dfc4-1c8f-4943-a93f-983d364a96db",
        "Origin": "https://www.lagou.com",
        'X-Anit-Forge-Code': "0",
        "X -Anit-Forge-Token": "None",
        "X-Requested-With": "XMLHttpRequest"
    }
    positions = []
    
    def request_list_page():
        url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    
        data = {
            "frist": "false",
            "pn": "1",
            "kd": "python"
        }
        for x in range(1,10):
            data['pn']=x
            response = requests.post(url, data=data, headers=headers)
            result=response.json()  # 如果返回的是json数据,会被load成一个字典
            positions=result['content']['positionResult']['result']
            for position in positions:
                positionId=position['positionId']#根据这个id找页面
                position_url='https://www.lagou.com/jobs/%s.html'%positionId
                parse_position_detail(position_url)
                break
            time.sleep(2)
            break
    
    def parse_position_detail(url):
        response=requests.get(url,headers=headers)
        text=response.text
        html=etree.HTML(text)
        name=html.xpath("//div[@class='job-name']/@title")[0]
        job_span=html.xpath("//dd[@class='job_request']//span")
        salary=job_span[0].xpath('.//text()')[0].strip()
        city=job_span[1].xpath(".//text()")[0].strip()
        city=re.sub(r"[s/]","",city)
        position = {
            'name': name,
            'salary': salary,
            'city': city
        }
        positions.append(position)
    
    def main():
        request_list_page()
        print(positions)
    if __name__ == '__main__':
        main()
    

      

    2.使用selenium爬取拉勾网ajax数据

    #encoding: utf-8
    
    from selenium import webdriver
    from lxml import etree
    import re
    import time
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    
    class LagouSpider(object):
        def __init__(self):
            self.driver = webdriver.Firefox()
            self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
            self.positions = []
    
        def run(self):
            self.driver.get(self.url)
            while True:
                source = self.driver.page_source
                WebDriverWait(driver=self.driver,timeout=10).until(
                    EC.presence_of_element_located((By.XPATH,"div[@class='pager_container']/span[last()]]"))
                )
                self.parse_list_page(source)
                try:
                    next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                    if "pager_next_disabled" in next_btn.get_attribute("class"):
                        break
                    else:
                        next_btn.click()
                except:
                    print(source)
                time.sleep(1)
    
        def parse_list_page(self,source):
            html = etree.HTML(source)
            links = html.xpath("//a[@class='position_link']/@href")
            for link in links:
                self.request_detail_page(link)
                time.sleep(1)
    
        def request_detail_page(self,url):
            # self.driver.get(url)
            self.driver.execute_script("window.open('%s')"%url)#打开新标签
            self.driver.switch_to.window(self.driver.window_handles[1])#driver移动到新标签
            WebDriverWait(self.driver,timeout=10).until(
                EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']"))
            )
            source = self.driver.page_source
            self.parse_detail_page(source)
            self.driver.close()# 关闭当前这个详情页
            self.driver.switch_to.window(self.driver.window_handles[0])# 继续切换回职位列表页
    
    
        def parse_detail_page(self,source):
            html = etree.HTML(source)
            name = html.xpath("//div[@class='job-name']/@title")[0]
            job_span = html.xpath("//dd[@class='job_request']//span")
            salary = job_span[0].xpath('.//text()')[0].strip()
            city = job_span[1].xpath(".//text()")[0].strip()
            city = re.sub(r"[s/]", "", city)
            position = {
                'name': name,
                'salary': salary,
                'city': city
            }
            self.positions.append(position)
            print(position)
            print('='*40)
    
    
    if __name__ == '__main__':
        spider = LagouSpider()
        spider.run()
    

      

  • 相关阅读:
    jsoup使用选择器语法来查找元素
    获取MD5值
    MD5
    反射机制的实现代码
    struts
    spring
    Hibernate
    商品信息录入
    extjs
    EasyUI
  • 原文地址:https://www.cnblogs.com/wcyMiracle/p/12500036.html
Copyright © 2020-2023  润新知