• Python下selenium的实战和普通爬虫的对比


    1.普通爬取数据

    """普通爬取数据爬虫,只要有反爬,cookie就不是很稳定。并不利于数据爬取"""
    import requests
    import re
    from lxml import etree
    
    
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
            "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
            # Cookie需要时常换取
            "Cookie": "_ga=GA1.2.1553999204.1538311958; user_trace_token=20180930205504-0cebb367-c4b0-11e8-bb68-5254005c3644; "
                      "LGUID=20180930205504-0cebbcd1-c4b0-11e8-bb68-5254005c3644; showExpriedIndex=1; showExpriedCompanyHome=1; "
                      "showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; sensorsdata2015jssdkcross=%7B%22distinct_"
                      "id%22%3A%22166811f974d15e-026ab47692a8d1-181c7151-2073600-166811f974e549%22%2C%22%24device_id%22%3A%2216681"
                      "1f974d15e-026ab47692a8d1-181c7151-2073600-166811f974e549%22%7D; LG_LOGIN_USER_ID=1d0d39f3227c1f914a3f9c4d95f"
                      "4816a5c6667141cc1313edac4603b4bd6d789; hasDeliver=6; _gid=GA1.2.2026255269.1540465512; WEBTJ-ID=2018102519420"
                      "2-166ab0808f9cb-06806b898438ec-181c7151-2073600-166ab0808fb7ef; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=15401"
                      "21809,1540210602,1540465512,1540467723; LGSID=20181025194505-6ab63d2a-d84b-11e8-8168-5254005c3644; PRE_UTM=m_cf_"
                      "cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%2"
                      "6rsv_idx%3D1%26tn%3Dbaidu%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_pq%3D8d0dc05a0000aada%26rsv_t%"
                      "3D4664T41fswButqvfw6ZM6FGWfkWjtwR%252Fmpsskb6hctTVnUHewMo9o1%252BqRGk%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D7%26r"
                      "sv_sug1%3D8%26rsv_sug7%3D100; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc;"
                      " _putrc=1D33894D7A6BEB76123F89F2B170EADC; JSESSIONID=ABAAABAAAGFABEF9CEC8B1F38F5075A286961D31667AC5C; login=true; unick=%E6%9D%A"
                      "8%E7%A6%B9; gate_login_token=b0629019d50bbe97eb829d61be9770ad4b570c1e68e239c68ae16cc71c68c808; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a"
                      "3d81c6ccf756e6=1540469398; LGRID=20181025201301-5183464a-d84f-11e8-a347-525400f775ce; TG-TRACK-CODE=index_search; SEARCH_ID=06714"
                      "3e245964eb7af08d8c8d316cd44"
        }
    
    
    def request_list_page():
        # 获取详情页面url
        url = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false"
        data = {
            "first": "false",
            "pn": 1,
            "kd": "Python"
        }
        for x in range(1, 31):
            data["pn"] = x
            rep = requests.post(url=url, headers=headers, data=data)
            # json方法,如果返回来的是json数据,自动转换为字典
            result = rep.json()
            # json在线解析分析数据格式
            positions = result["content"]["positionResult"]["result"]
            for position in positions:
                positionId = position["positionId"]
                # 找到详情页url,并传递给解析函数
                position_url = "https://www.lagou.com/jobs/{}.html".format(positionId)
                parse_position_detail(position_url)
                break
            break
    
    
    def parse_position_detail(url):
        # 页面解析
        res = requests.get(url, headers=headers)
        text = res.text
        html = etree.HTML(text)
        position_depart = html.xpath("//div[@class='company']/text()")
        position_names = html.xpath("//span[@class='name']/text()")
        job_requests = html.xpath("//dd[@class='job_request']//span/text()")
        salary_span = re.sub(r"[s/]", "", job_requests[0].strip())
        addr_span = re.sub(r"[s/]", "", job_requests[1].strip())
        exper_span = re.sub(r"[s/]", "", job_requests[2].strip())
        education_span = re.sub(r"[s/]", "", job_requests[3].strip())
        full_span = re.sub(r"[s/]", "", job_requests[4].strip())
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        print(position_depart, position_names, salary_span, addr_span, exper_span, education_span, full_span, desc)
    
    
    if __name__ == '__main__':
        request_list_page()
    View Code

    2.操作selenium获取数据

    """selenium自动化爬取,模拟人的动作,操作浏览器访问"""
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from lxml import etree
    import random
    import csv
    import time
    import re
    
    
    class LagouSpider(object):
        # 获取chromedriver.exe路径
        driver_path = r"E:Program Fileschromedriver.exe"
    
        def __init__(self):
            # 实例化这个路径
            self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
            self.url = "https://www.lagou.com/jobs/list_python?city=全国&cl=false&fromSearch=true&labelWords=&suginput="
    
        def run(self):
            # 页面列表仅限一次,取出while循环
            self.driver.get(self.url)
            while True:
                # 获取页面源码
                source = self.driver.page_source
                # 有异步加载,数据出来的慢,需要等待
                WebDriverWait(driver=self.driver, timeout=10).until(
                    EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
                )
                # 将完整的源码传递给函数解析
                self.parse_list_page(source)
                try:
                    # 获取下一页点击标签
                    next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                    # 是最后一页,停止循环
                    if "pager_next_disabled" in next_btn.get_attribute("class"):
                        break
                    else:
                        # 继续循环
                        next_btn.click()
                except:
                    print(source)
                time.sleep(1)
    
        def parse_list_page(self, source):
            # 获取详情url
            html = etree.HTML(source)
            links = html.xpath("//a[@class='position_link']/@href")
            for link in links:
                # 将获取的url传递给函数
                self.request_detail_page(link)
                time.sleep(random.randint(0, 2))
    
        def request_detail_page(self, url):
            # 打开新的页面窗口
            self.driver.execute_script("window.open('%s')" % url)
            # 移动到新的页面窗口
            self.driver.switch_to.window(self.driver.window_handles[1])
            # 有异步加载,数据出来的慢,需要等待
            WebDriverWait(self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
            )
            source = self.driver.page_source
            # 将源码传递给函数解析
            self.parse_detail_page(source)
            # 关闭当前这个详情页
            self.driver.close()
            # 继续切换回职位列表页
            self.driver.switch_to.window(self.driver.window_handles[0])
    
        def parse_detail_page(self, source):
            # 开始解析数据(不做介绍,前面有详解)
            html = etree.HTML(source)
            position_name = html.xpath("//span[@class='name']/text()")[0]
            job_request_spans = html.xpath("//dd[@class='job_request']//span")
            salary = job_request_spans[0].xpath(".//text()")[0].strip()
            city = job_request_spans[1].xpath(".//text()")[0].strip()
            city = re.sub(r'[s/]]', "", city)
            work_years = job_request_spans[2].xpath(".//text()")[0].strip()
            work_years = re.sub(r'[s/]]', "", work_years)
            education = job_request_spans[3].xpath(".//text()")[0].strip()
            education = re.sub(r'[s/]]', "", education)
            desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
            company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
            position = {(position_name, company_name, salary, city, work_years, education, desc)}
            with open("lagou.csv", "a+", encoding="utf-8", newline="") as fp:
                writer = csv.writer(fp)
                writer.writerows(position)
    
    
    if __name__ == '__main__':
        spider = LagouSpider()
        spider.run()
    View Code

    3.操作selenium爬取boss直聘

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from lxml import etree
    import random
    import time
    import csv
    import re
    
    
    class BossSpider(object):
        # 获取chromedriver.exe路径
        driver_path = r"E:Program Fileschromedriver.exe"
    
        def __init__(self):
            # 实例化对象
            self.driver = webdriver.Chrome(executable_path=BossSpider.driver_path)
            self.url = "https://www.zhipin.com/job_detail/?query=python&scity=100010000&industry=&position="
    
        def run(self):
            # 获取页面列表
            self.driver.get(self.url)
            while True:
                # 获取页面列表源码
                source = self.driver.page_source
                # 有异步加载,数据出来的慢,需要等待
                WebDriverWait(driver=self.driver, timeout=10).until(
                    EC.presence_of_element_located((By.XPATH, "//div[@class='page']/a[last()]"))
                )
                # 将完整的源码传递给函数解析
                self.parse_list_page(source)
                try:
                    # 获取下一页点击标签
                    next_btn = self.driver.find_element_by_xpath("//div[@class='page']/a[last()]")
                    # 是最后一页,停止循环
                    if "next disabled" in next_btn.get_attribute("class"):
                        break
                    else:
                        # 继续循环
                        next_btn.click()
                except:
                    print(source)
                time.sleep(random.randint(1, 5))
    
        def parse_list_page(self, source):
            html = etree.HTML(source)
            # 获取详情url
            links = html.xpath("//div[@class='info-primary']//a/@href")
            for link in links:
                url = "https://www.zhipin.com" + link
                # 将获取的url传递给函数
                self.request_detail_page(url)
                time.sleep(random.randint(1, 5))
    
        def request_detail_page(self, url):
            # 打开新的页面窗口
            self.driver.execute_script("window.open('%s')" % url)
            # 移动到新的页面窗口
            self.driver.switch_to.window(self.driver.window_handles[1])
            # 有异步加载,数据出来的慢,需要等待
            WebDriverWait(self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='info-primary']//div[@class='name']"))
            )
            source = self.driver.page_source
            # 将完整的源码传递给函数解析
            self.parse_detail_page(source)
            # 关闭当前这个详情页
            self.driver.close()
            # 继续切换回列表页
            self.driver.switch_to.window(self.driver.window_handles[0])
    
        def parse_detail_page(self, source):
            # 开始解析并获取数据
            html = etree.HTML(source)
            position_name = html.xpath("//div[@class='name']/h1/text()")[0].strip()
            company_name = html.xpath("//div[@class='info-company']//h3[@class='name']/a/text()")[0].strip()
            salary = html.xpath("//div[@class='name']/span[@class='badge']/text()")[0].strip()
            job_request_ps = html.xpath("//div[@class='info-primary']//p/text()")
            city = job_request_ps[0].strip()
            work_years = job_request_ps[1].strip()
            education = job_request_ps[2].strip()
            desc_tags = html.xpath("//div[@class='job-sec']/div[@class='text']")
            contents = ""
            for desc in desc_tags:
                tag_list = desc.xpath("./text()")
                for tag in tag_list:
                    # 替换无用字符串
                    text = re.sub(r"[s\xa0]", "", tag)
                    contents += text
            position = {(position_name, company_name, salary, city, work_years, education, contents)}
            # 打开文件以utf-8编码不换行进行追加,
            with open("boss.csv", "a+", encoding="utf-8", newline="") as fp:
                writer = csv.writer(fp)
                writer.writerows(position)
    
    
    if __name__ == '__main__':
        spider = BossSpider()
        spider.run()
    View Code

    4.12306抢票初步实现地址,内有包含步骤和说明

    https://github.com/yangyu57587720/12306GrabVotes
  • 相关阅读:
    CountDownLatch原理分析
    Maven項目打包報錯:Plugin execution not covered by lifecycle configuration
    Java实现编辑距离算法
    MQ的深入理解
    关于HashMap
    JVM基础详解
    Java实现Mysql的 substring_index 函数功能
    Kettle发送邮件
    Kettle实现从数据库中提取数据到Excel
    bzoj1596[Usaco2008 Jan]电话网络*
  • 原文地址:https://www.cnblogs.com/Guishuzhe/p/9855906.html
Copyright © 2020-2023  润新知