• 携程——多线程。


    # _*_coding: utf-8 _*_
    from fake_useragent import UserAgent
    import requests
    from requests.exceptions import RequestException
    import time
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import csv
    from lxml import etree
    import threading
    from queue import Queue
    
    
    
    ua = UserAgent()
    
    class XieCheng():
    
    
        def __init__(self):
            self.start_url = "http://hotels.ctrip.com/hotel/beijing1/p{}"
            self.details_url = "http://hotels.ctrip.com"     # 详情页url
            self.headers = {"User-Agent":ua.random}
            self.data_list = []
            self.details_list = []
            self.url_queue = Queue()
            self.html_queue = Queue()
            self.content_queue = Queue()
            # self.details_url_queue= Queue()
            self.details_html_queue = Queue()
            self.details_content_queue = Queue()
    
    
        # 构造列表页url列表
        def list_url(self):
            # url_list = [self.start_url.format(i) for i in range(1,500)]
            # return url_list
            for i in range(1,567):
                self.url_queue.put(self.start_url.format(i))
                # s = self.url_queue.get()
                # print(s)
    
    
        # 请求列表页
        def get_url(self):
            try:
                while True:
                    url = self.url_queue.get()
                    print(url)
                    response = requests.get(url=url,headers=self.headers)
                    if response.status_code ==200:
                        # return response.content.decode()
                        self.html_queue.put(response.content.decode())
                        self.url_queue.task_done()
                    return None
            except RequestException:
                return None
        #提取详情页url
        def extract_data(self):
            while True:
                html_str = self.html_queue.get()
                print('----'*20)
                html_list = etree.HTML(html_str)
                div_list = html_list.xpath("//div[@id='hotel_list']/div/ul/li[2]/h2/a/@href")
                for div_url in div_list:
                    url_str = self.details_url + div_url
                    print(url_str)
                    self.content_queue.put(url_str)
                self.html_queue.task_done()
    
         # 使用selenium请求详情页
        def driver_get(self):
            while True:
                details_url_s = self.content_queue.get()
                dcap = dict(DesiredCapabilities.PHANTOMJS)
                # 添加请求头
                dcap["phantomjs.page.settings.userAgent"] = (ua.random)
                # 取消图片加载
                dcap["phantomjs.page.settings.loadImages"] = False
                driver = webdriver.PhantomJS(desired_capabilities=dcap)
                driver.get(details_url_s)
                print(details_url_s)
                print("数据请求中。。。。。")
                time.sleep(3)
                details_html_str = driver.page_source
                driver.quit()
                self.details_html_queue.put(details_html_str)
                self.content_queue.task_done()
        # 提取详情页数据
        def driver_data(self,):
            while True:
                details_html_str= self.details_html_queue.get()
                html_str = etree.HTML(details_html_str)
                details_data_list= []
                item_dict = {}
                html_str_list = html_str.xpath(".//tr[@data-disable='0']")
                item_dict['name'] = html_str.xpath("//*[@id='J_htl_info']/div[1]/h2[1]/text()") if len(
                    html_str.xpath("//*[@id='J_htl_info']/div[1]/h2[1]/text()")) > 0 else None
                item_dict['id'] = html_str.xpath("//a[@id='linkViewMap']/@data-hotelid") if len(
                    html_str.xpath("//a[@id='linkViewMap']/@data-hotelid")) > 0 else None
                details_data_list.append(item_dict)
                for html in html_str_list:
                    item = {}
                    item['price'] = html.xpath(".//span[@class='base_price']/text()") if len(
                        html.xpath(".//span[@class='base_price']/text()")) > 0 else None
                    item['bed'] = html.xpath(".//td[@class='col3']/text()") if len(
                        html.xpath(".//td[@class='col3']/text()")) >0 else None
                    details_data_list.append(item)
                print(details_data_list)
                self.details_content_queue.put(details_data_list)
                self.details_html_queue.task_done()
        #保存
        def save_data(self):
            while True:
                details_data=self.details_content_queue.get()
                title = ['id', 'name', 'price', 'bed']
                with open('xiechen.csv', 'a+', encoding='utf-8') as f:
                    f_csv = csv.DictWriter(f,title)
                    f_csv.writeheader()
                    f_csv.writerows(details_data)
                    print("数据保存完成。。。。。。")
                self.details_content_queue.task_done()
    
        # 主函数
        def run(self):
            thread_list = []
            # 构造url列表
            t_url=threading.Thread(target=self.list_url)
            thread_list.append(t_url)
            # 请求列表页
            for i in range(567):
                t_g_url=threading.Thread(target= self.get_url)
                thread_list.append(t_g_url)
            # 提取详情页url
            for i in range(10):
                t_extract=threading.Thread(target=self.extract_data)
                thread_list.append(t_extract)
            # 请求详情页
            for i in range(7):
                t_details=threading.Thread(target=self.driver_get)
                thread_list.append(t_details)
    
            #提取详情页数据
            for i in range(5):
                t_details_data=threading.Thread(target=self.driver_data)
                thread_list.append(t_details_data)
            #保存
            for i in range(3):
                t_save=threading.Thread(target=self.save_data)
                thread_list.append(t_save)
    
            for t in thread_list:
                t.setDaemon(True)  # 设置子线程守护
                t.start()
    
            for q in [self.url_queue,
                      self.html_queue,
                      self.content_queue,
                      # self.details_url_queue,
                      self.details_html_queue,
                      self.details_content_queue,]:
                q.join()   # 阻塞
    
            print("主线程结束")
    if __name__ == '__main__':
        xc = XieCheng()
        xc.run()

    感觉多线程和phantomjs 不够完善·,想多写一些判断,和反爬措施。

  • 相关阅读:
    MYSQL EXPLAIN 执行计划详解
    MAC idea 快捷键
    mysql decimal类型与decimal长度用法详解
    docker zookeeper 集群搭建
    docker redis4.0 集群(cluster)搭建
    SSH登录问题 .ssh/known_hosts和 ECDSA key
    docker常用命令
    Linux 软件安装到 /usr,/usr/local/ 还是 /opt 目录?
    IoC基础篇(一)--- Spring容器中Bean的生命周期
    Maven实战(六)--- dependencies与dependencyManagement的区别
  • 原文地址:https://www.cnblogs.com/ittop/p/9471729.html
Copyright © 2020-2023  润新知