• 美团酒店


    #!/root/.pyenv/shims/python3.6
    # -*- coding:utf-8 -*-
    # Anuthor : zhibo.wang
    # E-mail  : d_1206@qq.com
    # Date    : 18/04/12 16:11:28
    # Desc    : 美团酒店
    
    
    import time
    import hashlib
    import socket
    import random
    import json
    import requests
    from bs4 import BeautifulSoup
    from data_utils.time_convert import get_time_stamp
    from data_utils.conmongodb import mongo_con_keepalive
    from data_utils.ali_oss import OSS2
    
    
    class Crawl:
        is_proxy = True  #
        proxyMeta = "http://xxxx:xxxx@proxy.abuyun.com:9020"
        proxies = {
            "http": proxyMeta,
            "https": proxyMeta,
        }
        start_url = "http://hotel.meituan.com/"
        time_stamp = get_time_stamp()
        path_dir = "hotel/meituan/{0}/".format(time_stamp)
        time_local = time.localtime(int(time_stamp))
        date = time.strftime("%Y%m%d", time_local)
    
        data_url = "https://ihotel.meituan.com/hbsearch/HotelSearch" 
                   "?utm_medium=pc" 
                   "&version_name=999.9" 
                   "&cateId=20" 
                   "&attr_28=129" 
                   "&uuid=" 
                   "&cityId=cityId" 
                   "&offset=0" 
                   "&limit=20" 
                   "&startDay={0}" 
                   "&endDay={1}" 
                   "&q=" 
                   "&sort=defaults" 
                   "&poi_attr_20022=poi_attr_20022".format(date, date)
        params_citys = "params_citys"
        website = "hotel_meituan"
        timeout = 20  # 超时时间
        if is_proxy:
            wait_time = [0.16, 0.17]
        else:
            wait_time = [1, 1.1, 1.2, 1.3]     # 间隔时间
        headers = {
                "Host": "hotel.meituan.com",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "Cache-Control" : "max-age=0",
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Content-Type": "text/html"
            }
    
        def __init__(self):
            self.db = mongo_con_keepalive()
            self.db.get_collection('pathdir_dict').insert_one({'pathdir': self.path_dir, 'website': self.website, 'flag': False})
            self.oss = OSS2()
            super(Crawl, self).__init__()
    
    
        def req(self, url, headers, pattern=True, num=3):
            # 请求数据
            time.sleep(random.choice(self.wait_time))
            soup = None
    
            if not num:
                return soup
            try:
                if self.is_proxy:
                    r = requests.get(url, headers=headers, timeout=self.timeout, proxies=self.proxies)
                else:
                    r = requests.get(url, headers=headers, timeout=self.timeout)
                if r.status_code == 200:
                    r.encoding = 'utf-8'
                    if pattern:
                        soup = BeautifulSoup(r.text, "html.parser")
                    else:
                        soup = r.json()
                elif r.status_code != 200:
                    num -=1
                    return self.req(url, headers, pattern, num)
    
            except Exception as e:
                print("fun req error: ", e)
            return soup
    
    
        def get_hotel_type_code(self, city_data):
            # 获取酒店类型
            city_url = "{0}{1}/".format(self.start_url, city_data.get("meituan_code"))
            headers = self.headers
            soup = self.req(city_url, headers, pattern=True)
            end_data = None
            if soup:
                txt = soup.find_all("div", class_="search-row-content")[2]
                end_data = [{"name": i.get_text().strip(), "poi_attr": i.get("href").split("/")[-2].replace("c", "")} for i in txt.find_all("a")]
    
            return end_data
    
    
        def create_filename(self, url):
            # 生成文件名
            fname = '%s_%s_%s_%s.html' % (socket.gethostname(),
                                              url.split('//')[-1].split('/')[0].replace('.', '-'),
                                              hashlib.md5(url.encode()).hexdigest(),
                                              str(time.time()).split('.')[0])
            return fname
    
        def get_data_totalcount(self, tot_url, headers):
            # 获取数据总数
            data = self.req(tot_url, headers, pattern=False)
            count = None
            if data:
                count = data.get("data").get("totalcount")
            return count
    
    
        def start(self):
            city_datas = self.db.get_collection(self.params_citys).find({})
            for city_data in city_datas:
                cityname = city_data.get("cityname")
                if city_data.get("meituan_code"):
                    referer = "{0}{1}/".format(self.start_url, city_data.get("meituan_code"))
                    hotel_type_codes = self.get_hotel_type_code(city_data)
                    # 获取酒店类型
                    if hotel_type_codes:
                        headers = self.headers
                        headers["Content-Type"] = "application/json, text/plain, */*"
                        headers["Host"] = "ihotel.meituan.com"
                        headers["Origin"] = "http://hotel.meituan.com"
                        headers["Referer"] = referer
                        for hotel_code in hotel_type_codes:
                            hotel_type_name = hotel_code.get("name")
                            hotel_type_code = hotel_code.get("poi_attr")
                            tot_url = self.data_url.replace("cityId=cityId", "cityId={0}".format(city_data.get("meituan_id")))
                                .replace("poi_attr_20022=poi_attr_20022","poi_attr_20022={0}".format(hotel_code.get("poi_attr")))
                            totalcount = self.get_data_totalcount(tot_url, headers)
                            # 获取数据总数 
                            if totalcount:
                                all_url = [tot_url.replace("offset=0", "offset={0}".format(c)) for c in range(0, totalcount+1, 20)]
                                # 根据数据总数生成所有分页地址
                                for url_ in all_url:
                                    data = self.req(url_, headers, pattern=False)
                                    if data:
                                        file_ = "{0}{1}".format(self.path_dir, self.create_filename(url_))
                                        data["cityname"] = cityname
                                        data["hotel_type_name"] = hotel_type_name
                                        data["hotel_type_code"] = hotel_type_code
                                        self.oss.uploadfiledata(file_, json.dumps(data))
    
    
    if __name__ == "__main__":
        C = Crawl()
        C.start()
    

       教程仅供技术研究学习使用,若有侵权,联系本人删除

  • 相关阅读:
    codevs 3971 航班
    2015山东信息学夏令营 Day4T3 生产
    2015山东信息学夏令营 Day5T3 路径
    Tyvj 1221 微子危机——战略
    清北学堂模拟赛 求和
    NOIP2012同余方程
    NOIP2009 Hankson的趣味题
    bzoj1441 MIN
    国家集训队论文分类
    贪心 + DFS
  • 原文地址:https://www.cnblogs.com/dockers/p/9238473.html
Copyright © 2020-2023  润新知