• 去哪儿网酒店数据启动程序更新


    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *

    # def write_txt(html_data):
    #     f = open("a.txt", 'a+')
    #     f.write(html_data)
    #     f.write(" ")
    #     f.close()

    domain_hotel = "https://hotel.qunar.com/cn/"
    district_url = "https://hotel.qunar.com/napi/seo?path=%2Fseo%2Fnav&city="
    hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" # 获取酒店评论数据的url,需要替换
    #domain_hotel = "https://hotel.qunar.com/cn/sanya/?fromDate=2020-08-03&toDate=2020-08-04&cityName=%E4%B8%89%E4%BA%9A"

    # 获取城市数据,存储,利用获取到的json文本数据
    def save_city_list():
        with open('cityList.json','r',encoding='utf8')as fp:
            json_data = json.load(fp)
            for data in json_data:
                for data_0 in data:
                    for data_value in data_0['value']:
                        district_url_0 = district_url + str(data_value['url'])
                        response = requests.request("GET", district_url_0)
                        json_city = json.loads(response.text)
                        #if len(json_city) > 0 and len(json_city['data'] > 0):
                        try:
                            if  (json_city['data'][0]["name"] == data_value['name'] + "行政区酒店") and (json_city['data'][0]['type'] == "city"):
                                for item in json_city['data'][0]['list']:
                                    data_i = item["name"].split("酒店")[0]
                                    data_i = data_i.split(" ")[0]
                                    catalogue = List_City()
                                    catalogue.district_name = data_i # 行政区域名字
                                    catalogue.district_spell = item['id'] # 行政区域拼音
                                    catalogue.city_name = data_value['name'] # 城市名称
                                    catalogue.city_spell = data_value['url'] # 城市拼音
                                    catalogue.create_time = datetime.datetime.now() # 抓取时间
                                    existed_id = List_City.select().where(List_City.district_spell==item['id'])
                                    if existed_id:
                                        pass  
                                    else:
                                        catalogue.save(force_insert=True)
                            else:
                                catalogue = List_City()
                                catalogue.district_name = data_value['name'] # 行政区域名字
                                catalogue.district_spell = data_value['url'] # 行政区域拼音
                                catalogue.city_name = data_value['name'] # 城市名称
                                catalogue.city_spell = data_value['url'] # 城市拼音
                                catalogue.create_time = datetime.datetime.now() # 抓取时间
                                existed_id = List_City.select().where(List_City.city_name==data_value['name'])
                                if existed_id:
                                    pass  
                                else:
                                    catalogue.save(force_insert=True)
                        except:
                            #print(response.status_code)
                            print("非大陆数据或者城市酒店数据为空")
                            print(district_url_0)  

    #根据catalogue存储的数据来获取城市的信息,用来拼接url
    def save_hotel_url_to_redis():
        id_data = List_City.select()
        for item in id_data:
            city_name = item.city_name
            city_spell = item.city_spell
            district_name = item.district_name # 行政区域名字
            from_date = datetime.datetime.now().strftime('%Y-%m-%d')
            to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')  
            #url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + city_name
            url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + parse.quote(city_name) + " " + str(district_name)
            r.lpush('qunar.com:hotel_url',url)#酒店数据的爬取url


    #根据catalogue存储的数据来获取门票的信息,用来拼接url
    def save_ticket_url_to_redis():
        id_data = List_City.select()
        for item in id_data:
            district_name = item.district_name # 行政区域名字
            url = tickect_url.replace('%E5%A6%82%E7%9A%8B%E5%B8%82',str(parse.quote(district_name)))  
            r.lpush('qunar.com:ticket_url',url)#酒店数据的爬取url

    domain_vacation = "https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_"
    #'https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_%E8%8B%8F%E5%B7%9E_all?ti=3&tm=l01_all_search_newc'
    #根据catalogue存储的数据来获取城市的信息,用来拼接url
    def save_vacation_url_to_redis():
        id_data = List_City.select()
        for item in id_data:
            city_name = item.city_name
            url = domain_vacation + parse.quote(city_name) + '_all?ti=3&tm=l01_all_search_newc' 
            r.lpush('qunar.com:vacation_url',url)#度假商品的url

    def get_nodes_json():
        url =  r.lpop('qunar.com:hotel_url')
        #url = 'https://hotel.qunar.com/cn/wuzhishan/?fromDate=2020-08-06&toDate=2020-08-07&cityName=%E4%BA%94%E6%8C%87%E5%B1%B1'
        city_spell = re.search(r"cn/(.*)/?",url).group(1) # 此处获取城市的对应拼音
        city_name = re.search(r"cityName=(.*)",url).group(1) # 此处获取城市的名字
        district_name = re.search(r"([u2E80-u9FFF]+)",url).group(1) # 此处获取行政区域的名字

        url = url.split(" ")[0]
        # city_name = parse.quote(city_name) # 对城市名字进行转码

        from_date = datetime.datetime.now().strftime('%Y-%m-%d')
        to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
        
        payload_data = payload
        payload_data = payload_data.replace(""cityUrl":" "",""cityUrl":"" + city_spell + """)
        payload_data = payload_data.replace(""大兴区"",""" + district_name + """) # 行政区域名字
        # payload_data = payload_data.replace(""num":20",""num":20") # 酒店翻页数量
        #payload_data = payload_data.replace(""cityName":" "",""cityName":"" + city_name + """)
        payload_data = payload_data.replace(""fromDate":" "",""fromDate":"" + from_date + """)
        payload_data = payload_data.replace(""toDate":" "",""toDate":"" + to_date + """)
        # payload_data = payload_data.encode("utf-8")

        headers_data = headers
        cookie_data = headers_data['cookie']
        cookie_data = cookie_data.replace("cityUrl=" "","cityUrl=" + city_spell)
        cookie_data = cookie_data.replace("cityName=" "","cityName=" + city_name)
        cookie_data = cookie_data.replace("checkInDate=" "","checkInDate=" + from_date)
        cookie_data = cookie_data.replace("checkOutDate=" "","checkOutDate=" + to_date)

        headers_data['cookie'] = cookie_data
        headers_data['referer'] = url
        # print(payload_data)
        # print(headers_data)
        hotel_number = 0
        while(1):
            payload_data_0 = payload_data.encode("utf-8")
            response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data_0)
            json_data = json.loads(response.text)
            if response.status_code == 200:
                #print(json_data['bstatus'])
                if json_data['bstatus']['code'] == 0:
                    hotel_number = json_data['data']['tcount']    
                    break

        print(hotel_number)
        start_num = 0
        before_num = 0
        while(1): 
            if hotel_number  > 0:
                print(before_num,start_num,hotel_number,"before_num","start_num","hotel_number")
                payload_data = payload_data.replace(""start":" + str(before_num),""start":" + str(start_num)) # 起始酒店序号
                payload_data_0 = payload_data.encode("utf-8")
                process_response_data(headers_data,payload_data_0,hotel_number,district_name)
                before_num = start_num
                #payload_data = payload_data.replace(""num":20",""num":" + str(20 if (hotel_number > 20)else hotel_number)) # 酒店翻页数量
                start_num = start_num + 20
                hotel_number = hotel_number - 20
            else:
                break

    # 获取酒店详细评论量
    def get_hotel_comment(hotel_id):
        hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
        hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
        response = requests.request("GET", hotel_comment)
        json_data = json.loads(response.text)
        negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
        neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
        positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
        return [negativeCount,neutralCount,positiveCount]
        

    def process_response_data(headers_data,payload_data,hotel_number,district_name): # 处理response的相应信息
        connect_times = 20 # 设置重连次数20次
        while(connect_times):
            response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data)
            if response.status_code == 200:
                json_data = json.loads(response.text)
                if json_data['bstatus']['code'] == 0:
                    time.sleep(random.randint(0,2)) # 设置随机休眠时间
                    connect_times = 0 # 重置重连次数 
                    hotel_city = json_data['data']['cityName'] # 酒店所在的城市
                    print(len(json_data['data']['hotels']),"hotels的数量")
                    if hotel_number > 20 and len(json_data['data']['hotels']) != 20: # 此处代码用来判断数据大于20的时候,取值缺少数据
                        connect_times = 20
                        print(f"当前hotel_number={hotel_number}")
                        print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                        print("休眠120s")
                        time.sleep(120)
                        continue
                    if hotel_number < 20 and len(json_data['data']['hotels']) != hotel_number: # 此处代码用来判断数据大于0的时候,取值数据缺少
                        connect_times = 20
                        print(f"当前hotel_number={hotel_number}")
                        print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                        print("休眠120s")
                        time.sleep(120)
                        continue

                    for data_hotel in json_data['data']['hotels']:
                        #print(data_hotel)
                        hotel_data = Hotel_data()
                        hotel_data.hotel_district = district_name
                        hotel_data.hotel_city = hotel_city
                        hotel_data.hotel_name = data_hotel['name']
                        #write_txt(data_hotel['name'])
                        hotel_data.hotel_level = data_hotel['dangciText']
                        hotel_data.hotel_score = data_hotel['score']
                        hotel_data.hotel_price = float(data_hotel['price']) # print(data_hotel['price'] + data_hotel['currencySign'])
                        #print(data_hotel['price'])
                        hotel_data.hotel_commentCount = data_hotel['commentCount']
                        negativeCount,neutralCount,positiveCount = get_hotel_comment(data_hotel["seqNo"])

                        hotel_data.hotel_negativeCount = negativeCount
                        hotel_data.hotel_neutralCount = neutralCount
                        hotel_data.hotel_positiveCount = positiveCount

                        hotel_data.hotel_Number = data_hotel['phoneNumber']
                        hotel_data.hotel_LocationInfo = data_hotel['locationInfo']
                        hotel_data.hotel_image = data_hotel["imageid"]
                        hotel_data.create_time = datetime.datetime.now() # 抓取时间
                        
                        hotel_data.save(force_insert=True)       
                else :
                    if json_data['bstatus']['code'] == -1000:
                        print("搜索条件修改")
                        time.sleep(3)
                        pass 
                    else:
                        print(f"第{20 - connect_times + 1}次尝试连接")
                        connect_times = connect_times -1
                        if 20 - connect_times + 1 > 18:
                            connect_times = 20
                            print("连接次数达到上线,休眠900s")
                            time.sleep(120)
                        pass    
            else:
                print("网页请求错误")

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
            #保存最终的数据

    if __name__ == "__main__":
        create_tables()
        save_city_list()
        save_hotel_url_to_redis()
        save_vacation_url_to_redis()
        save_ticket_url_to_redis()
        for i in range(100):
            parse_qunar_url_thread = parse_qunar_url_Thread()     
            parse_qunar_url_thread.start()    
    两年大概看此博客blog.codingnow.com/aee/
  • 相关阅读:
    Android 横屏启动activity,点击屏幕的单击、双击
    实现Android简单动画旋转案例
    当时遇到的主要难点在于TextView的内容不会刷新改变值,今天终于通过Timer和Handler实现了,分享给大家
    如何在Android当中显示网络图片
    Android的MediaRecorder架构介绍
    理解Android系统的进程间通信原理RPC机制
    Android开发WeatherForecast程序
    Android 如何导入已有的外部数据库
    百度地图API 源码
    Android TelephonyManager类
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13536708.html
Copyright © 2020-2023  润新知