• 去哪儿网本地启动程序


    import re
    import json
    from odps import ODPS
    from threading import Thread
    import threading
    from urllib import parse
    import datetime
    from lxml import etree

    import random 
    import requests
    import time

    from models import *

    # def write_txt(html_data):
    #     f = open("a.txt", 'a+')
    #     f.write(html_data)
    #     f.write(" ")
    #     f.close()

    domain_hotel = "https://hotel.qunar.com/cn/"
    district_url = "https://hotel.qunar.com/napi/seo?path=%2Fseo%2Fnav&city="
    hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" # 获取酒店评论数据的url,需要替换
    #domain_hotel = "https://hotel.qunar.com/cn/sanya/?fromDate=2020-08-03&toDate=2020-08-04&cityName=%E4%B8%89%E4%BA%9A"

    def get_cookies():
        session = requests.Session()
        url = "https://hotel.qunar.com/cn/bazhong/?fromDate=" + str(datetime.datetime.now().strftime('%Y-%m-%d')) + "&toDate=" + str((datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')) + "&cityName=巴中"
        false = False
        true = True
        payload = {
        'b':{
            'bizVersion':17,
            'cityUrl':'bazhong',
            'fromDate':str(datetime.datetime.now().strftime('%Y-%m-%d')),
            'toDate':str((datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')),
            'q':'',
            'qFrom':3,
            'start':640,
            'num':20,
            'minPrice':0,
            'maxPrice':-1,
            'level':'',
            'sort':0,
            'cityType':1,
            'fromForLog':1,
            'uuid':'',
            'userName':'',
            'userId':'',
            'fromAction':'',
            'searchType':0,
            'hourlyRoom':false,
            'locationAreaFilter':[],
            'comprehensiveFilter':[],
            'channelId':1
            },
        'qrt':'h_hlist',
        'source':'website'
        }
        session.post(url,data=payload)
        cookies = requests.utils.dict_from_cookiejar(session.cookies)
        return cookies['QN1']
        # print(session.cookies)
        # print(cookies)
        # print(type(cookies))
        # print(cookies['QN1'])

    def change_cookie(headers_data): # 改变cookie
        headers_data_0 = headers_data
        cookie_data = get_cookies()
        cookies_temp = headers_data['Cookie']
        cookies_temp = cookies_temp.replace('QN1=00001480319827120b981f99',"QN1="+ str(cookie_data))
        headers_data_0['Cookie'] = cookies_temp
        print(headers_data_0)
        return headers_data_0

    # 获取城市数据,存储,利用获取到的json文本数据
    def save_city_list():
        with open('cityList.json','r',encoding='utf8')as fp:
            json_data = json.load(fp)
            for data in json_data:
                for data_0 in data:
                    for data_value in data_0['value']:
                        district_url_0 = district_url + str(data_value['url'])
                        response = requests.request("GET", district_url_0)
                        json_city = json.loads(response.text)
                        #if len(json_city) > 0 and len(json_city['data'] > 0):
                        try:
                            if  (json_city['data'][0]["name"] == data_value['name'] + "行政区酒店") and (json_city['data'][0]['type'] == "city"):
                                for item in json_city['data'][0]['list']:
                                    data_i = item["name"].split("酒店")[0]
                                    data_i = data_i.split(" ")[0]
                                    catalogue = qunar_List_City()
                                    catalogue.district_name = data_i # 行政区域名字
                                    catalogue.district_spell = item['id'] # 行政区域拼音
                                    catalogue.city_name = data_value['name'] # 城市名称
                                    catalogue.city_spell = data_value['url'] # 城市拼音
                                    catalogue.create_time = datetime.datetime.now() # 抓取时间
                                    existed_id = qunar_List_City.select().where(qunar_List_City.district_spell==item['id'])
                                    if existed_id:
                                        pass  
                                    else:
                                        catalogue.save(force_insert=True)
                            else:
                                catalogue = qunar_List_City()
                                catalogue.district_name = data_value['name'] # 行政区域名字
                                catalogue.district_spell = data_value['url'] # 行政区域拼音
                                catalogue.city_name = data_value['name'] # 城市名称
                                catalogue.city_spell = data_value['url'] # 城市拼音
                                catalogue.create_time = datetime.datetime.now() # 抓取时间
                                existed_id = qunar_List_City.select().where(qunar_List_City.city_name==data_value['name'])
                                if existed_id:
                                    pass  
                                else:
                                    catalogue.save(force_insert=True)
                        except:
                            #print(response.status_code)
                            print("非大陆数据或者城市酒店数据为空")
                            print(district_url_0)  

    #根据catalogue存储的数据来获取城市的信息,用来拼接url
    def save_hotel_url_to_redis():
        id_data = qunar_List_City.select()
        for item in id_data:
            city_name = item.city_name
            city_spell = item.city_spell
            district_name = item.district_name # 行政区域名字
            from_date = datetime.datetime.now().strftime('%Y-%m-%d')
            to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')  
            #url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + city_name
            url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + parse.quote(city_name) + " " + str(district_name)
            r.lpush('test.com:hotel_url',url)#酒店数据的爬取url


    #根据catalogue存储的数据来获取门票的信息,用来拼接url
    def save_ticket_url_to_redis():
        id_data = qunar_List_City.select()
        for item in id_data:
            district_name = item.district_name # 行政区域名字
            url = tickect_url.replace('%E5%A6%82%E7%9A%8B%E5%B8%82',str(parse.quote(district_name)))  
            r.lpush('test.com:ticket_url',url)#ticket票据数据的爬取url

    domain_vacation = "https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_"
    #'https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_%E8%8B%8F%E5%B7%9E_all?ti=3&tm=l01_all_search_newc'
    #根据catalogue存储的数据来获取城市的信息,用来拼接url
    def save_vacation_url_to_redis():
        id_data = qunar_List_City.select()
        for item in id_data:
            district_name = item.district_name
            url = domain_vacation + parse.quote(district_name) + '_all?ti=3&tm=l01_all_search_newc' + " " + str(item.city_name) 
            r.lpush('test.com:vacation_url',url)#度假商品的url

    def get_nodes_json():
        url =  r.lpop('test.com:hotel_url')
        #url = 'https://hotel.qunar.com/cn/wuzhishan/?fromDate=2020-08-06&toDate=2020-08-07&cityName=%E4%BA%94%E6%8C%87%E5%B1%B1'
        city_spell = re.search(r"cn/(.*)/?",url).group(1) # 此处获取城市的对应拼音
        city_name = re.search(r"cityName=(.*)",url).group(1) # 此处获取城市的名字
        district_name = re.search(r"([u2E80-u9FFF]+)",url).group(1) # 此处获取行政区域的名字

        url = url.split(" ")[0]
        # city_name = parse.quote(city_name) # 对城市名字进行转码

        from_date = datetime.datetime.now().strftime('%Y-%m-%d')
        to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
        
        payload_data = payload
        payload_data = payload_data.replace(""cityUrl":" "",""cityUrl":"" + city_spell + """)
        payload_data = payload_data.replace(""大兴区"",""" + district_name + """) # 行政区域名字
        # payload_data = payload_data.replace(""num":20",""num":20") # 酒店翻页数量
        #payload_data = payload_data.replace(""cityName":" "",""cityName":"" + city_name + """)
        payload_data = payload_data.replace(""fromDate":" "",""fromDate":"" + from_date + """)
        payload_data = payload_data.replace(""toDate":" "",""toDate":"" + to_date + """)
        # payload_data = payload_data.encode("utf-8")

        headers_data = headers
        # cookie_data = get_cookies()
        # cookies_temp = headers_data['Cookie']
        # cookies_temp = cookies_temp.replace('QN1=00001480319827120b981f99',"QN1="+ str(cookie_data))
        # print(cookies_temp)
        # headers_data['Cookie'] = cookies_temp
        headers_data['referer'] = url
        # print(payload_data)
        # print(headers_data)
        hotel_number = 0

        flag_num = 0 # 用来标记请求次数,请求10次也没有获取到数据,说明地级县市没有数据
        while(1):
            payload_data_0 = payload_data.encode("utf-8")
            #headers_data = change_cookie(headers_data)
            response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data_0)
            json_data = json.loads(response.text)
            #print(response.status_code,url,len(response.text))
            print(district_name)
            if flag_num > 15:
                break
            if response.status_code == 200:
                flag_num = flag_num + 1
                print(json_data['bstatus'])
                if json_data['bstatus']['code'] == 0:
                    hotel_number = json_data['data']['tcount']    
                    break
                else:
                    #print(json_data['bstatus']['code'],url,len(response.text)) 
                    continue
                    
            
        print(hotel_number,"酒店总数量")
        start_num = 0
        before_num = 0
        while(1): 
            if hotel_number  > 0:
                #print(before_num,start_num,hotel_number,"before_num","start_num","hotel_number")
                print(hotel_number,"剩余未处理酒店数量")
                payload_data = payload_data.replace(""start":" + str(before_num),""start":" + str(start_num)) # 起始酒店序号
                payload_data_0 = payload_data.encode("utf-8")
                process_response_data(headers_data,payload_data_0,hotel_number,district_name)
                before_num = start_num
                #payload_data = payload_data.replace(""num":20",""num":" + str(20 if (hotel_number > 20)else hotel_number)) # 酒店翻页数量
                start_num = start_num + 20
                hotel_number = hotel_number - 20
            else:
                break

    # 获取酒店详细评论量
    def get_hotel_comment(hotel_id):
        # try:
        hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
        hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
        response = requests.request("GET", hotel_comment)
        while len(response.text) < 40:
            response = requests.request("GET", hotel_comment)
        #print(hotel_comment,response.status_code,len(response.text))
        json_data = json.loads(response.text)
        negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
        neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
        positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
        return [negativeCount,neutralCount,positiveCount]
        # except:
        #     time.sleep(3)
        #     try:
        #         hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
        #         hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
        #         response = requests.request("GET", hotel_comment)
        #         # print(hotel_comment,response.status_code)
        #         json_data = json.loads(response.text)
        #         negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
        #         neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
        #         positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
        #         return [negativeCount,neutralCount,positiveCount]
        #     except:
        #         hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
        #         hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
        #         print(hotel_comment,"没有评论数的酒店信息")
        #         return [0,0,0]
        

    def process_response_data(headers_data,payload_data,hotel_number,district_name): # 处理response的相应信息
        connect_times = 20 # 设置重连次数20次
        flag_num = 0 # 设置标志位,达到条件则获取新的cookies
        while(connect_times):
            flag_num = flag_num + 1
            if flag_num % 200 == 0:
                headers_data = change_cookie(headers_data)
            response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data)
            if response.status_code == 200:
                json_data = json.loads(response.text)
                if json_data['bstatus']['code'] == 0:
                    time.sleep(random.randint(0,2)) # 设置随机休眠时间
                    connect_times = 0 # 重置重连次数 
                    hotel_city = json_data['data']['cityName'] # 酒店所在的城市
                    print(len(json_data['data']['hotels']),"hotels的数量")
                    if hotel_number > 20 and len(json_data['data']['hotels']) != 20: # 此处代码用来判断数据大于20的时候,取值缺少数据
                        connect_times = 20
                        print(f"当前hotel_number={hotel_number}")
                        print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                        print("休眠120s")
                        time.sleep(120)
                        continue
                    if hotel_number < 20 and len(json_data['data']['hotels']) != hotel_number: # 此处代码用来判断数据大于0的时候,取值数据缺少
                        connect_times = 20
                        print(f"当前hotel_number={hotel_number}")
                        print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                        print("休眠120s")
                        time.sleep(120)
                        continue

                    for data_hotel in json_data['data']['hotels']:
                        #print(data_hotel)
                        hotel_data = qunar_Hotel_data()
                        hotel_data.hotel_district = district_name
                        hotel_data.hotel_city = hotel_city
                        hotel_data.hotel_name = data_hotel['name']
                        #write_txt(data_hotel['name'])
                        hotel_data.hotel_level = data_hotel['dangciText']
                        hotel_data.hotel_score = data_hotel['score']
                        hotel_data.hotel_price = float(data_hotel['price']) # print(data_hotel['price'] + data_hotel['currencySign'])
                        #print(data_hotel['price'])
                        hotel_data.hotel_commentCount = data_hotel['commentCount']
                        negativeCount,neutralCount,positiveCount = get_hotel_comment(data_hotel["seqNo"])

                        hotel_data.hotel_negativeCount = negativeCount
                        hotel_data.hotel_neutralCount = neutralCount
                        hotel_data.hotel_positiveCount = positiveCount

                        hotel_data.hotel_Number = data_hotel['phoneNumber']
                        hotel_data.hotel_LocationInfo = data_hotel['locationInfo']
                        hotel_data.hotel_image = data_hotel["imageid"]
                        hotel_data.create_time = datetime.datetime.now() # 抓取时间
                        
                        hotel_data.save(force_insert=True)       
                else :
                    if json_data['bstatus']['code'] == -1000:
                        print("搜索条件修改")
                        time.sleep(3)
                        pass 
                    else:
                        #print(f"第{20 - connect_times + 1}次尝试连接")
                        connect_times = connect_times -1
                        if 20 - connect_times + 1 > 18:
                            connect_times = 20
                            print("连接次数达到上线,休眠900s")
                            time.sleep(120)
                        pass    
            else:
                print("网页请求错误")

    class parse_qunar_url_Thread(Thread):
        def run(self):
            while(1):
                get_nodes_json()
            #保存最终的数据

    if __name__ == "__main__":
        create_tables()
        save_city_list()
        save_hotel_url_to_redis()
        save_vacation_url_to_redis()
        save_ticket_url_to_redis()
        # for i in range(20):
        #     parse_qunar_url_thread = parse_qunar_url_Thread()     
        #     parse_qunar_url_thread.start() 
  • 相关阅读:
    线程同步——用户模式下线程同步——Slim读写锁实现线程同步
    线程同步——用户模式下线程同步——关键段实现线程同步
    线程同步——用户模式下线程同步——Interlocked实现线程同步
    创建线程
    GDI的 点 线 面 双缓冲 位图的绘制
    简单的windows窗口创建实例
    宽字符与多字符
    学习MFC的建议
    DataGrip 2018.3.1破解激活码
    mysql 主从复制配置
  • 原文地址:https://www.cnblogs.com/dog-and-cat/p/13615442.html
Copyright © 2020-2023  润新知