import re
import json
from odps import ODPS
from threading import Thread
import threading
from urllib import parse
import datetime
from lxml import etree
import random
import requests
import time
from models import *
# def write_txt(html_data):
# f = open("a.txt", 'a+')
# f.write(html_data)
# f.write("
")
# f.close()
domain_hotel = "https://hotel.qunar.com/cn/"
district_url = "https://hotel.qunar.com/napi/seo?path=%2Fseo%2Fnav&city="
hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" # 获取酒店评论数据的url,需要替换
#domain_hotel = "https://hotel.qunar.com/cn/sanya/?fromDate=2020-08-03&toDate=2020-08-04&cityName=%E4%B8%89%E4%BA%9A"
def get_cookies():
session = requests.Session()
url = "https://hotel.qunar.com/cn/bazhong/?fromDate=" + str(datetime.datetime.now().strftime('%Y-%m-%d')) + "&toDate=" + str((datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')) + "&cityName=巴中"
false = False
true = True
payload = {
'b':{
'bizVersion':17,
'cityUrl':'bazhong',
'fromDate':str(datetime.datetime.now().strftime('%Y-%m-%d')),
'toDate':str((datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')),
'q':'',
'qFrom':3,
'start':640,
'num':20,
'minPrice':0,
'maxPrice':-1,
'level':'',
'sort':0,
'cityType':1,
'fromForLog':1,
'uuid':'',
'userName':'',
'userId':'',
'fromAction':'',
'searchType':0,
'hourlyRoom':false,
'locationAreaFilter':[],
'comprehensiveFilter':[],
'channelId':1
},
'qrt':'h_hlist',
'source':'website'
}
session.post(url,data=payload)
cookies = requests.utils.dict_from_cookiejar(session.cookies)
return cookies['QN1']
# print(session.cookies)
# print(cookies)
# print(type(cookies))
# print(cookies['QN1'])
def change_cookie(headers_data): # 改变cookie
headers_data_0 = headers_data
cookie_data = get_cookies()
cookies_temp = headers_data['Cookie']
cookies_temp = cookies_temp.replace('QN1=00001480319827120b981f99',"QN1="+ str(cookie_data))
headers_data_0['Cookie'] = cookies_temp
print(headers_data_0)
return headers_data_0
# 获取城市数据,存储,利用获取到的json文本数据
def save_city_list():
with open('cityList.json','r',encoding='utf8')as fp:
json_data = json.load(fp)
for data in json_data:
for data_0 in data:
for data_value in data_0['value']:
district_url_0 = district_url + str(data_value['url'])
response = requests.request("GET", district_url_0)
json_city = json.loads(response.text)
#if len(json_city) > 0 and len(json_city['data'] > 0):
try:
if (json_city['data'][0]["name"] == data_value['name'] + "行政区酒店") and (json_city['data'][0]['type'] == "city"):
for item in json_city['data'][0]['list']:
data_i = item["name"].split("酒店")[0]
data_i = data_i.split(" ")[0]
catalogue = qunar_List_City()
catalogue.district_name = data_i # 行政区域名字
catalogue.district_spell = item['id'] # 行政区域拼音
catalogue.city_name = data_value['name'] # 城市名称
catalogue.city_spell = data_value['url'] # 城市拼音
catalogue.create_time = datetime.datetime.now() # 抓取时间
existed_id = qunar_List_City.select().where(qunar_List_City.district_spell==item['id'])
if existed_id:
pass
else:
catalogue.save(force_insert=True)
else:
catalogue = qunar_List_City()
catalogue.district_name = data_value['name'] # 行政区域名字
catalogue.district_spell = data_value['url'] # 行政区域拼音
catalogue.city_name = data_value['name'] # 城市名称
catalogue.city_spell = data_value['url'] # 城市拼音
catalogue.create_time = datetime.datetime.now() # 抓取时间
existed_id = qunar_List_City.select().where(qunar_List_City.city_name==data_value['name'])
if existed_id:
pass
else:
catalogue.save(force_insert=True)
except:
#print(response.status_code)
print("非大陆数据或者城市酒店数据为空")
print(district_url_0)
#根据catalogue存储的数据来获取城市的信息,用来拼接url
def save_hotel_url_to_redis():
id_data = qunar_List_City.select()
for item in id_data:
city_name = item.city_name
city_spell = item.city_spell
district_name = item.district_name # 行政区域名字
from_date = datetime.datetime.now().strftime('%Y-%m-%d')
to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
#url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + city_name
url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + parse.quote(city_name) + " " + str(district_name)
r.lpush('test.com:hotel_url',url)#酒店数据的爬取url
#根据catalogue存储的数据来获取门票的信息,用来拼接url
def save_ticket_url_to_redis():
id_data = qunar_List_City.select()
for item in id_data:
district_name = item.district_name # 行政区域名字
url = tickect_url.replace('%E5%A6%82%E7%9A%8B%E5%B8%82',str(parse.quote(district_name)))
r.lpush('test.com:ticket_url',url)#ticket票据数据的爬取url
domain_vacation = "https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_"
#'https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_%E8%8B%8F%E5%B7%9E_all?ti=3&tm=l01_all_search_newc'
#根据catalogue存储的数据来获取城市的信息,用来拼接url
def save_vacation_url_to_redis():
id_data = qunar_List_City.select()
for item in id_data:
district_name = item.district_name
url = domain_vacation + parse.quote(district_name) + '_all?ti=3&tm=l01_all_search_newc' + " " + str(item.city_name)
r.lpush('test.com:vacation_url',url)#度假商品的url
def get_nodes_json():
url = r.lpop('test.com:hotel_url')
#url = 'https://hotel.qunar.com/cn/wuzhishan/?fromDate=2020-08-06&toDate=2020-08-07&cityName=%E4%BA%94%E6%8C%87%E5%B1%B1'
city_spell = re.search(r"cn/(.*)/?",url).group(1) # 此处获取城市的对应拼音
city_name = re.search(r"cityName=(.*)",url).group(1) # 此处获取城市的名字
district_name = re.search(r"([u2E80-u9FFF]+)",url).group(1) # 此处获取行政区域的名字
url = url.split(" ")[0]
# city_name = parse.quote(city_name) # 对城市名字进行转码
from_date = datetime.datetime.now().strftime('%Y-%m-%d')
to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
payload_data = payload
payload_data = payload_data.replace(""cityUrl":" "",""cityUrl":"" + city_spell + """)
payload_data = payload_data.replace(""大兴区"",""" + district_name + """) # 行政区域名字
# payload_data = payload_data.replace(""num":20",""num":20") # 酒店翻页数量
#payload_data = payload_data.replace(""cityName":" "",""cityName":"" + city_name + """)
payload_data = payload_data.replace(""fromDate":" "",""fromDate":"" + from_date + """)
payload_data = payload_data.replace(""toDate":" "",""toDate":"" + to_date + """)
# payload_data = payload_data.encode("utf-8")
headers_data = headers
# cookie_data = get_cookies()
# cookies_temp = headers_data['Cookie']
# cookies_temp = cookies_temp.replace('QN1=00001480319827120b981f99',"QN1="+ str(cookie_data))
# print(cookies_temp)
# headers_data['Cookie'] = cookies_temp
headers_data['referer'] = url
# print(payload_data)
# print(headers_data)
hotel_number = 0
flag_num = 0 # 用来标记请求次数,请求10次也没有获取到数据,说明地级县市没有数据
while(1):
payload_data_0 = payload_data.encode("utf-8")
#headers_data = change_cookie(headers_data)
response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data_0)
json_data = json.loads(response.text)
#print(response.status_code,url,len(response.text))
print(district_name)
if flag_num > 15:
break
if response.status_code == 200:
flag_num = flag_num + 1
print(json_data['bstatus'])
if json_data['bstatus']['code'] == 0:
hotel_number = json_data['data']['tcount']
break
else:
#print(json_data['bstatus']['code'],url,len(response.text))
continue
print(hotel_number,"酒店总数量")
start_num = 0
before_num = 0
while(1):
if hotel_number > 0:
#print(before_num,start_num,hotel_number,"before_num","start_num","hotel_number")
print(hotel_number,"剩余未处理酒店数量")
payload_data = payload_data.replace(""start":" + str(before_num),""start":" + str(start_num)) # 起始酒店序号
payload_data_0 = payload_data.encode("utf-8")
process_response_data(headers_data,payload_data_0,hotel_number,district_name)
before_num = start_num
#payload_data = payload_data.replace(""num":20",""num":" + str(20 if (hotel_number > 20)else hotel_number)) # 酒店翻页数量
start_num = start_num + 20
hotel_number = hotel_number - 20
else:
break
# 获取酒店详细评论量
def get_hotel_comment(hotel_id):
# try:
hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1"
hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
response = requests.request("GET", hotel_comment)
while len(response.text) < 40:
response = requests.request("GET", hotel_comment)
#print(hotel_comment,response.status_code,len(response.text))
json_data = json.loads(response.text)
negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
return [negativeCount,neutralCount,positiveCount]
# except:
# time.sleep(3)
# try:
# hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1"
# hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
# response = requests.request("GET", hotel_comment)
# # print(hotel_comment,response.status_code)
# json_data = json.loads(response.text)
# negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
# neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
# positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
# return [negativeCount,neutralCount,positiveCount]
# except:
# hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1"
# hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
# print(hotel_comment,"没有评论数的酒店信息")
# return [0,0,0]
def process_response_data(headers_data,payload_data,hotel_number,district_name): # 处理response的相应信息
connect_times = 20 # 设置重连次数20次
flag_num = 0 # 设置标志位,达到条件则获取新的cookies
while(connect_times):
flag_num = flag_num + 1
if flag_num % 200 == 0:
headers_data = change_cookie(headers_data)
response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data)
if response.status_code == 200:
json_data = json.loads(response.text)
if json_data['bstatus']['code'] == 0:
time.sleep(random.randint(0,2)) # 设置随机休眠时间
connect_times = 0 # 重置重连次数
hotel_city = json_data['data']['cityName'] # 酒店所在的城市
print(len(json_data['data']['hotels']),"hotels的数量")
if hotel_number > 20 and len(json_data['data']['hotels']) != 20: # 此处代码用来判断数据大于20的时候,取值缺少数据
connect_times = 20
print(f"当前hotel_number={hotel_number}")
print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
print("休眠120s")
time.sleep(120)
continue
if hotel_number < 20 and len(json_data['data']['hotels']) != hotel_number: # 此处代码用来判断数据大于0的时候,取值数据缺少
connect_times = 20
print(f"当前hotel_number={hotel_number}")
print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
print("休眠120s")
time.sleep(120)
continue
for data_hotel in json_data['data']['hotels']:
#print(data_hotel)
hotel_data = qunar_Hotel_data()
hotel_data.hotel_district = district_name
hotel_data.hotel_city = hotel_city
hotel_data.hotel_name = data_hotel['name']
#write_txt(data_hotel['name'])
hotel_data.hotel_level = data_hotel['dangciText']
hotel_data.hotel_score = data_hotel['score']
hotel_data.hotel_price = float(data_hotel['price']) # print(data_hotel['price'] + data_hotel['currencySign'])
#print(data_hotel['price'])
hotel_data.hotel_commentCount = data_hotel['commentCount']
negativeCount,neutralCount,positiveCount = get_hotel_comment(data_hotel["seqNo"])
hotel_data.hotel_negativeCount = negativeCount
hotel_data.hotel_neutralCount = neutralCount
hotel_data.hotel_positiveCount = positiveCount
hotel_data.hotel_Number = data_hotel['phoneNumber']
hotel_data.hotel_LocationInfo = data_hotel['locationInfo']
hotel_data.hotel_image = data_hotel["imageid"]
hotel_data.create_time = datetime.datetime.now() # 抓取时间
hotel_data.save(force_insert=True)
else :
if json_data['bstatus']['code'] == -1000:
print("搜索条件修改")
time.sleep(3)
pass
else:
#print(f"第{20 - connect_times + 1}次尝试连接")
connect_times = connect_times -1
if 20 - connect_times + 1 > 18:
connect_times = 20
print("连接次数达到上线,休眠900s")
time.sleep(120)
pass
else:
print("网页请求错误")
class parse_qunar_url_Thread(Thread):
def run(self):
while(1):
get_nodes_json()
#保存最终的数据
if __name__ == "__main__":
create_tables()
save_city_list()
save_hotel_url_to_redis()
save_vacation_url_to_redis()
save_ticket_url_to_redis()
# for i in range(20):
# parse_qunar_url_thread = parse_qunar_url_Thread()
# parse_qunar_url_thread.start()