马蜂窝景点数据采集

马蜂窝数据采集，难点sn 加密采用固定字符串加上参数转化md5值截取部分md5

import hashlib
import pymongo
import pandas
import requests
import time
from pyquery import PyQuery as pq
from retry import retry


class ScenicSpot:
    def __init__(self):
        self.scenic_url = "http://www.mafengwo.cn/ajax/router.php"
        self.headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
        }
        self.client = pymongo.MongoClient().ScenicSpot.ScenicSpot
        self.pio_headers = {
            'Host': 'www.mafengwo.cn',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
            'Cookie': 'PHPSESSID=o98g37f4squ0aq4ubcr07d84f2; mfw_uuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222020-12-26+10%3A11%3A48%22%3B%7D; __jsluid_h=8b69bee30f0e6459c08df76385484c05; __omc_chl=; __omc_r=; __mfwc=direct; __mfwa=1608948709480.47993.1.1608948709480.1608948709480; __mfwb=0418cf79c433.1.direct; __mfwlv=1608948709; __mfwvn=1; __mfwlt=1608948709; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; uva=s%3A78%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1608948710%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1608948710%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; bottom_ad_status=1; UM_distinctid=1769cd106714ce-0be3d1620af6d5-3e604809-1fa400-1769cd106729c0; CNZZDATA30065558=cnzz_eid%3D1359122677-1608945050-%26ntime%3D1608945050; __jsl_clearance=1608948735.479|0|LP8kMR7h6lJOyF4aqU9yvnUg4Ek%3D'
            }
        self.all_list = []
        tunnel = "tps198.kdlapi.com:15818"
        username = "t10886694756492"
        password = "bjgfg7jn"
        self.proxies = {
            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
            "https": "https://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
        }

    def par(self, t):
        hl = hashlib.md5()
        hl.update(t)
        return hl.hexdigest()[2:12]

    def get_page(self):
        for i in range(18, 21):
            t = time.time() * 1000
            print('第' + str(i) + '页')
            page = str(i)
            qdata = '{"_ts":"' + str(t) + '","iMddid":"10794","iPage":"' + str(
                page) + '","iTagId":"0","sAct":"KMdd_StructWebAjax|GetPoisByTag"}c9d6618dbc657b41a66eb0af952906f1'
            sn = self.par(qdata.encode('utf-8'))
            data = {
                'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
                'iMddid': '10794',
                '_ts': t,
                'iPage': page,
                'iTagId': '0',
                '_sn': sn
            }
            response = requests.post(url=self.scenic_url, headers=self.headers, data=data, proxies=self.proxies)
            data = response.json()['data']['list']
            doc = pq(data)
            li_list = doc('li').items()
            for li in li_list:
                title = li('a').attr("title")
                title_url = "http://www.mafengwo.cn" + li('a').attr("href")
                self.get_point_info(title_url, title)
                time.sleep(2)

    def get_point_info(self, url, title):
        """
        cookie 有问题
        """
        poi_dict = {}
        poi_dict['景区名称'] = title
        self.pio_headers['Referer'] = url
        print(url)
        poi_resp = requests.get(url, headers=self.pio_headers)
        if poi_resp.status_code == 521:
            raise
        poi_doc = pq(poi_resp.content)
        introduction = poi_doc('.summary').text()
        poi_dict['景区介绍'] = introduction
        place_station = poi_doc('.mod.mod-location p').text()
        dl_list = poi_doc('.mod.mod-detail dl').items()
        for dl in dl_list:
            dt = dl('dt').text()
            if '门票' in dt:
                dd = dl('dd').text()
                poi_dict['门票'] = dd
            elif '开放时间' in dt:
                dd = dl('dd').text()
                poi_dict['开放时间'] = dd
            else:
                continue
        poi_dict['景点位置'] = place_station
        self.client.insert_one(poi_dict)
        print(poi_dict)
        self.all_list.append(poi_dict)

    def run(self):
        self.get_page()
        pandas.DataFrame(self.all_list).to_excel('旅游景点.xlsx', index=False)


if __name__ == '__main__':
    ScenicSpot().run()

相关阅读:
python格式化输出之format用法
 Mybatis插入数据返回主键
 DBC 和 Mybatis连接mysql数据库的时候，设置字符集编码
 工具列表
 Idea的Git如何回退到上一个版本
 mybatis-plus id主键生成的坑
 JAVA 线上故障排查完整套路，从 CPU、磁盘、内存、网络、GC 一条龙！
DDD-快速hold住业务的利器
 深入理解ThreadLocal的原理和内存泄漏问题
 VUE开发--环境配置
原文地址：https://www.cnblogs.com/lqn404/p/14231287.html