• 马蜂窝景点数据采集


    马蜂窝数据采集,难点sn 加密 采用固定字符串 加上参数转化md5值截取部分md5

    import hashlib
    import pymongo
    import pandas
    import requests
    import time
    from pyquery import PyQuery as pq
    from retry import retry
    
    
    class ScenicSpot:
        def __init__(self):
            self.scenic_url = "http://www.mafengwo.cn/ajax/router.php"
            self.headers = {
                'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
            }
            self.client = pymongo.MongoClient().ScenicSpot.ScenicSpot
            self.pio_headers = {
                'Host': 'www.mafengwo.cn',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
                'Cookie': 'PHPSESSID=o98g37f4squ0aq4ubcr07d84f2; mfw_uuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222020-12-26+10%3A11%3A48%22%3B%7D; __jsluid_h=8b69bee30f0e6459c08df76385484c05; __omc_chl=; __omc_r=; __mfwc=direct; __mfwa=1608948709480.47993.1.1608948709480.1608948709480; __mfwb=0418cf79c433.1.direct; __mfwlv=1608948709; __mfwvn=1; __mfwlt=1608948709; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; uva=s%3A78%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1608948710%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1608948710%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; bottom_ad_status=1; UM_distinctid=1769cd106714ce-0be3d1620af6d5-3e604809-1fa400-1769cd106729c0; CNZZDATA30065558=cnzz_eid%3D1359122677-1608945050-%26ntime%3D1608945050; __jsl_clearance=1608948735.479|0|LP8kMR7h6lJOyF4aqU9yvnUg4Ek%3D'
                }
            self.all_list = []
            tunnel = "tps198.kdlapi.com:15818"
            username = "t10886694756492"
            password = "bjgfg7jn"
            self.proxies = {
                "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
                "https": "https://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
            }
    
        def par(self, t):
            hl = hashlib.md5()
            hl.update(t)
            return hl.hexdigest()[2:12]
    
        def get_page(self):
            for i in range(18, 21):
                t = time.time() * 1000
                print('第' + str(i) + '页')
                page = str(i)
                qdata = '{"_ts":"' + str(t) + '","iMddid":"10794","iPage":"' + str(
                    page) + '","iTagId":"0","sAct":"KMdd_StructWebAjax|GetPoisByTag"}c9d6618dbc657b41a66eb0af952906f1'
                sn = self.par(qdata.encode('utf-8'))
                data = {
                    'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
                    'iMddid': '10794',
                    '_ts': t,
                    'iPage': page,
                    'iTagId': '0',
                    '_sn': sn
                }
                response = requests.post(url=self.scenic_url, headers=self.headers, data=data, proxies=self.proxies)
                data = response.json()['data']['list']
                doc = pq(data)
                li_list = doc('li').items()
                for li in li_list:
                    title = li('a').attr("title")
                    title_url = "http://www.mafengwo.cn" + li('a').attr("href")
                    self.get_point_info(title_url, title)
                    time.sleep(2)
    
        def get_point_info(self, url, title):
            """
            cookie 有问题
            """
            poi_dict = {}
            poi_dict['景区名称'] = title
            self.pio_headers['Referer'] = url
            print(url)
            poi_resp = requests.get(url, headers=self.pio_headers)
            if poi_resp.status_code == 521:
                raise
            poi_doc = pq(poi_resp.content)
            introduction = poi_doc('.summary').text()
            poi_dict['景区介绍'] = introduction
            place_station = poi_doc('.mod.mod-location p').text()
            dl_list = poi_doc('.mod.mod-detail dl').items()
            for dl in dl_list:
                dt = dl('dt').text()
                if '门票' in dt:
                    dd = dl('dd').text()
                    poi_dict['门票'] = dd
                elif '开放时间' in dt:
                    dd = dl('dd').text()
                    poi_dict['开放时间'] = dd
                else:
                    continue
            poi_dict['景点位置'] = place_station
            self.client.insert_one(poi_dict)
            print(poi_dict)
            self.all_list.append(poi_dict)
    
        def run(self):
            self.get_page()
            pandas.DataFrame(self.all_list).to_excel('旅游景点.xlsx', index=False)
    
    
    if __name__ == '__main__':
        ScenicSpot().run()
    

      

  • 相关阅读:
    python格式化输出之format用法
    Mybatis插入数据返回主键
    DBC 和 Mybatis连接mysql数据库的时候,设置字符集编码
    工具列表
    Idea的Git如何回退到上一个版本
    mybatis-plus id主键生成的坑
    JAVA 线上故障排查完整套路,从 CPU、磁盘、内存、网络、GC 一条龙!
    DDD-快速hold住业务的利器
    深入理解ThreadLocal的原理和内存泄漏问题
    VUE开发--环境配置
  • 原文地址:https://www.cnblogs.com/lqn404/p/14231287.html
Copyright © 2020-2023  润新知