• 爬取大众点评评论


    # -*- coding: utf-8 -*-
    # @Time    : 2020/12/2 9:58
    # @Author  : ward
    # @File    : 3.py
    
    
    import re
    import requests
    import random
    from parsel import Selector
    
    
    class DaZongDianPing:
    
        def __init__(self):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.%s' % random.randint(1, 1000000),
                'Host': 'www.dianping.com',
                "Cookie": "_lxsdk_cuid=17451850da2c8-0151b42767558d-5a40201d-1fa400-17451850da3c8; _lxsdk=17451850da2c8-0151b42767558d-5a40201d-1fa400-17451850da3c8; _hc.v=3de5d487-65ad-a36a-71b1-c2eca91f37ef.1599095509; fspop=test; s_ViewType=10; dplet=e2e81f83472f4fe27e2a8928f00d24b7; dper=7f72060feeb63c283b680d6f38a54880bd9a44235ca03a0e460480fcf5d55e5a625885d5ad489752cfeb6709c61e2f9296fe9e733dd377f255892ff076df1a1d21f866b2ca757945796d0888f0951bf12e33b34f69f96426e173c128ea192a2d; ll=7fd06e815b796be3df069dec7836c3df; ua=%E5%90%8E%E5%A4%A9%E3%80%82%E3%80%82%E3%80%82; ctu=b5785c51f597a565ad3d8e65f9b30f75b29ec4a827de86a259bfeb6fbf80ce70; aburl=1; cy=2; cye=beijing; __utma=205923334.1472379769.1606804705.1606804705.1606804705.1; __utmz=205923334.1606804705.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=205923334; looyu_id=fcdb89be99ee3adbf016cb59105e993b84_51868%3A1; cityid=2; switchcityflashtoast=1; default_ab=citylist%3AA%3A1%7Cindex%3AA%3A3; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1606290415,1606879423; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606879438; _lxsdk_s=1762174b956-be1-f70-a8f%7C%7C327",
                'Accept-Encoding': 'gzip',
    
    
            }
    
            self.main()
    
        def main(self):
            proxies_url = 'http://xxx.com/api/gettps/?orderid=910687793312312&num=1&format=json&sep=1'
            # res = requests.get(proxies_url).json()
            # print(res)
            # ip = res.get('data').get('proxy_list')[0]
            # print(ip)
            # self.proxies = {
            #     "http": "http://xx@%s/" % ip
            # }
            html = self.get_index()
            css_url, class_name = self.get_url_and_tag(html)
            di = self.get_css_and_svg(css_url, class_name)
            self.parse_index(html, di)
    
        def get_index(self):
            url = 'http://www.dianping.com/shop/l2xKmVcQ5nWBYCXA/review_all'
            resp = requests.get(url, headers=self.headers)
            print(resp)
            print(resp.text)
            if resp.status_code == 200:
                return resp.text
    
        def get_url_and_tag(self, html):
            '''获取css_url和网页中的加密字体标签的class名'''
            css_url = re.findall(r'href="(.*?svgtextcss.*?)"', html)
            if css_url:
                css_url = 'http:' + css_url[0]
                # print(css_url)
            # 加密字体的class名
            class_name = re.findall(r'<svgmtsi class="(.*?)">', html)
            return css_url, class_name
    
        def get_css_and_svg(self, css_url, class_name):
            '''
            获取css属性和svg地址,根据css属性查找真实数据,构建替换字典
            svg地址有3个
            cc[class^="wgx"]  电话
            bb[class^="wnu"]  地址
            svgmtsi[class^="kvg"]  评论
            '''
            # print('css_url', css_url)
            # http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/0ddbef571464748f0408df2fb2ac1756.css
            css_resp = requests.get(css_url).text.replace("
    ", "").replace(" ", "")
            # print("css_resp", css_resp)
            # print(css_resp)
            # 获取评论的svg地址
            svg_url = re.findall(r'svgmtsi.*?url((.*?));', css_resp)
            # print("svg_url", svg_url)
            if svg_url:
                svg_url = 'http:' + svg_url[0]
                # print(svg_url)
                svg_resp = requests.get(svg_url).text
                # print("svg_resp", svg_resp)
            # 获取css属性值  对应的坐标值
            d = {}
            for name in class_name:
                # print('class_name', name)
                coord = re.findall(r"%s{background:-(.*?)px-(.*?)px;}" % name, css_resp)
                # print("coord", coord)
                x, y = coord[0]
                css_x, css_y = int(float(x)), int(float(y))
                # 获取svg标签对应的y值,规则是svg_y>=css_y
                svg_data = Selector(svg_resp)
                tests = svg_data.xpath('//text')
                # 3.如何选择svg_y?比较y坐标,选择大于等于css_y的最接近的svg_y
                svg_y = [i.attrib.get('y') for i in tests if css_y <= int(i.attrib.get('y'))][0]
                # 根据svg_y确定具体的text的标签
                svg_text = svg_data.xpath(f'//text[@y="{svg_y}"]/text()').extract_first()
                # 4、确认SVG中的文字大小
                font_size = re.findall(r'font-size:(d+)px', svg_resp)[0]
                # 5、得到css样式vhkbvu属性映射svg的位置
                # css_x // 字体大小 的值就是数值的下标
                position = css_x // int(font_size)
                s = svg_text[position]
                d[name] = s
            # print(d)
            # 加密字体整个标签与真实值之间的字典
            di = {f'<svgmtsi class="{k}"></svgmtsi>': v for k, v in d.items()}
            return di
    
        def parse_index(self, html, di):
            '''解析网页数据'''
            for key, value in di.items():
                if key in html:
                    html = html.replace(key, value)
                    # print(html)
            selector = Selector(html)
            # 评论摘要
            # desc_li = selector.xpath('//div[@class="review-truncated-words"]/text()').extract()
            desc_li = selector.xpath('//div[@class="review-words Hide"]/text()').extract()
            for desc in desc_li:
                desc = desc.replace('	', '').replace('
    ', '').replace(' ', '')
                print(desc)
    
    
    if __name__ == '__main__':
        a = DaZongDianPing()
  • 相关阅读:
    HCTF2018-admin
    SUCTF 2019-EasySQL
    BUUCTF-WEB-easy_tornado
    黑客攻防技术宝典web实战篇:攻击数据存储区习题
    可持久化数据结构·主席树(静态)
    Luogu P2661 [NOIP2015] 信息传递
    Luogu P2700 逐个击破
    Luogu P4779 【模板】单源最短路径(标准版)(Dijkstra+堆优化模板)
    Luogu P1962 斐波那契数列(矩阵乘法模板)
    Luogu P3366 【模板】最小生成树
  • 原文地址:https://www.cnblogs.com/xiao-xue-di/p/14073860.html
Copyright © 2020-2023  润新知