• html_parser


    import json
    from lxml import etree
    
    
    class HtmlParser(object):
        """这是HtmlParser"""
    
        # 提取urls
        def _get_new_urls(self):
            pass
    
        # 提取内容
        def _get_new_data(self):
            pass
    
        def parser(self, page_url, html_cont_str):
            if page_url is None or html_cont_str is None:
                return
            # dict_data=json.loads(html_cont)
            html_etree = etree.HTML(html_cont_str)  # 获取element 类型的html
            # node_list = html_etree.xpath("//div[@id='u1']/a")  # 获得节点
            node_list = html_etree.xpath("//a[starts-with(@href,'http')]|//a[starts-with(@href,'//')]")  # 获得节点
            print(len(node_list))
            # 遍历节点
            i = 1
            for node in node_list:
                a_href = node.xpath("./@href")[0]
                # a_href=node.xpath("./text()")
                print('No.%3s: %s' % (i, a_href))
                i += 1
            new_urls = self._get_new_urls()
            new_data = self._get_new_data()
    
            return new_urls, new_data
    
            pass
    

      

  • 相关阅读:
    Redis数据类型和基本操作
    Redis持久化
    Redis安装
    MySQL5.7二进制包安装
    Django ORM多表操作
    Django中启用事务
    Django ORM单表操作
    MySQL事务
    用顺序栈实现十进制向二进制转化
    顺序栈
  • 原文地址:https://www.cnblogs.com/andy9468/p/8060372.html
Copyright © 2020-2023  润新知