• 有时间的时候可以看看


    # from django.test import TestCase
    #
    # # Create your tests here.
    #
    #
    # # 可变数据类型  [] {}
    # # 不可变数据类型 数字 字符串  元组
    #
    # # s="hello".upper()
    # # print(s)
    #
    #
    # # l=[1,2,3]
    # #
    # # c=[4,5]
    # #
    # # l.append(c)
    # # c.append(7)
    # #
    # # print(c)  #
    # # print(l)
    #
    #
    #
    #
    # # d1={"name":"yuan"}
    # # d2={"age":12}
    # # d1["xxx"]=d2
    # #
    # # d2["height"]="180cm"
    # #
    # # print(d1)   # {"name":"yuan","xxx":{"age":12,"height":"180cm"}}
    #
    #
    #
    #
    # # d1={1:{"xxx":[12,34]},2:{"xxx":[34,56,[777,888,999],[11,8238,99]]}}
    # # d2={"xxx":[777,888,999,[11,8238,99]]}
    # # d3={"xxx":[11,8238,99]}
    # #
    # # d1[2]["xxx"].append(d2["xxx"])
    # # d2["xxx"].append(d3["xxx"])
    # #
    # # print(d1)   #  {1:{"xxx":[12,34]},2:{"xxx":[34,56,[777,888,999,[11,8238,99]]]}}
    # # print(d2)   #  {"xxx":[777,888,999,[11,8238,99]]}
    # # print(d3)   #  d3={"xxx":[11,8238,99]}
    #
    #
    #
    #
    # #=========================================================================
    # '''
    # [
    #   {'id': 1, 'content': '...', 'Pid': None, 'chidren_commentList': [{'id': 5, 'content': '...', 'Pid': 1, 'chidren_commentList': []},]},
    #   {'id': 2, 'content': '...', 'Pid': None, 'chidren_commentList': []},
    #   {'id': 3, 'content': '...', 'Pid': None, 'chidren_commentList': [{'id': 4, 'content': '...', 'Pid': 3, 'chidren_commentList': [{'id': 6, 'content': '...', 'Pid': 4, 'chidren_commentList': []},]},]},
    #   {'id': 4, 'content': '...', 'Pid': 3, 'chidren_commentList': [{'id': 6, 'content': '...', 'Pid': 4, 'chidren_commentList': []},]},
    #   {'id': 5, 'content': '...', 'Pid': 1, 'chidren_commentList': []},
    #   {'id': 6, 'content': '...', 'Pid': 4, 'chidren_commentList': []},
    #   {'id': 7, 'content': '...', 'Pid': 3, 'chidren_commentList': []},
    #   {'id': 8, 'content': '...', 'Pid': 7, 'chidren_commentList': []},
    #   {'id': 9, 'content': '...', 'Pid': None, 'chidren_commentList': []}
    #
    #  ]
    #
    # '''
    #
    # comment_list=[
    #
    #     {"id":1,"content":"...","Pid":None},
    #     {"id":2,"content":"...","Pid":None},
    #     {"id":3,"content":"...","Pid":None},
    #     {"id":4,"content":"...","Pid":1},
    #     {"id":5,"content":"...","Pid":1},
    #     {"id":6,"content":"...","Pid":4},
    #     {"id":7,"content":"...","Pid":3},
    #     {"id":8,"content":"...","Pid":7},
    #     {"id":9,"content":"...","Pid":None},
    #
    # ]
    #
    # comment_dict={}
    #
    # for comment in comment_list:
    #     comment["chidren_commentList"]=[]
    #     comment_dict[comment["id"]] = comment
    #
    # print(comment_dict)
    #
    # '''
    #
    # comment_dict:
    #
    # {
    # 1: {'id': 1, 'content': '...', 'Pid': None, 'chidren_commentList': []},
    # 2: {'id': 2, 'content': '...', 'Pid': None, 'chidren_commentList': []},
    # 3: {'id': 3, 'content': '...', 'Pid': None, 'chidren_commentList': []},
    # 4: {'id': 4, 'content': '...', 'Pid': 1, 'chidren_commentList': []},
    # 5: {'id': 5, 'content': '...', 'Pid': 1, 'chidren_commentList': []},
    # 6: {'id': 6, 'content': '...', 'Pid': 4, 'chidren_commentList': []},
    # 7: {'id': 7, 'content': '...', 'Pid': 3, 'chidren_commentList': []},
    # 8: {'id': 8, 'content': '...', 'Pid': 7, 'chidren_commentList': []},
    # 9: {'id': 9, 'content': '...', 'Pid': None, 'chidren_commentList': []}
    #
    # }
    #
    #
    # '''
    #
    # ret=[]
    #
    # for comment in comment_list:   # comment :  {'id': 1, 'content': '...', 'Pid': None, 'chidren_commentList': [{'id': 5, 'content': '...', 'Pid': 1, 'chidren_commentList': []},]},
    #     pid=comment.get("Pid")
    #     if  pid:
    #         print(comment)         #  {'id': 4, 'content': '...', 'Pid': 1, 'chidren_commentList': []}
    #         comment_dict[pid]["chidren_commentList"].append(comment)
    #     else:
    #         ret.append(comment)
    #
    #
    # print(ret)
    #
    #
    #
    #
    
    
    #####################################################################################################
    
    html_doc = """
    
    <html>
     <head>
      <title>
       The Dormouse's story
      </title>
     </head>
     <body>
      <p class="title">
       <b>
        The Dormouse's story
       </b>
      </p>
      <div id="d1" class="d1">
        <b>
        The Dormouse's story2
        </b></div>
      <p class="story">
       Once upon a time there were three little sisters; and their names were
       <a class="sister0" href="http://example.com/elsie" id="link1">
        Elsie
       </a>
       ,
       <a class="sister1" href="http://example.com/lacie" id="link2">
        Lacie
       </a>
       and
       <a class="sister2" href="http://example.com/tillie" id="link3">
        Tillie
       </a>
       ;
    and they lived at the bottom of a well.
      </p>
       <script>alert(1234)</script>
      <p class="story sister2">
       ...
      </p>
     </body>
    </html>
    """
    
    
    # 第一步:
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # 第二步:
    # print(soup.prettify()) # 无用
    
    # 第三步: 查找标签
    
    # print(soup.a) # 只能找到第一个标签
    # print(soup.find_all("a")) # 找到符合条件的所有的标签
    
    # print(soup.a["class"])
    
    #
    
    # print(soup.find_all(name="a",attrs={"class":"sister2"}))
    
    # for ele_a in soup.find_all("a"):
    #     print(ele_a["class"])
    
    
    # for ele_a in soup.find_all('a'):
    #     print(ele_a.attrs)  # {'class': ['sister0'], 'href': 'http://example.com/elsie', 'id': 'link1'}
    #     del ele_a["class"]
    #
    # for ele_a in soup.find_all('a'):
    #     print(ele_a)
    
    # for ele in soup.find_all():
    #     if ele.attrs:
    #         if ele.attrs.get("class"):
    #            print(ele.attrs)  # {'class': ['sister0'], 'href': 'http://example.com/elsie', 'id': 'link1'}
    #            del ele["class"]
    #
    # print(soup)
    
    
    # for ele_a in soup.find_all("a"):
    #     print(ele_a.string)    # string
    
    for ele in soup.find_all("script"):
        ele.string.replace_with("// 别瞎玩")
    
    print(soup)
  • 相关阅读:
    【Lucene4.8教程之五】Luke
    【Tika基础教程之一】Tika基础教程
    【Lucene4.8教程之四】分析
    【Lucene4.8教程之六】QueryParser与Query子类:如何生成Query对象
    【Lucene4.8教程之三】搜索
    Java路径问题最终解决方案—可定位所有资源的相对路径寻址
    java.util.logging.Logger基础教程
    【Lucene4.8教程之二】索引
    【Lucene4.8教程之一】使用Lucene4.8进行索引及搜索的基本操作
    【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析
  • 原文地址:https://www.cnblogs.com/ctztake/p/8001216.html
Copyright © 2020-2023  润新知