• 爬去自己的博客园呵呵


    import urllib
    import io
    from bs4 import BeautifulSoup
    import requests
    import os
    import re
    
    
    def GetessayContent(essayUrl):# the funcition is for get content to save then is IO
        html_content = requests.get(essayUrl).text
        bs = BeautifulSoup(html_content)
        title = bs.find("a", attrs={"id": "cb_post_title_url"}).text
        body = bs.find("div", attrs={"id": "cnblogs_post_body"}).get_text()
        with open (os.path.abspath(""+"essay/"+title+".txt"),"w",errors="ignore") as r:
            r.write(body)
        print("下载成功")
    
    
    def GetessayList(SideHrefUrl):# the function is for find essay list to href
        html_content = requests.get(SideHrefUrl).text #html
        bs = BeautifulSoup(html_content)    #bs
        divs = bs.find_all("div", attrs={"class": "entrylistPosttitle"})
        ass = list(map(lambda x:x.find("a")["href"],divs))
        for assurl in ass:
            GetessayContent(assurl) # go function
    
    
    def GetSideList(): # the funciton is for find side list to href
        blog_url = "https://www.cnblogs.com/zaranet/mvc/blog/sidecolumn.aspx" # responts url
        html_content = requests.get(blog_url).text#this is requests text
        bs = BeautifulSoup(html_content)# new bs
        Side_div_html = bs.find_all("div",attrs={"class":"catListPostCategory"})# html
        html_list = ('').join(str(Side_div_html))
        Side_Pattren = re.compile(r'https:.*?.(?:html)')  # my pattren
        Side_list = Side_Pattren.findall(html_list)  # find side url
        for MyObj in Side_list: #itertion Side list:
           GetessayList(MyObj)
    
    
    GetSideList()

    呵呵

  • 相关阅读:
    springmvcdemo
    src/main/resorces applicationContext.xml
    maven 中setting.xml
    Git
    刘相兵 AWR
    Oracle 查询SQL 执行次数---hash value
    Oracle 查询重复索引列
    Oracle 史上最全近百条Oracle DBA日常维护SQL脚本指令
    Oracle 查询表对应的索引
    11g RAC 自动升级 PSU
  • 原文地址:https://www.cnblogs.com/ZaraNet/p/9593442.html
Copyright © 2020-2023  润新知