• 爬去自己的博客园呵呵


    import urllib
    import io
    from bs4 import BeautifulSoup
    import requests
    import os
    import re
    
    
    def GetessayContent(essayUrl):# the funcition is for get content to save then is IO
        html_content = requests.get(essayUrl).text
        bs = BeautifulSoup(html_content)
        title = bs.find("a", attrs={"id": "cb_post_title_url"}).text
        body = bs.find("div", attrs={"id": "cnblogs_post_body"}).get_text()
        with open (os.path.abspath(""+"essay/"+title+".txt"),"w",errors="ignore") as r:
            r.write(body)
        print("下载成功")
    
    
    def GetessayList(SideHrefUrl):# the function is for find essay list to href
        html_content = requests.get(SideHrefUrl).text #html
        bs = BeautifulSoup(html_content)    #bs
        divs = bs.find_all("div", attrs={"class": "entrylistPosttitle"})
        ass = list(map(lambda x:x.find("a")["href"],divs))
        for assurl in ass:
            GetessayContent(assurl) # go function
    
    
    def GetSideList(): # the funciton is for find side list to href
        blog_url = "https://www.cnblogs.com/zaranet/mvc/blog/sidecolumn.aspx" # responts url
        html_content = requests.get(blog_url).text#this is requests text
        bs = BeautifulSoup(html_content)# new bs
        Side_div_html = bs.find_all("div",attrs={"class":"catListPostCategory"})# html
        html_list = ('').join(str(Side_div_html))
        Side_Pattren = re.compile(r'https:.*?.(?:html)')  # my pattren
        Side_list = Side_Pattren.findall(html_list)  # find side url
        for MyObj in Side_list: #itertion Side list:
           GetessayList(MyObj)
    
    
    GetSideList()

    呵呵

  • 相关阅读:
    Java中子类继承了父类的私有属性及方法吗?
    为什么静态成员、静态方法中不能用this和super关键字
    poj 3378 二维树状数组
    poj 3034 动态规划
    poj 2498 动态规划
    poj 2029 二维树状数组
    hdu 3280 动态规划
    hdu 2586 LCA
    poj 3689 树形dp
    poj 1947 树形dp
  • 原文地址:https://www.cnblogs.com/ZaraNet/p/9593442.html
Copyright © 2020-2023  润新知