• 古诗文网爬虫


    0x00 代码

    #coding:utf-8
    import requests
    import re
    
    def parse_page(url):
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
        
        }
        response = requests.get(url,headers=headers)
        text = response.text
        titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)#re.DOTALL:(.)匹配所有字符。(.*?)加个问号改成非贪婪模式
        #print(title)
        dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
        #print(dynasty)
        authors = re.findall(r'<p class="source">,*?<a.*?>.*?<a.*?>(.*?)</a>',text)
        #print(authors)
        content_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)#我们需要用(.*?)来获得中间的内容
        #print((contents))
        contents = []
        for content in content_tags:
            x = re.sub(r'<.*?>',"",content)
            contents.append(x.strip())
            #print(contents)
    
            for value in zip(titles,dynasties,authors,contents):#将目标转换成一一对应的数组
                title,dynastiy,author,content = value#进行解包
                #将下面封装的字典装在列表里
                poems = []
                
                #封装在字典里
                poem = {
                       'title':title,
                       'dynastiy':dynastiy,
                       'author':author,
                       'content':content
    
    
                }
                poems.append(poem)
                for poem in poems:
                    print(poem)
                    print('='*40)
    
    
    def main():
        url = 'https://www.gushiwen.org/default_1.aspx'
        for x in range(1,11):
            url = "https://www.gushiwen.org/default_%s.aspx" %x
            parse_page(url)
    if __name__ == "__main__":
        main()
    

    0x02 效果

  • 相关阅读:
    VBA.replace替换单引号或双引号
    读取文件
    UPDATE
    alter update
    SQL日期格式
    python map的用法
    python os模块用法
    python re.I compile search
    python 正则匹配
    通过list中值得名称查询索引号
  • 原文地址:https://www.cnblogs.com/wangtanzhi/p/12416397.html
Copyright © 2020-2023  润新知