• python3爬虫 -----爬取古诗文-------from古诗文网站


     1 # -*- coding: utf-8 -*-
     2 #author:zxy
     3 #Date:2018-10-19
     4 
     5 
     6 import requests
     7 import re
     8 HEADERS={
     9     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    10                  "Chrome/69.0.3497.100 Safari/537.36"
    11 }
    12 
    13 
    14 def parse_url(url):
    15     response=requests.get(url,headers=HEADERS)
    16     text=response.text
    17     titles=re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #r raw
    18     dynasties=re.findall(r'<psclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    19     authors=re.findall(r'<psclass="source">.*?<a.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    20     content_tags=re.findall(r'<divsclass="contson".*?>(.*?)</div>',text,re.DOTALL)
    21     contents=[]
    22     for content_tag in content_tags:
    23         x=re.sub('<.*?>','',content_tag)
    24         xx=re.sub('', '',x)
    25         contents.append(xx.strip())
    26     poems=[]
    27     for value in zip(titles,dynasties,authors,contents):
    28         title,dynasty,author,content=value
    29         poem={
    30             "title":title,
    31             "dynasty":dynasty,
    32             "author":author,
    33             "content":content
    34         }
    35         poems.append(poem)
    36 
    37     with open('poems.txt','w',encoding="utf-8") as f:
    38         for poem in poems:
    39             for (key,value) in poem.items():
    40                 if(key=="title"):
    41                     f.write("{}
    ".format(value))
    42                 if (key == "dynasty"):
    43                     f.write("	{}
    ".format(value))
    44                 if(key=="author"):
    45                     str="	{}
    "
    46                     f.write(str.format(value))
    47                 if(key=="content"):
    48                     print(value)
    49                     f.write("{}
    
    
    ".format(value))
    50                     # print(x+"{}
    
    
    ".format(value))
    51 
    52 if __name__ == '__main__':
    53     url="https://www.gushiwen.org/default_1.aspx"
    54     parse_url(url)
  • 相关阅读:
    iOS开发UI篇—Quartz2D使用(绘图路径)
    iOS开发UI篇—Quartz2D简单使用(三)
    iOS开发UI篇—Quartz2D使用(图片剪切)
    WordPress主题开发: 制作文章页面single.php
    Wordpress页面判断
    为WordPress某个文章添加额外的样式
    Wordpress本地伪静态设置
    Wordpress添加关键词和描述
    Wordpress制作sidebar.php
    Wordpress固定链接设置
  • 原文地址:https://www.cnblogs.com/z-712/p/9815253.html
Copyright © 2020-2023  润新知