• 爬虫爬取博客园文章的文字【练手】


    注意:只是文字,其他都行,自己实现吧。

      1 import requests
      2 from lxml import etree
      3 from urllib.request import urlopen, Request
      4 
      5 import time
      6 class blog():
      7 
      8     def __init__(self,url):
      9         self.base_url = url
     10         self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}#模拟浏览器
     11 
     12     def get_html(self,url):
     13         response = requests.get(url, self.headers, timeout = 5)#因为请求方式位get所以,用get请求
     14         if response.status_code == 200:#访问正常状态码为200
     15             response.encoding = response.apparent_encoding
     16             return response.text
     17         return None
     18     def get_url1(self,html):#获取每个分页内的文章标题和url
     19         url_1 = []
     20         name = []
     21         x_html = etree.HTML(html)
     22         #print(x_html)
     23         url_1 = x_html.xpath('//a[@class="postTitle2"]/@href')
     24         name = x_html.xpath('//a[@class="postTitle2"]/text()')
     25         names = []
     26 
     27         for i in name:
     28             names.append(i.strip())#去除空格
     29         if names:
     30             return names, url_1
     31         else:
     32             return None
     33     def get_url2(self,s):
     34 
     35         url = self.base_url + "default.html?page=" + s
     36         return url
     37     def get_text(self,html):
     38         x_html = etree.HTML(html)
     39         txt = []
     40 
     41         txt = x_html.xpath('//div[@id="cnblogs_post_body"]//p//text()')
     42         return txt
     43     def save_text(self,name,txt):#保存成txt文件,txt是文章内容
     44         print(name + "loading...")
     45         with open("C:/Users/25766/Desktop/sa/" + name + '.txt','w',encoding='utf-8') as f:#文件路径
     46             for i in txt:
     47                 f.write(i)
     48                 f.write('
    ')
     49             f.close()
     50         print("finsh")
     51 surl = input("输入爬取博客的首页url:")
     52 c = blog(surl)
     53 for i in range(1,200):#获取其各个分页,通过规律发现
     54     url = c.get_url2(str(i))
     55     ls = c.get_html(url)
     56     #print(ls)
     57     time.sleep(2)
     58     names, urls = c.get_url1(ls)
     59     for name, url in zip(names,urls):
     60         #print(name,":", url)
     61         html = c.get_html(url)
     62         txt = c.get_text(html)
     63         c.save_text(name[:5],txt)#以前5个字符命名
     64 
     65 
     66 
     67 
    追求吾之所爱
  • 相关阅读:
    管理反思(阶段)
    应用软件系统程序员的三个立面
    再优秀那么一点点
    TCP相关时延
    go wiki整理1
    慢就是快
    给自己一点机会
    竞争
    go mem
    币圈再次过年
  • 原文地址:https://www.cnblogs.com/rstz/p/14391031.html
Copyright © 2020-2023  润新知