• Python爬虫实践~BeautifulSoup+urllib+Flask实现静态网页的爬取


    爬取的网站类型:

        论坛类网站类型

    涉及主要的第三方模块:

        BeautifulSoup:解析、遍历页面

        urllib:处理URL请求

        Flask:简易的WEB框架

    介绍:

            本次主要使用urllib获取网页数据,然后通过BeautifulSoup进行页面解析,返回json结构的数据。

    功能点:

    • urllib根据URL通过GET方式获取网页内容;

    • 通过JSON文件配置

    • 解析页面结构,返回JSON结构的数据

    • 提供REST服务进行调用

       

    特点:

    1、提供数据描述服务,总页面,每页条数,总条数;

    2、增量请求数据,只获取当前时间和上次请求时间之间的数据;

    3、控制请求时间间隔,防治IP被封杀

    4、分页数据请求

    5、修改请求关键字,并记录上次的请求关键字

     

    主要代码结构:

    - 公共请求端封装

    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    from urllib import request
    from urllib.parse import quote
    import string
    import requests
    
    # 静态页面基类
    class StaticBase():
        # 获取网页内容
        def __getHTMLText(self,url,code="utf-8"):
            try:
                r = requests.get(url)
                r.raise_for_status()
                r.encoding = code
                return r.text
            except:
                return ""
        # get方式请求数据
        def getUrl(self,url,code='utf-8'):
            url = quote(url,safe=string.printable)
            req = request.Request(url)
            req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
            with request.urlopen(req) as f:
                print('Status:',f.status,f.reason)
                return f.read().decode(code)
    
    #s = StaticBase()
    #print(s.getUrl('http://www.baidu.com/','utf-8'))

    - 配置文件

    {
       "host": "http://shangyu.108sq.cn", 
       "base_url": "http://shangyu.108sq.cn/shuo/search?sertype=4", 
       "key_words": "污染", 
       "page_size": 30, 
       "search_key": "", 
       "last_request_time": 1562142204.149511, 
       "request_gap": 60
      }

    - 解析服务

    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    from bs4 import BeautifulSoup
    from datetime import datetime
    import json
    from common.staticBase import StaticBase
    
    class Shangyh108(StaticBase):
    
        __config_file = "config.json"   #配置文件
        __text = ""                     #解析的网页内容
        __config_dict = {}              #配置集合
        __url = ""                      #请求的URL
        __keywords = ""                 #关键字
        __last_request_time = 0         #上次请求时间
        # 构造函数
        def __init__(self):
            self.__config_dict = self.__getConfig()
            # 查询关键字
            if len(self.__config_dict['search_key']) >0 :
                self.__keywords = self.__config_dict['search_key']
            else:
                self.__keywords = self.__config_dict['key_words']
            self.__url = self.__getUrl()
        # 获取网页内容
        def getText(self):
            print(self.__url)
            self.__text = StaticBase.getUrl(self,self.__url)
    
        # 获取第一页内容
        def getFirstPageText(self,url=''):
            if self.checkRquestTime():
                if len(url)==0 :
                    url = self.__getUrl()
                self.__text = StaticBase.getUrl(self,url)
                return self.parseHTML()
            else:
                print("操作频繁请稍后重新")
        # 获取下一页
        def getNextPage(self,url):
            url = self.__config_dict['host']+url
            print(url)
            self.__text = StaticBase.getUrl(self,url)
            return self.parseHTML()
    
        # 为防止请求对服务器造成太大压力,控制请求的时间间隔,最少为5分钟
        def checkRquestTime(self):
            request_gap = self.__config_dict['request_gap']
            last_request_time = self.__config_dict['last_request_time']
            dt_now = datetime.now().timestamp()
            self.__last_request_time = last_request_time  # 记录上次请求的时间,为了获取阶段性的数据
            if last_request_time == 0:  #第一次请求,直接通过
                last_request_time = dt_now
            elif last_request_time+request_gap > dt_now:
                print("请求过度频繁,请稍后重试")
                return False
            else:
                last_request_time = dt_now
    
            self.__setConfig('last_request_time',last_request_time)
            return True
    
        # 获取网页描述信息
        def getDesc(self):
            self.getText()
            soup = BeautifulSoup(self.__text,'html.parser')
            obj_count = soup.select('.count')[0]
            count_str = str(obj_count.string).replace("(共","").replace("条)","")
            count = int(count_str)
            pageSize = int(self.__config_dict['page_size'])
            host = self.__config_dict['host']
            if count % pageSize == 0 :
                pages = count//pageSize
            else:
                pages = count // pageSize + 1
            desc = {}
            desc['host'] = host
            desc['count'] = count
            desc['page_size'] = pageSize
            desc['total_page'] = pages
            # 增加分页的URL
            if pages > 0 :
                pageUrls = soup.select(".TCPage__middle > a")
                page_url = []
                for i in range(len(pageUrls)-1) :
                    tag = pageUrls[i+1]
                    page_url.append(tag['href'])
                desc['page_url'] = page_url
            return json.dumps(desc,ensure_ascii=False)
    
        # 解析网页内容
        def parseHTML(self):
            soup = BeautifulSoup(self.__text, 'html.parser')
            list_li = soup.select('.TCSayList .TCSayList_li')
            data_list = []
            for i in range(len(list_li)):
                item = {}
                temp = list_li[i]
                publish_time = temp.select('.TCSayList_li_time')[0]
                int_dt = int(publish_time['data-time'])
    
                if self.__last_request_time == 0 or self.__last_request_time < int_dt :
                    # 发布时间
                    item['publish_time_long'] = publish_time['data-time']
                    item['publish_time_str'] = datetime.fromtimestamp(int(publish_time['data-time'])).strftime(
                        '%Y-%m-%d %H:%M:%S')
    
                    # 数据标签
                    item['data-tag'] = temp['data-tag']
                    # 用户
                    author = temp.select('.TCSayList_li_author')[0]
                    item['author_name'] = author.string
                    item['author_url'] = author['href']
    
                    # 标题
                    if len(temp.select('.TCSayList__title a')) >0:
                        title = temp.select('.TCSayList__title a')[0]
                        item['title'] = title.string
                        item['link_url'] = title['href']
                    # 内容
                    item['content'] = temp.select('.TCSayList_li_content')[0]['data-short']
    
                    data_list.append(item)
            return data_list
        # 获取请求配置信息
        def __getConfig(self):
            with open(self.__config_file, "r",encoding="utf-8") as load_f:
                load_dict = json.load(load_f)
            return load_dict
        # 设置配置项
        def __setConfig(self,key,value):
            self.__config_dict[key] = value
            print(self.__config_dict)
            with open(self.__config_file,'w',encoding="utf-8") as f:
                f.write(json.dumps(self.__config_dict,ensure_ascii=False))
    
        def getKeywords(self):
            self.__keywords = input("请输入查询的关键字,多个关键字用“+”连接,默认关键字:环保+污染+投诉,使用默认关键字可直接按Enter:")
            if len(self.__keywords) == 0:
                self.__keywords = self.__config_dict['key_words']
            else:
                self.__setConfig("search_key",self.__keywords)
    
        # 获取请求的URL
        def __getUrl(self):
            base_url = self.__config_dict['base_url']
            # 组装查询参数
            url = base_url + "&key=" + self.__keywords
            return url

    - REST服务端(目前服务没有暴露很多,但相应服务实现都已经实现)

    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    from flask import Flask,jsonify,abort,request
    from changshuo108.shangyu.Shangyh108 import Shangyh108
    
    app = Flask(__name__)
    # 支持中文
    app.config['JSON_AS_ASCII'] = False
    shangyue = Shangyh108()
    @app.route('/shangyu108/api/desc',methods=['GET'])
    def get_desc():
        return shangyue.getDesc()
    
    @app.route('/shangyu108/api/first_page',methods=['GET'])
    def get_firstPage():
        return jsonify({'data':shangyue.getFirstPageText('')})
    
    @app.route('/shangyu108/api/page',methods=['POST'])
    def get_article():
       if not request.json or not 'url' in request.json:
          abort(400)
       print(request.json['url'])
       return jsonify({'data':shangyue.getNextPage(request.json['url'])})
    
    if __name__ == '__main__':
        app.run(debug=True)

    爬取的网站类型:

        论坛类网站类型

    涉及主要的第三方模块:

        BeautifulSoup:解析、遍历页面

        urllib:处理URL请求

        Flask:简易的WEB框架

    介绍:

            本次主要使用urllib获取网页数据,然后通过BeautifulSoup进行页面解析,返回json结构的数据。

    功能点:

    • urllib根据URL通过GET方式获取网页内容;

    • 通过JSON文件配置

    • 解析页面结构,返回JSON结构的数据

    • 提供REST服务进行调用

       

    特点:

    1、提供数据描述服务,总页面,每页条数,总条数;

    2、增量请求数据,只获取当前时间和上次请求时间之间的数据;

    3、控制请求时间间隔,防治IP被封杀

    4、分页数据请求

    5、修改请求关键字,并记录上次的请求关键字

     

    主要代码结构:

    - 公共请求端封装

    #!/usr/bin/env python # -*- coding:utf-8 -*-from urllib import requestfrom urllib.parse import quoteimport stringimport requests
    # 静态页面基类class StaticBase():    # 获取网页内容   使用的requests 库实现 def __getHTMLText(self,url,code="utf-8"): try: r = requests.get(url) r.raise_for_status() r.encoding = code return r.text except: return ""    # get方式请求数据  通过urllib记性实现GET请求 def getUrl(self,url,code='utf-8'): url = quote(url,safe=string.printable) req = request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') with request.urlopen(req) as f: print('Status:',f.status,f.reason) return f.read().decode(code)
    #s = StaticBase()#print(s.getUrl('http://www.baidu.com/','utf-8'))

    - 配置文件

    {   "host": "http://shangyu.108sq.cn",    "base_url""http://shangyu.108sq.cn/shuo/search?sertype=4"   "key_words""污染"   "page_size"30   "search_key"""   "last_request_time"1562142204.149511   "request_gap"60  } 

    - 解析服务

    #!/usr/bin/env python # -*- coding:utf-8 -*-from bs4 import BeautifulSoupfrom datetime import datetimeimport jsonfrom common.staticBase import StaticBase
    class Shangyh108(StaticBase):
    __config_file = "config.json" #配置文件 __text = "" #解析的网页内容 __config_dict = {} #配置集合 __url = "" #请求的URL __keywords = "" #关键字 __last_request_time = 0 #上次请求时间 # 构造函数 def __init__(self): self.__config_dict = self.__getConfig() # 查询关键字 if len(self.__config_dict['search_key']) >0 : self.__keywords = self.__config_dict['search_key'] else: self.__keywords = self.__config_dict['key_words'] self.__url = self.__getUrl() # 获取网页内容 def getText(self): print(self.__url) self.__text = StaticBase.getUrl(self,self.__url)
    # 获取第一页内容 def getFirstPageText(self,url=''): if self.checkRquestTime(): if len(url)==0 : url = self.__getUrl() self.__text = StaticBase.getUrl(self,url) return self.parseHTML() else: print("操作频繁请稍后重新") # 获取下一页 def getNextPage(self,url): url = self.__config_dict['host']+url print(url) self.__text = StaticBase.getUrl(self,url) return self.parseHTML()
    # 为防止请求对服务器造成太大压力,控制请求的时间间隔,最少为5分钟 def checkRquestTime(self): request_gap = self.__config_dict['request_gap'] last_request_time = self.__config_dict['last_request_time'] dt_now = datetime.now().timestamp() self.__last_request_time = last_request_time # 记录上次请求的时间,为了获取阶段性的数据 if last_request_time == 0: #第一次请求,直接通过 last_request_time = dt_now elif last_request_time+request_gap > dt_now: print("请求过度频繁,请稍后重试") return False else: last_request_time = dt_now
    self.__setConfig('last_request_time',last_request_time) return True
    # 获取网页描述信息 def getDesc(self): self.getText() soup = BeautifulSoup(self.__text,'html.parser') obj_count = soup.select('.count')[0] count_str = str(obj_count.string).replace("(共","").replace("条)","") count = int(count_str) pageSize = int(self.__config_dict['page_size']) host = self.__config_dict['host'] if count % pageSize == 0 : pages = count//pageSize else: pages = count // pageSize + 1 desc = {} desc['host'] = host desc['count'] = count desc['page_size'] = pageSize desc['total_page'] = pages # 增加分页的URL if pages > 0 : pageUrls = soup.select(".TCPage__middle > a") page_url = [] for i in range(len(pageUrls)-1) : tag = pageUrls[i+1] page_url.append(tag['href']) desc['page_url'] = page_url return json.dumps(desc,ensure_ascii=False)
    # 解析网页内容 def parseHTML(self): soup = BeautifulSoup(self.__text, 'html.parser') list_li = soup.select('.TCSayList .TCSayList_li') data_list = [] for i in range(len(list_li)): item = {} temp = list_li[i] publish_time = temp.select('.TCSayList_li_time')[0] int_dt = int(publish_time['data-time'])
    if self.__last_request_time == 0 or self.__last_request_time < int_dt : # 发布时间 item['publish_time_long'] = publish_time['data-time'] item['publish_time_str'] = datetime.fromtimestamp(int(publish_time['data-time'])).strftime( '%Y-%m-%d %H:%M:%S')
    # 数据标签 item['data-tag'] = temp['data-tag'] # 用户 author = temp.select('.TCSayList_li_author')[0] item['author_name'] = author.string item['author_url'] = author['href']
    # 标题 if len(temp.select('.TCSayList__title a')) >0: title = temp.select('.TCSayList__title a')[0] item['title'] = title.string item['link_url'] = title['href'] # 内容 item['content'] = temp.select('.TCSayList_li_content')[0]['data-short']
    data_list.append(item) return data_list # 获取请求配置信息 def __getConfig(self): with open(self.__config_file, "r",encoding="utf-8") as load_f: load_dict = json.load(load_f) return load_dict # 设置配置项 def __setConfig(self,key,value): self.__config_dict[key] = value print(self.__config_dict) with open(self.__config_file,'w',encoding="utf-8") as f: f.write(json.dumps(self.__config_dict,ensure_ascii=False))
    def getKeywords(self): self.__keywords = input("请输入查询的关键字,多个关键字用“+”连接,默认关键字:环保+污染+投诉,使用默认关键字可直接按Enter:") if len(self.__keywords) == 0: self.__keywords = self.__config_dict['key_words'] else: self.__setConfig("search_key",self.__keywords)
    # 获取请求的URL def __getUrl(self): base_url = self.__config_dict['base_url'] # 组装查询参数 url = base_url + "&key=" + self.__keywords return url

    - REST服务端(目前服务没有暴露很多,但相应服务实现都已经实现)

    #!/usr/bin/env python # -*- coding:utf-8 -*-from flask import Flask,jsonify,abort,requestfrom changshuo108.shangyu.Shangyh108 import Shangyh108
    app = Flask(__name__)# 支持中文app.config['JSON_AS_ASCII'] = Falseshangyue = Shangyh108()@app.route('/shangyu108/api/desc',methods=['GET'])def get_desc(): return shangyue.getDesc()
    @app.route('/shangyu108/api/first_page',methods=['GET'])def get_firstPage(): return jsonify({'data':shangyue.getFirstPageText('')})
    @app.route('/shangyu108/api/page',methods=['POST'])def get_article(): if not request.json or not 'url' in request.json: abort(400) print(request.json['url']) return jsonify({'data':shangyue.getNextPage(request.json['url'])})
    if __name__ == '__main__': app.run(debug=True)
    公众号链接:https://mp.weixin.qq.com/s?__biz=Mzg4MzI3MjM4NQ==&tempkey=MTAxNl9KVjVnVCtVNlo4RUpIZmZXbzBfSVR4dHU4YUhhX3hPNGMxVXdMd1JaQ21OZExlNnNybmJzaVhCT2hkZk85RzZKbzRlYWxFcEk1U2g5bmN4cWJ1QlNmNEdmWlBvVWxGTER2NDM5NjdWa1VIaDVWZlFyUF9EVmtYM0lmNnplRzRjanZsWEo4RUlESTg2YlFkVjBxdDFXbzEwR1UtVVpSd2V5U0R1YUVnfn4%3D&chksm=4f48bebe783f37a8622096cf8cb7d5dfbc5d913e3f1694ea601f51eec5aadddee66271739639#rd
  • 相关阅读:
    ssd 的anchor生成详解
    Qt小技巧8.利用反射机制通过类名创建Qt对象
    项目经验2.需求才是王道
    Qt实战12.可自由展开的ToolBox
    Qt实战11.进程窗口集成之假装写了个第三方软件
    Qt小技巧7.Qt4集成fusion风格
    Qt杂谈3.快速体验Qt for Android(windows平台)
    Qt实战10.支持最小化和最大化的QDockWidget
    gitlab 拉取远程分支代码
    CentOS7下用jdk1.7编译hadoop-2.7.1全过程详解
  • 原文地址:https://www.cnblogs.com/xiaotao726/p/11141833.html
Copyright © 2020-2023  润新知