• python 爬虫学习--Beautiful Soup插件


    Beautiful Soup插件学习使用参考:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#recursive

    使用Beautiful Soup前的准备,先确保安装了该插件(该版本为:beautifulsoup4):

    C:Program Files (x86)Python36-32Scripts>pip install beautifulsoup4
    Requirement already satisfied: beautifulsoup4 in c:program files (x86)python36-32libsite-packages (4.6.3)

    通过python自带的urllib.request模块解析目标网址:

    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    from bs4 import BeautifulSoup
    import urllib.request
    import time
    from python.common.ConnectDataBase import ConnectionMyslq
    
    
    # 解析栏目数据
    def getHtmlData(url):
        # 请求
        request = urllib.request.Request(url)
    
        # 结果
        response = urllib.request.urlopen(request)
        data = response.read()
        data = data.decode('gbk')
        return data

    获取到需要解析的页面数据后,开始调用Beautiful Soup插件解析页面,提取需要的数据:

    # 通过select样式选择器,选择需要的内容
    def soupData2(data):
        soup = BeautifulSoup(data)
        # 获取今日推荐内容(使用样式选择器)
        infostag = soup.select('.today_news > ul > li > a')
        infos = list()
        for child in infostag:
            info = {}
            info['title'] = child.get_text()
            info['href'] = child.get('href')
            info['time'] = time.strftime('%Y-%m-%d', time.localtime())
            # detailData = getHtmlData(child.get('href'))
            # content = soupDetial(detailData)
            # info['content'] = content
            #调用数据库方法,将数据入库
            insertInfo(info)
            # infos.append(info)
        # print(infos)

    ==============================================分割线===================================================================

    接下来数据库的操作,前提条件安装了pymysql:

    pymysql参考地址:https://github.com/PyMySQL/PyMySQL

    C:Program Files (x86)Python36-32Scripts>python -m pip install PyMySQL
    Requirement already satisfied: PyMySQL in c:program files (x86)python36-32libsite-packagespymysql-0.7.4-py3.6.egg (0.7.4)

    导入数据库连接模块:

    # 导入 pymysql
    import pymysql

    创建数据库连接类 和 对应的数据库操作方法:

    '''
    数据库连接
    '''
    class ConnectionMyslq(object):
        def __init__(self, ip, user_name, passwd, db, port, char='utf8'):
            self.ip = ip
            self.port = port
            self.username = user_name
            self.passwd = passwd
            self.mysqldb = db
            self.char = char
    
            self.MySQL_db = pymysql.connect(
                host=self.ip,
                user=self.username,
                passwd=self.passwd,
                db=self.mysqldb,
                port=self.port,
                charset=self.char)
    
        # 查询数据
        def findList(self, sql):
            cursor = self.MySQL_db.cursor()
            MySQL_sql = sql
            try:
                # 执行SQL语句
                cursor.execute(MySQL_sql)
                # 获取所有记录列表
                results = cursor.fetchall()
            except Exception:
                print("Error: unable to fetch data")
                print(Exception)
                self.MySQL_db.close()
            self.MySQL_db.close()
            return results
    
        # 数据增删改查
        def exe_sql(self, sql):
            cursor = self.MySQL_db.cursor()
            MySQL_sql = sql
            try:
                # 执行SQL语句
                cursor.execute(MySQL_sql)
                self.MySQL_db.commit()
            except Exception:
                print("Error: unable to fetch data")
                print(Exception)
                self.MySQL_db.close()
            self.MySQL_db.close()

    编写数据库操作的方法:

    # 数据入库
    def insertInfo(info):
        conn = ConnectionMyslq("localhost", "root",
                               "Gepoint", "pythondb", 3306)
        sql = 'insert into news_info(title, link, time) values("%s", "%s", "%s")' % (
            info['title'], info['href'], info['time'])
        conn.exe_sql(sql)
    # 数据查询
    def findAll():
        conn = ConnectionMyslq("localhost", "root",
                               "Gepoint", "pythondb", 3306)
        sql = 'select * from news_info'
        result = conn.findList(sql)
        print(result)

    最后执行方法:

    if __name__ == "__main__":
    
        # 中间列数据接口 http://temp.163.com/special/00804KVA/cm_guonei_02.js?callback=data01_callback
        # 网易的今日推荐新闻
        data = getHtmlData(url='http://news.163.com/domestic/')
        soupData2(data)
      
    findAll()
     

    执行结果:

    初心回归,时光已逝!
  • 相关阅读:
    PHP学习笔记(一)
    切记数据类型的最大范围
    (转)女程序员的生活,真苦涩。
    向模态窗体传递参数和获取返回值
    在ASP.NET 中实现单点登录(单域名)
    错误:访问IIS 元数据库失败
    Page.registerstartupscript方法用法介绍
    IIS安装配置的问题
    GridView的双击等几个事件
    解决IIS不能启动,提示服务器没有及时响应启动或控制请求
  • 原文地址:https://www.cnblogs.com/yin1361866686/p/9842777.html
Copyright © 2020-2023  润新知