• python爬虫-豆瓣电影top250


    一.python爬虫简介
    1.什么是爬虫:
    网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或者脚本。由于互联网数据的多样性和资源的有限性,根据用户需求定向抓取相关网页并分析已成为如今主流的爬取策略。
    2.爬虫的作用:
    网络抓取图片,爬取想看的视频,只要通过浏览器访问的数据都可以通过爬虫获取
    3.爬虫的本质:
    模拟浏览器打开网页,获取网页中我们想要的那部分数据

    二.爬取数据
    1.urllib模块使用

    import urllib.request
    import urllib.parse
    #解析baidu网页源码并进行utf-8解码,get请求
    response = urllib.request.urlopen( "http://www.baidu.com" )
    print(response.read().decode("utf-8"))
    
    #获取一个post请求,其中封装data数据,使用utf8解码
    data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
    response = urllib.request.urlopen("http://httpbin.org/post",data=data)
    print(response.read().decode("utf-8"))
    
    #超时处理
    try:
        response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
        print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        print("time out")
    
    #获取响应码/头部
    response = urllib.request.urlopen( "http://www.baidu.com" )
    print(response.status)
    print(response.getheaders())
    
    #爬取豆瓣信息,使用浏览器信息
    url = "http://www.douban.com"
    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
    }
    #data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8")
    req = urllib.request.Request(url=url,headers=headers,method="POST")
    response = urllib.request.urlopen(req)
    print(response.read().decode("utf-8"))

    2.实例-数据获取

    #得到指定一个URL的网页内容
    def askURl(url):
        head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
        request = urllib.request.Request(url,headers=head)
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
            #print(html)
        except urllib.error.URLError as  e:
            if hasattr(e,"code"):
                print(e.code)
        return html

    三.解析数据
    1.BeauifulSoup模块

    #!/usr/bin/python3
    # @DESC:BeatuifulSoup4将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以回归为4种:Tag,NavigableString,BeautifulSoup,Comment
    import re
    from bs4 import BeautifulSoup
    
    file = open("./baidu.html","rb")
    html = file.read().decode("utf-8")
    bs = BeautifulSoup(html,"html.parser")
    #1.Tag标签及其内容,拿到他找到的第一个内容
    print(bs.title) #打印title
    print(bs.a)     #打印a标签
    print(bs.head)  #打印head标签
    
    # 2.NavigableString拿到标签的内容
    print(bs.title.string)  #打印title中字符串
    print(bs.a.attrs)  #打印标签内所有属性
    print(bs.a.string) #打印标签内字符串
    
    # 3.BeautifulSoup,表示整个文档
    print(bs.name)
    print(bs.attrs)
    print(bs)
    
    # 4.comment,是一个特殊的NavigableString,输出内容不包含注释
    
    # 5.文档的遍历
    print(bs.head.contents)
    print(bs.head.contents[0])
    
    # 6文档搜索
    # 6.1 find_all() 字符串过滤:会查找于字符串你完全匹配的内容
    t_list = bs.find_all("a") #查找所有的a标签
    print(t_list)
    
    # 6.2正则表达式搜索:使用search()方法来匹配内容
    t_list = bs.find_all(re.compile("a"))
    print(t_list)
    
    # 6.3根据函数的要求来搜索
    def name_is_exists(tag):
        return tag.has_attr("name")
    t_list = bs.find_all(name_is_exists);
    for item in t_list:
        print(item)
    
    # 6.3.kwargs 参数
    t_list = bs.find_all(id="head",name=True,limit=3)
    #t_list = bs.find_all(text="贴吧")
    for item in t_list:
        print(item)
    
    # 6.4选择器
    t_list = bs.select('title')  #通过标签查找
    t_list = bs.select('#u1')   #通过id查找
    t_list = bs.select(".mnav")   #通过类名查找
    t_list = bs.select("a[class]")   #通过类名查找
    t_list = bs.select(".mnav ~ .bri")   #查看兄弟节点
    for item in t_list:
        print(item)

    2.re模块

    import re
    #创建模式对象-search
    pat = re.compile("AA")  #此处的AA是正则表达式,用来验证其他字符串
    m = pat.search("CBA")  #search字符串为被校验的内容
    m = pat.search("ABCAA")  #search字符串为被校验的内容
    m = pat.search("BAACABCAA")  #search字符串为被校验的内容
    print(m) #打印返回第一次匹配的字符串中下标,左闭右开
    #无模式对象-search
    m = re.search("asd","Aasd")  #前字符串为模板,后字符串为被校验的对象
    #print(m)
    #查找所有符合标准的字符串,返回列表
    print(re.findall("a","ASDaDEFGAa")) #前字符串为模板,后字符串为被校验的对象
    print(re.findall("[A-Z]","ASDaDEFGAa")) #返回大写字母
    print(re.findall("[A-Z]+","ASDaDEFGAa")) #符合的字母一次性输出
    #匹配符合调整的内容(.*?)
    print(re.findall("AS(.*?)Aa","ASDaDEFGAa"))
    #sub 正则替换
    print(re.sub("a","A","abcdcasd"))  #找到a用A来替换,在第三个字符串中查找
    print(re.sub("
    ","","ab
    dca
    sd"))  #去除换行
    #建议在正则表达式中,被比较的字符串前面加上r,不用担心转义

    3.实例-数据解析

    #创建正则表达式对象,表示规则(字符串的模式)
    findLink = re.compile(r'<a href="(.*?)">')   #影片链接匹配规则
    findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)   #re.S忽略换行符,图片链接匹配规则
    findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名
    findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则
    fingCommentNum = re.compile(r'<span>(d*?)人评价</span>')  #匹配评价人数
    findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价
    findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容
    
    #爬取网页
    def getData(baseurl):
        datalist = []
        #2.逐一解析数据
        for i in range(10):      #调用获取页面信息的函数10次
            url = baseurl + str(i*25)
            html = askURl(url)
            #2.解析数据
            soup = BeautifulSoup(html,"html.parser")
            for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表
                #print(item) #测试查看电影item全部信息
                data = [] #保存一部电影的所有信息
                item=str(item)
                # re库用来通过正则表达式查找指定的字符串
                link=re.findall(findLink,item)[0]  #查找超链接
                data.append(link)
                imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址
                data.append(imgSrc)
                titles=re.findall(findTitle,item) #查找标题,可能多个
                for i in range(0,3):
                    res = titles[i].replace("/","").replace(" ","").replace("xa0","") #去掉无关符号
                    data.append(res)
                rating = re.findall(findRating, item)[0] #查找评分
                data.append(rating)
                commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量
    
                data.append(commentNum)
                inq = re.findall(findInq, item) #查找一句话评论
                if len(inq) !=0:
                    inq = inq[0].replace(".","").replace(" ","").replace("","") #去掉无关符号
                    data.append(inq)
                else:
                    data.append("")
                bd = re.findall(findBD, item)[0] #查找相关内容
                bd = re.sub('<br(s+)?/>(s+)?>',"",bd)  #去掉<br/>
                bd = re.sub('/',"",bd)
                bd = re.sub('xa0',"",bd)
                bd = re.sub(' ',"",bd)
                data.append(bd.strip())
    
                datalist.append(data) #把处理好的一部电影信息放入datalist
                #print(datalist)
        return  datalist

    四.保存数据
    1.xlwt模块

    import xlwt
    workbook = xlwt.Workbook(encoding="utf-8") #创建workbook对象
    worksheet = workbook.add_sheet('sheet1') #创建工作表
    worksheet.write(0,0,'hello') #写入数据,第一行参数为行,第二行参数为列,第三行参数内容
    workbook.save('student.xls') #保存数据表

    2.sqlite3模块

    import sqlite3
    #1.打开或创建数据库文件
    conn = sqlite3.connect("test.db")
    #安装插件Database Navigator后重启pycharm即可
    print("Opened database successfully")
    c = conn.cursor()  #获取游标
    
    #2.创建表
    sql_creatTabel = '''
        create table if not exists company
            (id int promary key not null,
            name text not null,
            age int not null,
            address char(50),
            salary real);
    '''
    c.execute(sql_creatTabel)  #执行sql语句
    conn.commit()  #提交数据库操作
    #conn.close()   #关闭数据库连接
    print("Creat table successfully")
    #3.插入数据
    sql_insertData1 = '''
        insert into company(id,name,age,address,salary)
        values(1,'张三',35,'南京',10000);
    '''
    sql_insertData2 = '''
        insert into company(id,name,age,address,salary)
        values(2,'李四',27,'北京',15000);
    '''
    c.execute(sql_insertData1)
    c.execute(sql_insertData2)
    conn.commit()  #提交数据库操作
    print("Insert Data successfully")
    
    #4.查询数据
    sql_queryData = ' select * from company '
    cursor = c.execute(sql_queryData)
    for row in cursor:
        print("id=",row[0],end="")
        print("name=",row[1],end="")
        print("address=",row[2],end="")
        print("salary=",row[3],end="
    ")
    print("Query Data successfully")
    conn.close()

    3.实例-数据xls

    #保存数据
    def saveData(datalist,savepath):
        print("save......")
        book = xlwt.Workbook(encoding="utf8",style_compression=0)
        sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
        col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息")
        for i in range(9):
            sheet.write(0,i,col[i]) #列名
        for i in range(250):
            print("第%d条"%(i+1))
            data = datalist[i]
            for j in range(0,9):
                sheet.write(i+1,j,data[j])
        book.save(savepath)    #保存

    4.实例-数据保存DB

    #数据库初始化
    def init_db(dbpath):
        sql = '''
            create table if not exists movie250(
            id integer primary key autoincrement,
            info_link text,
            pic_link text,
            name1 varchar,
            name2 varchar,
            name3 varchar,
            score numeric,
            rated numeric,
            instroduction text,
            info text
            )
        ''' #创建数据表
        conn = sqlite3.connect(dbpath)
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        conn.close()
    
    #保存数据入DB
    def saveData2DB(datalist, dbpath):
        init_db(dbpath)
        conn=sqlite3.connect(dbpath)
        cur = conn.cursor()
    
        for data in datalist:
            for index in range(len(data)):
                if index ==5 or index ==6:
                    continue
                data[index] = '"'+data[index]+'"'
            sql = '''
            insert into movie250(
            info_link,pic_link,name1,name2,name3,score,rated,instroduction,info)
            values(%s)'''%",".join(data)
            #print(sql)
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()

    五.完整源码

    #!/usr/bin/python3
    # -*- coding:utf-8 -*-
    # @Time:2021/8/21 11:43
    # @author: Mrwhite
    # @File:spiderdouban250.py
    # @DESC:
    
    from bs4 import BeautifulSoup    #网页解析,获取数据
    import re      #正则表达式 进行文字匹配
    import urllib.request,urllib.error   #制定URL,获取网页数据
    import xlwt    #进行excel操作
    import sqlite3 #进行数据库操作
    
    def main():
        #xx电影250基础url
        baseurl = "https://movie.douban.com/top250?start="
    
        #1-2.爬取网页并解析
        datalist=getData(baseurl)
        savepath = "豆瓣电影Top250.xls"
        dbpath = "movie.db"
    
        #3.保存数据
        #saveData(datalist,savepath)
        saveData2DB(datalist,dbpath)
    
    #创建正则表达式对象,表示规则(字符串的模式)
    findLink = re.compile(r'<a href="(.*?)">')   #影片链接匹配规则
    findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)   #re.S忽略换行符,图片链接匹配规则
    findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名
    findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则
    fingCommentNum = re.compile(r'<span>(d*?)人评价</span>')  #匹配评价人数
    findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价
    findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容
    
    #爬取网页
    def getData(baseurl):
        datalist = []
        #2.逐一解析数据
        for i in range(10):      #调用获取页面信息的函数10次
            url = baseurl + str(i*25)
            html = askURl(url)
            #2.解析数据
            soup = BeautifulSoup(html,"html.parser")
            for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表
                #print(item) #测试查看电影item全部信息
                data = [] #保存一部电影的所有信息
                item=str(item)
                # re库用来通过正则表达式查找指定的字符串
                link=re.findall(findLink,item)[0]  #查找超链接
                data.append(link)
                imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址
                data.append(imgSrc)
                titles=re.findall(findTitle,item) #查找标题,可能多个
                for i in range(0,3):
                    res = titles[i].replace("/","").replace(" ","").replace("xa0","") #去掉无关符号
                    data.append(res)
                rating = re.findall(findRating, item)[0] #查找评分
                data.append(rating)
                commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量
    
                data.append(commentNum)
                inq = re.findall(findInq, item) #查找一句话评论
                if len(inq) !=0:
                    inq = inq[0].replace(".","").replace(" ","").replace("","") #去掉无关符号
                    data.append(inq)
                else:
                    data.append("")
                bd = re.findall(findBD, item)[0] #查找相关内容
                bd = re.sub('<br(s+)?/>(s+)?>',"",bd)  #去掉<br/>
                bd = re.sub('/',"",bd)
                bd = re.sub('xa0',"",bd)
                bd = re.sub(' ',"",bd)
                data.append(bd.strip())
    
                datalist.append(data) #把处理好的一部电影信息放入datalist
                #print(datalist)
        return  datalist
    #得到指定一个URL的网页内容
    def askURl(url):
        head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
        request = urllib.request.Request(url,headers=head)
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
            #print(html)
        except urllib.error.URLError as  e:
            if hasattr(e,"code"):
                print(e.code)
        return html
    
    #保存数据
    def saveData(datalist,savepath):
        print("save......")
        book = xlwt.Workbook(encoding="utf8",style_compression=0)
        sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
        col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息")
        for i in range(9):
            sheet.write(0,i,col[i]) #列名
        for i in range(250):
            print("第%d条"%(i+1))
            data = datalist[i]
            for j in range(0,9):
                sheet.write(i+1,j,data[j])
        book.save(savepath)    #保存
    
    #数据库初始化
    def init_db(dbpath):
        sql = '''
            create table if not exists movie250(
            id integer primary key autoincrement,
            info_link text,
            pic_link text,
            name1 varchar,
            name2 varchar,
            name3 varchar,
            score numeric,
            rated numeric,
            instroduction text,
            info text
            )
        ''' #创建数据表
        conn = sqlite3.connect(dbpath)
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        conn.close()
    
    #保存数据入DB
    def saveData2DB(datalist, dbpath):
        init_db(dbpath)
        conn=sqlite3.connect(dbpath)
        cur = conn.cursor()
    
        for data in datalist:
            for index in range(len(data)):
                if index ==5 or index ==6:
                    continue
                data[index] = '"'+data[index]+'"'
            sql = '''
            insert into movie250(
            info_link,pic_link,name1,name2,name3,score,rated,instroduction,info)
            values(%s)'''%",".join(data)
            #print(sql)
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()
    
    if __name__ == "__main__":  #当程序执行时
        #调用函数
        main()
        print("爬取完毕")
  • 相关阅读:
    C# 开发规范
    C# 调用webserver 出现:未能从程序集“jgd3jufm, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null”中加载类型
    C# 组装XML传给webserver+XML 返回获取多个xml,根据多个XML 返回dataset类型
    linux下搭建git服务器
    Linux整合Apache和SVN
    JAVA通过Gearman实现MySQL到Redis的数据同步(异步复制)
    比尔盖茨的十句忠告
    Spring核心接口之InitializingBean
    mongodb安装和配置
    redis主从配置
  • 原文地址:https://www.cnblogs.com/mrwhite2020/p/15169293.html
Copyright © 2020-2023  润新知