• 【Python】百度贴吧-中国好声音评论爬爬【自练OK-csv提取格式及评论提取空格等问题待改进】


    代码编写思路:

    学习知识点:

    1.class=a b(a假设是字体-宋体,b是颜色-蓝色;class中可以同时有两个参数a,b(宋体+蓝色),两者用空格隔开即可)

    2.拓展1:想要soup到某个元素,且该元素对应class中含有多个值,我们可以根据class中元素出现的规律,找到共性出现的元素去编写soup中内容。

    例如 想soup中的class可以找到相关规律,发现想找的元素对应class中都含有“l_post_bright”那么写成以下形式即可找到相关的元素对应内容。

    soup.find_all('div', class_="l_post_bright") 即可。

    百度贴吧-中国好声音评论爬爬

     1 # coding=utf-8
     2 import csv
     3 import random
     4 import io
     5 from urllib import request,parse,error
     6 import http.cookiejar
     7 import urllib
     8 import re
     9 from bs4 import BeautifulSoup
    10 
    11 # 爬爬网址
    12 #homeUrl ="http://tieba.baidu.com" #贴吧首页
    13 subUrl = "http://tieba.baidu.com/f?kw=%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3&ie=utf-8&pn=0" #中国好声音贴吧页面
    14 #childUrl="http://tieba.baidu.com/p/5825125387" #中国好声音贴吧第一条
    15 
    16 #存储csv文件路径
    17 outputFilePath = "E:scriptpython-scriptlaidutiebapapacsvfile_output.csv"
    18 
    19 def GetWebPageSource(url):
    20 
    21     values = {}
    22     data = parse.urlencode(values).encode('utf-8')
    23 
    24     # header
    25     user_agent = ""
    26     headers = {'User-Agent':user_agent,'Connection':'keep-alive'}
    27 
    28     # 声明cookie 声明opener
    29     cookie_filename = 'cookie.txt'
    30     cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
    31     handler = urllib.request.HTTPCookieProcessor(cookie)
    32     opener = urllib.request.build_opener(handler)
    33 
    34     # 声明request
    35     request=urllib.request.Request(url, data, headers)
    36     # 得到响应
    37     response = opener.open(request)
    38     html=response.read().decode('utf-8')
    39     # 保存cookie
    40     cookie.save(ignore_discard=True,ignore_expires=True)
    41 
    42     return  html
    43 
    44     # 将读取的内容写入一个新的csv文档
    54 # 主函数
    55 if __name__ == "__main__":
    56     #outputString = []
    57     maxSubPage=2 #中国好声音贴吧翻几页设置?
    58     m=0#起始页数
    59     with open("E:scriptpython-scriptlaidutiebapapacsvfile_output.csv", "w", newline="", encoding='utf-8') as datacsv:
    60         headers = ['title', 'url', 'comment']
    61         csvwriter = csv.writer(datacsv, headers)
    62         for n in range(1, int(maxSubPage)):
    63             subUrl = "http://tieba.baidu.com/f?kw=%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3&ie=utf-8&pn=" + str(m) + ".html"
    64             print(subUrl)
    65             subhtml = GetWebPageSource(subUrl)
    66             #print(html)
    67             # 利用BeatifulSoup获取想要的元素
    68             subsoup = BeautifulSoup(subhtml, "lxml")
    69             #打印中国好声音贴吧页标题
    70             print(subsoup.title)
    71             #打印中国好声音第一页贴吧标题
    72             all_titles = subsoup.find_all('div', class_="threadlist_title pull_left j_th_tit ")  # 提取有关与标题
    73             for title in all_titles:
    74                 print('--------------------------------------------------')
    75                 print("贴吧标题:"+title.get_text())#贴吧各标题题目
    76                 commentUrl = str(title.a['href'])
    77                 #print(commitment)
    78                 #评论页循环需要改改,maxchildpage
    79                 childPage = 1#评论页翻页变量
    80                 maxChildPage = 6#【变量】几页就翻几页设置?
    81                 for n in range(1, int(maxChildPage)):
    82                     childUrl = "http://tieba.baidu.com" + commentUrl +"?pn=" + str(childPage)
    83                     print("贴吧Url:"+childUrl)
    84                     childhtml = GetWebPageSource(childUrl)
    85                     childsoup = BeautifulSoup(childhtml, "lxml")
    86                     all_comments = childsoup.find_all('div', class_="d_post_content_main")  # 提取有关与评论
    87                     for comment in all_comments:
    88                         print("用户评论:" + comment.get_text())  # 贴吧各标题题目
    89                         #outputString.append([title.get_text(), childUrl, comment.get_text()])
    90                         csvwriter.writerow([title.get_text(), childUrl, comment.get_text()])
    91                     print('--------------------------------------------------')
    92                     childPage = childPage + 1
    93                 m = m + 50  # 翻页控制,通过观察发现,每页相差50
    

    跑完了成果图

    csv文档中效果

    上方生成的csv文件通过txt记事本打开另存为ANIS编码方式,然后在通过csv打开就不会再乱码了,解决csv打开乱码问题相关可以参考博文:https://www.cnblogs.com/zhuzhubaoya/p/9675203.html

     ----------------------------------------------------------------------------------------

    升级版-待完善代码:

    # coding=utf-8
    import csv
    import random
    import io
    from urllib import request,parse,error
    import http.cookiejar
    import urllib
    import re
    from bs4 import BeautifulSoup
    
    # 爬爬网址
    #homeUrl ="http://tieba.baidu.com" #贴吧首页
    subUrl = "http://tieba.baidu.com/f?kw=%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3&ie=utf-8&pn=0" #中国好声音贴吧页面
    #childUrl="http://tieba.baidu.com/p/5825125387" #中国好声音贴吧第一条
    
    #存储csv文件路径
    outputFilePath = "F:Pythonscriptspestpapacsvfile_output.csv"
    
    def GetWebPageSource(url):
    
        values = {}
        data = parse.urlencode(values).encode('utf-8')
    
        # header
        user_agent = ""
        headers = {'User-Agent':user_agent,'Connection':'keep-alive'}
    
        # 声明cookie 声明opener
        cookie_filename = 'cookie.txt'
        cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
        handler = urllib.request.HTTPCookieProcessor(cookie)
        opener = urllib.request.build_opener(handler)
    
        # 声明request
        request=urllib.request.Request(url, data, headers)
        # 得到响应
        response = opener.open(request)
        html=response.read().decode('utf-8')
        # 保存cookie
        cookie.save(ignore_discard=True,ignore_expires=True)
    
        return  html
    
        # 将读取的内容写入一个新的csv文档
    # 主函数
    if __name__ == "__main__":
        #outputString = []
        maxSubPage=2 #中国好声音贴吧翻几页设置?
        m=0#起始页数
        with open("F:Pythonscriptspestpapacsvfile_output.csv", "w", newline="", encoding='utf-8') as datacsv:
            headers = ['title', 'url', 'comment']
            csvwriter = csv.writer(datacsv, headers)
            for n in range(1, int(maxSubPage)):
                subUrl = "http://tieba.baidu.com/f?kw=%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3&ie=utf-8&pn=" + str(m) + ".html"
                print(subUrl)
                subhtml = GetWebPageSource(subUrl)
                #print(html)
                # 利用BeatifulSoup获取想要的元素
                subsoup = BeautifulSoup(subhtml, "lxml")
                #打印中国好声音贴吧页标题
                print(subsoup.title)
                #打印中国好声音第一页贴吧标题
                all_titles = subsoup.find_all('div', class_="threadlist_title pull_left j_th_tit ")  # 提取有关与标题
                for title in all_titles:
                    print('--------------------------------------------------')
                    print("贴吧标题:"+title.get_text())#贴吧各标题题目
                    commentUrl = str(title.a['href'])
                    #print(commitment)
                    #评论页循环需要改改,maxchildpage
                    childPage = 1#评论页翻页变量
                    maxChildPage = 6#【变量】几页就翻几页设置?
                    csvwriter.writerow(['title', 'url', 'comment'])
                    for n in range(1, int(maxChildPage)):
                        childUrl = "http://tieba.baidu.com" + commentUrl +"?pn=" + str(childPage)
                        print("贴吧Url:"+childUrl)
                        childhtml = GetWebPageSource(childUrl)
                        childsoup = BeautifulSoup(childhtml, "lxml")
                        #all_comments = childsoup.find_all('div', class_="d_post_content_main")  # 提取有关与评论
                        allCommentList = childsoup.find_all('div', class_="l_post_bright")
                        for n in allCommentList:
    
                            authorName = n.find_all('li', class_="d_name")
                            for i in authorName:#写成for循环可以规避报错,如果有,走0条,不会报错。
                                print("作者:" + i.get_text().strip())
                            authorLev = n.find_all('div', class_="d_badge_lv")
                            for i in authorLev:
                                print("等级:" + i.get_text().strip())
    
                            all_comments = n.find_all('div', class_="d_post_content_main")
                            for comment in all_comments:
                                print("评论:" + comment.get_text().strip())
                                csvwriter.writerow([title.get_text(), childUrl, comment.get_text()])
                            print('--------------------------------------------------')
                        childPage = childPage + 1
                    m = m + 50  # 翻页控制,通过观察发现,每页相差50

     ————————————————————————————————————————————————————————————————————————————

    完善后代码:

    # coding=utf-8
    import csv
    from urllib import request,parse
    import http.cookiejar
    import urllib
    from bs4 import BeautifulSoup
    
    # 爬爬网址
    #homeUrl ="http://tieba.baidu.com" #贴吧首页
    subUrl = "http://tieba.baidu.com/f?kw=%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3&ie=utf-8&pn=0" #中国好声音贴吧页面
    #childUrl="http://tieba.baidu.com/p/5825125387" #中国好声音贴吧第一条
    
    #存储csv文件路径
    outputFilePath = "E:scriptpython-scriptlaidutiebapapacsvfile_output.csv"
    
    def GetWebPageSource(url):
    
        values = {}
        data = parse.urlencode(values).encode('utf-8')
    
        # header
        user_agent = ""
        headers = {'User-Agent':user_agent,'Connection':'keep-alive'}
    
        # 声明cookie 声明opener
        cookie_filename = 'cookie.txt'
        cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
        handler = urllib.request.HTTPCookieProcessor(cookie)
        opener = urllib.request.build_opener(handler)
    
        # 声明request
        request=urllib.request.Request(url, data, headers)
        # 得到响应
        response = opener.open(request)
        html=response.read().decode('utf-8')
        # 保存cookie
        cookie.save(ignore_discard=True,ignore_expires=True)
    
        return  html
    
    #打印子贴吧评论区
    def PrintUserComment(childUrl):
        childhtml = GetWebPageSource(childUrl)
        childsoup = BeautifulSoup(childhtml, "lxml")
    
        allCommentList = childsoup.find_all('div', class_="l_post_bright")
        for n in allCommentList:
            print('--------------------------------------------------')
            authorName = n.find_all('li', class_="d_name")
            for i in authorName:  # 写成for循环可以规避报错,如果有,走0条,不会报错。
                authorName = i.get_text().strip()
                print("作者:" + authorName)
            authorLev = n.find_all('div', class_="d_badge_lv")
            for i in authorLev:
                authorLev = i.get_text().strip()
                print("等级:" + authorLev)
    
            all_comments = n.find_all('div', class_="d_post_content_main")
            for comment in all_comments:
                commentRes = comment.get_text()
                print("评论:" + comment.get_text().strip())
                csvwriter.writerow([titleName, childUrl, authorName, authorLev, commentRes])
            print('--------------------------------------------------')
    
    # 主函数
    if __name__ == "__main__":
    
        # 几个控制变量初值定义
        m = 0  # 起始页数
        maxSubPage = 2  # 中国好声音贴吧翻几页设置?
        maxChildPage = 6  # 【变量】几页就翻几页设置?
    
        # 开始遍历贴吧内容,顺序:父贴吧-子贴吧-评论区,并将以下子贴吧评论内容写入csv文件
        with open("E:scriptpython-scriptlaidutiebapapacsvfile_output.csv", "w", newline="", encoding='utf-8') as datacsv:
            headers = []
            csvwriter = csv.writer(datacsv, headers)
    
            # 父贴吧页面处理
            for n in range(1, int(maxSubPage)):
                subUrl = "http://tieba.baidu.com/f?kw=%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3&ie=utf-8&pn=" + str(m) + ".html"# 父贴吧链接
                print(subUrl)
                subhtml = GetWebPageSource(subUrl)
                subsoup = BeautifulSoup(subhtml, "lxml")
                print(subsoup.title)# 打印父贴吧标题
    
                # 遍历父贴吧下子贴吧标题
                all_titles = subsoup.find_all('div', class_="threadlist_title pull_left j_th_tit ")  # 提取有关与标题
                for title in all_titles:
                    titleName = title.get_text()
                    print("贴吧标题:"+titleName)# 打印中国好声音父贴吧页各子贴吧title
                    commentUrl = str(title.a['href'])# 取子贴吧网址链接规律值'href'
                    csvwriter.writerow(['贴吧标题', '链接', '用户姓名', '用户等级', '评论'])# 定义打印评论csv文件标题行
    
                    childPage = 1  # 评论页翻页变量
                    # 遍历子贴吧下评论页
                    for n in range(1, int(maxChildPage)):
                        childUrl = "http://tieba.baidu.com" + commentUrl +"?pn=" + str(childPage)
                        print("贴吧Url:"+childUrl)
    
                        # 打印子贴吧评论区
                        PrintUserComment(childUrl)
                        childPage = childPage + 1
                    m = m + 50  # 翻页控制,通过观察发现,每页相差50

    运行后效果:

    自动生成csv效果(如果乱码需要按照上面的方法进行转码后再打开就OK了):

  • 相关阅读:
    导入模块
    Windows x86-64下python的三个版本
    ubuntu 16.04 添加网卡
    重启rsyncd
    docker时区
    git回滚
    impdp and docker install oracleXE
    Oracle 把一个用户所有表的读权限授予另一个用户
    zabbix web监控
    WebStorm license server
  • 原文地址:https://www.cnblogs.com/zhuzhubaoya/p/9675376.html
Copyright © 2020-2023  润新知