• 爬虫练习:贴吧 https://tieba.baidu.com/f?kw=友谊已走到尽头


    
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time : 2020/8/28 10:05
    # @Author : aqiong
    # @Site : 
    # @File : 贴吧爬虫.py
    # @Software: PyCharm
    #https://tieba.baidu.com/f?kw=友谊已走到尽头
    #后缀:
    #1.请求网页,获取数据,2,解析数据 3.存储数据
    import re
    import xlwt#存储数据
    from bs4 import BeautifulSoup#分析网页
    import urllib.request,urllib.error#请求网页
    
    
    #标题
    findTitle = re.compile(r'<a class="j_th_tit".*?title=".*?".*?>(.*?)</a>',re.S)
    #主题作者
    findauthor = re.compile(r'<span class="tb_icon_author" data-field=.*?title="(.*?)"><i class=.*?</span>')
    #发布时间
    findTime = re.compile(r'<div class="threadlist_abs threadlist_abs_onlyline">(.*?)</span>')
    #发布内容
    findContent = re.compile(r'<div class="threadlist_abs threadlist_abs_onlyline">(.*?)</div>',re.S)
    #最后回复人
    findLastReader = re.compile(r'<span class="tb_icon_author_rely j_replyer" title="(.*?)">')
    #最后回复时间
    findLastTime=re.compile(r'<span class="threadlist_reply_date pull_right j_reply_data" title="最后回复时间">(.*?)</span>',re.S)
    #镇楼图
    #findImgSrc = re.compile(r' <ul class="threadlist_media j_threadlist_media clearfix">*?src="(.*?).*?</ul>',re.S)
    findImgSrc = re.compile(r'<img attr=.*?bpic="(.*?)".*?class="threadlist_pic j_m_pic".*?src=""/>',re.S)
    #回复数
    findReadNum = re.compile(r'<span class="threadlist_rep_num center_text" title="回复">(d*?)</span>')
    def main():
        #1.请求网页,获取数据
        baseUrl = 'https://tieba.baidu.com/f?kw=%E5%8F%8B%E8%B0%8A%E5%B7%B2%E8%B5%B0%E5%88%B0%E5%B0%BD%E5%A4%B4'
        #获取n条数据
        n= 200
        datalist = getdata(baseUrl,n)
        name = '贴吧帖子.xls'
        saveData(datalist,name)
    
    
    def getdata(baseUrl,n):
        datalist = []
        for index in range(0,n,50):
           #获取每页的数据
            html = askURL(baseUrl+'&ie=utf-8&pn='+str(index))
            #html=html.replace('div class="t_con cleafix"','div class="t_concleafix"').replace('<!--','').replace('-->','')
            html = html.replace('<!--', '').replace('-->', '')
    
            soup = BeautifulSoup(html,'html.parser')
    
    
            #解析数据
           # for item in soup.find_all('div',class_='t_concleafix'):
            for item in soup.find_all('div',class_='t_con cleafix'):
                data = []
                item = str(item)
    
                #print(re.findall(findReadNum,item))
                #print(re.findall(findImgSrc, item))
                data.append(re.findall(findTitle,item)[0])
                if re.findall(findauthor,item) :
                    data.append(re.findall(findauthor, item)[0])
                else :
                    data.append(' ')
    
                if re.findall(findTime,item):
                   data.append(re.findall(findTime,item))
                else:
                    data.append('')
                data.append(re.findall(findContent,item)[0])
                data.append(re.findall(findLastReader,item)[0])
                data.append(re.findall(findLastTime,item)[0])
                data.append(re.findall(findReadNum, item))##这里不理解为什么当下面的图片链接都取出来,取回复数就会报错,或者取到的很多数据为空,但如果只取一个图片,则不报错,且数据显示正常,初步猜测和存取到excel有关系
                imglist = re.findall(findImgSrc,item)
                #print(re.findall(findReadNum, item))
                if imglist:
                    data.append(imglist[0])
                    for item in range(0,len(imglist)):
                        #print(imglist)
                        data.append(imglist[item]+'
    ')
                else:
                    data.append(' ')
                #data.append(re.findall(findReadNum, item))
                #print(re.findall(findReadNum,str(item)))
                datalist.append(data)
        return datalist
    
    
    def askURL(baseUrl):
        html = ''
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36","Accept-Language": "zh-CN,zh;q=0.9"}
        request=urllib.request.Request(url=baseUrl,headers=headers)
        try:
    
           html = urllib.request.urlopen(request).read().decode('utf-8')
    
    
        except urllib.error.URLError as e:
            print(e)
        return html
    
    def saveData(datalist,name):
        workbook = xlwt.Workbook(encoding='utf-8',style_compression=0)
        worksheet = workbook.add_sheet('帖子内容',cell_overwrite_ok=True)
        col = ('标题','主题作者','发布时间','发布内容','最后回复人','最后回复时间','回复数','镇楼图')
        for i in range(0,len(col)):
            worksheet.write(0,i,col[i])
        for item in  range(0,len(datalist)):
            data = datalist[item]
            for index in range(0,len(data)):
                worksheet.write(item+1,index,data[index])
        workbook.save(name)
    
    
    
    
    
    if __name__ == '__main__':
        main()
    
  • 相关阅读:
    node获取请求参数的方法get与post请求
    express框架路由未导出错误:Router.use() requires a middleware function but got a Object
    移动端学习之理解WEB APP、Native APP、Hybrid APP以及React Native/uniapp包括H5、小程序等的区别与共通之处
    微信小程序
    “You may need an appropriate loader to handle this file type”
    vue-cli Cannot find module 'less'
    Node 跨域问题 Access to XMLHttpRequest at 'http://localhost:8080/api/user/login' from origin 'http://localhost:808
    Access denied for user 'root'@'localhost' (using password: YES)
    nrm : 无法加载文件 C:Users......因为在此系统上禁止运行脚本。
    继承树追溯
  • 原文地址:https://www.cnblogs.com/aqiong/p/13631494.html
Copyright © 2020-2023  润新知