• 无法解决的问题


    学习python时做了一个爬虫爬取百度贴吧的内容,但是用BeautifulSoup得到的结果使用find_all函数却无法获取。

    getCommentInfo.py:

     1 from urllib import request
     2 import requests
     3 from bs4 import BeautifulSoup
     4 from mylog import MyLog as mylog
     5 import random
     6 
     7 class Item(object):
     8     title = None    #帖子标题
     9     firstAuthor = None  #创建者
    10     firstTime = None    #创建时间
    11     reNum = None    #总回复数
    12     content = None  #最后回复内容
    13     lastAuthor = None   #最后回复者
    14     lastTime = None     #最后回复时间
    15 
    16 class GetTiebaInfo(object):
    17     def __init__(self,url):
    18         self.url = url
    19         self.log = mylog()
    20         self.pageSum = 5
    21         self.urls = self.getUrls(self.pageSum)
    22         self.items = self.spider(self.urls)
    23         self.pipelines(self.items)
    24 
    25     def getUrls(self,pageSum):
    26         urls = []
    27         pns = [str(i*50) for i in range(pageSum)]
    28         ul = self.url.split('=')
    29         for pn in pns:
    30             ul[-1] = pn
    31             url = '='.join(ul)
    32             urls.append(url)
    33         self.log.info(u"获取URLS成功 ")
    34         return urls
    35 
    36     def spider(self,urls):
    37         items = []
    38         for url in urls:
    39             htmlContent = self.getResponseContent(url)
    40             with open("content.html","w",encoding='utf-8') as f:
    41                 f.write(htmlContent)
    42             soup = BeautifulSoup(htmlContent,'lxml')
    43             with open('soup.txt','w',encoding='utf-8') as fp:
    44                 fp.write(soup.text)
    45 
    46             tagsli = soup.find_all('li',attrs={'class':'j_thread_list clearfix'})
    47             for tag in tagsli:
    48                 item = Item()
    49                 item.title = tag.find('a',attrs={'class':'j_th_tit '}).get_text().strip()
    50                 item.firstAuthor = tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
    51                 item.firstTime = tag.find('span',attrs={'title':u'创建时间'.encode('utf-8')}).get_text().strip()
    52                 item.reNum = tag.find('span',attrs={'title':u'回复'.encode('utf-8')}).get_text().strip()
    53                 item.content = tag.find('div',attrs={'class':'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
    54                 item.lastAuthor = tag.find('span',attrs={'class':'tb_icon_author_rely j_replyer'}).a.get_text().strip()
    55                 item.lastTime = tag.find('span',attrs={'title':u'最后回复时间'.encode('utf-8')}).get_text().strip()
    56                 items.append(item)
    57                 self.log.info(u'获取标题为<<%s>>的项成功 ...' %item.title)
    58         return items
    59 
    60     def pipelines(self,items):
    61         fileName = u'百度贴吧_权力的游戏.txt'.encode('utf-8')
    62         with open(fileName,'w') as fp:
    63             for item in items:
    64                 fp.write('title:%s 	 author:%s 	 firstTime:%s 
     content:%s 
     return:%s 
     lastAuthor:%s 	 lastTime:%s 
    
    
    
    '
    65                          %(item.title.encode('utf-8'),item.firstAuthor.encode('utf-8'),item.firstTime.encode('utf-8'),item.content.encode('utf-8'),item.lastTime.encode('utf-8')))
    66                 self.log.info(u'标题为<<%s>>的项输入到"%s"成功' %(item.title,fileName.decode('utf-8')))
    67 
    68     def getResponseContent(self,url):
    69         header = {
    70             'Accept': 'text/heml,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    71             'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'zh_CN,zh;q=0.8',
    72             'Connect': 'keep-alive',
    73             'User-Agent': 'Mozilla/5.0(Windows NT 6.3;WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.235'
    74         }
    75         timeout = random.choice(range(80, 180))
    76         try:
    77             response = requests.get(url,headers = header,timeout = timeout)
    78             response.encoding = 'utf-8'
    79         except:
    80             self.log.error(u'Python 返回 URL:%s 数据失败' %url)
    81         else:
    82             self.log.info(u'Python 返回URL:%s 数据成功' %url)
    83             return response.content.decode('utf-8')
    84 if __name__=='__main__':
    85     url = u'http://tieba.baidu.com/f?kw=权力的游戏&ie=utf-8&pn=50'
    86     GTI = GetTiebaInfo(url)
    View Code

    mylog.py

     1 import logging
     2 import getpass
     3 import sys
     4 
     5 #定义MyLog类
     6 class MyLog(object):
     7     def __init__(self):
     8         self.user = getpass.getuser()
     9         self.logger = logging.getLogger(self.user)
    10         self.logger.setLevel(logging.DEBUG)
    11 
    12         #日志文件名
    13         self.logFile = sys.argv[0][0:-3] + '.log'
    14         self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s
    ')
    15 
    16         #文件显示到屏幕并输出到日志文件
    17         self.logHand = logging.FileHandler(self.logFile,encoding='utf-8')
    18         self.logHand.setFormatter(self.formatter)
    19         self.logHand.setLevel(logging.DEBUG)
    20 
    21         self.logHandSt = logging.StreamHandler()
    22         self.logHand.setFormatter(self.formatter)
    23         self.logHandSt.setLevel(logging.DEBUG)
    24 
    25         self.logger.addHandler(self.logHand)
    26         self.logger.addHandler(self.logHandSt)
    27 
    28     def debug(self,msg):
    29         self.logger.debug(msg)
    30 
    31     def info(self,msg):
    32         self.logger.info(msg)
    33 
    34     def warn(self,msg):
    35         self.logger.warning(msg)
    36 
    37     def error(self,msg):
    38         self.logger.error(msg)
    39 
    40     def critical(self,msg):
    41         self.logger.critical(msg)
    42 
    43 # if __name__=='__main__':
    44 # #     mylog = MyLog()
    45 # #     mylog.debug(u"I'm debug 测试中文")
    46 # #     mylog.info("I'm info")
    47 # #     mylog.warn("I'm warn")
    48 # #     mylog.error(u"I'm error 测试中文")
    49 # #     mylog.critical("I'm critical")
    View Code

    错误:

      在getCommentInfo.py中40行左右的htmlContent可得到原html的正确内容,但经BeautifulSoup后,返回的soup内容变化,导致无法爬取结果。可从两个调式文件content.html和soup.txt得知。

  • 相关阅读:
    零点起飞学FlashCS6动画制作
    注意 方法的执行 顺序,并且 如果 为 nil的话,bool类型的数据 也默认是有值的,
    datepicker 的一个属性,
    用户体验 的一个原则,
    break 一下 便会 跳出 整个 switch ,
    原来 同一个 bundleid的项目 的下面 可以 通过这个 解决,诡异的问题,
    section 和 row,
    缓存,plist 和 json
    本地通知,UILocalNotification
    bundle id
  • 原文地址:https://www.cnblogs.com/ShadowCharle/p/10739512.html
Copyright © 2020-2023  润新知