• 爬取贴吧


    在编码问题上困住了好久,对BeautifulSoup的问题在上一篇blog也分析了

    最后写文件的时候用的gb18030编码解决的

    编码名称 用途
    utf8 所有语言
    gbk 简体中文
    gb2312 简体中文
    gb18030 简体中文
    big5 繁体中文
    big5hkscs 繁体中文

    UnicodeEncodeError: 'gbk' codec can't encode character 'xXX' in position XX

    哎,我就是载在了GBK手上

     1 from bs4 import BeautifulSoup
     2 from multiprocessing.dummy import Pool as ThreadPool
     3 import requests
     4 import re
     5 import os
     6 # import io
     7 # import sys
     8 import traceback
     9 
    10 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
    11 
    12 def writeRes(Res):
    13     filename = 'data/test.txt'
    14     with open(filename, 'a', encoding = 'gb18030') as f:
    15         f.write('回帖时间:' + str(Res['date']) + '
    ')
    16         f.write('回帖人:' + Res['user_name'] + '
    ')
    17         f.write('回帖内容:' + Res['text'] + '
    
    ')
    18 
    19 def getHTML(url, pages, header):
    20     try:
    21         parameters = {'pn':pages}
    22         r = requests.get(url, params = parameters, headers = header)
    23         r.raise_for_status()
    24         r.encoding = r.apparent_encoding
    25         return r.text
    26     except:
    27         print('网站获取失败')
    28         return ""
    29 
    30 def parse(url):
    31     #parse every pages
    32     for pages in range(1, 700):
    33         try:
    34             header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
    35             html = getHTML(url, pages, header)
    36             Soup = BeautifulSoup(html, 'html.parser')
    37             InfoList = Soup.find_all(class_ = re.compile('l_post j_l_post l_post_bright'))
    38             # turn str into dict
    39             Res = {}
    40             for Info in InfoList:
    41                 s = Info.attrs['data-field']
    42                 s = s.replace('null', 'None')
    43                 s = s.replace('true', 'True')
    44                 s = s.replace('false', 'False')
    45                 s = eval(s)
    46                 temp = Info.find(attrs = {'class':'d_post_content'})
    47                 Res['user_name'] = s['author']['user_name']
    48                 Res['date'] = s['content']['date']
    49                 Res['text'] = temp.text.replace(' ', '')
    50                 #print('Hello')
    51                 writeRes(Res)
    52             print('第{}页解析成功'.format(pages))
    53         except:
    54             #traceback.print_exc()
    55             print('第{}页解析失败'.format(pages))
    56             continue
    57 
    58 def main():
    59     url = 'http://tieba.baidu.com/p/3522395718'
    60     parse(url)
    61 
    62 if __name__ == '__main__':
    63     main()
  • 相关阅读:
    SQL Server 调优系列进阶篇
    封装 RabbitMQ.NET
    RabbitMQ 的行为艺术
    SQL Server 调优系列进阶篇
    SQL Server 调优系列进阶篇
    FastFrameWork 快速开发框架
    SQL Server 调优系列进阶篇
    Java基础:三目运算符
    marquee标签,好神奇啊...
    Java JFrame 和 Frame 的区别
  • 原文地址:https://www.cnblogs.com/ducklu/p/9010469.html
Copyright © 2020-2023  润新知