在编码问题上困住了好久,对BeautifulSoup的问题在上一篇blog也分析了
最后写文件的时候用的gb18030编码解决的
编码名称 | 用途 |
utf8 | 所有语言 |
gbk | 简体中文 |
gb2312 | 简体中文 |
gb18030 | 简体中文 |
big5 | 繁体中文 |
big5hkscs | 繁体中文 |
UnicodeEncodeError: 'gbk' codec can't encode character 'xXX' in position XX
哎,我就是载在了GBK手上
1 from bs4 import BeautifulSoup 2 from multiprocessing.dummy import Pool as ThreadPool 3 import requests 4 import re 5 import os 6 # import io 7 # import sys 8 import traceback 9 10 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码 11 12 def writeRes(Res): 13 filename = 'data/test.txt' 14 with open(filename, 'a', encoding = 'gb18030') as f: 15 f.write('回帖时间:' + str(Res['date']) + ' ') 16 f.write('回帖人:' + Res['user_name'] + ' ') 17 f.write('回帖内容:' + Res['text'] + ' ') 18 19 def getHTML(url, pages, header): 20 try: 21 parameters = {'pn':pages} 22 r = requests.get(url, params = parameters, headers = header) 23 r.raise_for_status() 24 r.encoding = r.apparent_encoding 25 return r.text 26 except: 27 print('网站获取失败') 28 return "" 29 30 def parse(url): 31 #parse every pages 32 for pages in range(1, 700): 33 try: 34 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} 35 html = getHTML(url, pages, header) 36 Soup = BeautifulSoup(html, 'html.parser') 37 InfoList = Soup.find_all(class_ = re.compile('l_post j_l_post l_post_bright')) 38 # turn str into dict 39 Res = {} 40 for Info in InfoList: 41 s = Info.attrs['data-field'] 42 s = s.replace('null', 'None') 43 s = s.replace('true', 'True') 44 s = s.replace('false', 'False') 45 s = eval(s) 46 temp = Info.find(attrs = {'class':'d_post_content'}) 47 Res['user_name'] = s['author']['user_name'] 48 Res['date'] = s['content']['date'] 49 Res['text'] = temp.text.replace(' ', '') 50 #print('Hello') 51 writeRes(Res) 52 print('第{}页解析成功'.format(pages)) 53 except: 54 #traceback.print_exc() 55 print('第{}页解析失败'.format(pages)) 56 continue 57 58 def main(): 59 url = 'http://tieba.baidu.com/p/3522395718' 60 parse(url) 61 62 if __name__ == '__main__': 63 main()