因为编码问题卡在这里了,还是不知道咋解决。先记录下代码,明天去研究。
#coding=utf-8 import requests import BeautifulSoup import re def createURL(checkWord): checkWord = checkWord.replace(' ', '+') checkWord = checkWord.strip() baiduURL = 'http://www.baidu.com/s?wd=%s&rn=100' % checkWord return baiduURL def getLastURL(rawurl): r = requests.get(rawurl) return r.url def getAtext(atext): pat = re.compile(r'<a .*?>(.*?)</a>') match = pat.findall(atext) if match: pureText = match[0].replace('<em>', '').replace('</em>', '') return pureText def getCacheDate(t): pat = re.compile(r'<span class="g">.*?(d{4}-d{1,2}-d{1,2}) </span>') match = pat.findall(t) if match: cacheDate = match[0] return cacheDate def getrank(checkWord): checkWord = unicode(checkWord, 'utf-8') baiduURL = createURL(checkWord) r = requests.get(baiduURL, allow_redirects = False) cont = r.content soup = BeautifulSoup.BeautifulSoup(cont) results = soup.findAll('table', {'class': 'result'}) for result in results: checkData = unicode(result.find('span', {'class': 'g'})) if domain in checkData: #改正则 nowRank = result['id'] resLink = result.find('h3').a resURL = resLink['href'] domainURL = getLastURL(resURL) resTitle = getAtext(unicode(resLink)) rescache = result.find('span', {'class': 'g'}) cacheDate = getCacheDate(unicode(rescache)) print checkWord , ',' , nowRank, ',', resTitle, ',', cacheDate, ',', domainURL # print domainURL # print resTitle # print nowRank # print cacheDate break else: print '>100' domain = 'www.douban.com/' f = open('r.txt', 'w') f.write(getrank('梦天空 解梦')) f.close()
最后放上2个链接供学习:
http://blog.wahahajk.com/2009/08/unicodedecodeerror-ascii-codec-cant.html