果断使用BeautifulSoup!
- -不想写了,挺简单的,贴个代码
1 import urllib2 2 import chardet 3 from BeautifulSoup import BeautifulSoup 4 import pymongo 5 import time 6 db = pymongo.Connection().notice #user notice 7 8 def load_work(): 9 return db.work.find() 10 def work_insert(data): 11 db.work.insert(data) 12 def update_state(idx,data): 13 #print data 14 db.state.update(idx,data,upsert=True) 15 def to_notice(item): 16 if(len(item) == 0): 17 print 'No news' 18 else: 19 print item 20 #pass 21 def main_work(): 22 work = load_work() 23 print '=====begin======' 24 print time.ctime() 25 for i in work: 26 data = urllib2.urlopen(i['url']).read() 27 charset = chardet.detect(data)['encoding'].lower() 28 if charset == 'gb2312': 29 charset = 'GBK' 30 soup = BeautifulSoup(data , fromEncoding = charset) 31 lst = soup.findAll('a') 32 new_item = [] 33 url_set = [] 34 for item in lst: 35 url = item.attrs[0][1] 36 text = item.getText() 37 url_set.append(url) 38 if(db.state.find_one({'url':i['url'] , 'url_set':url}) == None):state.update({'url':url} , {'url':url} , upsert = True) 39 if(len(new_item) > 0): 40 #TODO notice 41 to_notice(new_item) 42 db.state.update({'url':i['url']} , {'url':i['url'] , 'url_set':url_set},upsert=True) 43 print '=====end======' 44 while True: 45 main_work() 46 time.sleep(60)