不多说了,上代码:
1 from requests import request 2 from bs4 import BeautifulSoup 3 import re 4 import pymongo 5 6 7 8 class SpiderDouBan: 9 10 11 12 def __init__(self): 13 client = pymongo.MongoClient(host='localhost', port=27017) 14 db = client['spider_db'] 15 self.collection = db['douban_movie_top250'] 16 17 18 19 def get_html(self, url): 20 ''' 21 获取一页的html文本 22 :param url: 地址 23 :return: 24 ''' 25 html = request('get', url).text 26 soup = BeautifulSoup(html, 'lxml') 27 return soup 28 29 30 def get_one_page(self, soup, order): 31 ''' 32 获取某一页的内容 33 :param soup: soup实例化对象 34 :return: 35 ''' 36 movie_names = [span.string for span in soup.find_all(name='span', attrs={'class': 'title'}) if not re.search('/', span.string)] 37 movie_actors = [ re.sub(' |xa0', '', p.get_text().strip('" " | | xa0')).split('/') for p in soup.find_all(name='p', attrs={'class': ''})] 38 movie_rates = [span.string for span in soup.find_all(name='span', attrs={'class': 'rating_num'})] 39 comment_num = [span_2.string for span in soup.find_all(attrs={'property': 'v:best'}) for span_2 in span.next_siblings if re.search('w+', span_2.string)] 40 short_comments = [re.sub('。', '', span.string) for span in soup.find_all(class_='inq')] 41 for index, name in enumerate(movie_names): 42 print(f'正在爬取第{order + index + 1}条数据...') 43 movie_info = { 44 'order': f'No.{order + index + 1}', 45 'movie_name': name, 46 'movie_type': f'{re.findall("[0-9]+", movie_actors[index][-3])[0]}年/{movie_actors[index][-2]}/{movie_actors[index][-1]}', 47 'movie_rate': f'{movie_rates[index][0]}分', 48 'short_comment': f'{short_comments[index]}' 49 } 50 self.collection.insert_one(movie_info) 51 52 53 54 def main(self, url, order): 55 ''' 56 主程序 57 :return: 58 ''' 59 soup = self.get_html(url) 60 self.get_one_page(soup, order) 61 62 63 64 65 if __name__ == '__main__': 66 for offset in range(0, 250, 25): 67 order = offset 68 url = f'https://movie.douban.com/top250?start={str(offset)}' 69 SpiderDouBan().main(url, order)
运行结果:
MongoDB存储效果: