将爬取出来的数据存入mongodb中
import pymongo import requests from lxml import etree class QiushiSpider: def __init__(self): self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"} # 连接对象 self.conn = pymongo.MongoClient("localhost", 27017) # 库对象 self.db = self.conn["Qiushi"] # 集合对象 self.myset = self.db["qiushiinfo"] def getPage(self, url): res = requests.get(url, headers=self.headers) res.encoding = "utf-8" html = res.text print(res.status_code) self.parsePage(html) def parsePage(self, html): parseHtml = etree.HTML(html) base_list = parseHtml.xpath('//div[contains(@id,"qiushi_tag_")]') print(base_list) for base in base_list: name = base.xpath('./div/a/h2/text()') if not name: name[0] = "匿名用户" content = base.xpath('./a/div/span') laughNum = base.xpath("./div/span[1]/i") pingNum = base.xpath("./div/span[2]/a/i") d = { "name": name[0].strip(), "content": content[0].text.strip(" "), "laughNum": laughNum[0].text, "pingNum": pingNum[0].text, } self.myset.insert(d) print("成功") if __name__ == '__main__': spider = QiushiSpider() url = "https://www.qiushibaike.com/8hr/page/1/" spider.getPage(url)