1.将爬取出来的数据存储到scv, mysql, mongo数据库中,利用正则的方式爬取。
import csv import warnings import pymongo import pymysql import requests import re class LIANJIA: def __init__(self): self.url = "https://%s.lianjia.com/ershoufang/" # self.proxies = {"HTTP", "http://61.152.248.147:80"} self.proxies = {"HTTP": "http://116.255.162.107:16816"} self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"} # 连接pymysql数据库 # self.db = pymysql.connect("localhost", "root", "123456", charset="utf8") # self.cursor = self.db.cursor() # 连接mongo数据库 self.conn = pymongo.MongoClient("localhost", 27017) self.db = self.conn["lianjia"] self.tab = self.db.lianjiafang def getHtml(self, url): response = requests.get(url, proxies=self.proxies, headers=self.headers) response.encoding = "utf-8" html = response.text self.parse(html) # 利用正则来匹配数据 def parse(self, html): re_str = '<div class="info clear">.*?data-el="region">(.*?)</a>.*?class="totalPrice"><span>(.*?)</span>.*?<span>(.*?)</span></div>' p = re.compile(re_str, re.S) # 利用正则匹配,返回的是一个集合列表 result_list = p.findall(html) print(result_list) self.saveTomongo(result_list) # 存入csv def saveTocsv(self, result_list): for result in result_list: print(result) with open("lianjia.csv", "a", newline="") as f: writer = csv.writer(f) writer.writerow(result) # 将数据存入mysql数据库中 def saveTomysql(self, result_list): cd_db = "create database if not exists lianjia charset utf8" u_db = "use lianjia" c_tab = "create table if not exists lianjiafang( id int primary key auto_increment, name varchar(100), price varchar(100), sq_mPrice varchar(100) )" ins = "insert into lianjiafang(name, price, sq_mPrice) values(%s, %s, %s)" warnings.filterwarnings("ignore") try: self.cursor.execute(cd_db) self.cursor.execute(u_db) self.cursor.execute(c_tab) except: pass # 插入记录 for result_tuple in result_list: L = [result_tuple[0].strip(), int(result_tuple[1].strip())*10000, result_tuple[2].strip()] # execute(ins, [列表]) self.cursor.execute(ins, L) self.db.commit() print("OK") # 存入mongo def saveTomongo(self, result_list): for result_tuple in result_list: name = result_tuple[0].strip() price = int(result_tuple[1].strip()) * 10000 sq_mPrice = result_tuple[2].strip() d = {"name":name, "star":price, "time":sq_mPrice} self.tab.insert(d) print("OK") def workOn(self): city = input("请输入你要搜索的城市首拼音:") end = int(input("爬取多少页:")) for x in range(1, end+1): if x == 1: url = self.url % city self.getHtml(url) else: url = (self.url % city) + "pg" + str(x) + "/" self.getHtml(url) # self.cursor.close() # self.db.close() if __name__ == "__main__": lianjia = LIANJIA() lianjia.workOn()
得到csv中的结果,mysql和mongo结果就不显示了: