#-*- codding = utf-8 -*- #@Time: 2020/6/22 10:09 #@Author: liruifeng #@File: demo1.py #@Software: PyCharm from bs4 import BeautifulSoup #网页解析获取数据 import re #正则表达式,进行文字匹配 import urllib.request,urllib.error #定制url,获取网页数据 import xlwt #进行Excel操作 import sqlite3 #进行sqllite数据库操作 def main(): baseurl = "https://movie.douban.com/top250?start=" datalist = getData(baseurl) #savepath = ".\豆瓣电影Top250.xls" dbpath = "movie.db" # 保存数据 #saveData(datalist,savepath) #保存至Excel saveData2(datalist,dbpath) #保存至sqlite #影片详情的规则 findLink = re.compile(r'<a href="(.*?)">') #生成正则表达式对象,表示规则(字符串的模式) #影片图片规则 findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #re.S让换行符包含 #影片的片名 findTitle = re.compile(r'<span class="title">(.*)</span>') #影片评分 findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') #评价人数 findJudge = re.compile(r'<span>(d*)人评价</span>') #精选评价 findInq = re.compile(r'<span class="inq">(.*)</span>') #相关内容 findBd = re.compile(r'<p class="">(.*?)</p>',re.S) # 爬取网页 def getData(baseurl): datalist = [] for i in range(0,10): #调用获取页面函数10次 url = baseurl + str(i*25) html = askURL(url) #保存获取到的网页 # 解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="item"):#查找符合要求的字符串形成列表 data = [] #保存一部电影的所有信息 item = str(item) #影片详情连接 link = re.findall(findLink,item)[0] #re库用来通过正则表达式查找指定字符串 data.append(link) #添加连接 images = re.findall(findImgSrc,item)[0] data.append(images) #添加图片 titles = re.findall(findTitle,item) if(len(titles) == 2): ctitle = titles[0] data.append(ctitle) #添加中文名 otitle = titles[1].replace("/","") #去掉无关符号 data.append(otitle) #添加外国名 else: data.append(titles[0]) data.append(' ') #外国名留空 rating = re.findall(findRating,item)[0] data.append(rating) #添加评分 judgeNum = re.findall(findJudge,item)[0] data.append(judgeNum) #添加评价人数 inq = re.findall(findInq,item) if len(inq) != 0: inq = inq[0].replace("。","") #去掉句号 data.append(inq) #添加精选评价 else: data.append(" ") bd = re.findall(findBd,item)[0] bd = re.sub('<br(s+)?/>(s+)?'," ",bd) #去掉<br/> bd = re.sub('/'," ",bd) #替换/ data.append(bd.strip()) #去掉前后空格 datalist.append(data) #处理完成的一部电影放入datalist #print(datalist) return datalist #获取指定url的网页内容 def askURL(url): head = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.138Safari / 537.36" } #模拟请求头 #用户代理 request = urllib.request.Request(url,headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html # 保存数据 ####保存至Excel # def saveData(datalist,savepath): # book = xlwt.Workbook(encoding="utf-8",style_compression=0) # 创建workbook对象 # sheet = book.add_sheet('豆瓣电影Top250',cell_overwrite_ok=True) # 创建工作表 # col = ("电影详情链接","图片链接","影片中文名","影片外国名","评分","评价数","概况","相关信息") # for i in range(0,8): # sheet.write(0,i,col[i]) # for i in range(0,250): # print("第%d条"%(i+1)) # data = datalist[i] # for j in range(0,8): # sheet.write(i+1,j,data[j]) # book.save(savepath) # 保存数据表 #### def saveData2(datalist,dbpath): init_db(dbpath) conn = sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): if index == 4 or index == 5: continue data[index] = '"'+data[index]+'"' sql = ''' insert into movie250 (info_link, pic_link, cname, ename, score, rated, instroduction, info) values (%s)'''%",".join(data) cur.execute(sql) conn.commit() cur.close() conn.close() def init_db(dbpath): conn = sqlite3.connect(dbpath) #打开或创建数据文件 c = conn.cursor() #获取游标 sql = ''' create table movie250 (id integer primary key autoincrement, info_link text, pic_link text, cname varchar, ename varchar, score numeric, rated numeric, instroduction text, info text) ''' #创建数据表 c.execute(sql) #执行SQL conn.commit() #提交数据库操作 conn.close() #关闭数据库连接 if __name__ == "__main__": #调用函数 main() print("爬取完毕")