前面已能够爬取豆瓣电影所需要的所有信息,今日主要修改了爬取豆瓣的代码,进行了一些优化,主要两部分代码:
def insert_data(data_beans,headers,cursor,conn): try: for data_bean in data_beans: #20个电影数据 for i in data_bean: # 分配数据 score = i["rate"].replace(" ", "") director = i["directors"] # [] director_str = "" for j in director: director_str = director_str + " " + j name = i["title"].replace(" ", "") img = i["cover"].replace(" ", "") star = i["casts"] # [] star_str = "" for j in star: star_str = star_str + " " + j # 分配数据 # 获取电影详细数据的网址 url_details = i["url"].replace(" ", "") r = requests.get(url_details, headers=headers) soup_bean = BeautifulSoup(r.text, "lxml") # 获取详细数据 span = soup_bean.find_all("span", {"property": "v:genre"}) type = "" for i in span: type = type + " " + i.text span = soup_bean.find_all("span", {"property": "v:runtime"}) timelen = span[0].text.replace(" ", "") span = soup_bean.find_all("span", {"property": "v:initialReleaseDate"}) date = span[0].text.replace(" ", "") span = soup_bean.find("a", {"class", "rating_people"}) scorenum = span.text.replace(" ", "") span = soup_bean.find("span", {"property": "v:summary"}) summary = span.text.replace(" ", "") # 将空格去掉 ex = ' <span class="pl">制片国家/地区:</span> (.*?)<br/>' test = re.findall(ex, r.text, re.S) area = test[0].replace(" ", "") ex2 = '<span class="pl">语言:</span> (.*?)<br/>' test = re.findall(ex2, r.text, re.S) language = test[0].replace(" / ", " ") print(url_details) # 获取详细数据 sql = "insert into moviebean values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql, [name, star_str, director_str, type, area, date, summary, score, language, img, scorenum, timelen]) conn.commit() # 提交事务 update delete insert操作 //*[@id="info"]/text()[2] print(f"{time.asctime()}插入数据完毕") except: traceback.print_exc()
def get_tencent_data(): #豆瓣的网址 url_bean = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', } cursor = None conn = None conn, cursor = get_conn() data_beans=[] num=3240#1440/3020/2760/3100/3180 b=0; while b<=500: a = 1 b=b+1 while a <= 1: num_str = '%d' % num num = num + 20 a = a + 1; # 获取豆瓣页面电影数据 r = requests.get(url_bean + num_str, headers=headers) print(num_str) res_bean = json.loads(r.text); print(url_bean+num_str) data_beans.append(res_bean["data"]) print(f"{time.asctime()}开始插入数据") insert_data(data_beans, headers,cursor,conn) data_beans=[] print(f"{time.asctime()}所有数据插入完毕") close_conn(conn, cursor)