最近在做一个ai推荐明星项目,首先需要进行数据的收集
所以这些天都在着手爬虫
其中需要我完善师兄爬的数据,自己爬百度百科去完善数据库里的词条
在启动爬虫的时候碰到一个问题
需要我遍历表单,提取明星名、作品名分别更改爬虫关键词启动爬虫→收集数据
但是碰到一个问题,这需要我在循环中调用爬虫
自己写的时候 一直碰到ReactorNotRestartable之类的问题,百度也无果
后来问了师兄捣鼓了好久才弄成功
在这写一下博客分享一下实现的办法 希望能帮到以后如果有同样困扰的朋友
解决思路:
根据不同的task值生成不同的runner爬虫任务,把爬虫任务添加到集dfs中,然后通过把dfs添加到DefferedList进行启动
核心函数crawl_run函数代码如下:
def crawl_run(proc,spider): runner = CrawlerRunner(settings) dfs = set() for task in proc: try: print(task) d = runner.crawl(spider, task) dfs.add(d) except Exception: info = sys.exc_info() print('{0}:{1}'.format(info[0], info[1])) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) reactor.run()
爬虫启用核心代码:
task = {'keyword': key_search, 'tid': 200, 'pid': 2001, } #关键词,key_search根据数据集进行变化 #爬虫启用 if need_find is True: proc = [] proc.append(task) if proc.__len__() >= 1: p = Process(target=crawl_run, args=(proc,爬虫名称,), ) p.start() p.join(timeout=180) proc.clear() print("setup crawl!") else : print('waiting...')
最终完整代码实现如下:
from scrapy.conf import settings from baidu.spiders.baidus import * from baidu.spiders.baidubaike import * from scrapy.crawler import CrawlerProcess import mysql.connector from twisted.internet import reactor,defer from scrapy.cmdline import execute import os import csv from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerRunner from multiprocessing import Process def crawl_run(proc,spider): runner = CrawlerRunner(settings) dfs = set() for task in proc: try: print(task) d = runner.crawl(spider, task) dfs.add(d) except Exception: info = sys.exc_info() print('{0}:{1}'.format(info[0], info[1])) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) reactor.run() def connect_mysql(): conn=mysql.connector.connect(host="localhost",user='root',password='123456',database="superstar",charset='utf8') cursor = conn.cursor() cursor.execute('select * from test_star') D=cursor.fetchall() for d in D: print(type(d)) print(d[1]) print(d[2]) # connect_mysql() if __name__ == '__main__': runner = CrawlerRunner(settings) #访问数据库 conn = mysql.connector.connect(host="localhost", user='root', password='123456', database="superstar", charset='utf8') cursor = conn.cursor() cursor.execute('select workname,star,role,id from tbl_works where id >=4316 and id <=6315') D = cursor.fetchall() # 对于读出的指定id数据条进行遍历 for d in D: #是否要进行数据爬取的判断 flag_workname =True flag_star = True flag_role = True if flag_workname and flag_star and flag_role : need_find = True else: need_find = False work_name = "" star_name ="" if d[0] == "" : flag_workname = False if d[1] == "" : flag_star = False if d[2] == "" : flag_role = False if flag_star : separate = re.findall(r'S+', d[1]) star_name = separate[0] if flag_workname : work_name = d[0] if flag_workname and flag_star and flag_role : need_find = True else: need_find = False if flag_role : role_separate = re.findall(r'S+',d[2]) if len(role_separate) > 2: if role_separate[1] == "(饰" : need_find = False key_search = work_name + ' ' +star_name + ' ' +d[3] print(key_search) task = {'keyword': key_search, 'tid': 200, 'pid': 2001, } #爬虫启用 if need_find is True: proc = [] proc.append(task) if proc.__len__() >= 1: p = Process(target=crawl_run, args=(proc, BaidubaikeSpider,), ) p.start() p.join(timeout=180) proc.clear() print("setup crawl!") else : print('waiting...') print('waiting.1..')