1.升级pip
pip install --upgrade pip
2.pip安装pyspider
pip install pyspider
3.安装phantomjs: https://phantomjs.org/download.html
wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
yum -y install bzip2
tar -jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /opt/
mv phantomjs-2.1.1-linux-x86_64/ phantomjs
建立软链接
ln -s /opt/phantomjs/bin/phantomjs /usr/bin/
安装依赖
yum -y install fontconfig
启动验证
phantomjs
4.pyspider启动报错
ValueError: Invalid configuration:
- Deprecated option 'domaincontroller': use 'http_authenticator.domain_controller' instead.
在安装包中找到pyspider的资源包,然后找到webui文件里面的webdav.py文件打开,修改第209行即可。
'domaincontroller': NeedAuthController(app),
'http_authenticator':{
'HTTPAuthenticator':NeedAuthController(app),
},
5.使用mysql数据库
启动pyspider
pyspider
默认会在启动目录生成data目录,存放数据,默认使用SQLite数据库
[root@iZbp1gg50ddqbgxf1jpqwwZ opt]# cd data/
[root@iZbp1gg50ddqbgxf1jpqwwZ data]# ll
total 16
-rw-r--r-- 1 root root 3072 Jan 21 17:39 project.db
-rw-r--r-- 1 root root 0 Jan 21 17:39 result.db
-rw-r--r-- 1 root root 6 Jan 21 17:39 scheduler.1d
-rw-r--r-- 1 root root 6 Jan 21 17:39 scheduler.1h
-rw-r--r-- 1 root root 6 Jan 21 17:39 scheduler.all
-rw-r--r-- 1 root root 0 Jan 21 17:39 task.db
6.创建mysql数据库
pyspider_taskdb
pyspider_projectdb
pyspider_resultdb
7.配置文件
touch /usr/lib/python2.7/site-packages/pyspider/config.json
{
"taskdb": "mysql+taskdb://root:123456@121.40.112.188:3306/taskdb",
"projectdb": "mysql+projectdb://root:123456@121.40.112.188:3306/projectdb",
"resultdb":"mysql+resultdb://root:123456@121.40.112.188:3306/resultdb",
"message_queue": "redis://root@123456127.0.0.1:6379/db",
"webui": {
"port":5000,
"username": "evans",
"password": "123456",
"need-auth": true
}
}
8.安装组件
pip install mysql-connector
pip install redis 如果配置还用了redis的话
9.通过配置启动
pyspider -c config.json all
10.启动脚本
#!/bin/sh
cd `dirname $0`
if [ `ps -ef | grep 'pyspider' |grep -v 'grep' | wc -l` -lt "1" ];
then
nohup pyspider -c config.json all &
echo "pyspider started"
fi