本人刚才开始学习爬虫,从网上查询资料,写了一个利用Selenium+Phantomjs动态获取网站数据信息的例子,当然首先要安装Selenium+Phantomjs,具体的看
http://www.cnblogs.com/shaosks/p/6526817.html Selenium下载: https://pypi.python.org/pypi/selenium/
phantomjs使用参考:http://javascript.ruanyifeng.com/tool/phantomjs.html 及官网:http://phantomjs.org/quick-start.html
源代码如下:
# coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import os
class Crawler:
def __init__(self, firstUrl = "https://list.jd.com/list.html?cat=9987,653,655",
nextUrl = "https://list.jd.com/list.html?cat=9987,653,655&page=%d&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main"):
self.firstUrl = firstUrl
self.nextUrl = nextUrl
def getDetails(self,pageIndex,id = "plist"):
'''
获取详细信息
:param pageIndex: 页索引
:param id: 标签对应的id
:return:
'''
element = self.driver.find_element_by_id(id)
txt = element.text.encode('utf8')
items = txt.split('¥')
for item in items:
if len(item) > 0:
details = item.split(' ')
print '¥' + item
# print '单价:¥'+ details[0]
# print '品牌:' + details[1]
# print '参与评价:' + details[2]
# print '店铺:' + details[3]
print ' '
print '第 ' + str(pageIndex) + '页'
def CatchData(self,id = "plist",totalpageCountLable = "//span[@class='p-skip']/em/b"):
'''
抓取数据
:param id:获取数据的标签id
:param totalpageCountLable:获取总页数标记
:return:
'''
start = time.clock()
self.driver = webdriver.PhantomJS()
wait = ui.WebDriverWait(self.driver, 10)
self.driver.get(self.firstUrl)
#在等待页面元素加载全部完成后才进行下一步操作
wait.until(lambda driver: self.driver.find_element_by_xpath(totalpageCountLable))
# 获取总页数
pcount = self.driver.find_element_by_xpath(totalpageCountLable)
txt = pcount.text.encode('utf8')
print '总页数:' + txt
print '第1页'
print ' '
pageNum = int(txt)
pageNum = 3 # 只执行三次
i = 2
while (i <= pageNum):
self.getDetails(i,id)
print ' '
time.sleep(5) # 延迟5秒,防止获取数据过快而被封IP
wait = ui.WebDriverWait(self.driver, 10)
self.driver.get(self.nextUrl % i)
# driver.find_element_by_id("submit").click()
i = i + 1
else:
print 'Load Over'
end = time.clock()
print "Time: %f s" % (end - start)
def main():
# 首页的url
firstUrl = "https://list.jd.com/list.html?cat=9987,653,655"
#下一页的url
nextUrl = "https://list.jd.com/list.html?cat=9987,653,655&page=%d&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main"
cw = Crawler(firstUrl, nextUrl)
#总页数标签
totalpageCountLable = "//span[@class='p-skip']/em/b"
#获取数据的标签ID
id = "plist"
cw.CatchData(id,totalpageCountLable)
#测试
main()
参考:http://blog.csdn.net/eastmount/article/details/47907341