• selenium配合phantomjs实现爬虫功能,并把抓取的数据写入excel


    # -*- coding: UTF-8 -*-
    '''
    Created on 2016年5月13日
    
    @author: csxie
    '''
    import datetime
    from Base import BasePage
    import ExcelOperation as excel
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support.expected_conditions import text_to_be_present_in_element
    import unittest
    class JobLog(BasePage):
        url=""#目标url
        def setUp(self):
            BasePage.setUP(self)
        def tearDown(self):
            BasePage.tearDown(self)
            
                  
        def test_querysql(self):
            """
            查询到符合条件的信息并写入excel
            """
            driver=self.driver 
            driver.get(self.url)
            '''
            设置参数
            start:起始日期
            end:截止日期
            user:查询人员                
            '''
            end=datetime.date.today()
            start=end+datetime.timedelta(days=-6)
            user=None
            #起始日期        
            txtStart=driver.find_element_by_id("ctl01_txtSelectDateFrom")
            txtStart.clear()
            txtStart.send_keys(start)
            #截止日期
            txtEnd=driver.find_element_by_id("ctl01_txtSelectDateTo")
            txtEnd.clear()
            txtEnd.send_keys(end)
            #查询人员
            if(user!=None):
                txtuser=driver.find_element_by_id("ctl01_txtUser")
                txtuser.clear()
                txtuser.send_keys(user)
            #查询按钮
            btnQueryInfo=driver.find_element_by_id("ctl01_btnQueryInfo")
            btnQueryInfo.click()
            #总页数
            totalPages=int(driver.find_element_by_id("ctl01_uc_CommonPager_lb_TotalPages").text)
            i=1
            while(i<=totalPages):
                #table的xpath
                listTable=self.getTableData(".//*[@id='ctl01__wrLogList_gvlist']",colNO=9,loop=i)
                #当前页数
                currentPage=int(driver.find_element_by_id("ctl01_uc_CommonPager_lb_PageNum").text)
                if(currentPage==1):
                    excel.save_to_excel_newsheet(listTable,excelPath='joblog.xls')
                else:
                    excel.save_to_excel_oldsheet(listTable,excelPath='joblog.xls')
                if(i!=totalPages):
                    #下一页按钮
                    driver.find_element_by_id("ctl01_uc_CommonPager_NextPage").click()
                    WebDriverWait(driver,8).until(text_to_be_present_in_element(('id','ctl01_uc_CommonPager_lb_PageNum'),str(i+1)))
                i=i+1
                
                
        def getTableData(self,value,colTag='td',colNO=None,loop=None):
            '''
            获取表格数据                        
            colNO:存在合并行时,强制指定表格列数                
            '''
            driver=self.driver
            table=driver.find_element(by="xpath",value=value)                        
            rows=table.find_elements(by="tag name",value="tr")#行集合
            maxrowCount=len(rows)#表格行数
                
            #如果colNO=None不为空,则用传进来的值作为列数
            if(colNO==None): 
                cols = table.find_elements(by="tag name",value="td");
                if(cols!='td'):            
                    cols = table.find_elements(by="tag name",value="th")                          
                maxcolCount=len(cols)/len(rows)#表格列数
            else:
                maxcolCount=colNO      
                
            lists = [[] for i in range(maxrowCount)]
            #print  "列数"+str(maxcolCount)        
            #print  "行数"+str(maxrowCount)
            i=1       
            while i<=maxrowCount:   
                j=1;#初始化列
                while j<=maxcolCount:
                    try:
                        if(i==1):
                            try:
                                innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                            except:
                                colTag="th"
                                innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                        else:
                            colTag="td"            
                            if(j==3 or j==5):
                                innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]/a").get_attribute("title")
                            elif(j==8):
                                innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                                innerText=float(innerText.replace("h", ""))
                            else:
                                innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                    except:
                        innerText=""
                    #print "第"+str(i)+"行,第"+str(j)+"列"+str(innerText)
                    lists[i-1].append(innerText)                                           
                    j+=1
                i+=1
            if(loop!=1):#不是第一次循环,列头不要            
                return lists[1:]
            return lists
        
    if __name__ == "__main__":
        unittest.main()    
    # -*- coding: UTF-8 -*-
    '''
    Created on 2016年5月18日
    
    @author: csxie
    '''
    import xlwt
    from xlrd import open_workbook
    from xlutils.copy import copy
    import os
    
    def save_to_excel_newsheet(listTable,excelPath=r'C:demo.xls',sheetName='sheet1'):
        if(isinstance(listTable,list)):        
            rowNO=len(listTable);
            if(rowNO==0):
                raise ValueError,u'传入的是list是空的'            
            colNO=len(listTable[0]);
            
            wkb = xlwt.Workbook()
            sheet = wkb.add_sheet(sheetName)        
            for i in range(rowNO):
                for j in range(colNO):
                    #print listTable[i][j]
                    #print str(i)+'行'+str(j)+'列'
                    sheet.write(i,j,listTable[i][j])
            if(os.path.exists(excelPath)):
                os.remove(excelPath)       
            wkb.save(excelPath)
            return True
        else:
            raise TypeError,u'传入的类型不是list'
        
    def save_to_excel_oldsheet(listTable,excelPath=r'C:demo.xls',sheetName='sheet1'):
        if(isinstance(listTable,list)):
            rowNO=len(listTable); 
            if(rowNO==0):
                raise ValueError,u'传入的是list是空的'                
            colNO=len(listTable[0]);
            rb = open_workbook(excelPath)
            sheet_index=rb.sheet_names().index(sheetName)
            old_rows=rb.sheet_by_name(sheetName).nrows#已存在的excel中数据行数
            
            wkb = copy(rb)        
            sheet = wkb.get_sheet(sheet_index)        
            for i in range(rowNO):
                for j in range(colNO):
                    sheet.write(i+old_rows,j,listTable[i][j])
                    #print str(i)+'行'+str(j)+'列'
            wkb.save(excelPath)
            return True
        else:
            raise TypeError,u'传入的类型不是list'
            
    # -*- coding: UTF-8 -*-
    from selenium import webdriver
    import unittest
    
    class BasePage(unittest.TestCase):      
        def setUP(self):
            self.driver= webdriver.PhantomJS(executable_path="phantomjs.exe")
            #self.driver.set_window_size(1400, 1000)
            #self.driver.implicitly_wait(1)
            self.verificationErrors = []
            
        def tearDown(self):
            self.driver.quit()
            #self.driver.close()
            self.assertEqual([], self.verificationErrors)
  • 相关阅读:
    POJ1201 Intervals & TYVJ 1589 西瓜种植 差分约束
    洛谷 P3385 【模板】负环 SPFA
    Test 2018-07-19 二中集训
    《一出好戏》一中高三学子誓师大会后记
    [NOI2012] 迷失游乐园 概率 期望 基环树DP
    codeforces CF983E NN country 树上倍增
    「Luogu P3521 [POI2011]ROT-Tree Rotations」
    「高等数学学习笔记 DAY18」
    「高等数学学习笔记 DAY17」
    「高等数学学习笔记 DAY16」
  • 原文地址:https://www.cnblogs.com/changshuo/p/5722469.html
Copyright © 2020-2023  润新知