• selenium+BeautifulSoup实现强大的爬虫功能


    sublime下运行

    1 下载并安装必要的插件

    BeautifulSoup

    selenium

    phantomjs

    采用方式可以下载后安装,本文采用pip

    pip install BeautifulSoup

    pip install selenium

    pip install phantomjs

    2 核心代码

    phantomjs解析

    def driver_open():
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (r"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36")
        driver = webdriver.PhantomJS(executable_path=r'C:UsersAdministratorAppDataRoamingSublime Text 3PackagesAnacondaphantomjs.exe', desired_capabilities=dcap)
        return driver

    BeautifulSoup

    def get_content(driver,url):
        driver.get(url)
        time.sleep(30)
        content = driver.page_source.encode('utf-8')
        driver.close()
        soup = BeautifulSoup(content, 'lxml')
        return soup

    3 源码

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    from selenium import webdriver
    import time
    from bs4 import BeautifulSoup
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    def driver_open():
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (r"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36")
        driver = webdriver.PhantomJS(executable_path=r'C:UsersAdministratorAppDataRoamingSublime Text 3PackagesAnacondaphantomjs.exe', desired_capabilities=dcap)
        return driver
    def get_content(driver,url):
        driver.get(url)
        time.sleep(30)
        content = driver.page_source.encode('utf-8')
        driver.close()
        soup = BeautifulSoup(content, 'lxml')
        return soup
    
    def get_basic_info(soup):
        basic_info = soup.select('.baseInfo_model2017')
    
        zt = soup.select('.td-regStatus-value > p ')[0].text.replace("
    ","").replace(" ","")
        basics = soup.select('.basic-td > .c8 > .ng-binding ')
        zzjgdm = basics[3].text
        tyshxydm = basics[7].text
        print (u'公司名称:'+company)
        print (u'公司状态:'+zt)
        # print basics
        print (u'组织机构代码:'+zzjgdm)
        print (u'统一社会信用代码:'+tyshxydm)
    
    if __name__=='__main__':
        url = "http://www.tianyancha.com/company/2310290454"
        driver = driver_open()
        soup = get_content(driver, url)
        print(soup.body.text)
        print('----获取基础信息----')
        get_basic_info(soup)
  • 相关阅读:
    Web——[HCTF 2018]WarmUp
    栈的设置+栈的越界问题+栈的极限大小
    栈的概念
    检测点3.1
    字节型数据和字型数据的小结
    汇编语言(王爽)学习记录_第一章
    sqli-labs less-1 --> less-4
    五角星
    STD二手图书交流平台团队博客-登陆问题的解决
    STD二手图书交流平台团队博客-界面构建
  • 原文地址:https://www.cnblogs.com/baiboy/p/pc1.html
Copyright © 2020-2023  润新知