• xpath的常见操作


    1. 获取某一个节点下所有的文本数据:

    data = response.xpath('//div[@id="zoomcon"]')
    content = ''.join(data.xpath('string(.)').extract())

    这段代码将获取,div为某一个特定id的所有文本数据:

    image

    http://www.nhfpc.gov.cn/fzs/s3576/200804/cdbda975a377456a82337dfe1cf176a1.shtml 

    image

    2. 获取html几点属性的值

    >>> response.xpath("//div[@id='zoomtime']").extract()      
    [u'<div class="content_subtitle" id="zoomtime" title="u53d1u5e03u65e5u671fuff1a2010-10-26"><span>u4e2du534eu4ebau6c11u5171u548cu56fdu56fdu5bb6u536bu751fu548cu8ba1u5212u751fu80b2u59d4u5458u4f1a</span><span class="wzurl_tt" style="margin-left:10px;"></span><span style="margin-left:10px;">2010-10-26</span>
                    <span style="margin-left:30px;"></span> </div>']
    >>> response.xpath("//div[@id='zoomtime']/@title").extract()
    [u'u53d1u5e03u65e5u671fuff1a2010-10-26']

    这里需要获取的是某一个id下,属性title的值,使用的@title就可以获取到:

    image

    scrapy的项目结构:

    image

    nhfpc.py

    # -*- coding: utf-8 -*-
    import scrapy
    import sys
    import hashlib
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors import LinkExtractor
    from datetime import *
    from common_lib import *
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class NhfpcItem(scrapy.Item):
        url = scrapy.Field()
        name = scrapy.Field()
        description = scrapy.Field()
        size = scrapy.Field()
        dateTime = scrapy.Field()
        
    
    
    class NhfpcSpider(scrapy.contrib.spiders.CrawlSpider):
        name = "nhfpc"
        allowed_domains = ["nhfpc.gov.cn"]
        start_urls = (
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list.shtml',
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list_2.shtml',
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list_3.shtml',
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list_4.shtml',
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list_5.shtml',
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list_6.shtml',
            'http://www.nhfpc.gov.cn/fzs/pzcfg/list_7.shtml',
        )
        
    
        rules = (
            Rule(
                LinkExtractor(allow='.*d{6}/.*'),
                callback='parse_item'
            ),
            Rule(
                LinkExtractor(allow='.*201307.*'),
                follow=True,
            ),
        )
    
        def parse_item(self, response):
            
            retList =  response.xpath("//div[@id='zoomtitle']/*/text()").extract()
            title = ""
            
            if len(retList) == 0: 
                retList = response.xpath("//div[@id='zoomtitl']/*/text()").extract()
                title =  retList[0].strip()
            else:
                title = retList[0].strip()
            
    
            content = ""
            data = response.xpath('//div[@id="zoomcon"]')
    
            if len(data) == 0: 
                data = response.xpath('//div[@id="contentzoom"]')
            content = ''.join(data.xpath('string(.)').extract())
    
    
            pubTime = "1970-01-01 00:00:00"
            time = response.xpath("//div[@id='zoomtime']/@title").extract()
    
            if len(time) == 0 :
                time = response.xpath("//ucmspubtime/text()").extract()
            else:
                time = ''.join(time).split("")[1]
    
            pubTime = ''.join(time)
            pubTime = pubTime + " 00:00:00"
            #print pubTime
    
            #insertTime = datetime.now().strftime("%20y-%m-%d %H:%M:%S")
            insertTime = datetime.now()
            webSite = "nhfpc.gov.cn"
            
            values = []
            values.append(title)
    
            md5Url=hashlib.md5(response.url.encode('utf-8')).hexdigest()
            
            values.append(md5Url)
            values.append(pubTime)
            values.append(insertTime)
            values.append(webSite)
            values.append(content)
            values.append(response.url)
            #print values
            insertDB(values)

    common_lib.py

    #!/usr/bin/python
    #-*-coding:utf-8-*-
    
    '''
    This file include all the common routine,that are needed in
    the crawler project.
    Author: Justnzhang @(uestczhangchao@qq.com)
    Time:2014年7月28日15:03:44
    '''
    import os
    import sys
    import MySQLdb
    from urllib import quote, unquote
    import uuid
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    def insertDB(dictData):
        print "insertDB"
        print dictData
        id = uuid.uuid1()
        try:
            conn_local = MySQLdb.connect(host='192.168.30.7',user='xxx',passwd='xxx',db='xxx',port=3306)
            conn_local.set_character_set('utf8')
            cur_local = conn_local.cursor()
            cur_local.execute('SET NAMES utf8;') 
            cur_local.execute('SET CHARACTER SET utf8;')
            cur_local.execute('SET character_set_connection=utf8;')                
    
            values = []
    #        print values
    
    
            values.append("2")
            values.append("3")
            values.append("2014-04-11 00:00:00")
            values.append("2014-04-11 00:00:00")
            values.append("6")
            values.append("7")
            
            cur_local.execute("insert into health_policy values(NULL,%s,%s,%s,%s,%s,%s)",values)
            #print "invinsible seperator line-----------------------------------"
            conn_local.commit()
            cur_local.close()
            conn_local.close()
        except MySQLdb.Error,e:
            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
    
    
    if __name__ == '__main__':
        values = [1,2,4]
        insertDB(values)
    SET FOREIGN_KEY_CHECKS=0;
    
    -- ----------------------------
    -- Table structure for health_policy
    -- ----------------------------
    DROP TABLE IF EXISTS `health_policy`;
    CREATE TABLE `health_policy` (
      `hid` int(11) NOT NULL AUTO_INCREMENT,
      `title` varchar(1000) DEFAULT NULL COMMENT '政策标题',
      `md5url` varchar(1000) NOT NULL COMMENT '经过MD5加密后的URL',
      `pub_time` datetime DEFAULT NULL COMMENT '发布时间',
      `inser_time` datetime NOT NULL COMMENT '插入时间',
      `website` varchar(1000) DEFAULT NULL COMMENT '来源网站',
      `content` longtext COMMENT '政策内容',
      `url` varchar(1000) DEFAULT NULL,
      PRIMARY KEY (`hid`)
    ) ENGINE=InnoDB AUTO_INCREMENT=594 DEFAULT CHARSET=utf8;
  • 相关阅读:
    团队作业第五次—项目冲刺-Day6
    团队作业第五次—项目冲刺-Day4
    团队作业第五次—项目冲刺-Day3
    团队作业第五次—项目冲刺-Day2
    研途——冲刺集合
    团队作业第六次—事后诸葛亮
    研途——测试总结
    研途——冲刺总结
    研途——冲刺日志(第七天)
    研途——冲刺日志(第六天)
  • 原文地址:https://www.cnblogs.com/justinzhang/p/4482170.html
Copyright © 2020-2023  润新知