• 运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中


    原文地址:运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中



    依赖包:
    
    1.jieba
    
    2.pymongo
    
    3.HTMLParser
    
    # -*- coding: utf-8 -*-
    """
    @author: jiangfuqiang
    """
    
    from HTMLParser import  HTMLParser
    import re
    import time
    from datetime import  date
    import pymongo
    import urllib2
    import sys
    import traceback
    import jieba
    
    default_encoding = 'utf-8'
    if sys.getdefaultencoding() != default_encoding:
        reload(sys)
        sys.setdefaultencoding(default_encoding)
    isExist = False
    
    class FetchCnblog(HTMLParser):
        def __init__(self, id):
            HTMLParser.__init__(self)
            self.result = []
            self.data = {}
            self.isTitleLink = False
            self.id = id
            self.isSummary = False
            self.isPostItem = False
            self.isArticleView = False
    
    
        def handle_data(self, data):
            if self.isTitleLink and self.isPostItem:
                self.data['title'] = data
                self.isTitleLink = False
            elif self.isSummary and self.isPostItem:
                data = data.strip()
                if data:
                    self.data['desc'] = data
    
    
        def handle_starttag(self, tag, attrs):
            if tag == 'a':
                for key, value in attrs:
                    if key == 'class':
                        if value == 'titlelnk':
                            self.isTitleLink = True
                        elif value == 'gray' and self.isArticleView:
                            self.isArticleView = False
                            for key, value in attrs:
                                if key == 'href':
                                    self.data['readmoreLink'] = value
                                    reg = 'd+'
                                    result = re.search(reg,value)
                                    self.isPostItem = False
    
                                    if result:
                                        self.data['id'] = int(result.group())
                                    else:
                                        self.data = {}
                                        return
                                    if self.data['id'] <= self.id:
                                        self.data = {}
                                        isExist = True
                                        return
                                    else:
                                        self.data['srouce'] = "www.cnblogs.com"
                                        self.data['source_key'] = 'cnblogs'
                                        self.data['fetchTime'] = str(date.today())
                                        self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
                                        self.result.append(self.data)
                                        self.data = {}
    
            elif tag == 'p':
                for key, value in attrs:
                    if key == 'class' and value == 'post_item_summary':
                        self.isSummary = True
            elif tag == 'img':
                for key, value in attrs:
                    if key == 'class' and value == 'pfs':
                        for key, value in attrs:
                            if key == 'src':
                                self.data['imgSrc'] = value
    
    
            elif tag == 'div':
                for key, value in attrs:
                    if key == 'class' and value == 'post_item_foot':
                        self.isSummary = False
                    elif key == 'class' and value == 'post_item':
                        self.isPostItem = True
            elif tag == 'span':
                for key , value in attrs:
                    if key == 'class' and value == 'article_view':
                        self.isArticleView = True
    
    
        def getResult(self):
    
            return self.result
    
    
    if __name__ == "__main__":
        con = pymongo.Connection('localhost', 27017)
        db = con.blog
        fetchblog = db.fetch_blog
        record = db.record
        url = "http://www.cnblogs.com/sitehome/p/%d"
        count = 1
        flag = False
        headers={
                 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US。 rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        reco = record.find_one({"type":'cnblogs'})
        id = 0
        if reco:
            id = reco['maxId']
        while isExist == False:
            try:
                req = urllib2.Request(url%count,headers=headers)
                request = urllib2.urlopen(req)
                data = request.read()
                fj = FetchCnblog(id)
                fj.feed(data)
                result = fj.getResult()
                if len(result) < 1:
                    isExist = True
                else:
                    if flag == False:
                        flag = True
                        dic = result[0]
                        id = int(dic['id'])
                        record.update({"type":'cnblogs'},{"$set":{'maxId':id}},True,False)
                    result.reverse()
                    for doc in result:
                        fetchblog.insert(doc)
                    print "page is %d"%count
                    count += 1
    
                    time.sleep(5)
            except Exception, e:
                traceback.print_exc()
                print "parse error",e
    
    程序假设在linux,mac下运行。在可在crontab -e中设置定时任务,假设在windows运行,则自己再在程序里加个定时器就可以


  • 相关阅读:
    IE678下,select 诡异的样式
    跟着我一步一步的搭建一个基于springcloud的微服务实例
    关于Future踩过的坑
    Apache下的SocketClient的使用
    Jaxb处理泛型,转化成xml字符串
    Linux Centos虚拟机扩容
    docker 搭建zookeeper集群和kafka集群
    sysbench 数据库性能测试工具的使用
    docker 容器技术
    自己手写实现Dubbo
  • 原文地址:https://www.cnblogs.com/mthoutai/p/6796214.html
Copyright © 2020-2023  润新知