• python抓取伯乐在线的全部文章,对标题分词后存入mongodb中


    依赖包:

    1.pymongo

    2.jieba


    # -*- coding: utf-8 -*-

    """
    @author: jiangfuqiang
    """
    from HTMLParser import HTMLParser
    import urllib2
    import sys
    import pymongo
    import time
    import jieba
    import traceback

    default_encoding = 'utf-8'
    if sys.getdefaultencoding() != default_encoding:
        reload(sys)
        sys.setdefaultencoding(default_encoding)
    class FetchJobble(HTMLParser):

        def __init__(self):
            HTMLParser.__init__(self)
            self.isPostThumb = False
            self.isPostMeta = False
            self.isMetaTitle = False
            self.isCategoryTag = False
            self.isComment = False
            self.isexcerpt = False
            self.isReadMore = False
            self.isPicture = False
            self.data = {}
            self.result = []

        def handle_starttag(self,tag,attrs):
            if tag == 'div':
                for key,value in attrs:
                    if key == 'class':
                        if value == 'post-thumb':
                            self.isPostThumb = True
                        elif value == 'meta-title':
                            self.isMetaTitle = True
            elif tag == 'a' and self.isPostThumb == True:

                for key, value in attrs:
                    if self.isReadMore:
                        if key == 'href':
                            self.data['redmoreLink'] = value
                            self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
                            self.result.append(self.data)
                            self.isPostThumb = False
                            self.isMetaTitle = False
                            self.isReadMore = False
                            self.isCategoryTag = False
                            self.isComment = False
                            self.isexcerpt = False
                            self.isPicture = False

                            self.data = {}
                    else:
                        if key == 'class':
                            if value == 'meta-title':
                                self.isMetaTitle = True
                        elif key == 'rel':
                            if value == 'category tag':
                                self.isCategoryTag = True
                        elif key =='href':
                            if value.find('#respond') > 0:
                                self.isComment = True
            elif tag == 'span' and self.isComment == True:
                for key, value in attrs:
                    if key == 'class' and value == 'excerpt':
                        self.isexcerpt = True
                    elif key == 'class' and value == 'read-more':
                        self.isReadMore = True
            elif tag == 'img' and self.isPostThumb and self.isPostMeta == False:
                for key, value in attrs:
                    if key == 'src':
                        self.data['imgSrc'] = value

        def handle_endtag(self,tag):

            pass

        def handle_data(self,data):
             if self.isMetaTitle:
                self.data['title'] = data
                self.isMetaTitle = False
             elif self.isCategoryTag:
                 ct = ''
                 if 'tag' in self.data.keys() :
                     ct = self.data['tag'] + "," + data
                 else:
                     ct = data
                 self.data['tag'] = ct
                 self.isCategoryTag = False
             elif self.isComment and 'comment' not in self.data.keys():
                 self.data['comment'] = data.split(" ")[0]
             elif self.isexcerpt:
                 self.data['desc'] = data
                 self.isexcerpt = False


        def getResult(self):
            return self.result

    if __name__ == "__main__":
        con = pymongo.Connection('localhost', 27017)
        db = con.blog
      
        fetchblog = db.fetch_blog

        url = "http://blog.jobbole.com/all-posts/page/%d"
        count = 1
        flag = False
        headers={
                 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        while flag == False:
            try:
                req = urllib2.Request(url%count,headers=headers)
                request = urllib2.urlopen(req)
                data = request.read()
                fj = FetchJobble()
                fj.feed(data)
                result = fj.getResult()
                if len(result) < 1:
                    flag = True
                else:
                    for doc in result:
                        fetchblog.insert(doc)
                    print "page is %d"%count
                    count += 1

                    time.sleep(5)
            except Exception, e:
                traceback.print_exc()
                print "parse error",e

  • 相关阅读:
    现在的技术QQ群为什么都变成了这样?效率高也是有弊端的?
    【php】php中mail()不可用,php中sendmail不能用的解决方法
    Cannot validate since no PHP executable is set. Use the setting 'php.validate.executablePath' to configure the PHP executable.无法使用PHP可执行的设置。设置php.validate。executablePath配置PHP可执行文件。
    20150907自动化测试之Appinum For Android(前篇)
    [摘]关于目标管理
    婚恋网站应该有视频功能
    GIS的双屏显示模式是一个实用的创新
    移动产品将越分越细
    基于开源GIS软件的电子政务地理信息应用解决方案
    手机长途话费应再降!
  • 原文地址:https://www.cnblogs.com/lcchuguo/p/4008352.html
Copyright © 2020-2023  润新知