Python登录人人网并抓取新鲜事

Python登录人人网并抓取新鲜事

from sgmllib import SGMLParser

import sys,urllib2,urllib,cookielib

class spider(SGMLParser):

    def __init__(self,email,password):

        SGMLParser.__init__(self)

        self.h3=False

        self.h3_is_ready=False

        self.div=False

        self.h3_and_div=False

        self.a=False

        self.depth=0

        self.names=""

        self.dic={}



        self.email=email

        self.password=password

        self.domain='renren.com'

        try:

            cookie=cookielib.CookieJar()

            cookieProc=urllib2.HTTPCookieProcessor(cookie)

        except:

            raise

        else:

            opener=urllib2.build_opener(cookieProc)

            urllib2.install_opener(opener)

    def login(self):

        url='http://www.renren.com/PLogin.do'

        postdata={

                  'email':self.email,

                  'password':self.password,

                  'domain':self.domain

                  }

        req=urllib2.Request(

                            url,

                            urllib.urlencode(postdata)

                            )



        self.file=urllib2.urlopen(req).read()

        #print self.file

    def start_h3(self,attrs):

        self.h3 = True

    def end_h3(self):

        self.h3=False

        self.h3_is_ready=True



    def start_a(self,attrs):

        if self.h3 or self.div:

            self.a=True

    def end_a(self):

        self.a=False



    def start_div(self,attrs):

        if self.h3_is_ready == False:

            return

        if self.div==True:

            self.depth += 1



        for k,v in attrs:

            if k == 'class' and v == 'content':

                self.div=True;

                self.h3_and_div=True   #h3 and div is connected

    def end_div(self):

        if self.depth == 0:

            self.div=False

            self.h3_and_div=False

            self.h3_is_ready=False

            self.names=""

        if self.div == True:

            self.depth-=1

    def handle_data(self,text):

        #record the name

        if self.h3 and self.a:

            self.names+=text

        #record says

        if self.h3 and (self.a==False):

            if not text:pass

            else: self.dic.setdefault(self.names,[]).append(text)

            return

        if self.h3_and_div:

            self.dic.setdefault(self.names,[]).append(text)



    def show(self):

        type = sys.getfilesystemencoding()

        for key in self.dic:

            print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type),

                  ( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)

renrenspider=spider('your email','your password')

renrenspider.login()

renrenspider.feed(renrenspider.file)

renrenspider.show()
相关阅读:
Python----定义
 [转载]Morris Traversal方法遍历二叉树（非递归，不用栈，O(1)空间）
彻底明白IP地址——计算相关地址
 [转载] 教你如何迅速秒杀掉：99%的海量数据处理面试题
 [转载]从B 树、B+ 树、B* 树谈到R 树
 [转载]Java抽象类和接口的学习
 [转载]字符串匹配的Boyer-Moore算法
 [转载]字符串匹配的KMP算法
 [转载]孤儿进程与僵尸进程[总结]
[转载]Huffman编码压缩算法
原文地址：https://www.cnblogs.com/hd-zg/p/4932844.html