• python爬虫如何爬知乎的话题?


    因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

     

    #coding:utf-8
    """
    @author:haoning
    @create time:2015.8.5
    """
    from __future__ import division  # 精确除法
    from Queue import Queue
    from __builtin__ import False
    import json
    import os
    import re
    import platform
    import uuid
    import urllib
    import urllib2
    import sys
    import time
    import MySQLdb as mdb
    from bs4 import BeautifulSoup
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    headers = {
       'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
       'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
       'X-Requested-With':'XMLHttpRequest',
       'Referer':'https://www.zhihu.com/topics',
       'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
    }
    
    DB_HOST = '127.0.0.1'
    DB_USER = 'root'
    DB_PASS = 'root'
    
    queue= Queue() #接收队列
    nodeSet=set()
    keywordSet=set()
    stop=0
    offset=-20
    level=0
    maxLevel=7
    counter=0
    base=""
    
    conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
    conn.autocommit(False)
    curr = conn.cursor()
    
    def get_html(url):
        try:
            req = urllib2.Request(url)
            response = urllib2.urlopen(req,None,3) #在这里应该加入代理
            html = response.read()
            return html
        except:
            pass
        return None
    
    def getTopics():
        url = 'https://www.zhihu.com/topics'
        print url
        try:
            req = urllib2.Request(url)
            response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
            html = response.read().decode('utf-8')
            print html
            soup = BeautifulSoup(html)
            lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})
            
            for li in lis:
                data_id=li.get('data-id')
                name=li.text
                curr.execute('select id from classify_new where name=%s',(name))
                y= curr.fetchone()
                if not y:
                    curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
            conn.commit()
        except Exception as e:
            print "get topic error",e
            
    
    def get_extension(name):  
        where=name.rfind('.')
        if where!=-1:
            return name[where:len(name)]
        return None
    
    
    def which_platform():
        sys_str = platform.system()
        return sys_str
    
    def GetDateString():
        when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
        foldername = str(when)
        return foldername 
    
    def makeDateFolder(par,classify):
        try:
            if os.path.isdir(par):
                newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
                if which_platform()=="Linux":
                    newFolderName=par + '/' + GetDateString() + "/" +str(classify)
                if not os.path.isdir( newFolderName ):
                    os.makedirs( newFolderName )
                return newFolderName
            else:
                return None 
        except Exception,e:
            print "kk",e
        return None 
    
    def download_img(url,classify):
        try:
            extention=get_extension(url)
            if(extention is None):
                return None
            req = urllib2.Request(url)
            resp = urllib2.urlopen(req,None,3)
            dataimg=resp.read()
            name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
            top="E://topic_pic"
            folder=makeDateFolder(top, classify)
            filename=None
            if folder is not None:
                filename  =folder+"//"+name
            try:
                if "e82bab09c_m" in str(url):
                    return True
                if not os.path.exists(filename):
                    file_object = open(filename,'w+b')
                    file_object.write(dataimg)
                    file_object.close()
                    return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
                else:
                    print "file exist"
                    return None
            except IOError,e1:
                print "e1=",e1
                pass
        except Exception as e:
            print "eee",e
            pass
        return None #如果没有下载下来就利用原来网站的链接
    
    def getChildren(node,name):
        global queue,nodeSet
        try:
            url="https://www.zhihu.com/topic/"+str(node)+"/hot"
            html=get_html(url)
            if html is None:
                return
            soup = BeautifulSoup(html)
            p_ch='父话题'
            node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
            topic_cla=soup.find('div', {'class' : 'child-topic'})
            if topic_cla is not None:
                try:
                    p_ch=str(topic_cla.text)
                    aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点
                    if u'子话题' in p_ch:
                        for a in aList:
                            token=a.get('data-token')
                            a=str(a).replace('
    ','').replace('	','').replace('
    ','')
                            start=str(a).find('>')
                            end=str(a).rfind('</a>')
                            new_node=str(str(a)[start+1:end])
                            curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同
                            y= curr.fetchone()
                            if not y:
                                print "y=",y,"new_node=",new_node,"token=",token
                                queue.put((token,new_node,node_name))
                except Exception as e:
                    print "add queue error",e
        except Exception as e:
            print "get html error",e
            
        
    
    def getContent(n,name,p,top_id):
        try:
            global counter
            curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
            y= curr.fetchone()
            print "exist?? ",y,"n=",n
            if not y:
                url="https://www.zhihu.com/topic/"+str(n)+"/hot"
                html=get_html(url)
                if html is None:
                    return
                soup = BeautifulSoup(html)
                title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
                pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
                description=soup.find('div',{'class':'zm-editable-content'})
                if description is not None:
                    description=description.text
                    
                if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环
                    description=None
                    
                tag_path=download_img(pic_path,top_id)
                print "tag_path=",tag_path
                if (tag_path is not None) or tag_path==True:
                    if tag_path==True:
                        tag_path=None
                    father_id=2 #默认为杂谈
                    curr.execute('select id from rooms where name=%s',(p))
                    results = curr.fetchall()
                    for r in results:
                        father_id=r[0]
                    name=title
                    curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
                    y= curr.fetchone()
                    print "store see..",y
                    if not y:
                        friends_num=0
                        temp = time.time()
                        x = time.localtime(float(temp))
                        create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                        create_time
                        creater_id=None
                        room_avatar=tag_path
                        is_pass=1
                        has_index=0
                        reason_id=None  
                        #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                        ######################有资格入库的内容
                        counter=counter+1
                        curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                        conn.commit() #必须时时进入数据库,不然找不到父节点
                        if counter % 200==0:
                            print "current node",name,"num",counter
        except Exception as e:
            print "get content error",e       
    
    def work():
        global queue
        curr.execute('select id,node,parent,name from classify where status=1')
        results = curr.fetchall()
        for r in results:
            top_id=r[0]
            node=r[1]
            parent=r[2]
            name=r[3]
            try:
                queue.put((node,name,parent)) #首先放入队列
                while queue.qsize() >0:
                    n,p=queue.get() #顶节点出队
                    getContent(n,p,top_id)
                    getChildren(n,name) #出队内容的子节点
                conn.commit()
            except Exception as e:
                print "what's wrong",e  
                
    def new_work():
        global queue
        curr.execute('select id,data_id,name from classify_new_copy where status=1')
        results = curr.fetchall()
        for r in results:
            top_id=r[0]
            data_id=r[1]
            name=r[2]
            try:
                get_topis(data_id,name,top_id)
            except:
                pass
    
    
    def get_topis(data_id,name,top_id):
        global queue
        url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
        isGet = True;
        offset = -20;
        data_id=str(data_id)
        while isGet:
            offset = offset + 20
            values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
            try:
                msg=None
                try:
                    data = urllib.urlencode(values)
                    request = urllib2.Request(url,data,headers)
                    response = urllib2.urlopen(request,None,5)
                    html=response.read().decode('utf-8')
                    json_str = json.loads(html)
                    ms=json_str['msg']
                    if len(ms) <5:
                        break
                    msg=ms[0]
                except Exception as e:
                    print "eeeee",e
                #print msg
                if msg is not None:
                    soup = BeautifulSoup(str(msg))
                    blks = soup.find_all('div', {'class' : 'blk'})
                    for blk in blks:
                        page=blk.find('a').get('href')
                        if page is not None:
                            node=page.replace("/topic/","") #将更多的种子入库
                            parent=name
                            ne=blk.find('strong').text
                            try:
                                queue.put((node,ne,parent)) #首先放入队列
                                while queue.qsize() >0:
                                    n,name,p=queue.get() #顶节点出队
                                    size=queue.qsize()
                                    if size > 0:
                                        print size
                                    getContent(n,name,p,top_id)
                                    getChildren(n,name) #出队内容的子节点
                                conn.commit()
                            except Exception as e:
                                print "what's wrong",e  
            except urllib2.URLError, e:
                print "error is",e
                pass 
                
            
    if __name__ == '__main__':
        i=0
        while i<400:
            new_work()
            i=i+1
    

      

     

    说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

    有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

  • 相关阅读:
    C++ 模板函数指针
    MaxScript Object_Oriented_Struct 使用strut 模拟面向对像编程中的 Class
    C# Managed DirectX 学习笔记 一 (基础环境,画三角形,输入的处理)
    C# 代理做为函数参数的时候
    mongoose基本增删改查
    JS中的reduce()详解
    JS中every()和some()的用法
    JS数组遍历方法集合
    第一篇博文
    gb2312 了解
  • 原文地址:https://www.cnblogs.com/huangxie/p/8206460.html
Copyright © 2020-2023  润新知