• 正经Python汤不热爬虫


    转自:https://github.com/facert/tumblr_spider

    install

    pip install -r requirements.txt

    run

    python tumblr.py username (usename 为任意一个热门博主的 usename)

    snapshoot

    爬取结果

    user.txt 是爬取的博主用户名结果, source.txt 是视频地址集

    原理

    根据一个热门博主的 usename, 脚本自动会获取博主转过文章的其他博主的 username,并放入爬取队列中,递归爬取。

    申明

    这是一个正经的爬虫(严肃脸),爬取的资源跟你第一个填入的 username 有很大关系,另外由于某些原因,导致 tumblr 被墙,所以最简单的方式就是用国外 vps 去跑。

    # -*- coding:utf-8 -*-
    import signal
    import sys
    import requests
    import threading
    import queue
    import time
    from bs4 import BeautifulSoup
    
    mutex = threading.Lock()
    is_exit = False
    
    
    class Tumblr(threading.Thread):
    
        def __init__(self, queue):
            self.user_queue = queue
            self.total_user = []
            self.total_url = []
            self.f_user = open('user.txt', 'a+')
            self.f_source = open('source.txt', 'a+')
    
            threading.Thread.__init__(self)
    
        def download(self, url):
            res = requests.get(url)
    
            source_list = []
            soup = BeautifulSoup(res.text)
            iframes = soup.find_all('iframe')
            tmp_source = []
            for i in iframes:
                source = i.get('src', '').strip()
                if source and source.find('https://www.tumblr.com/video') != -1 and source not in self.total_url:
                    source_list.append(source)
                    tmp_source.append(source)
                    print (u'新增链接:' + source)
    
            tmp_user = []
            new_users = soup.find_all(class_='reblog-link')
            for user in new_users:
                username = user.text.strip()
                if username and username not in self.total_user:
                    self.user_queue.put(username)
                    self.total_user.append(username)
                    tmp_user.append(username)
                    print (u'新增用户:' + username)
    
            mutex.acquire()
            if tmp_user:
                self.f_user.write('
    '.join(tmp_user)+'
    ')
            if tmp_source:
                self.f_source.write('
    '.join(tmp_source)+'
    ')
            mutex.release()
    
        def run(self):
            global is_exit
            while not is_exit:
                user = self.user_queue.get()
                url = 'http://%s.tumblr.com/' % user
                self.download(url)
                time.sleep(2)
            self.f_user.close()
            self.f_source.close()
    
    
    def handler(signum, frame):
        global is_exit
        is_exit = True
        print ("receive a signal %d, is_exit = %d" % (signum, is_exit))
        sys.exit(0)
    
    
    def main():
    
        if len(sys.argv) < 2:
            print ('usage: python tumblr.py username')
            sys.exit()
        username = sys.argv[1]
    
        NUM_WORKERS = 10
        q = queue.Queue()
        # 修改这里的 username
        q.put(username)
    
        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)
    
        threads = []
        for i in range(NUM_WORKERS):
            tumblr = Tumblr(q)
            tumblr.setDaemon(True)
            tumblr.start()
            threads.append(tumblr)
    
        while True:
            for i in threads:
                if not i.isAlive():
                    break
            time.sleep(1)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    正则表达式
    vim
    linux 6 安装 zabbix.3服务
    内核链表学习记录
    rpc-protobuff-实现
    Qedis实现
    try-catch 异常捕获学习
    协程的学习和使用
    惊群的学习研究 这人的博客还有其他干货
    互斥锁与自旋锁的区别测试代码
  • 原文地址:https://www.cnblogs.com/linmilove/p/9159825.html
Copyright © 2020-2023  润新知