• 多线程抓取邮箱


    # -*- coding: utf-8 -*-
    """
    @author: Dell Created on Sun Dec 29 17:26:43 2019
    """
    import re
    import time
    import queue
    import threading
    import requests
    
    
    def getpagesource(url):
        """获取网页源码"""
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp.content.decode("utf-8")
        except:
            return ""
        pass
    
    def getemaillist(page_source):
        """根据网页源代码抓取一个页面的所有邮箱"""
        try:
            pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)"
            return re.compile(pattern, re.IGNORECASE).findall(page_source)#忽略异常情况和大小写
        except:
            return ""
        pass
    
    def gethostname(url):
        """获取域名"""
        try:
            return re.compile(r"(http://S*?)/", re.IGNORECASE).findall(url)[0]
        except:
            return ""
        pass
    
    def getabsurl(page_source):
        """获取所有绝对路径的超链接"""
        try:
            return re.compile(r"(http://s*S*?)["|>|)]", re.IGNORECASE).findall(page_source)
        except:
            return ""
        pass
    
    def getrelurl(url, page_source):
        """获取一个页面所有相对路径的url"""
        links = re.compile(r'href="(.*?)"',re.IGNORECASE).findall(page_source)
        links_bak = links.copy()#深拷贝
        for link in links_bak:
            # 去掉所有绝对链接和非链接,保留所有相对链接
            if (link.find("http://") != -1 or link.find("https://") != -1 
                or link.find("javascript") != -1 or link.find("#") != -1):
                links.remove(link)
            
        hostname = gethostname(url)#获取域名,拼接完整链接
        if hostname != None:
            for i in range(len(links)):
                if links[i].startswith("/"):
                    links[i] = hostname + links[i]
                else:
                    links[i] = hostname + "/" + links[i]
        return links
        pass
    
    def getallurl(page_source):
        """获取所有的超链接"""
        allurllist = []#保存所有的超链接
        absurllist = []#绝对路径的超链接
        relurllist = []#相对路径拼接的超链接
        
        absurllist = getabsurl(page_source)
        if len(absurllist) > 0:
            relurllist = getrelurl(absurllist[0], page_source)
        
        allurllist.extend(absurllist)
        allurllist.extend(relurllist)
        return allurllist
        pass
    
    def saveemail():#每过5s执行一次保存
        global email_queue
        file = open("mail.txt", "ab")
        while True:
            time.sleep(5)
            while not email_queue.empty():
                email = email_queue.get()
                file.write((email+"
    ").encode("utf-8", "ignore"))
                file.flush()#实时写入
        file.close()
        pass
    
    def BFS(url, email_queue, url_queue):
        url = url_queue.get()#取出url
        page_source = getpagesource(url)#抓取页面源码
        emaillist = getemaillist(page_source)
        if len(emaillist) != 0:
            for email in emaillist:
                email_queue.put(email)
                print(email)
    
        urllist = getallurl(page_source)#提取页面链接压入队列
        if len(urllist) != 0:
            for myurl in urllist:
                url_queue.put(myurl)#将url压入队列
        pass
    
    
    def executeBFS(url,email_queue,url_queue):
        url_queue.put(url)#给定初始值
        global sem
        with sem:#限定线程的数量
            while True:
                while not url_queue.empty():
                    for i in range(101):
                        threading.Thread(target=BFS, args=(url,email_queue,url_queue)).start()
        pass
    
    
    if __name__ == "__main__":
        email_queue = queue.Queue()#邮箱队列
        url_queue = queue.Queue()
        
        sem = threading.Semaphore(20)#控制最大线程数为100
        timerthd = threading.Timer(5, saveemail)#5s以后开启一个线程将邮箱保存到文件
        timerthd.start()
        
        url = "http://bbs.tianya.cn/post-140-393974-1.shtml"
        executeBFS(url,email_queue,url_queue)
        pass
    
  • 相关阅读:
    Gradle Gretty进行runAppDebug的Listening for transport dt_socket at address: 5005 的后续配置
    Oracle :value too large for column "SCHEMA"."TABLE"."COLUMN" (actual: 519, maximum: 500)的解决方案
    js file对象 文件大小转换可视容易阅读的单位
    JS的Event各种属性级target/currentTarget/relatedTarget各种目录的解释
    浏览器控制台是否打开的一些措施的讨论
    eclipse启动指定jvm的版本
    IDEA terminal无法从vim的编辑模式转换为命令模式
    win7 64位系统在IronPython2.7 rc安装后运行出现"ipy64/ipy.exe"does not exist解决办法
    VS2010 插件 CSS3 IS 2.1.1 在win7 64位机子上安装小记
    Asp.net ajax 1.0 绑定drowdownlist时取值问题
  • 原文地址:https://www.cnblogs.com/zxfei/p/12116726.html
Copyright © 2020-2023  润新知