• Python 爬取SeeBug poc


     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 # @Date    : 2017-08-24 21:42:52
     4 # @Author  : EnderZhou (zptxwd@gmail.com)
     5 # @Link    : http://www.cnblogs.com/enderzhou/
     6 # @Version : $Id$
     7 
     8 import requests
     9 from bs4 import BeautifulSoup as bs
    10 import threading
    11 import Queue
    12 import urllib
    13 # import os
    14 import time
    15 
    16 main_url ='https://www.seebug.org/vuldb/vulnerabilities?category=&order_time=1&order_rank=1&has_all=default&has_vm=default&submitTime=all&has_affect=default&has_poc=true&has_detail=default&level=all&page='#+pagenumber
    17 
    18 bug_url = 'https://www.seebug.org/vuldb/ssvid-'#96358
    19 
    20 dl_url = 'https://www.seebug.org/vuldb/downloadPoc/'#96358
    21 
    22 
    23 #SeeBug防爬机制教强,若仅修改cookie无效,请替换全部header头信息
    24 headers = {
    25 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    26 'Accept-Encoding':'gzip, deflate, br',
    27 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
    28 'AlexaToolbar-ALX_NS_PH':'AlexaToolbar/alx-4.0.1',
    29 'Cache-Control':'max-age=0',
    30 'Connection':'keep-alive',
    31 'Cookie':'请填写自己的cookie',
    32 'Referer':'https://www.seebug.org/vuldb/vulnerabilities',
    33 'Upgrade-Insecure-Requests':'1',
    34 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
    35 }
    36 
    37 class SeeBugPoc(threading.Thread):
    38     def __init__(self,queue):
    39         threading.Thread.__init__(self)
    40         self._queue = queue
    41 
    42     def run(self):
    43         while not self._queue.empty():
    44             url = self._queue.get_nowait()
    45             self.spidet(url)
    46 
    47     def spidet(self,url):
    48         r = requests.get(url=url,headers=headers)
    49         soup = bs(r.content,'html.parser')
    50         vullist = soup.find_all(name='a',attrs={'class':'vul-title'})
    51         for u in vullist:
    52             name = u['href'].split('/')[-1]
    53             l = dl_url+u['href'].split('-')[-1]
    54             print name + '----' + l
    55             d = requests.get(url=l,headers=headers)
    56             f =  open(name+'.txt','w') 
    57             f.write(d.content)
    58             f.close
    59             time.sleep(7.5)
    60 #SeeBug防爬机制教强,此处设置延时7.5秒。后续看心情是否更新添加代理功能
    61 
    62 def main():
    63     queue = Queue.Queue()
    64     for i in range(1,2185):
    65         queue.put(main_url+str(i))
    66 
    67     threads = []
    68     thread_count = 1
    69 
    70     for i in range(thread_count):
    71         threads.append(SeeBugPoc(queue))
    72 
    73     for t in threads:
    74         t.start()
    75 
    76     for t in threads:
    77         t.join()
    78 
    79 if __name__ == '__main__':
    80     main()
  • 相关阅读:
    Django基础篇
    转 枚举设备栈
    转 Windows串口过滤驱动程序的开发
    VS2010 + WinDDK 搭建驱动开发环境(转)
    cef 下载地址
    electron入门教程
    转:PHP 生成复杂JSON格式 简单快速方法
    CEF General Usage(CEF3预览)
    转:关于使用ImageMagick和Tesseract进行简单数字图像识别
    转 Tesseract-OCR 字符识别---样本训练
  • 原文地址:https://www.cnblogs.com/enderzhou/p/7444705.html
Copyright © 2020-2023  润新知