乌云漏洞爬虫的数据库版本（mysql）

特别鸣谢阮思绮同学！虽然感觉这个冷冷的博客也没人看23333

import mysql.connector
import sys, os
import urllib.request
import re
import itertools
user = 'root'
pwd  = ''
host = '127.0.0.1'
db   = 'test'
data_file = 'wooyun.dat'
create_table_sql = "CREATE TABLE IF NOT EXISTS mytable (id int(10) AUTO_INCREMENT PRIMARY KEY, type varchar(300) , info varchar(1000) , detail varchar(5000) , repair varchar(1000) )CHARACTER SET utf8"
insert_sql = "INSERT INTO mytable (type, info, detail, repair) VALUES ( %s, %s, %s, %s)"
select_sql = "SELECT id, type, info, detail, repair FROM mytable"
cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db)
cursor = cnx.cursor()

def create_table_sql_api(a):
    try:
        cursor.execute(a)
    except mysql.connector.Error as err:
        print("create table 'mytable' failed.")
        print("Error: {}".format(err.msg))
        sys.exit()

def insert_sql_api(a,b):
    try:
        cursor.execute(a,b)
    except mysql.connector.Error as err:
        print("insert table 'mytable' failed.")
        print("Error: {}".format(err.msg))
        sys.exit()

def select_sql_api(a):
    try:
        cursor.execute(a)
        for (id, type, info, detail, repair) in cursor:
            print("ID:{}  type:{}  info:{}  repair:{}".format(id, type, info, detail, repair))
    except mysql.connector.Error as err:
        print("query table 'mytable' failed.")
        print("Error: {}".format(err.msg))
        sys.exit()

def get_html_response(url):
    html_response = urllib.request.urlopen(url).read().decode('utf-8')
    return html_response

def geturl(starturl):
    a=get_html_response(starturl)
    childurl=(re.findall(r'/bugs/wooyun-w*-w*',a))
    return childurl

def get_nextpage(starturl):
    d=get_html_response(starturl)
    nextpage=(re.findall(r'searchbug.php?q=6YeR6J6N&pNO=w',d))
    return nextpage

starturl="http://www.wooyun.org/searchbug.php?q=6YeR6J6N"
result=[]
final=[]
type_wooyun_n=[]
info_n=[]
detail_n=[]
repair_n=[]
#output=open("D:\wooyun.csv","w+")

create_table_sql_api(create_table_sql)

for i in get_nextpage(starturl):
    result+=geturl('http://wooyun.org/'+re.sub('金融','6YeR6J6N',i))
    #扫描各种漏洞的url地址放入result中
result=set(result)#去除result中重复的地址

for i in result:
    k=get_html_response('http://wooyun.org/'+re.sub('金融','%E9%87%91%E8%9E%8D',i))#下载页面到k
    type_wooyun=re.findall(r'漏洞类型：.*.</h3>',k)
    info=re.findall(r'<h3>w*：.*.</h3>',k)#空白字符用/s,寻找所有适用于<h3>标签的文字
    detail=re.findall(r'<p class="detail">.*.</p>',k)
    repair=re.findall(r'修复方案：</h3>s*<p class="detail">.*.s*</p>',k)
    for j in type_wooyun:#漏洞类型，为之后进行数据库分类做准备
        j=re.sub(r'：s',':',j)
        j=re.sub(r'	','',j)
        j=re.sub(r'</h3>','',j)
        type_wooyun_n+=j
    for j in info:#处理概要
        j=re.sub(r'：s',':',j)
        j=re.sub(r'<h3>','',j)
        j=re.sub(r'</h3>','',j)
        j=re.sub(r'<ashref=".*.">','',j)
        j=re.sub(r'</a>','',j)
        j=re.sub(r'<imgheight=".*./>','',j)
        j=j.split()
        info_n+=j
    for j in detail:#处理详情
        j=re.sub(r'：s',':',j)
        j=re.sub(r'<psclass="detail">','',j)
        j=re.sub(r'</p>','',j)
        j=re.sub(r'"starget="_blank"><imgssrc="/upload/.*.width="600"/></a>',',',j)
        j=re.sub(r'<a href="',' http://www.wooyun.org',j)
        j=re.sub(r'对本漏洞信息进行评价，.*.备学习价值','',j)
        detail_n+=j
    for j in repair:#处理回复方法
        j=re.sub(r'</br>',',',j)
        j=re.sub(r'</p>',',',j)
        j=re.sub(r'</h3>',',',j)
        j=re.sub(r'<psclass="detail">','',j)
        j=re.sub(r'：',':',j)
        j=j.split()
        repair_n+=j
    
    type_wooyun_str="".join(itertools.chain(*type_wooyun_n))
    info_str="".join(itertools.chain(*info_n))
    detail_str="".join(itertools.chain(*detail_n))  
    repair_str="".join(itertools.chain(*repair_n)) 
    final.append(type_wooyun_str)
    final.append(info_str)
    final.append(detail_str)
    final.append(repair_str)
    insert_sql_api(insert_sql,tuple(final))
    select_sql_api(select_sql)
    #output.writelines(final)
    #output.writelines('

')
    final.clear()
    repair_n.clear()
    info_n.clear()
    type_wooyun_n.clear()
    detail_n.clear()

    
    

cnx.commit()
cursor.close()
cnx.close()
#output.close()

因为弱小，所以要变强，因为不想灭亡，所以选择战斗

相关阅读:
Minimum Cost POJ
SPFA费用流模板
 ISAP模板
 822D My pretty girl Noora
822C Hacker, pack your bags!
Dinic模板
 extjs最普通的grid
springmvc配置首页的方式
 JSTL select和checkbox的用法
 请教<context:component-scan/>和<mvc:annotation-driven/>的区别20
原文地址：https://www.cnblogs.com/cmjason/p/3918978.html

最新文章
小机房的树
 商务旅行
 最优贸易
 字串变换
 前缀和系列题目
 食物链
 货车运输
 白洋淀
 游泳
 美泉世界

热门文章
工具到位
 乔迁之喜
 团建（四）
团建（三）
团建（二）
团建（一）
A
Matrix POJ
UVa1451 数形结合
 UVa1471 LIS变种 nlgn