import mysql.connector import sys, os import urllib.request import re import itertools import base64 search_item='金融'#搜索项改这个就可以了 #以后只需要修改search_item就可以了 #转成bytes string bytesString = search_item.encode(encoding="utf-8") encodestr = base64.b64encode(bytesString) #base64 编码 user = 'root' pwd = '' host = '127.0.0.1' db = 'test' data_file = 'wooyun.dat' create_table_sql = "CREATE TABLE IF NOT EXISTS mytable (id int(10) AUTO_INCREMENT PRIMARY KEY,serial_number_sql varchar(100), title_sql varchar(100), loophole_type_sql varchar(100) , industry_sql varchar(100) , author_sql varchar(100) , yield_time_sql varchar(100), loophole_mood_sql varchar(100), hazard_rating_sql varchar(100), reveal_mood_sql varchar(200), detail_sql varchar(5000), repair_sql varchar(2000), path_sql varchar(50)) CHARACTER SET utf8" insert_sql = "INSERT INTO mytable (serial_number_sql, title_sql, loophole_type_sql, industry_sql, author_sql, yield_time_sql, loophole_mood_sql, hazard_rating_sql, reveal_mood_sql, detail_sql, repair_sql, path_sql) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db) cursor = cnx.cursor() def create_table_sql_api(a): try: cursor.execute(a) except mysql.connector.Error as err: print("create table 'mytable' failed.") print("Error: {}".format(err.msg)) sys.exit() def insert_sql_api(a,b): try: cursor.execute(a,b) except mysql.connector.Error as err: print("insert table 'mytable' failed.") print("Error: {}".format(err.msg)) sys.exit() create_table_sql_api(create_table_sql) #mysql数据库 starturl="http://www.wooyun.org/searchbug.php?q="+encodestr.decode() loophole=[] nextpage=[] result=[] #定义页面跳转相关变量 def get_html_response(url): html_response = urllib.request.urlopen(url).read().decode('utf-8') return html_response def geturl(starturl): a=get_html_response(starturl) childurl=(re.findall(r'/bugs/wooyun-w*-w*',a)) return childurl def get_nextpage(starturl): d=get_html_response(starturl) num_p=0 num=re.findall(r'd*s页',d) for i in num: i=re.sub(r's页','',i) num_p=i for x in range(1,int(num_p)): x='searchbug.php?q='+encodestr.decode()+'&pNO='+str(x) nextpage.append(x) return nextpage def download_img(url): img_name=re.sub(r'http://wooyun.org/upload/d*/','',url) download_img=urllib.request.urlretrieve(url,'D:wooyun\%s'%img_name) def download_html(i,title): html_path='D:\wooyun_html\'+title+'.html' download_html=open(html_path,'w+',encoding='utf-8') download_html.write(i) download_html.close() return('wooyun_html\'+title+'.html') for i in get_nextpage(starturl): result+=geturl('http://wooyun.org/'+i) #扫描各种漏洞的url地址放入result中 result=set(result)#去除result中重复的地址 serial_number_p='' title_p='' refered_industry_p='' author_p='' yield_time_p='' loophole_type_p='' loophole_mood_p='' hazard_rating_p='' reveal_mood_p=[] detail_p=[] repair_p='' final=[] #定义漏洞相关变量 for i in result: k=get_html_response('http://wooyun.org/'+re.sub(search_item,encodestr,i))#下载页面到k #基础信息提取 serial_number=re.findall(r'">WooYun-w{4}-w*',k) title=re.findall(r'漏洞标题:.*.</h3>',k) refered_industry=re.findall(r'相关厂商:.*.',k) author=re.findall(r'<a href="http://www.wooyun.org/whitehats/S*">',k) yield_time=re.findall(r'提交时间:.*.',k) loophole_type=re.findall(r'漏洞类型:.*.',k) hazard_rating=re.findall(r'危害等级:.*.</h3>',k) loophole_mood=re.findall(r'漏洞状态:s*S*s*</h3>',k) #详细信息提取 reveal_mood=re.findall(r'd*-d*-d*:s*S*<br/>',k) detail=re.findall(r'<p class="detail">.*.</p>',k) repair=re.findall(r'修复方案:</h3>s*<p class="detail">.*.s*</p>',k) #基础信息处理 for j in serial_number: j=re.sub(r'">','',j) serial_number_p=j for j in title: j=re.sub('漏洞标题: ','',j) j=re.sub(r's</h3>','',j) title_p=j for j in refered_industry: j=re.sub(r'相关厂商: <a href="http://www.wooyun.org/corps/','',j) j=re.sub(r'"> ','',j) refered_industry_p=j for j in author: j=re.sub(r'<a href="http://www.wooyun.org/whitehats/','',j) j=re.sub(r'">','',j) author_p=j for j in yield_time: j=re.sub(r'提交时间: ','',j) j=re.sub(r'</h3> ','',j) yield_time_p=j for j in loophole_type: j=re.sub(r'漏洞类型: ','',j) j=re.sub(r'</h3> ','',j) loophole_type_p=j for j in hazard_rating: j=re.sub(r'危害等级: ','',j) j=re.sub(r'</h3>','',j) hazard_rating_p=j for j in loophole_mood: j=re.sub(r'漏洞状态:s*','',j) j=re.sub(r's*</h3>','',j) loophole_mood_p=j #详细信息处理 for j in reveal_mood: j=re.sub('<br/>','',j) reveal_mood_p.append(j) for j in detail:#处理详情 j=re.sub(r':s',':',j) j=re.sub(r'<p class="detail">','',j) j=re.sub(r'</p>','',j) j=re.sub(r'"starget="_blank"><imgssrc="/upload/.*.width="600"/></a>',',',j) j=re.sub(r'<a href="',' http://wooyun.org',j) j=re.sub(r'对本漏洞信息进行评价,.*.备学习价值','',j) detail_p.append(j) for j in repair:#处理回复方法 j=re.sub(r'</br>','',j) j=re.sub(r'</p>','',j) j=re.sub(r'修复方案:</h3>','',j) j=re.sub(r'<psclass="detail">','',j) j=re.sub(r':',':',j) j=j.split() repair_p=j serial_number_str= "".join(itertools.chain(*serial_number_p)) title_str="".join(itertools.chain(*title_p)) loophole_type_str="".join(itertools.chain(*loophole_type_p)) refered_industry_str="".join(itertools.chain(*refered_industry_p)) author_str="".join(itertools.chain(*author_p)) yield_time_str="".join(itertools.chain(*yield_time_p)) loophole_mood_str="".join(itertools.chain(*loophole_mood_p)) hazard_rating_str="".join(itertools.chain(*hazard_rating_p)) detail_str="".join(itertools.chain(*detail_p)) reveal_mood_str="".join(itertools.chain(*reveal_mood_p)) repair_str="".join(itertools.chain(*repair_p)) '''img=re.findall(r'http://wooyun.org/upload/d*/w*.w{3}',detail_str) for j in img: download_img(j)''' path=download_html(k,serial_number_str) final.append(serial_number_str) final.append(title_str) final.append(loophole_type_str) final.append(refered_industry_str) final.append(author_str) final.append(yield_time_str) final.append(loophole_mood_str) final.append(hazard_rating_str) final.append(reveal_mood_str) final.append(detail_str) final.append(repair_str) final.append(path) insert_sql_api(insert_sql,tuple(final)) detail_p.clear() reveal_mood_p.clear() final.clear() cnx.commit() cursor.close() cnx.close()