# - *- coding:utf-8-*-
import urllib2
import re
import os
import threading
import sys
reload(sys)
sys.setdefaultencoding('utf-8') #编码
from bs4 import BeautifulSoup
os.mkdir(u'小说0')
os.chdir(u'小说0')
def get_url():
User_Agent= 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
url="http://f.qidian.com/all?size=-1&sign=-1&tag=-1&chanId=-1&subCateId=-1&orderId=&update=-1&page=1&month=-1&style=1&action=1"
headers={'User-Agent':User_Agent}
request=urllib2.Request(url,headers=headers)
html=urllib2.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
l = soup.find_all('div', class_ = 'book-mid-info')
print #
for htmltile in l:
name = htmltile.find('h4').encode('utf-8')
reg=r'<h4><a data-bid=".*?" data-eid=".*?" href="(.*?)" target="_blank">(.*?)</a></h4>'
text=re.findall(reg,name)
return text
def get_content(curl,title):
os.mkdir(title.encode('gbk')) #创建目录
#os.chdir(title.encode('gbk')) #在当前目录下操作
html1 = urllib2.urlopen('http:'+curl+'#Catalog').read()
reg=re.compile(r'<li data-rid=".*?"><a href="(.*?)" target="_blank" data-eid="qd_G55" data-cid=".*?" title=".*?">(.*?)</a>')
titles=re.finditer(reg,html1)
for n in titles:
curl_=n.group(1)
names=n.group(2)
fd=open(title.encode('gbk')+'/'+names.encode('gbk')+'.txt','wb') #在指定目录下创建文件
#fd=open(names.encode('gbk')+'.txt','wb')
print "正在爬取%s本"%names
htmlll=urllib2.urlopen('http:'+curl_).read()
regs=re.compile(r'<div class="read-content j_readContent">s*([sS]*?)s*</div>') #正则多行时注意用s*
content=re.findall(regs,htmlll)
for m in content:
contents=m.replace('<p>','
')
fd.write(names+'
'+contents)
print "已完成%s"%names
fd.close()
threads=[]
def main():
for i in get_url():
th=threading.Thread(target= get_content,args=(i[0],i[1]))
threads.append(th)
for t in threads:
t.start()
while True:
if len(threading.enumerate())<10:#控制线程数量
break
if __name__=='__main__':
main()