和朋友在QQ上聊天感觉呀没有激情,突然,突发奇想,我写个小的爬虫 ,把表情包爬取下来随便挑,斗到他们吐血。
下面是爬取斗图的代码,代码可供参考
#encoding:utf8
#模块
import re
import requests
from lxml import etree
import os
import random
import threading
import time
import hashlib
def makemake(path):
path = re.sub('\,|:|?|!','',path)
path = path.replace(' ','')
#替换
if os.path.isdir('F:\11\斗图\'+path):
#创建路径
print(path+'已经存在')
else:
print('开始创建'+path)
zhi = os.makedirs('F:\11\斗图\'+path)
# print(zhi)
def make_files(path,source):
path = re.sub('\,|:|?|!','',path)
path = path.replace(' ','')
#替换
if os.path.isfile('F:\11\斗图\'+path):
#判断路径是否存在,如果存在就直接打印
print(path+'已经存在')
else:
#循环网页
while True:
n = 0
n = n + 1
try:
source = requests.get('http://'+source).content
break
except:
print('http://'+source+'连接出错正在重试当前次数:'+str(n))
time.sleep(1)
if n>6:
print('放弃http://'+source)
break
print('正在下载'+path)
file = open('F:\11\斗图\'+path,'wb')
#打开文件夹
file.write(source)
#写入
file.close()
#关闭
def start_spider(g):
print('当前下载页数为:' + str(g))
while True:
n = 1
try:
yuan = requests.get('https://www.doutula.com/article/list/?page=' + str(g)).text
break
except:
print('https://www.doutula.com/article/list/?page=' + str(g) + '连接出错正在重试当前次数:' + str(n))
time.sleep(1)
n = n + 1
lists = etree.HTML(yuan).xpath('//*[@id="home"]/div/div/div/ul/a')
#获取图片规则
for i in lists:
img_name = i.xpath('div/h4/text()')[0]
makemake(img_name)
for ii in i.xpath('div/div/img/@data-original'):
img_url = ii[2:]
wei = img_url[-4:]
md5 = hashlib.md5(wei.encode("gb2312"))
listss = md5.hexdigest()
if listss in ['.jpg','.gif','.png']:
make_files(img_name + '\' + str(random.randint(1, 99999999999999)) + listss, img_url)
else:
print(img_url)
thread_list = []
for g in range(1,499):
while True:
print(len(thread_list))
if len(thread_list)<21:
the_thread = threading.Thread(target=start_spider,args=(g,))
the_thread.setDaemon(True)
the_thread.start()
thread_list.append(the_thread)
break
else:
print('线程数为:'+str(len(thread_list))+'等待清空')
time.sleep(1)
for thread in thread_list:
if not thread.is_alive():
thread_list.remove(thread)
for t in thread_list:
t.join()
print('完成')