爬天极网多线程.py

import os
import requests  # 发送请求
from bs4 import BeautifulSoup   # 解析文本
import re
import threading

base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, '3')

# 拿到前五页url
url_heard = "http://pic.yesky.com"
url_start = "/c/6_20491_1.shtml"
response = requests.get(url=url_heard+url_start)
soup = BeautifulSoup(response.text, "html.parser")
div_obj = soup.find(name="div", attrs={"class": "flym"})
a_list = div_obj.find_all(name='a')
href_list = []
for a in a_list:
    if a.get('href') not in href_list:
        href_list.append(a.get('href'))
href_list.insert(0, url_start)

def func(url_heard, img_path, u):
    response1 = requests.get(url=url_heard+u)
    soup1 = BeautifulSoup(response1.text, 'html.parser')  # 将请求结果交给bs4解析
    div_obj1 = soup1.find(name='div', attrs={"class": "lb_box"})  # 经过分析之后,定位到指定div

    list_dd = div_obj1.find_all(name='dd')
    for dd in list_dd:  # 每一张图片的dl
        a_obj = dd.find('a')

        # 拼接文件夹的路径,并创建文件夹
        title = re.sub('[/:*?"<>|]', '_', a_obj.text)
        dir_path = os.path.join(img_path, title)
        if not os.path.isdir(dir_path):  # 判断文件是否存在
            os.mkdir(dir_path)

        a_response = requests.get(a_obj.get('href'))
        a_response.encoding = 'GBK'  # 标题汉字
        soup2 = BeautifulSoup(a_response.text, 'html.parser')
        div_obj2 = soup2.find(name='div', attrs={"class": "overview"})
        print(div_obj2)

        try:
            img_list = div_obj2.find_all(name='img')
            for img in img_list:
                img_src = img.get("src")
                img_response = requests.get(img_src.replace('113x113', '740x-'))  # ******************此网站找到的捷径规律
                file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])
                with open(file_path, 'wb') as f:
                    f.write(img_response.content)
        except Exception as e:
            pass
t = []
n = 1
for u in href_list:
    t.append(threading.Thread(target=func, name="线程"+str(n), args=(url_heard, img_path, u)))
    n += 1

for i in t:
    i.start()

效果如下：

相关阅读:
SQOOP的安装配置_Linux伊甸园开源社区24小时滚动更新开源资讯，全年无休！
Cloudera's Hadoop Demo VM for CDH4 Cloudera Support
海量文档查同或聚类问题 Locality Sensitive Hash 算法
 part 1: resemblance with the jaccard coefficient
计算机科学中最重要的32个算法zz
详细的tfidf构建过程实例（转）
2012 Beijing Google Dev FastDay(11/03/2012) 移动新观察
 百度技术沙龙
 Hive官方手册翻译(Getting Started) 实践检验真理 51CTO技术博客
 《周末休闲吧》：教你如何玩车震——车震全程攻略！_周末休闲吧_百度空间
原文地址：https://www.cnblogs.com/zhang-da/p/12210008.html