爬取7160

优化后的代码如下,

先用循环创建20个目录,然后循环写入这20个目录,每个最多写入50000

#coding=utf-8
import os
import random
import sys


import urllib.request
from bs4 import BeautifulSoup
from urllib import error
import re
ls = ['meinv','zhenrenxiu',"lianglichemo",'rentiyishu','xiaohua','lianglichemo']
file_list = os.listdir("d:\craw\")

def validateTitle(title):
   rstr = r"[/\:*?"<>|]"  # '/  : * ? " < > |'
   new_title = re.sub(rstr, "_", title)  # 替换为下划线
   return new_title

def get_file_name():
   file = random.sample(file_list,1)[0]
   path = 'd:\craw/'+ str(file);
   if  os.path.isdir(path):
      total_num = len(os.listdir('d:\craw\'+ str(file)))
      if total_num >= 50000:
         file = get_file_name()
   else:
      os.mkdir(path)
      print("创建目录"+ str(path))

   return str(path)+'\'
for k in ls:
   for j in range(1,101111):
      url_origin = "http://www.7160.com/"+str(k)+"/"+str(j)
      print(url_origin)
      try:
         page_obj = urllib.request.urlopen(url_origin)
         page_soup = BeautifulSoup(page_obj,'lxml')
         total_page_obj = page_soup.find(text=re.compile('共')).string
         pattern = re.compile(r'd+')
         match = pattern.search(total_page_obj)

         if match == None:
            total_page = 0;
         else:
            total_page = match.group();

         for i in range(1,int(total_page)+1):
            if i == 1 :
               url = url_origin+"/index.html"
            else:
               url = url_origin+"/index_"+str(i)+".html"
            request = urllib.request.Request(url)
            try:
               res = urllib.request.urlopen(request)

               soup = BeautifulSoup(res,'lxml')
               title_obj = soup.find(attrs={"class":"picmainer"})

               if title_obj is not None:
                  print(url)
                  title = title_obj.h1.string
                  content = soup.find('img')
                  src = content.get("src")
                  file_name = validateTitle(title)+".jpg"
                  urllib.request.urlretrieve(src, str(get_file_name())+file_name)
                  print(str(get_file_name())+file_name+"保存成功")
            except Exception  as e:
               print("异常"+str(e))
      except Exception  as e:
               print("异常"+str(e))

相关阅读:
python 中 print函数的用法详解
 可转债操作一览
 Python基本数据类型
 python的列表
 理财的方法
 92、Multiple commands produce Info.plist 报错
 91、最新cocoaPods安装与使用
 90、引入头文件不提示
 89、instancetype和id的区别
 88、const、static、extern介绍
原文地址：https://www.cnblogs.com/brady-wang/p/8370574.html