• 爬取7160


    优化后的代码如下,

    先用循环创建20个目录,然后循环写入这20个目录,每个最多写入50000

    #coding=utf-8
    import os
    import random
    import sys


    import urllib.request
    from bs4 import BeautifulSoup
    from urllib import error
    import re
    ls = ['meinv','zhenrenxiu',"lianglichemo",'rentiyishu','xiaohua','lianglichemo']
    file_list = os.listdir("d:\craw\")

    def validateTitle(title):
    rstr = r"[/\:*?"<>|]" # '/ : * ? " < > |'
    new_title = re.sub(rstr, "_", title) # 替换为下划线
    return new_title

    def get_file_name():
    file = random.sample(file_list,1)[0]
    path = 'd:\craw/'+ str(file);
    if os.path.isdir(path):
    total_num = len(os.listdir('d:\craw\'+ str(file)))
    if total_num >= 50000:
    file = get_file_name()
    else:
    os.mkdir(path)
    print("创建目录"+ str(path))

    return str(path)+'\'
    for k in ls:
    for j in range(1,101111):
    url_origin = "http://www.7160.com/"+str(k)+"/"+str(j)
    print(url_origin)
    try:
    page_obj = urllib.request.urlopen(url_origin)
    page_soup = BeautifulSoup(page_obj,'lxml')
    total_page_obj = page_soup.find(text=re.compile('共')).string
    pattern = re.compile(r'd+')
    match = pattern.search(total_page_obj)

    if match == None:
    total_page = 0;
    else:
    total_page = match.group();

    for i in range(1,int(total_page)+1):
    if i == 1 :
    url = url_origin+"/index.html"
    else:
    url = url_origin+"/index_"+str(i)+".html"
    request = urllib.request.Request(url)
    try:
    res = urllib.request.urlopen(request)

    soup = BeautifulSoup(res,'lxml')
    title_obj = soup.find(attrs={"class":"picmainer"})

    if title_obj is not None:
    print(url)
    title = title_obj.h1.string
    content = soup.find('img')
    src = content.get("src")
    file_name = validateTitle(title)+".jpg"
    urllib.request.urlretrieve(src, str(get_file_name())+file_name)
    print(str(get_file_name())+file_name+"保存成功")
    except Exception as e:
    print("异常"+str(e))
    except Exception as e:
    print("异常"+str(e))

      

  • 相关阅读:
    js三大弹出消息框
    HDU
    BZOJ 1101 Zap 莫比乌斯反演
    竞赛常用STL备忘录
    K-query SPOJ
    HDU 3333 Turing Tree 离线 线段树/树状数组 区间求和单点修改
    2018 Multi-University Training Contest
    多校补完计划 2017-02
    CodeForces 931C Laboratory Work 水题,构造
    CodeForces 937D 936B Sleepy Game 有向图判环,拆点,DFS
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8370574.html
Copyright © 2020-2023  润新知