• python_crawler,批量下载文件


    这个第一个python3网络爬虫,参考书籍是《python网络数据采集》。该爬虫的主要功能是爬取某个网站,并将.rar,.doc,.docx,.zip文件批量下载。

    后期将要改进的是,用后缀名来识别并下载文件,但面对大数据量的网站,需要用到BloomFilter,再者还需要了解网站的反爬虫机制。

    # -*- coding: utf-8 -*-

    import os
    from urllib.request import urlretrieve
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re
    from urllib.parse import quote
    import string

    downloadDirectory = "downloaded"
    baseUrl = "http://computer.hdu.edu.cn"
    def is_chinese(uchar):
    if uchar >= u'u2E80' and uchar <= u'uFE4F':
    return True
    else:
    return False

    def getAbsoluteURL(baseUrl, source):
    if source.startswith("http://www."):
    url = "http://"+source[11:]
    elif source.startswith("http://"):
    url = source
    elif source.startswith("www."):
    url = source[4:]
    url = "http://"+source
    else:
    url = baseUrl+source
    if baseUrl not in url:
    return None
    return url

    def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace("www.", "")
    path = path.replace(baseUrl, "")
    path = downloadDirectory+path
    directory = os.path.dirname(path)

    if not os.path.exists(directory):
    os.makedirs(directory)

    print(path)
    return path


    pages = set()
    def getLinks(pageUrl):
    global pages
    html = urlopen("http://computer.hdu.edu.cn"+pageUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    try:
    print(bsObj.h1.get_text())
    print(bsObj.h2.get_text())
    print(bsObj.h3.get_text())
    # my_docs = bsObj.findAll("a", {"href":re.compile("/uploads/attachments/.*.doc")})
    my_files = bsObj.findAll("a", {"href":re.compile("/uploads/attachments/")})

    for my_file in my_files:
    if is_chinese(my_file["href"]):
    my_file["href"]=quote(my_file["href"])
    print("τݾ"+my_file["href"])
    url = getAbsoluteURL(baseUrl, my_file["href"])
    # url="http://computer.hdu.edu.cn"+ my_file["href"]
    print(url)
    if url is not None:
    # print(url)
    # url=url.encode("utf-8")
    # url=quote(url,safe=string.printable)
    # url=quote(url)

    # print(url)
    urlretrieve(url, getDownloadPath(baseUrl, url, downloadDirectory))

    # print(bsObj.find(id ="mw-content-text").findAll("p")[0])
    # print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
    print("This page is missing something! No worries though!")

    for link in bsObj.findAll("a", href=re.compile("^(/index.php/)")):
    if 'href' in link.attrs:
    if link.attrs['href'] not in pages:
    #We have encountered a new page
    newPage = link.attrs['href']
    print("---------------- "+newPage)
    pages.add(newPage)
    getLinks(newPage)
    getLinks("")

    一生有所追!
  • 相关阅读:
    Java :BufferedWriter类和BufferedReader类的构造方法、主要方法
    多线程:四大线程池详解
    多线程:head first Thread.join()
    多线程: 多线程中断机制
    多线程:深入Thread.sleep
    多线程:线程池原理
    java:java构造器和java方法的区别
    java: Java中this和super的用法总结
    postman: 用于网页调试和发送Http请求的chrome插件
    json:JSONObject包的具体使用(JSONObject-lib包是一个beans,collections,maps,java arrays和xml和JSON互相转换的包)
  • 原文地址:https://www.cnblogs.com/BlueBlue-Sky/p/6719230.html
Copyright © 2020-2023  润新知