• python 爬虫获取文件式网站资源(基于python 3.6)


    import urllib.request

    from bs4 import BeautifulSoup

    from urllib.parse import urljoin

    from Cat.findLinks import get_link

    from Cat.Load import Schedule

    import os
    import time
    import errno

    -------import的其余包代码----------------
    def get_link(page):  # 寻找链接的href
    linkData = []
    for page in page.find_all('td'):
    links = page.select("a")
    for each in links:
    # if str(each.get('href'))[:1] == '/': 过滤if代码
    data=each.get('href')
    linkData.append(data)
    return(linkData)

    def Schedule(a,b,c):  #当数据过大,加载显示模块
    '''''
    a:已经下载的数据块
    b:数据块的大小
    c:远程文件的大小
    '''
    per = 100.0 * a * b / c
    if per > 100 :
    per = 100
    print('%.2f%%' % per)
    ----------end-------------------


    def mkdir_p(path): #递归创建多级目录
    try:
    os.makedirs(path)
    except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
    if exc.errno == errno.EEXIST and os.path.isdir(path):
    pass
    else: raise

    def file_Down(connet,file):
    urllib.request.urlretrieve(connet, file, Schedule)

    def decice(data):
    a = '/'
    if a in data:
    return 1



    def findAll(): #主函数
    url='http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/'
    page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(page,'lxml') #利用BeautifulSoup取得网页代码
    links=get_link(soup)
    # print(links)

    for childLink in range(len(links)-1):
    childLink =childLink +1
    connet = urljoin(url, links[childLink]) #拼接网址路径
    page_next = urllib.request.urlopen(connet).read()
    soup_next = BeautifulSoup(page_next, 'lxml')
    link_next=get_link(soup_next ) #第2次链接内的<a href=?
    file = os.path.join('D:\test\Index' + "\" + links[childLink])
    # decice(links[childLink])
    # file_cre=os.path.join('D:\test\Index' ,links[childLink])
    if decice(links[childLink]):
    mkdir_p(file )
    else:
    file_Down(connet, file)

    print(connet)
    for child_next in range(len(link_next)-1):
    child_next =child_next +1
    connet_next=urljoin(connet,link_next[child_next] )
    page_next = urllib.request.urlopen(connet_next).read()
    soup_nextF = BeautifulSoup(page_next , 'lxml')
    link_nextF = get_link(soup_nextF) # 第3次链接内的<a href=?
    fileF = os.path.join('D:/test/Index' + "/", links[childLink]+link_next[child_next])
    if decice(links[childLink]):
    mkdir_p(fileF)
    else:
    file_Down(connet, fileF)
    print("Start : %s" % time.ctime())
    time.sleep(4)
    print("End : %s" % time.ctime())
    print(connet_next)
    for child_nextT in range(len(link_nextF )-1):
    child_nextT = child_nextT + 1
    connet_nextT = urljoin(connet_next, link_nextF[child_nextT])
    fileT = os.path.join('D:/test/Index' + "/", links[childLink] + link_next[child_next]+link_nextF[child_nextT] )
    if decice(link_nextF[child_nextT]) == 1:
    mkdir_p(fileT)
    else:
    file_Down(connet, fileT)
    print(connet_nextT)


    if __name__ == '__main__':
    findAll()


  • 相关阅读:
    table中tr间距的设定table合并单元格 colspan(跨列)和rowspan(跨行)
    使用jquery触发a标签跳转
    真正的让iframe自适应高度 兼容多种浏览器随着窗口大小改变
    html5 data属性的使用
    jQuery取得select选择的文本与值
    jqueryui教程
    密码复杂度
    zabbix配置微信报警
    tomcat配置域名访问
    阿里云ecs禁止ping,禁止telnet
  • 原文地址:https://www.cnblogs.com/setname/p/7366989.html
Copyright © 2020-2023  润新知