• python 获取网页内容新增网页分类+删除指定后缀数组元素功能(基于python 3.6)


    from urllib.parse import urljoin
    import urllib.request

    from bs4 import BeautifulSoup
    import time
    import os
    import re
    import errno


    def mkdir_p(path): # 递归创建多级目录
    try:
    os.makedirs(path)
    except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
    if exc.errno == errno.EEXIST and os.path.isdir(path):
    pass
    else:
    raise


    def get_link(page): # 寻找链接的href
    linkData = []
    for page in page.find_all('td'):
    links = page.select("a")
    for each in links:
    # if str(each.get('href'))[:1] == '/': 过滤if代码
    data = each.get('href')
    linkData.append(data)
    return (linkData)


    def gain(url): # 获取网页指定内容
    try:
    page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(page, 'lxml') # 利用soup获取网页内容
    links = get_link(soup) # 获取<a href= ? 内容
    return links
    except:
    print('无法获取该链接:' + url)
    return 1




    def main():
    url = 'http://weather.unisys.com/hurricane/index.php'
    Download_dir = 'E:\Typhoon_data\Data' #download path
    Web_Link = gain(url)

    for Link in range(len(Web_Link)):
    Link_Add = Web_Link[Link]
    Link_One = re.split("/", Link_Add) # 去除'/',将Link_Add变成数组
    Ocean_Folder = Link_One[0] # 获取数组第1位值
    Ocean_Time = Link_One[1] # 获取数组第2位值
    url_Typhoon = 'http://weather.unisys.com/hurricane/'
    _connet = urljoin(url_Typhoon, Link_Add)
    Web_Link_ = gain(_connet)

    # 删除多余gif链接
    Gifdata = []
    for Gif in range(len(Web_Link_)):
    Gifdata_ = Web_Link_[Gif]
    findGif = re.findall(r'.gif$', Gifdata_, re.I)
    if findGif:
    Gifdata.append(Gifdata_)
    # print(Gifdata)
    else:
    continue
    for _Gif in range(len(Gifdata)):
    Web_Link_.remove(Gifdata[_Gif])

    time.sleep(3)
    if Ocean_Time != 'index.php':
    for Link_A in range(len(Web_Link_)):
    Link_Add_ = Web_Link_[Link_A]
    Link_part = re.split("/", Link_Add_) # 去除'/',将Link_Add变成数组
    Ocean_dataName = Link_part[0] # 获取dataName
    url_Data = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/" + Ocean_Time + "/"
    connet_ = urljoin(url_Data, Link_Add_)

    time.sleep(1)
    # 下载数据
    Ocean_dataFile = (Ocean_dataName + '.json')
    file = os.path.join(Download_dir + "/" + Ocean_Folder + "/" + Ocean_Time + "/") # 拼接绝对路径
    mkdir_p(file)
    print(connet_)
    if os.path.isfile(file + Ocean_dataFile):
    print('文件已存在')
    else:
    try:
    url =connet_
    wp = urllib.request.urlopen(url) # 打开数据网页数据
    content = wp.read()

    fp = open(file + Ocean_dataFile, "wb") # 写入指定文件夹
    fp.write(content) # 写入数据
    fp.close() # 关闭文件
    except:
    print('无法获取该链接:' + url)
    continue





    else:
    for Link_B in range(len(Web_Link_)):
    _Link_Add = Web_Link_[Link_B]
    Link_part_ = re.split("/", _Link_Add) # 去除'/',将Link_Add变成数组
    Ocean_Time_ = Link_part_[1] # 获取数组第2位值,年份
    url_Typhoon_ = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/"
    Connet = urljoin(url_Typhoon_, _Link_Add)
    _Web_Link = gain(Connet)
    time.sleep(3)

    # 删除多余gif链接
    _Gifdata = []
    for _Gif_ in range(len(_Web_Link)):
    _Gifdata_ = _Web_Link[_Gif_]
    findGif = re.findall(r'.gif$', _Gifdata_, re.I)
    if findGif:
    _Gifdata.append(_Gifdata_)
    # print(Gifdata)
    else:
    continue
    for _Gif in range(len(_Gifdata)):
    _Web_Link.remove(_Gifdata[_Gif])

    for Link_B_ in range(len(_Web_Link)):
    _Link_Add_ = Web_Link[Link_B_]
    _Link_part_= re.split("/", _Link_Add_) # 去除'/',将Link_Add变成数组
    _Ocean_dataName_ = _Link_part_[0] # 获取数组第1位值
    url_Data_ = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/" + Ocean_Time_ + "/"
    Connet_ = urljoin(url_Data_, _Link_Add_)
    time.sleep(1)
    # 下载数据

    Ocean_dataName = (_Ocean_dataName_ + '.json')
    file = os.path.join(Download_dir + "/" + Ocean_Folder + "/" + Ocean_Time_ + "/" ) # 拼接绝对路径
    mkdir_p(file)
    print(Connet_)
    if os.path.isfile(file + Ocean_dataName):
    print('文件已存在')
    else:
    try:
    url = Connet_
    wp = urllib.request.urlopen(url) # 打开数据网页数据
    content = wp.read()
    fp = open(file + Ocean_dataName
    , "wb") # 写入指定文件夹
    fp.write(content) # 写入数据
    fp.close() # 关闭文件
    except:
    print('无法获取该链接:' + url)
    continue

    if __name__ == '__main__':
    main()
     


  • 相关阅读:
    JS---案例:tab切换效果
    .net core 使用MD5加密解密字符串
    c#实战开发:用.net core开发一个简单的Web以太坊钱包 (六)
    c#实战开发:以太坊Geth 命令发布智能合约 (五)
    c#实战开发:以太坊Geth 常用命令 (四)
    c#实战开发:以太坊钱包快速同步区块和钱包卡死解决方案 (三)
    c#实战开发:以太坊钱包对接私链 (二)
    c# API接受图片文件以文件格式上传图片
    c# API接受图片文件以Base64格式上传图片
    命令查看当前电脑安装所有版本.NET Core SKD
  • 原文地址:https://www.cnblogs.com/setname/p/8556977.html
Copyright © 2020-2023  润新知