• 获取网页内容生成html,并将某些标签属性进行修改 (基于python3.6)


    #!/usr/bin/python3
    # -*- coding: utf-8 -*-

    import urllib.request
    import os

    from bs4 import BeautifulSoup

    # 网址
    url = 
    # 更换部分
    Splicing = 


    def get_web(get_url):
    page = urllib.request.urlopen(get_url)
    html = page.read().decode("utf-8")
    all_url = []

    url_list = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    for list_div in url_list.find_all('div', class_='col_menu_con'):
    for a in list_div.find_all('a', href=True):
    if a.get_text(strip=True):
    if 'https' in a['href']:
    continue
    elif 'http' in a['href']:
    continue
    else:
    all_url.append(a['href'])

    for want_url in all_url:
    jump_url = Splicing + want_url
    name_split = want_url.split('/')
    file_name = name_split[1] + '.html'
    down_page = urllib.request.urlopen(jump_url)
    down_html = down_page.read()
    write_html = open(file_name, "w+b")
    write_html.write(down_html)
    write_html.close()
    print(file_name + ' ' + 'done!')


    def change_web(html_file):
    file = html_file
    content = open(file, 'r', encoding="utf-8")
    html_cont = content.read()
    find_content = BeautifulSoup(html_cont, 'lxml')

    # 修改<a href
    for change_a in find_content.find_all('a', href=True):
    change_a.get_text(strip=True)
    if 'https' in change_a['href']:
    continue
    elif 'http' in change_a['href']:
    continue
    else:
    change_href = Splicing + change_a['href']
    change_a['href'] = change_href

    # 修改<link href
    for change_link in find_content.find_all('link', href=True):
    change_link.get_text(strip=True)
    if 'https' in change_link['href']:
    continue
    elif 'http' in change_link['href']:
    continue
    else:
    change_linkhref = Splicing + change_link['href']
    change_link['href'] = change_linkhref

    # 修改<script src
    for change_script in find_content.find_all('script', src=True):
    change_script.get_text(strip=True)
    if 'https' in change_script['src']:
    continue
    elif 'http' in change_script['src']:
    continue
    else:
    change_src = Splicing + change_script['src']
    change_script['src'] = change_src

    # 修改<form action
    for change_form in find_content.find_all('form', action=True):
    change_form.get_text(strip=True)
    if 'https' in change_form['action']:
    continue
    elif 'http' in change_form['action']:
    continue
    else:
    change_action = Splicing + change_form['action']
    change_form['action'] = change_action

    # 修改<img src
    for change_image in find_content.find_all('img', src=True):
    change_image.get_text(strip=True)
    if 'https' in change_image['src']:
    continue
    elif 'http' in change_image['src']:
    continue
    else:
    change_imagesrc = Splicing + change_image['src']
    change_image['src'] = change_imagesrc

    # 修改<img original_src
    for change_originalsrc in find_content.find_all('img', original_src=True):
    change_originalsrc.get_text(strip=True)
    if 'https' in change_originalsrc['original_src']:
    continue
    elif 'http' in change_originalsrc['original_src']:
    continue
    else:
    change_original = Splicing + change_originalsrc['original_src']
    change_originalsrc['original_src'] = change_original

    change_content = str(find_content).encode(encoding='utf-8')   #尤其注意,soup生成了字典,进行修改后要转为str,并将其固定utf-8编码,才能存回去
    change_html = open(file, "w+b")
    change_html.write(change_content)
    change_html.close()
    print(file + ' ' + 'changed!')


    get_web(url)
    filearray = []
    file_list = os.listdir(os.getcwd())
    for fileNAME in file_list:
    if os.path.splitext(fileNAME)[1] == '.html':
    filearray.append(fileNAME)
    for html_number in range(len(filearray)):
    change_web(filearray[html_number])

  • 相关阅读:
    1048 石子归并
    高精度算法小结
    3117 高精度练习之乘法
    UVa 11809
    3115 高精度练习之减法
    3116 高精度练习之加法
    “da shen” in my heart
    爱是怀疑!
    普通disco
    崇拜
  • 原文地址:https://www.cnblogs.com/setname/p/9261396.html
Copyright © 2020-2023  润新知