爬取大半导体网新闻内容保存到word（基于python3.6）

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @File    : Spider
# @Author  : moucong
# @Date    : 2018/12/25 16:36
# @Software: PyCharm

from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import quote
from docx.shared import Inches
from docx.oxml.ns import qn
import string
import time
import re
import docx
import os





def spider():
    url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
    main_url = "http://www.semi.org.cn"
    page = request.urlopen(url).read().decode('utf-8')
    # html = page.read().decode('utf-8')
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')

    patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
    group = patt.findall(page)
    content_list = str(group[0]).split("<br />")
    file = docx.Document()
    for count in range(len(content_list)):
        x = 0
        if "img" in content_list[count]:
            path = "E:/SEMI_job/SEMI_Spider/pic/"
            if not os.path.isdir(path):
                os.makedirs(path)
            paths = path + '\'
            pic = re.compile('src="(.*?)"')
            pic_img = content_list[count]
            pic_url = pic.findall(pic_img)
            picurl = main_url+str(pic_url[0])
            if ' ' in picurl:
                picurl = replace(picurl)

            picurl = quote(picurl, safe=string.printable)
            pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
            pic = request.urlretrieve(picurl, pic_path)
            x = x+1
            file.add_picture(pic_path, width=Inches(3.0))

        elif "strong" in content_list[count]:
            strong_font = re.compile('<strong>(.*?)</strong>')
            strong_type = strong_font.findall(content_list[count])
            p = file.add_paragraph()
            run = p.add_run(strong_type)
            # 加粗
            run.font.bold = True
            # print(strong_type)
        else:
            file.styles['Normal'].font.name = u'宋体'
            file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
            content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
            file.add_paragraph(content_part)
            # print(content_part)

    file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
    print("已处理好！")

相关阅读:
linux内核启动分析（2）
linux内核启动分析
 U-Boot启动过程完全分析<转>
linux网桥浅析
 centeros --- dockerfile
centeros--- docker 操作命令
 java SSM项目搭建-- The server time zone value '�й��׼ʱ��' is unrecognized or represents more than one time zone
java 环境配置 -- url is not registered
java 环境配置 -- 配置Tomcat
java 环境配置--破解IDEA
原文地址：https://www.cnblogs.com/setname/p/10195397.html