#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @File : Spider
# @Author : moucong
# @Date : 2018/12/25 16:36
# @Software: PyCharm
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import quote
from docx.shared import Inches
from docx.oxml.ns import qn
import string
import time
import re
import docx
import os
def spider():
url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
main_url = "http://www.semi.org.cn"
page = request.urlopen(url).read().decode('utf-8')
# html = page.read().decode('utf-8')
soup = BeautifulSoup(page, "lxml")
title = soup.title.string
title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')
patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
group = patt.findall(page)
content_list = str(group[0]).split("<br />")
file = docx.Document()
for count in range(len(content_list)):
x = 0
if "img" in content_list[count]:
path = "E:/SEMI_job/SEMI_Spider/pic/"
if not os.path.isdir(path):
os.makedirs(path)
paths = path + '\'
pic = re.compile('src="(.*?)"')
pic_img = content_list[count]
pic_url = pic.findall(pic_img)
picurl = main_url+str(pic_url[0])
if ' ' in picurl:
picurl = replace(picurl)
picurl = quote(picurl, safe=string.printable)
pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
pic = request.urlretrieve(picurl, pic_path)
x = x+1
file.add_picture(pic_path, width=Inches(3.0))
elif "strong" in content_list[count]:
strong_font = re.compile('<strong>(.*?)</strong>')
strong_type = strong_font.findall(content_list[count])
p = file.add_paragraph()
run = p.add_run(strong_type)
# 加粗
run.font.bold = True
# print(strong_type)
else:
file.styles['Normal'].font.name = u'宋体'
file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
file.add_paragraph(content_part)
# print(content_part)
file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
print("已处理好!")