依赖包如下所示:
pip install python-docx==0.8.10
pip install lxml==4.6.3
代码如下所示:
import datetime import os import time import uuid import docx import json import lxml import mysql.connector import requests import subprocess import sys from docx import Document from docx import shared from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.oxml import OxmlElement from docx.oxml.ns import qn import Baidu_Text sys.path.append(os.getcwd()) class report: # doc = docx.Document() def __init__(self, doc): self.doc = doc def setHeading(self, lv, s): a = self.doc.add_heading(s, lv) a.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER def setParagraph(self, s, alig): # s = s.encode('ascii') p = self.doc.add_paragraph(s) if len(alig) > 0: if alig == 'R': p.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT elif alig == 'C': p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER def addPicture(self, path): # shared.Inches(1) 按英寸设置 shared.Cm(2) 按厘米设置 # self.doc.add_picture(path, width=shared.Inches(3)) paragraph = self.doc.add_paragraph() # 图片居中设置 paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER run = paragraph.add_run("") run.add_picture(path, width=shared.Inches(3)) def setTOC(self): paragraph = self.doc.add_paragraph() run = paragraph.add_run() fldChar = OxmlElement('w:fldChar') # creates a new element fldChar.set(qn('w:fldCharType'), 'begin') # sets attribute on element instrText = OxmlElement('w:instrText') instrText.set(qn('xml:space'), 'preserve') # sets attribute on element instrText.text = 'TOC \o "1-4" \h \z \u' # change 1-3 depending on heading levels you need fldChar2 = OxmlElement('w:fldChar') fldChar2.set(qn('w:fldCharType'), 'separate') fldChar3 = OxmlElement('w:t') # fldChar3.text = "Right-click to update field." fldChar3.text = "右键单击以更新字段。" fldChar2.append(fldChar3) fldChar4 = OxmlElement('w:fldChar') fldChar4.set(qn('w:fldCharType'), 'end') r_element = run._r r_element.append(fldChar) r_element.append(instrText) r_element.append(fldChar2) r_element.append(fldChar4) p_element = paragraph._p # 添加分页符 paragraph.add_run().add_break(docx.enum.text.WD_BREAK.PAGE) def writeDoc(self, fileName): self.doc.save(fileName) def set_updatefields_true(self, docx_path): namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" f = open(docx_path, 'rb') doc = Document(f) # add child to doc.settings element element_updatefields = lxml.etree.SubElement( doc.settings.element, f"{namespace}updateFields" ) element_updatefields.set(f"{namespace}val", "true") doc.save(docx_path) ## Heading ## def translation(_lang, _con): if _lang == "en": _lang = "zh" _con = _con.strip() translation = "" print(len(_con)) test_con = _con.replace(" ", "") if _con != "" and len(test_con): if len(_con) >= 5000: con_arr = _con.split(" ") for con1 in con_arr: time.sleep(1) test_con = con1.replace(" ", "") if con1 != '' and len(test_con): rsp = Baidu_Text.baiduTrans("en", _lang, con1) if rsp.get("error_code") is None: for key in rsp.get("trans_result"): translation = translation + key.get("dst") + " " result = { "code": 0, "translation": translation } else: result = { "code": rsp.get("error_code"), "translation": translation } return result # 翻译中途出现错误,则返回错误结果 return result # 翻译完成,返回翻译内容 else: time.sleep(1) rsp = Baidu_Text.baiduTrans("en", _lang, _con) if rsp.get("error_code") is None: for key in rsp.get("trans_result"): translation = translation + key.get("dst") + " " result = { "code": 0, "translation": translation } else: result = { "code": rsp.get("error_code"), "translation": translation } result["translation"] = result.get("translation").strip(" ") print(result) return result else: result = { "code": 0, "translation": "" } return result def javaTOC(doc): p = sys.path[0] doc = p + '/' + doc print('java -jar ' + p + '/toc.jar ' + doc) # stdout 文件对象 stderr 文件描述符 subprocess.Popen('java -jar ' + p + '/toc.jar ' + doc, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def entry(lalala): print(lalala) docc = docx.Document() doc = report(docc) doc.setTOC() curr_time = str(datetime.datetime.now().strftime("%Y-%m-%d日%H")) + '时推送' # fileName = sys.path[0] + '/report/thenationonlineng.docx' fileName = sys.path[0] + '/report/news{}.docx'.format(curr_time, curr_time) arrg = [] for lala in lalala: print(type(lala["list"])) if len(lala["list"]): arrg += lala["list"] doc.setHeading(1, lala["website"]) # 情报 for obj in lala["list"]: title = obj["title"] author = obj["author"] pd = str(obj["publish_time"]) content = str(obj["content"]) print(title) # doc.setHeading(1, obj["child_source"]) doc.setHeading(2, title) # for img in obj['imgs']: # imgName = uuid.uuid1() # explation = img["explation"].replace('"', '\"') # img_binary = requests.get(img["url"]) # .content # 获取图片的二进制格式 # content_type = img_binary.headers["Content-Type"].split('/')[1] # print(img_binary.status_code) # if img_binary.status_code == 200: # # picture_url = '/static/picture/' + str(imgName) + '.' + content_type # # print(picture_url) # try: # path = sys.path[0].replace('\', '/').replace('emailTest', '') + '/test/' + str( # imgName) + '.' + content_type # # path = sys.path[0].replace('\', '/') + '/test/' + str(imgName) + '.' + content_type # # print(path) # with open(path, "wb") as f: # f.write(img_binary.content) # img_bin里面保存着 以二进制方式读取的图片内容,当前目录会生成一张img.jpg的图片 # f.close() # pic = doc.addPicture(path) # except Exception as err: # print(err) # pd_arr = pd.split('-') # pd1 = pd_arr[0] + '年' + pd_arr[1] + '月' + pd_arr[2] + '日' # if author == '': # auth_time = pd1 # else: # auth_time = author + '-' + pd1 auth_time = pd doc.setParagraph(auth_time, 'C') doc.setParagraph(content, '') chn_title = translation("en", title)["translation"] doc.setHeading(2, chn_title) doc.setParagraph(auth_time, 'C') chn_content = translation("en", content)["translation"] doc.setParagraph(chn_content, '') doc.writeDoc(fileName) javaTOC(str(fileName)) doc.set_updatefields_true(fileName) if len(arrg): result = { "code": 1, "message": "生成成功" } print(result) else: result = { "code": 2, "message": "没有最新消息" } print(result) # dbt.db.close() return result # 接收子进程传递过来得参数 print(sys.argv) if __name__ == '__main__': # # _from, _to, lang, name, type # _from = '2021-03-16' # _to = '2021-03-20' # lang = 1 # _lang = 0 双语 1 原语 2 译文 # # name = "repost" # # type = "周报" # _uuid = '4b89d9f8-69be-11eb-914c-40ec996f89c9' # entry(_from, _to, lang, _uuid) from_date = sys.argv[1] _to_date = sys.argv[2] _lang = sys.argv[3] _uuid = sys.argv[4] _sid = sys.argv[5] entry(from_date, _to_date, _lang, _uuid, _sid)
上面代码,加了翻译,如果不需要翻译,可将有关翻译的代码全部去掉。
Baidu_Text.py代码:
# -*- coding: utf-8 -*- # This code shows an example of text translation from English to Simplified-Chinese. # This code runs on Python 2.7.x and Python 3.x. # You may install `requests` to run.py this code: pip install requests # Please refer to `https://api.fanyi.baidu.com/doc/21` for complete api document import requests import random import json from hashlib import md5 # Set your own appid/appkey. appid = 'xxxxxx' # 这里写自己的appid appkey = 'xxxxxx' # 这里写自己的appkey # # For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21` # from_lang = 'en' # to_lang = 'zh' endpoint = 'http://api.fanyi.baidu.com' path = '/api/trans/vip/translate' url = endpoint + path # Generate salt and sign def make_md5(s, encoding='utf-8'): return md5(s.encode(encoding)).hexdigest() def baiduTrans(from_lang, to_lang, query): salt = random.randint(32768, 65536) sign = make_md5(appid + query + str(salt) + appkey) # Build request headers = {'Content-Type': 'application/x-www-form-urlencoded'} payload = {'appid': appid, 'q': query, 'from': from_lang, 'to': to_lang, 'salt': salt, 'sign': sign} # Send request print(payload) try: r = requests.post(url, params=payload, headers=headers) result = r.json() # result = json.dumps(result, indent=4, ensure_ascii=False) # Show response # print(json.dumps(result, indent=4, ensure_ascii=False)) except Exception as err: result = { "error_code": 500, "err": err } return result if __name__ == '__main__': from_lang = 'en' to_lang = 'zh' query = 'Hello World! This is 1st paragraph.This is 2nd paragraph.' aa = baiduTrans(from_lang, to_lang, query) print(aa.get("error_code")) print(aa.get("trans_result")[0].get("dst"))