• python识别文档英语单词后查找翻译并保存为xlsx


    代码:

    # -*- coding: utf-8 -*-
    """
    Created on Fri Aug  5 17:11:50 2022
    
    @author: koneko
    """
    import requests
    import docx
    import re
    import sqlite3
    import openpyxl
    
       
    
    def translate(keyword):
        url = 'https://fanyi.baidu.com/sug'
        data = {'kw': keyword}
        html = requests.post(url,data)
        json = html.json()
        if json['data'] == []:
            return []
        else:
            return json['data'][0]['v']
    
    
    def lang_detect(keyword):
        url = 'https://fanyi.baidu.com/langdetect'
        data = {'query':keyword}
        html = requests.post(url, data)
        json = html.json()
        return json['lan']
    
    
    def load_docx_and_get_words(fileName):
        doc = docx.Document(fileName)
        text = ''
        
        for paragraph in doc.paragraphs:
            text += paragraph.text
            
        text = text.lower()
        words = re.findall(r'[A-Za-z]+', text)[:10]
        words =  list(set(words))
        print('总共解析出'+str(len(words))+'个单词')
        return words
        
    
    def words_filter(words):
        for i, word in enumerate(words):
            print(i, word)
            if len(word) <= 2:
                print('remove '+ word +' for length <= 2')
                words.remove(word)
                continue
            lan = lang_detect(word)
            if lan != 'en':
                print('remove '+ word + ' for not english' )
                words.remove(word)
                continue
        print('清理后共'+str(len(words))+'个单词')
        return words
    
    
    
    def words_to_dictionary(words):
        dictionary = dict()
        for word in words:
            trans = translate(word)
            if trans == []:
                remains = 3
                while remains and trans == []:
                    trans = translate(word)
                    remains -= 1
            if trans == []:
                print(word,'找不到翻译')
                continue
            print(word)
            print(trans)
            dictionary[word] = trans
        
        #按键(字母顺序)进行排序,这里会变成list    
        dictionary = sorted(dictionary.items(), key = lambda x:x[0])
        return dict(dictionary)
    
    
    def save_to_xlsx(fileName, dictionary):
        wb = openpyxl.Workbook()
        ws = wb.active
        ws.title = 'vocabulary'
        for row, item in enumerate(dictionary.items()):
            ws.cell(row+1, 1).value = item[0]
            ws.cell(row+1, 2).value = item[1]
        
        wb.save(fileName+'.xlsx')
    
    
    words = load_docx_and_get_words('cet4-1.docx')
    words = words_filter(words,)
    dictionary = words_to_dictionary(words)
    
    save_to_xlsx('myVocabulary', dictionary) 
    
    
    
    
        
    
  • 相关阅读:
    计网第一章——基本概念
    计网第二章——应用层
    命令行测试邮件发送工具mailsend-go
    CentOS-7-x86_64-DVD-2009 rpm包列表(centos7.9)
    CentOS-7-x86_64-Everything-2009 rpm包列表(CentOS7.9)
    Centos发行版ISO镜像中rpm包列表
    nginx使用记录
    centos resolv.conf
    python cookbook
    ansible中变量和主机名
  • 原文地址:https://www.cnblogs.com/urahyou/p/16555208.html
Copyright © 2020-2023  润新知