import os import re import zipfile import logging import requests from bs4 import BeautifulSoup from openpyxl import Workbook from openpyxl.utils import get_column_letter logging.basicConfig(level=logging.INFO,#控制台打印的日志级别 filename='new.log', filemode='a',##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志,#a是追加模式,默认如果不写的话,就是追加模式 format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' #日志格式 ) def Readzip(file_name): try: z = zipfile.ZipFile(file_name, 'r') # 打印zip文件中的文件列表 guokanzhiguang_folder = 'guokanzhiguang' guokanzhiguang_list = [] for filename in z.namelist(): # sertch .txt print(filename) if filename.find(guokanzhiguang_folder) >= 0: content = z.read(filename) if len(content) == 0: continue guokanzhiguang_list.append(content) return guokanzhiguang_list except: return 'Readzip Running Faild!!' def getBookList(letter_lst): try:for html in letter_lst: soup = BeautifulSoup(html, 'html.parser') tag1 = soup.find_all('div', attrs={'class': "book-result-item-warp"}) tag.append(tag1) return tag except: return 'getBookList Running Faild!!' def getBookElementInfo(letter_lst): try: alllist = getBookList(letter_lst) if len(alllist) > 0: print('文件个数:%d' % len(alllist)) alldetialbookinfolst = [] for lst in alllist : for bookinfo in lst: detialbookinfolst = [] center = bookinfo.find('div', attrs={'class': 'center'}) #杂志 title = re.sub(u"\(.*?\)|\{.*?}|\[.*?]", "", center.find("div", attrs={"class": "title"}).get_text()) detialbookinfolst.append(title) allinfo = center.findAll('div', attrs={'class': "info"}) #国家 country = allinfo[0].get_text() detialbookinfolst.append(country[4:]) # 因子 factor = allinfo[1].find('span', class_='field').get_text() ifs = allinfo[1].find('span', class_='ifs').get_text() diff = allinfo[1].find('span', class_='diff').get_text() detialbookinfolst.append(ifs + " " + diff) # 周期 period = allinfo[2].get_text() detialbookinfolst.append(period[4:]) # 占比 ratio = allinfo[3].get_text() detialbookinfolst.append(ratio[6:]) # 地址 addre = allinfo[4].find('a').get('href') detialbookinfolst.append(addre) # 自引 cited_rate = allinfo[5].get_text() detialbookinfolst.append(cited_rate[5:]) # print("+++++++++++++++++++++++++++++++++++") alldetialbookinfolst.append(detialbookinfolst) return alldetialbookinfolst else: print('txt文件不存在或内容为空!!!') return '' except: return 'getBookElementInfo Running Faild!!' def Insert2Excel(bookinfo): # 插入数据 try: tableTitle = ['杂志', '国家', '因子', '周期', '占比', '地址', '自引'] wb = Workbook() ws = wb.active ws.title = 'gk_sheet' ws.append(tableTitle) work_name = 'gkbookinfolist.xlsx' for i in range(1, ws.max_column + 1): ws.column_dimensions[get_column_letter(i)].width = 15 for info in bookinfo : ws.append(info) wb.save(work_name) return 'Insert Excel succcessfully!' except: return 'Insert Excel failed!' if __name__ == '__main__': path = os.getcwd() letter_lst = Readzip('bookinfo.zip') bookinfo = getBookElementInfo(letter_lst) #写excel print(Insert2Excel(bookinfo))