import os import os.path import codecs import numpy import pandas import jieba #创建语料库 corpos = pandas.DataFrame(columns=['filePath', 'content']) for root, dirs, files in os.walk( "D:\PDM\2.1\SogouC.mini\Sample\C000007\" ): for name in files: filePath = root + '\' + name; f = codecs.open(filePath, 'r', 'utf-8') content = f.read() f.close() corpos.loc[len(corpos)+1] = [filePath, content.strip()]; #进行分词 segments = pandas.DataFrame(columns=["filePath", 'segment']) for content in corpos['content']: segs = jieba.cut(content) for seg in segs: segments.loc[len(segments)+1] = [filePath, seg]