首先来看全部代码
import collections
start_token = 'G'
end_token = 'E'
def process_poems(file_name):
# 诗集
poems = []
with open(file_name, "r", encoding='utf-8', ) as f:
for line in f.readlines():
try:
title, content = line.strip().split(':')
content = content.replace(' ', '')
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or
start_token in content or end_token in content:
continue
if len(content) < 5 or len(content) > 79:
continue
content = start_token + content + end_token
poems.append(content)
except ValueError as e:
pass
# 按诗的字数排序
poems = sorted(poems, key=lambda l: len(line))
# 统计每个字出现次数
all_words = []
for poem in poems:
all_words += [word for word in poem]
# 这里根据包含了每个字对应的频率
counter = collections.Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)
# 取前多少个常用字
words = words[:len(words)] + (' ',)
# 每个字映射为一个数字ID
word_int_map = dict(zip(words, range(len(words))))
poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems]
return poems_vector, word_int_map, words
之后看一下数据集
最后来一点点分析
定义一个数据预处理函数:
def process_poems(file_name):
首先把处理好的结果指定成一个list:
poems = []
打开处理模块,首先制定好一个路径,然后以读的方式打开 ,最后因为诗是中文的,所以编码方式为‘utf-8’:
with open(file_name, "r", encoding='utf-8', ) as f:
一行一行去读
for line in f.readlines():
用冒号将文本分割为诗的题目和内容:
title, content = line.strip().split(':')
如果训练数据集中古诗存在问题,应该舍弃该诗:
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or
start_token in content or end_token in content:
continue
if len(content) < 5 or len(content) > 79:
continue
对诗的内容进行处理,加上开始和中止符号,然后才能将诗的内容传进结果的list里:
content = start_token + content + end_token
poems.append(content)
对得到的结果list进行排序处理:
poems = sorted(poems, key=lambda l: len(line))
统计每个字出现的次数,两层循环,第一层是循环每一首诗,第二层是循环每首诗里的每一个字:
all_words = []
for poem in poems:
all_words += [word for word in poem]
计算词频:
counter = collections.Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*count_pairs)
取前多少个常用字:
words = words[:len(words)] + (' ',)
每个字映射为一个数字ID:
word_int_map = dict(zip(words, range(len(words))))
poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems]
返回所需要的值:
return poems_vector, word_int_map, words