Amazon评论数据的预处理代码,用于情感分析,代码改自
https://github.com/PaddlePaddle/Paddle/tree/develop/demo/quick_start/data
Amazon商品评论数据网址:
http://jmcauley.ucsd.edu/data/amazon/
Bash脚本文件
get_data.sh:
#!/bin/bash # 1. size of pos : neg = 1:1. # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set. # 3. distinct train set and test set. set -e # Download data echo "Downloading Amazon Electronics reviews data..." # http://jmcauley.ucsd.edu/data/amazon/ # wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz # wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz echo "Downloading mosesdecoder..." # https://github.com/moses-smt/mosesdecoder # wget https://github.com/moses-smt/mosesdecoder/archive/master.zip # unzip master.zip # rm master.zip ################## # Preprocess data echo "Preprocess data..." export LC_ALL=C UNAME_STR=`uname` if [ ${UNAME_STR} == 'Linux' ]; then SHUF_PROG='shuf' else SHUF_PROG='gshuf' fi mkdir -p tmp # python preprocess.py -i reviews_Electronics_5.json.gz python preprocess.py -i reviews_Digital_Music_5.json.gz # uniq and shuffle cd tmp echo 'Uniq and shuffle...' cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed min_len=`sed -n '$=' neg.shuffed` echo `sed -n '$=' neg.shuffed` test_num=$((min_len/10)) if [ $test_num -gt 12500 ];then test_num=12500 fi train_num=$((min_len-test_num)) head -n$train_num pos.shuffed >train.pos head -n$train_num neg.shuffed >train.neg tail -n$test_num pos.shuffed >test.pos tail -n$test_num neg.shuffed >test.neg cat train.pos train.neg | ${SHUF_PROG} >../train.txt cat test.pos test.neg | ${SHUF_PROG} >../test.txt cd - echo 'train.txt' > train.list echo 'test.txt' > test.list # use 30k dict # rm -rf tmp mv dict.txt dict_all.txt cat dict_all.txt | head -n 30001 > dict.txt echo 'Done.'
数据处理文件:preprocess.py:
# -*- coding: UTF-8 -*- """ 1. Tokenize the words and punctuation Usage: python preprocess.py -i data_file [random seed] """ import sys import os import operator import gzip from subprocess import Popen, PIPE from optparse import OptionParser import json from multiprocessing import Queue from multiprocessing import Pool import multiprocessing batch_size = 5000 word_count = {} num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save max_queue_size = 8 parse_queue = Queue(maxsize=max_queue_size + num_tokenize) tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize) def create_dict(data): """ Create dictionary based on data, and saved in data_dir/dict.txt. The first line is unk -1. data: list, input data by batch. """ for seq in data: try: for w in seq.lower().split(): if w not in word_count: word_count[w] = 1 else: word_count[w] += 1 except: sys.stderr.write(seq + " ERROR ") def parse(path): """ Open .gz file. """ sys.stderr.write(path) g = gzip.open(path, 'r') for l in g: yield json.loads(l) g.close() def tokenize(sentences): """ Use tokenizer.perl to tokenize input sentences. tokenizer.perl is tool of Moses. sentences : a list of input sentences. return: a list of processed text. """ dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl' if not os.path.exists(dir): sys.exit( "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists." ) tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] assert isinstance(sentences, list) text = " ".join(sentences) tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) tok_text, _ = tokenizer.communicate(text) toks = tok_text.split(' ')[:-1] return toks def save_data(instance, data_dir, pre_fix, batch_num): """ save data by batch """ label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))] lines = ['%s %s' % (label[i], instance[i]) for i in range(len(label))] file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num)) file(file_name, 'w').write(' '.join(lines) + ' ') def tokenize_batch(id): """ tokenize data by batch """ while True: num_batch, instance, pre_fix = parse_queue.get() if num_batch == -1: ### parse_queue finished tokenize_queue.put((-1, None, None)) sys.stderr.write("Thread %s finish " % (id)) break tokenize_instance = tokenize(instance) tokenize_queue.put((num_batch, tokenize_instance, pre_fix)) sys.stderr.write('.') def save_batch(data_dir, num_tokenize, data_dir_dict): """ save data by batch build dict.txt """ token_count = 0 while True: num_batch, instance, pre_fix = tokenize_queue.get() if num_batch == -1: token_count += 1 if token_count == num_tokenize: #### tokenize finished. break else: continue save_data(instance, data_dir, pre_fix, num_batch) create_dict(instance) ## update dict sys.stderr.write("save file finish ") f = open(data_dir_dict, 'w') f.write('%s %s ' % ('unk', '-1')) for k, v in sorted(word_count.items(), key=operator.itemgetter(1), reverse=True): f.write('%s %s ' % (k, v)) f.close() sys.stderr.write("build dict finish ") def parse_batch(data, num_tokenize): """ parse data by batch parse -> tokenize -> save """ raw_txt = parse(data) neg, pos = [], [] count = 0 sys.stderr.write("extract raw data ") for l in raw_txt: rating = l["overall"] text = l["reviewText"].lower() # # convert words to lower case if rating == 5.0 and text: pos.append(text) if rating < 3.0 and text: neg.append(text) if len(pos) == batch_size or len(neg) == batch_size: if len(pos) == batch_size: batch = pos pre_fix = 'pos' else: batch = neg pre_fix = 'neg' parse_queue.put((count, batch, pre_fix)) count += 1 if pre_fix == 'pos': pos = [] else: neg = [] if len(pos) > 0: parse_queue.put((count, pos, 'pos')) count += 1 if len(neg) > 0: parse_queue.put((count, neg, 'neg')) count += 1 for i in range(num_tokenize): parse_queue.put((-1, None, None)) #### for tokenize's input finished sys.stderr.write("parsing finish ") def option_parser(): parser = OptionParser(usage="usage: python preprcoess.py " "-i data_path [options]") parser.add_option( "-i", "--data", action="store", dest="input", help="Input data path.") parser.add_option( "-s", "--seed", action="store", dest="seed", default=1024, help="Set random seed.") return parser.parse_args() def main(): reload(sys) sys.setdefaultencoding('utf-8') options, args = option_parser() data = options.input seed = options.seed data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt') data_dir = os.path.join(os.path.dirname(data), 'tmp') pool = Pool(processes=num_tokenize + 2) pool.apply_async(parse_batch, args=(data, num_tokenize)) for i in range(num_tokenize): pool.apply_async(tokenize_batch, args=(str(i), )) pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict)) pool.close() pool.join() file(os.path.join(os.path.dirname(data), 'labels.list'), 'w').write('neg 0 pos 1 ') if __name__ == '__main__': main()