• Amazon评论数据的预处理代码(Positive & Negative)


    Amazon评论数据的预处理代码,用于情感分析,代码改自

    https://github.com/PaddlePaddle/Paddle/tree/develop/demo/quick_start/data

    Amazon商品评论数据网址:

    http://jmcauley.ucsd.edu/data/amazon/

    Bash脚本文件

    get_data.sh:

    #!/bin/bash
    
    # 1. size of pos : neg = 1:1.
    # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
    # 3. distinct train set and test set.
    
    set -e
    
    # Download data
    echo "Downloading Amazon Electronics reviews data..."
    # http://jmcauley.ucsd.edu/data/amazon/
    # wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
    # wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz
    echo "Downloading mosesdecoder..."
    # https://github.com/moses-smt/mosesdecoder
    # wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
    
    # unzip master.zip
    # rm master.zip
    
    ##################
    # Preprocess data 
    echo "Preprocess data..."
    export LC_ALL=C
    UNAME_STR=`uname`
    
    if [ ${UNAME_STR} == 'Linux' ]; then
      SHUF_PROG='shuf'
    else
      SHUF_PROG='gshuf'
    fi
    
    mkdir -p tmp
    # python preprocess.py -i reviews_Electronics_5.json.gz
    python preprocess.py -i reviews_Digital_Music_5.json.gz
    # uniq and shuffle
    cd tmp
    echo 'Uniq and shuffle...'
    cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
    cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
    
    min_len=`sed -n '$=' neg.shuffed`
    echo `sed -n '$=' neg.shuffed`
    test_num=$((min_len/10))
    if [ $test_num -gt 12500 ];then
     test_num=12500
    fi
    train_num=$((min_len-test_num))
    
    head -n$train_num pos.shuffed >train.pos
    head -n$train_num neg.shuffed >train.neg
    tail -n$test_num pos.shuffed >test.pos
    tail -n$test_num neg.shuffed >test.neg
    
    cat train.pos train.neg | ${SHUF_PROG} >../train.txt
    cat test.pos test.neg | ${SHUF_PROG} >../test.txt
    
    cd -
    echo 'train.txt' > train.list
    echo 'test.txt' > test.list
    
    # use 30k dict
    # rm -rf tmp
    mv dict.txt dict_all.txt
    cat dict_all.txt | head -n 30001 > dict.txt
    echo 'Done.'
    

    数据处理文件:preprocess.py:

    # -*- coding: UTF-8 -*-
    
    """
    1. Tokenize the words and punctuation 
    Usage:
        python preprocess.py -i data_file [random seed]
    """
    
    import sys
    import os
    import operator
    import gzip
    from subprocess import Popen, PIPE
    from optparse import OptionParser
    import json
    from multiprocessing import Queue
    from multiprocessing import Pool
    import multiprocessing
    
    batch_size = 5000
    word_count = {}
    num_tokenize = max(1,
                       multiprocessing.cpu_count() - 2)  # parse + tokenize + save
    max_queue_size = 8
    parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
    tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
    
    
    def create_dict(data):
        """
        Create dictionary based on data, and saved in data_dir/dict.txt.
        The first line is unk 	 -1.
        data: list, input data by batch.
        """
        for seq in data:
            try:
                for w in seq.lower().split():
                    if w not in word_count:
                        word_count[w] = 1
                    else:
                        word_count[w] += 1
            except:
                sys.stderr.write(seq + "	ERROR
    ")
    
    
    def parse(path):
        """
        Open .gz file.
        """
        sys.stderr.write(path)
        g = gzip.open(path, 'r')
        for l in g:
            yield json.loads(l)
        g.close()
    
    
    def tokenize(sentences):
        """
        Use tokenizer.perl to tokenize input sentences.
        tokenizer.perl is tool of Moses.
        sentences : a list of input sentences.
        return: a list of processed text.
        """
        dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
        if not os.path.exists(dir):
            sys.exit(
                "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
            )
        tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
        assert isinstance(sentences, list)
        text = "
    ".join(sentences)
        tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
        tok_text, _ = tokenizer.communicate(text)
        toks = tok_text.split('
    ')[:-1]
        return toks
    
    
    def save_data(instance, data_dir, pre_fix, batch_num):
        """
        save data by batch
        """
        label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
        lines = ['%s	%s' % (label[i], instance[i]) for i in range(len(label))]
        file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
        file(file_name, 'w').write('
    '.join(lines) + '
    ')
    
    
    def tokenize_batch(id):
        """
        tokenize data by batch
        """
        while True:
            num_batch, instance, pre_fix = parse_queue.get()
            if num_batch == -1:  ### parse_queue finished
                tokenize_queue.put((-1, None, None))
                sys.stderr.write("Thread %s finish
    " % (id))
                break
            tokenize_instance = tokenize(instance)
            tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
            sys.stderr.write('.')
    
    
    def save_batch(data_dir, num_tokenize, data_dir_dict):
        """
            save data by batch
            build dict.txt
        """
        token_count = 0
        while True:
            num_batch, instance, pre_fix = tokenize_queue.get()
            if num_batch == -1:
                token_count += 1
                if token_count == num_tokenize:  #### tokenize finished.
                    break
                else:
                    continue
            save_data(instance, data_dir, pre_fix, num_batch)
            create_dict(instance)  ## update dict
    
        sys.stderr.write("save file finish
    ")
        f = open(data_dir_dict, 'w')
        f.write('%s	%s
    ' % ('unk', '-1'))
        for k, v in sorted(word_count.items(), key=operator.itemgetter(1), 
                           reverse=True):
            f.write('%s	%s
    ' % (k, v))
        f.close()
        sys.stderr.write("build dict finish
    ")
    
    
    def parse_batch(data, num_tokenize):
        """
        parse data by batch
        parse -> tokenize -> save
        """
        raw_txt = parse(data)
        neg, pos = [], []
        count = 0
        sys.stderr.write("extract raw data
    ")
        for l in raw_txt:
            rating = l["overall"]
            text = l["reviewText"].lower()  # # convert words to lower case
            if rating == 5.0 and text:
                pos.append(text)
            if rating < 3.0 and text:
                neg.append(text)
            if len(pos) == batch_size or len(neg) == batch_size:
                if len(pos) == batch_size:
                    batch = pos
                    pre_fix = 'pos'
                else:
                    batch = neg
                    pre_fix = 'neg'
    
                parse_queue.put((count, batch, pre_fix))
                count += 1
                if pre_fix == 'pos':
                    pos = []
                else:
                    neg = []
    
        if len(pos) > 0:
            parse_queue.put((count, pos, 'pos'))
            count += 1
        if len(neg) > 0:
            parse_queue.put((count, neg, 'neg'))
            count += 1
        for i in range(num_tokenize):
            parse_queue.put((-1, None, None))  #### for tokenize's input finished
        sys.stderr.write("parsing finish
    ")
    
    
    def option_parser():
        parser = OptionParser(usage="usage: python preprcoess.py "
                                    "-i data_path [options]")
        parser.add_option(
            "-i", "--data", action="store", dest="input", help="Input data path.")
        parser.add_option(
            "-s",
            "--seed",
            action="store",
            dest="seed",
            default=1024,
            help="Set random seed.")
        return parser.parse_args()
    
    
    def main():
        reload(sys)
        sys.setdefaultencoding('utf-8')
        options, args = option_parser()
        data = options.input
        seed = options.seed
        data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
        data_dir = os.path.join(os.path.dirname(data), 'tmp')
        pool = Pool(processes=num_tokenize + 2)
        pool.apply_async(parse_batch, args=(data, num_tokenize))
        for i in range(num_tokenize):
            pool.apply_async(tokenize_batch, args=(str(i), ))
        pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
        pool.close()
        pool.join()
    
        file(os.path.join(os.path.dirname(data), 'labels.list'),
             'w').write('neg	0
    pos	1
    ')
    
    
    if __name__ == '__main__':
        main()
    
  • 相关阅读:
    GRIDVIEW导出到EXCEL
    数据表示:字节 高位低位
    js学习笔记0
    12奇招,循序删除顽固的文件
    加快开关机速度
    PHP正则表达式的快速学习
    firefox下height不能自动撑开的解决办法
    给MySQL加密(也适用于Wamp5中)
    我的电脑创建资源管理器
    css 圆角7种CSS圆角框解决方案
  • 原文地址:https://www.cnblogs.com/huadongw/p/6165119.html
Copyright © 2020-2023  润新知