• 代码


    #!/usr/bin/python
    # -*- coding:utf-8 -*-
    
    import pandas as pd
    import numpy as np
    import matplotlib as mpl
    import math
    import warnings
    import gc
    from gensim import corpora, models, similarities
    from sklearn.preprocessing import LabelEncoder
    import datetime as dt
    from pandas.tseries.offsets import Day,MonthEnd,MonthBegin
    from multiprocessing import Pool
    from dask import dataframe as dd
    from dask.multiprocessing import get
    from multiprocessing import cpu_count
    import  jieba
    
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['font.serif'] = ['SimHei']
    warnings.filterwarnings("ignore")
    
    
    
    def getlda(doc_topics, x, num_show_topic, col):
        '''
        :param doc_topics: 主题
        :param x: 样本
        :param num_show_topic:主题个数
        :param col: 列名
        :return:
        '''
        # print(x,len(doc_topics))
        topic = np.array(doc_topics[x])
        topic_id=topic[np.argsort(topic[:,1])]
        if topic_id.shape[0]<num_show_topic:
            settopici=set(topic_id[:,0])
            settopicadd=set([x for x in range(num_show_topic)])-settopici# 补上没出现的topic
            dfall=pd.concat([pd.DataFrame({0:list(settopicadd),1:[0 for x in range(len(settopicadd))]}),pd.DataFrame(topic_id)],axis=0)
        else:
            dfall=pd.DataFrame(topic_id)
        dfall.sort_values(0,inplace=True)#0~num_show_topic 个主题所占概率
        df =pd.DataFrame([dfall[1].values])# 第i个主题概率
        df=df.astype(np.float32)
        L = range(num_show_topic)
        df.columns = [col + 'lda' + str(i) for i in L]
        return df
    
    # 读取数据
    test = pd.read_csv('../data/age_test.csv', header=None)
    test.columns = ['uid']
    train = pd.read_csv('../data/age_train.csv', header=None)
    train.columns = ['uid', 'label']
    app_actived = pd.read_csv('../data/app_actived.csv', header=None)
    app_actived.columns = ['uid', 'appid']
    print(app_actived.shape)
    print(train.shape)
    print(test.shape)
    # 拆分app_actived表
    test_actived = pd.merge(test, app_actived, on='uid', how='left')
    train_actived = pd.merge(train, app_actived, on='uid', how='left')
    print(test_actived.shape)
    print(train_actived.shape)
    
    #
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    # 预处理user_taglist这张表
    user_taglist = pd.read_csv('/home/sxtj/han/PPAI/data/user_taglist.csv', parse_dates=['insertdate'], )
    print(user_taglist.shape)
    # tfidf要从总体提取每个特征提取一个weight 这部分有穿越!
    columstfidf = ['taglist']
    def fundic(x):
        x = x.split('|')
        return x
    print('processing taglist')
    for index, item in enumerate(columstfidf):
        # 做成文本
        testdata = list(user_taglist[item].map(lambda x: fundic(x)))
        user_taglist.drop(item, axis=1, inplace=True)
        dictionary = corpora.Dictionary(testdata)
        corpus = [dictionary.doc2bow(text) for text in testdata]
        corpus_tfidf = models.TfidfModel(corpus)[corpus]# tfidf
        # weight = corpus_tfidf.obj.idfs
        lda = models.LdaMulticore(corpus_tfidf, num_topics=100, id2word=dictionary,
                                  chunksize=2000, passes=1, random_state=0, minimum_probability=0.005, workers=11)
        # lda.save('./model/' + item + '_ldanew.model')  # 留给test集合用
        doc_topics = lda.get_document_topics(corpus_tfidf)
        # 提取前num_topicsuese个主题
        print('num_topicsuese……')
        dfjoin = pd.concat([cols for cols in
                            user_taglist.reset_index()['index'].apply(lambda x: getlda(doc_topics, x, 100, item))],
                           ignore_index=True)  # 前80个主题
    
    
    
    print(dfjoin.head())
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
        # print(dfjoin.head(5))
        # print('saving taglist')
        # # dfjoin.to_hdf('dfjoin.h5','dfjoin')
        # del testdata,dictionary,corpus,corpus_tfidf,doc_topics
        # gc.collect()
        # user_taglistfe = user_taglist.join(dfjoin)
        # del dfjoin,user_taglist
        # gc.collect()
  • 相关阅读:
    使用XWAF框架(1)——Web项目的代码分层
    Web框架——XWAF的代码结构和运行机制(4)
    理解Web应用程序的代码结构和运行原理(3)
    获取并安装XWAF框架压缩包(2)
    Angular7教程-06-页面与数据交互
    Angular7教程-05-搭建项目环境
    Angular7教程-04-Angular常用操作(下)
    Angular7教程-03-Angular常用操作(上)
    Angular7教程-02-Angular项目目录及基本文件说明
    Angular7教程-01-Angular开发环境配置
  • 原文地址:https://www.cnblogs.com/xxswkl/p/10949010.html
Copyright © 2020-2023  润新知