• 学习数据分析day01


    数据分析的常用库很多,所以为了方便,python重新安装了anaconda版本

    用pyharm+anaconda可以节约大量时间

    参考了数据分析与挖掘实践,比较少有的python3

    先对几个常用库进行了测试

    numpy,matplotlib

    import matplotlib.pyplot as plt
    import numpy as np
    x=np.linspace(0,10,1000)
    y=np.sin(x)+1
    z=np.cos(x**2)+1
    plt.figure(figsize=(8,4))
    plt.plot(x,y,label='$sinx+1$',color='red',linewidth=2)
    plt.plot(x,z,'b--',label='$cosx^2+1$')
    plt.xlabel('Time(s)')
    plt.ylabel('Volt')
    plt.title('A Simple Example')
    plt.ylim(0,2.2)
    plt.legend()
    plt.show()

    pandas

    import pandas as pd
    
    s=pd.Series([1,2,3],index=['a','b','c'])
    d=pd.DataFrame([[1,2,3],[4,5,6]], columns=['a','b','c'])
    d2=pd.DataFrame(s)
    
    d.head()
    d.describe()
    
    print(d.describe())

    statsmodels

    from statsmodels.tsa.stattools import adfuller as ADF
    import numpy as np
    a=ADF(np.random.rand(100))
    print(a)

    scikit-learn

    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    print(model)
    from sklearn import datasets
    iris= datasets.load_iris()
    print(iris.data.shape)
    from sklearn import  svm
    clf = svm.LinearSVC()
    clf.fit(iris.data,iris.target)
    clf.predict([[5.0,3.6,1.3,0.25]])
    print(clf.coef_)

    keras

    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.optimizers import SGD
    
    model = Sequential()
    # Dense(64) is a fully-connected layer with 64 hidden units.
    # in the first layer, you must specify the expected input data shape:
    # here, 20-dimensional vectors.
    model.add(Dense(64, input_dim=20, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(64, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(10, init='uniform'))
    model.add(Activation('softmax'))
    
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])
    
    model.fit(X_train, y_train,
              nb_epoch=20,
              batch_size=16)
    score = model.evaluate(X_test, y_test, batch_size=16)

    gensim

    import gensim, logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    
    import pymongo
    import hashlib
    
    db = pymongo.MongoClient('172.16.0.101').weixin.text_articles_words
    md5 = lambda s: hashlib.md5(s).hexdigest()
    class sentences:
        def __iter__(self):
            texts_set = set()
            for a in db.find(no_cursor_timeout=True):
                if md5(a['text'].encode('utf-8')) in texts_set:
                    continue
                else:
                    texts_set.add(md5(a['text'].encode('utf-8')))
                    yield a['words']
            print (u'最终计算了%s篇文章'%len(texts_set))
    
    word2vec = gensim.models.word2vec.Word2Vec(sentences(), size=256, window=10, min_count=64, sg=1, hs=1, iter=10, workers=25)
    word2vec.save('word2vec_wx')

    基础测试,慢慢开始吧

  • 相关阅读:
    4、现有的命名方式有多少种?请举例说明。
    第二次作业
    第一次作业
    RateLimiter源码
    使用ASM字节码框架实现动态代理
    Java流机制学习
    Java8 Stream 学习总结
    XML实体解析器的作用
    DefaultResouceLoader的设计
    RSA 非对称加密 数字签名 数字证书
  • 原文地址:https://www.cnblogs.com/zsc329/p/9280466.html
Copyright © 2020-2023  润新知