数据分析的常用库很多,所以为了方便,python重新安装了anaconda版本
用pyharm+anaconda可以节约大量时间
参考了数据分析与挖掘实践,比较少有的python3
先对几个常用库进行了测试
numpy,matplotlib
import matplotlib.pyplot as plt import numpy as np x=np.linspace(0,10,1000) y=np.sin(x)+1 z=np.cos(x**2)+1 plt.figure(figsize=(8,4)) plt.plot(x,y,label='$sinx+1$',color='red',linewidth=2) plt.plot(x,z,'b--',label='$cosx^2+1$') plt.xlabel('Time(s)') plt.ylabel('Volt') plt.title('A Simple Example') plt.ylim(0,2.2) plt.legend() plt.show()
pandas
import pandas as pd s=pd.Series([1,2,3],index=['a','b','c']) d=pd.DataFrame([[1,2,3],[4,5,6]], columns=['a','b','c']) d2=pd.DataFrame(s) d.head() d.describe() print(d.describe())
statsmodels
from statsmodels.tsa.stattools import adfuller as ADF import numpy as np a=ADF(np.random.rand(100)) print(a)
scikit-learn
from sklearn.linear_model import LinearRegression model = LinearRegression() print(model)
from sklearn import datasets iris= datasets.load_iris() print(iris.data.shape) from sklearn import svm clf = svm.LinearSVC() clf.fit(iris.data,iris.target) clf.predict([[5.0,3.6,1.3,0.25]]) print(clf.coef_)
keras
from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.optimizers import SGD model = Sequential() # Dense(64) is a fully-connected layer with 64 hidden units. # in the first layer, you must specify the expected input data shape: # here, 20-dimensional vectors. model.add(Dense(64, input_dim=20, init='uniform')) model.add(Activation('tanh')) model.add(Dropout(0.5)) model.add(Dense(64, init='uniform')) model.add(Activation('tanh')) model.add(Dropout(0.5)) model.add(Dense(10, init='uniform')) model.add(Activation('softmax')) sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) model.fit(X_train, y_train, nb_epoch=20, batch_size=16) score = model.evaluate(X_test, y_test, batch_size=16)
gensim
import gensim, logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import pymongo import hashlib db = pymongo.MongoClient('172.16.0.101').weixin.text_articles_words md5 = lambda s: hashlib.md5(s).hexdigest() class sentences: def __iter__(self): texts_set = set() for a in db.find(no_cursor_timeout=True): if md5(a['text'].encode('utf-8')) in texts_set: continue else: texts_set.add(md5(a['text'].encode('utf-8'))) yield a['words'] print (u'最终计算了%s篇文章'%len(texts_set)) word2vec = gensim.models.word2vec.Word2Vec(sentences(), size=256, window=10, min_count=64, sg=1, hs=1, iter=10, workers=25) word2vec.save('word2vec_wx')
基础测试,慢慢开始吧