提纲
numpy:
#genformtxt import numpy as np #genformtxtdata=np.genfromtxt("genfromtxtdata") #print(help(numpy.genfromtxt)) #matrix-list of list matrix=np.array([[12,12],[12,12],[1,13]]) print(matrix) #强制转换成一致数据类型 dataa=np.array([1,2,4.0,1]) #切片 #判断 #datab=dataa #结果返回true,false #导出等于某一值的数组 #booldata=(datab==1) #print(datab[booldata]) #取出包含某一值的某一行 boolmatrix =(matrix[:,1]==13) print(matrix[boolmatrix,:]) # & | 与或 #类型转换 dataa.astype(float) #求极值 dataa.min() #按照行列求和 matrix.sum(axis=1) #np.zeros((3,4)->元组格式) #np.arange(15).reshape(3,4) #np.random.random()->先进入random模块,默认范围-1->+1 #np.linspace(0,2*pi,100)->均匀取值 #np.exp() #相减:维度一样对应相减,不一样都减去后一个数 A=np.array([[1,2],[1,1]]) B=np.array([[1,2],[1,1]]) print(A*B)#对应元素相乘 print(A.dot(B))#矩阵相乘 print(np.dot(A,B)) #矩阵操作 #向下取 a=np.floor(10*np.random.random((3,4))) b=np.floor(10*np.random.random((3,4))) #将矩阵拉成向量 print(a) print(a.ravel()) #数据拼接 #print(np.hstack((a,b))) #print(np.vstack((a,b))) #数据切分 #print(np.hsplit(a,2)) #print(np.vsplit(a,2)) #数据复制 b=a b.shape=4,3 #改变b的形状,a的形状跟着变了 print(a) #a,b的ID值一样,指向统一内存空间 print(id(a),id(b)) #浅复制 #c与a虽然指向的地址不同但是共用一套数值,改变 c,a也会改变 c=a.view() c.shape=2,6 c[1,1]=11 print(a.shape) print(a) #深复制 #d与a完全没关系了 d=a.copy() #索引操作 #找最大值所在的位置 intt=a.argmax(axis=0) print(intt) #扩展数组 a=np.arange(1,20,10) b=np.tile(a,(2,3)) print(b) #排序 a=np.array([[1,2,3],[3,2,1]]) #从小到大的索引值 j=np.argsort(a) a.sort(axis=1) print(j) print(a)
pandas:
import pandas as pd import numpy as np current_path = %pwd print(current_path) #food_info=pd.read_csv("food_info.csv") #DataFrame数据类型 #print(type(food_info)) #print(food_info.dtypes) #food_info.head() #food_info.tail(4) #print(food_info.columns) #print(food_info.shape) #索引与计算 #print(food_info.loc[0]) #传入一个list->多列 #print(food_info[["NDB_No","Shrt_Desc"]]) #column_list=food_info.columns.tolist() #print(column_list) ##数据预处理 #food_info.sort_values("NDB_No",inplace=True) ##排序后缺失值会被放到最后 ##从小到大排序 #print(food_info["NDB_No"]) ##从大到小 #food_info.sort_values("NDB_No",inplace=True,ascending=False) #print(food_info["NDB_No"]) titanic_train_info=pd.read_csv("titanic_train.csv") #print(titanic_train_info.head()) #age=titanic_train_info["Age"] #print(age.loc[0:10]) #age_is_null=pd.isnull(age) #print(age_is_null) #age_null_true=age[age_is_null] #age_null_count=len(age_null_true) #print(age_null_count) #除去缺失值求平均 #age_null_false=titanic_train_info["Age"][age_is_null==False] #average_age=sum(age_null_false)/len(age_null_false) #average_age1=titanic_train_info["Age"].mean() #print(average_age,average_age1) #数据统计表 #基准-统计对象-方法 #求均值是默认方法 #passager_survival=titanic_train_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean) #print(passager_survival) #passager_age=titanic_train_info.pivot_table(index="Pclass",values="Age",aggfunc=np.mean) #print(passager_age) #port_stats=titanic_train_info.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum) #print(port_stats) ##缺失值丢掉 #titanic_train_info1=titanic_train_info drop_na_columns=titanic_train_info1.dropna(axis=0,subset=["Age","Sex"]) drop_na_columns.head() #定位到某一具体值 row_index_83_age=titanic_train_info1.loc[83,"Age"] print(row_index_83_age) #自定义函数 #titanic_train_info1.apply("函数名")
#series结构 import pandas as pd score_csv=pd.read_csv("fandango_score_comparison.csv") series_FILM=score_csv["FILM"] #print(type(series_FILM)) from pandas import Series film_names=series_FILM.values #print(type(film_names)) series_rt=score_csv["RottenTomatoes"] #print(series_rt) rt_scores=series_rt.values print(rt_scores) #以名字所谓索引 series_customer=Series(rt_scores,index=film_names) series_customer["Minions (2015)"] series_customer[5:10]
matplotlib:
#折线图 import pandas as pd unrate=pd.read_csv("UNRATE.csv") unrate["DATE"]=pd.to_datetime(unrate["DATE"]) #print(unrate.head(12)) import matplotlib.pyplot as plt #first_twelve=unrate[0:100] #plt.plot(first_twelve["DATE"],first_twelve["VALUE"]) #plt.xticks(rotation=45) #plt.xlabel("month") #plt.ylabel("rate") #plt.title("失业率") #plt.show() #fig=plt.figure() #ax1=fig.add_subplot(4,3,1) #ax2=fig.add_subplot(4,3,2) #ax2=fig.add_subplot(4,3,6) import numpy as np #fig=plt.figure(figsize=(10,6)) #ax1=fig.add_subplot(2,1,1) #ax2=fig.add_subplot(2,1,2) #ax1.plot(np.random.randint(1,5,5),np.arange(5)) #ax2.plot(np.arange(10)*3,np.arange(10)) #plt.show() unrate["Month"]=unrate["DATE"].dt.month #fig=plt.figure(figsize=(6,3)) #plt.plot(unrate[0:12]["Month"],unrate[0:12]["VALUE"],c="red") #plt.plot(unrate[12:24]["Month"],unrate[12:24]["VALUE"],c="blue") fig=plt.figure(figsize=(10,5)) colors=["red","blue","green","orange","black"] for i in range(5): start_index=i*12 end_index=(i+1)*12 subset=unrate[start_index:end_index] label=str(1948+i) plt.plot(subset["Month"],subset["VALUE"],c=colors[i],label=label) plt.legend(loc="best") plt.show() #bar import pandas as pd reviews = pd.read_csv('fandango_scores.csv') cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars'] norm_reviews = reviews[cols] #print(norm_reviews[:1]) import matplotlib.pyplot as plt from numpy import arange #The Axes.bar() method has 2 required parameters, left and height. #We use the left parameter to specify the x coordinates of the left sides of the bar. #We use the height parameter to specify the height of each bar num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars'] bar_heights = norm_reviews.ix[0, num_cols].values bar_positions = arange(5) + 0.75 tick_positions = range(1,6) fig, ax = plt.subplots() ax.bar(bar_positions, bar_heights, 0.5) #横着画图 ax.barh(bar_positions, bar_heights, 0.5) ax.set_xticks(tick_positions) ax.set_xticklabels(num_cols, rotation=45) ax.set_xlabel('Rating Source') ax.set_ylabel('Average Rating') ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)') plt.show() #散点图 #Let's look at a plot that can help us visualize many points. #函数返回一个figure图像和一个子图ax的array列表。 fig = plt.figure(figsize=(10,5)) ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2) ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm']) ax1.set_xlabel('Fandango') ax1.set_ylabel('Rotten Tomatoes') ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue']) ax2.set_xlabel('Rotten Tomatoes') ax2.set_ylabel('Fandango') plt.show()
#柱状图 import pandas as pd import matplotlib.pyplot as plt reviews = pd.read_csv('fandango_scores.csv') cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue'] norm_reviews = reviews[cols] #print(norm_reviews[:5]) #数据计数 fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts() #数据索引从小到大排列 fandango_distribution = fandango_distribution.sort_index() imdb_distribution = norm_reviews['IMDB_norm'].value_counts() imdb_distribution = imdb_distribution.sort_index() #print(fandango_distribution) #print(imdb_distribution) fig, ax = plt.subplots() #ax.hist(norm_reviews['Fandango_Ratingvalue']) #bins指定个数,range指定区间 ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20) ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20) ax.set_ylim(0,20) #四分图(盒图) num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue'] fig, ax = plt.subplots() ax.boxplot(norm_reviews[num_cols].values) ax.set_xticklabels(num_cols, rotation=90) ax.set_ylim(0,5) plt.show()
#一些细节 import pandas as pd import matplotlib.pyplot as plt # Add your code here. fig, ax = plt.subplots() ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women') ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men') #去掉小横线 ax.tick_params(bottom="off", top="off", left="off", right="off") ax.set_title('Percentage of Biology Degrees Awarded By Gender') ax.legend(loc="upper right") major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics'] fig = plt.figure(figsize=(12, 12)) #for sp in range(0,4): # ax = fig.add_subplot(2,2,sp+1) # ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women') # ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men') # # Add your code here. # ## Calling pyplot.legend() here will add the legend to the last subplot that was created. #plt.legend(loc='upper right') #plt.show() major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics'] fig = plt.figure(figsize=(12, 12)) for sp in range(0,4): ax = fig.add_subplot(2,2,sp+1) ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women') ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men') for key,spine in ax.spines.items(): spine.set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(major_cats[sp]) ax.tick_params(bottom="off", top="off", left="off", right="off") # Calling pyplot.legend() here will add the legend to the last subplot that was created. plt.legend(loc='upper right') plt.show() #Setting Line Width cb_dark_blue = (0/255, 107/255, 164/255) cb_orange = (255/255, 128/255, 14/255) fig = plt.figure(figsize=(12, 12)) for sp in range(0,4): ax = fig.add_subplot(2,2,sp+1) # Set the line width when specifying how each line should look. ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10) ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10) for key,spine in ax.spines.items(): spine.set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(major_cats[sp]) ax.tick_params(bottom="off", top="off", left="off", right="off") plt.legend(loc='upper right') plt.show() stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics'] fig = plt.figure(figsize=(18, 3)) for sp in range(0,6): ax = fig.add_subplot(1,6,sp+1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3) for key,spine in ax.spines.items(): spine.set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[sp]) ax.tick_params(bottom="off", top="off", left="off", right="off") if sp == 0: ax.text(2005, 87, 'Men') ax.text(2002, 8, 'Women') elif sp == 5: ax.text(2005, 62, 'Men') ax.text(2001, 35, 'Women') plt.show()
seaborn:
#seaborn风格模板 import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np %matplotlib inline def sinplot(flip=1): x=np.linspace(0,14,100) for i in range(1,7): plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip) #sns默认风格(有五种主题风格) #sns.set() #sinplot() #sns.set_style("whitegrid") #sns.set_style("dark") #sns.set_style("white") #sns.set_style("ticks") #data=np.random.normal(size=(20,6))+np.arange(6)/2 #sns.boxplot(data=data) #去掉上方和右边的线条 #sns.despine() #sns.despine(offset=10) #sns.despine(left=True) #with内执行的都是当前风格 #with sns.axes_style("darkgrid"): # plt.subplot(211) # sinplot() #plt.subplot(212) #sinplot(-1) ##设置整体布局 sns.set_style("whitegrid") sns.set_context("paper",font_scale=2.5,rc=({"lines.linewidth":4.5}))#poster/notebook plt.figure(figsize=(8,6)) sinplot() # 颜色(离散型与连续型) >颜色很重要 >color_palette()能传入任何matplot所支持的颜色 >color_palette()不写参数则默认颜色 >set_palette()设置所有图的颜色 #分类色板 #默认的绘图颜色 current_palette=sns.color_palette() sns.palplot(current_palette) #hls默认的颜色空间 sns.palplot(sns.color_palette("hls",8)) #把颜色放到数据中 fig=plt.figure(figsize=(10,6)) data=np.random.normal(size=(20,6))+np.arange(6)/2 sns.boxplot(data=data,palette=sns.color_palette("hls",8)) #更改调色板亮度与饱和度 #fig=plt.figure(figsize=(10,6)) #sns.palplot(sns.hls_palette(8,l=.2,h=.9)) #sns.boxplot(data=data,palette=sns.hls_palette(8,l=.2,h=.9)) #调出来成对的颜色 sns.palplot(sns.color_palette("Paired",8)) 使用xkcd来命名颜色 xkcd包含了一套众包努力的针对随机GRB色的命名,产生了954个可以随时通过xkcd_rgb字典中调用的命名颜色 plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw=3) plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw=3) plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw=3) #连续画板 #色彩可以变换,比如用颜色的变化表示值重要性的变化 sns.palplot(sns.color_palette("Blues")) #由深到浅 sns.palplot(sns.color_palette("Blues_r")) #线性调色板 sns.palplot(sns.color_palette("cubehelix",8)) sns.palplot(sns.cubehelix_palette(8,start=.5,rot=-0.75)) #指定颜色深浅 sns.palplot(sns.light_palette("green")) sns.palplot(sns.dark_palette("purple"))
x,y=np.random.multivariate_normal([0,0],[[1,-.5],[-.5,1]],size=300).T #plt.scatter(x,y) fig=plt.figure(figsize=(10,6)) pal=sns.dark_palette("green",as_cmap=True) sns.kdeplot(x,y,cmap=pal)