Python Data Analysis Library 或 pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现,它是使Python成为强大而高效的数据分析环境的重要因素之一。
本文介绍了Pandas的基础用法
Github: https://github.com/yingzk/pandas_learning
本文PDF下载:https://img.yingjoy.cn/attachment/2018/04/pandas_pdf.zip
库的导入
# 导入numpy库并重命名为np
import numpy as np
# 导入pandas 库并重命名为pd
import pandas as pd
数据的导入
pd.read_csv(filename) # 从csv导入
pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据
pd.read_excel(filename) # 从excel导入
pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据
pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据
pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据
pd.read_clipboard() # 导入系统粘贴板里面的数据
pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。
数据的导出
df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中
df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中
df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中
df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中
创建测试对象
pd.DataFrame(np.random.rand(5, 10)) # 创建一个5列10行的由随机浮点数组成的数据框 DataFrame
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
0 |
0.016860 |
0.855994 |
0.992872 |
0.652278 |
0.517510 |
0.742986 |
0.452981 |
0.568701 |
0.795436 |
0.622609 |
1 |
0.476801 |
0.190823 |
0.450436 |
0.912401 |
0.335651 |
0.197766 |
0.042523 |
0.580323 |
0.498982 |
0.473128 |
2 |
0.029820 |
0.886500 |
0.902864 |
0.465084 |
0.380933 |
0.033583 |
0.928827 |
0.501687 |
0.857512 |
0.671840 |
3 |
0.897254 |
0.413717 |
0.991061 |
0.393033 |
0.388630 |
0.661025 |
0.635417 |
0.695609 |
0.305378 |
0.147508 |
4 |
0.573882 |
0.786888 |
0.177782 |
0.864474 |
0.594416 |
0.765678 |
0.217279 |
0.446570 |
0.930604 |
0.686823 |
pd.Series(my_list) # 从一个可迭代的对象 my_list 中创建一个数据组
my_list = ['abc',123,'HelloWorld', 5.7]
pd.Series(my_list)
0 abc
1 123
2 HelloWorld
3 5.7
dtype: object
df = pd.DataFrame(np.random.rand(10, 5))
df.index = pd.date_range('2017/1/1', periods=df.shape[0])
df
|
0 |
1 |
2 |
3 |
4 |
2017-01-01 |
0.011762 |
0.634116 |
0.045220 |
0.452117 |
0.879969 |
2017-01-02 |
0.802262 |
0.661908 |
0.214822 |
0.444259 |
0.200370 |
2017-01-03 |
0.301050 |
0.004534 |
0.881042 |
0.825632 |
0.331118 |
2017-01-04 |
0.095324 |
0.916430 |
0.177795 |
0.191502 |
0.546973 |
2017-01-05 |
0.482868 |
0.953719 |
0.615461 |
0.868984 |
0.639286 |
2017-01-06 |
0.958404 |
0.155357 |
0.293012 |
0.115218 |
0.177846 |
2017-01-07 |
0.915488 |
0.486922 |
0.440474 |
0.584764 |
0.271243 |
2017-01-08 |
0.480413 |
0.600622 |
0.325212 |
0.532259 |
0.687718 |
2017-01-09 |
0.859887 |
0.236677 |
0.635073 |
0.811840 |
0.497289 |
2017-01-10 |
0.024623 |
0.635122 |
0.346393 |
0.860260 |
0.325502 |
数据的查看
df.head(n) # 查看前n行的数据
df.tail(n) # 查看后n行的数据
df = pd.DataFrame(np.random.rand(10, 5))
df.head(3)
|
0 |
1 |
2 |
3 |
4 |
0 |
0.996381 |
0.440502 |
0.583701 |
0.120444 |
0.241775 |
1 |
0.126877 |
0.646841 |
0.740163 |
0.764182 |
0.810129 |
2 |
0.254386 |
0.451341 |
0.288513 |
0.515995 |
0.146529 |
df = pd.DataFrame(np.random.rand(10, 5))
df.tail(3)
|
0 |
1 |
2 |
3 |
4 |
7 |
0.466316 |
0.747013 |
0.568442 |
0.562552 |
0.949529 |
8 |
0.243633 |
0.605133 |
0.114011 |
0.898604 |
0.024648 |
9 |
0.155605 |
0.799580 |
0.160883 |
0.986743 |
0.446114 |
查看数据的形状
df.shape # 查看数据的形状(行和宽)
查看数据的相关信息
df.info() # 查看数据的索引、数据类型及内存信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
0 10 non-null float64
1 10 non-null float64
2 10 non-null float64
3 10 non-null float64
4 10 non-null float64
dtypes: float64(5)
memory usage: 480.0 bytes
df.describe() # 对于数据类型为数值型的列,查询其描述性统计的内容
|
0 |
1 |
2 |
3 |
4 |
count |
10.000000 |
10.000000 |
10.000000 |
10.000000 |
10.000000 |
mean |
0.432296 |
0.601318 |
0.444123 |
0.543053 |
0.413944 |
std |
0.268632 |
0.216483 |
0.298076 |
0.284759 |
0.345554 |
min |
0.155605 |
0.252712 |
0.114011 |
0.126172 |
0.024648 |
25% |
0.245740 |
0.461876 |
0.191090 |
0.295224 |
0.094655 |
50% |
0.353927 |
0.609304 |
0.401004 |
0.565072 |
0.389053 |
75% |
0.559964 |
0.762463 |
0.556721 |
0.693000 |
0.668621 |
max |
0.985457 |
0.932679 |
0.990827 |
0.986743 |
0.949529 |
统计次数
s.value_counts(dropna=False) # 查询每个独特数据值出现次数统计
s = pd.Series([1,2,3,3,4,np.nan,5,5,5,6,7])
s.value_counts(dropna=False)
5.0 3
3.0 2
7.0 1
6.0 1
NaN 1
4.0 1
2.0 1
1.0 1
dtype: int64
s = s.apply(lambda x: x+1)
s
0 3.0
1 4.0
2 5.0
3 5.0
4 6.0
5 NaN
6 7.0
7 7.0
8 7.0
9 8.0
10 9.0
dtype: float64
df.apply(pd.Series.value_counts) # 查询数据框 (Data Frame) 中每个列的独特数据值出现次数统计
数据选取
df[col] # 以数组 Series 的形式返回选取的列
df = pd.DataFrame(np.random.rand(5, 5), columns=list('ABCDE'))
df['C']
0 0.452717
1 0.407755
2 0.549391
3 0.759433
4 0.153871
Name: C, dtype: float64
df[[col1, col2]] # 选择多列
df = pd.DataFrame(np.random.rand(5, 5), columns=list('ABCDE'))
df[['C', 'D']]
|
C |
D |
0 |
0.431885 |
0.304796 |
1 |
0.028960 |
0.187738 |
2 |
0.176520 |
0.102980 |
3 |
0.370277 |
0.098031 |
4 |
0.247122 |
0.345735 |
s.iloc[0] # 按位置选取
s = pd.Series(np.array(['I', 'Love', 'China']))
s.iloc[0]
'I'
s.loc['index_one'] # 按索引选取
s = pd.Series(np.array(['I', 'Love', 'China']))
s.loc[0]
'I'
df.DataFrame[n, :] #选取第n行
df = pd.DataFrame(np.array([['I', 'Love', 'China'], ['I', 'Love', 'Data']]))
df.iloc[1, :]
0 I
1 Love
2 Data
Name: 1, dtype: object
df.iloc[0, 0] # 选取第一个元素
df = pd.DataFrame(np.random.rand(5, 5))
df
|
0 |
1 |
2 |
3 |
4 |
0 |
0.786709 |
0.405902 |
0.151383 |
0.384778 |
0.871664 |
1 |
0.491006 |
0.774710 |
0.388011 |
0.758102 |
0.762115 |
2 |
0.085647 |
0.543243 |
0.582565 |
0.664243 |
0.379896 |
3 |
0.806211 |
0.794284 |
0.968755 |
0.883923 |
0.354820 |
4 |
0.463902 |
0.481756 |
0.131181 |
0.590878 |
0.801769 |
df.iloc[0, 0]
0.78670886755075187
数据的清洗
df.columns = ['a', 'b'] # 对列名重新命名
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
'C':'foo'})
df
|
A |
B |
C |
0 |
1.0 |
NaN |
foo |
1 |
NaN |
4.0 |
foo |
2 |
2.0 |
NaN |
foo |
3 |
3.0 |
5.0 |
foo |
4 |
6.0 |
9.0 |
foo |
5 |
NaN |
NaN |
foo |
df.columns = ['a', 'b', 'c']
df
|
a |
b |
c |
0 |
1.0 |
NaN |
foo |
1 |
NaN |
4.0 |
foo |
2 |
2.0 |
NaN |
foo |
3 |
3.0 |
5.0 |
foo |
4 |
6.0 |
9.0 |
foo |
5 |
NaN |
NaN |
foo |
pd.isnull() # 检查数据中出现空值的情况, 返回一个布尔型的列
pd.notnull() #相对应isnull 返回不是空值的情况
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
'C':'foo'})
df.isnull()
|
A |
B |
C |
0 |
False |
True |
False |
1 |
True |
False |
False |
2 |
False |
True |
False |
3 |
False |
False |
False |
4 |
False |
False |
False |
5 |
True |
True |
False |
df.isnull().sum() # 对每一列的空值进行统计
A 2
B 3
C 0
dtype: int64
df.dropna(axis = 0, thresh=n) # 删除包含缺失值的行 axis = 1时删除列 # thresh = n移除空值超过(包括等于)n的行
df
|
A |
B |
C |
0 |
1.0 |
NaN |
foo |
1 |
NaN |
4.0 |
foo |
2 |
2.0 |
NaN |
foo |
3 |
3.0 |
5.0 |
foo |
4 |
6.0 |
9.0 |
foo |
5 |
NaN |
NaN |
foo |
df.dropna(axis = 0)
|
A |
B |
C |
3 |
3.0 |
5.0 |
foo |
4 |
6.0 |
9.0 |
foo |
df.dropna(axis = 1)
|
C |
0 |
foo |
1 |
foo |
2 |
foo |
3 |
foo |
4 |
foo |
5 |
foo |
df.dropna(axis = 0, thresh = 2)
|
A |
B |
C |
0 |
1.0 |
NaN |
foo |
1 |
NaN |
4.0 |
foo |
2 |
2.0 |
NaN |
foo |
3 |
3.0 |
5.0 |
foo |
4 |
6.0 |
9.0 |
foo |
df.fillna(df.mean()) # 用平均值来填充空值
s = pd.Series([1,3,5,np.nan,7,9,9])
s.fillna(s.mean())
0 1.000000
1 3.000000
2 5.000000
3 5.666667
4 7.000000
5 9.000000
6 9.000000
dtype: float64
s.astype(type) # 转换列的类型
s = pd.Series([1,3,5,np.nan,7,9,9])
s.fillna(s.mean()).astype(int)
0 1
1 3
2 5
3 5
4 7
5 9
6 9
dtype: int32
s.replace(1, 'one') # 将Series中的1替换为one
s = pd.Series([1,3,5,np.nan,7,9,9])
s.replace(1,'one')
0 one
1 3
2 5
3 NaN
4 7
5 9
6 9
dtype: object
s.replace([1,3],['one','three']) # 将数组(Series)中所有的1替换为'one', 所有的3替换为'three'
s = pd.Series([1,3,5,np.nan,7,9,9])
s.replace([1,3],['one','three'])
0 one
1 three
2 5
3 NaN
4 7
5 9
6 9
dtype: object
df.rename(columns=lambda x: x + 2) # 将全体列重命名
df = pd.DataFrame(np.random.rand(4,4))
df
|
0 |
1 |
2 |
3 |
0 |
0.669102 |
0.548996 |
0.512802 |
0.449220 |
1 |
0.108840 |
0.974720 |
0.665050 |
0.271009 |
2 |
0.146804 |
0.060744 |
0.637770 |
0.383380 |
3 |
0.108163 |
0.893999 |
0.216907 |
0.730504 |
df.rename(columns=lambda x: x+ 2)
|
2 |
3 |
4 |
5 |
0 |
0.669102 |
0.548996 |
0.512802 |
0.449220 |
1 |
0.108840 |
0.974720 |
0.665050 |
0.271009 |
2 |
0.146804 |
0.060744 |
0.637770 |
0.383380 |
3 |
0.108163 |
0.893999 |
0.216907 |
0.730504 |
df.rename(columns={'old_name': 'new_ name'}) # 将选择的列重命名
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.rename(columns={'A':'New A', 'B':'New B'})
|
New A |
New B |
C |
D |
E |
0 |
0.680941 |
0.766561 |
0.486226 |
0.301537 |
0.289970 |
1 |
0.917036 |
0.100054 |
0.464342 |
0.181454 |
0.933591 |
2 |
0.253549 |
0.766181 |
0.085607 |
0.969627 |
0.630674 |
3 |
0.377840 |
0.909920 |
0.214338 |
0.011844 |
0.392257 |
4 |
0.608564 |
0.587614 |
0.039867 |
0.630492 |
0.402101 |
5 |
0.361074 |
0.937618 |
0.787055 |
0.054157 |
0.300325 |
6 |
0.605472 |
0.608429 |
0.052152 |
0.669343 |
0.745648 |
7 |
0.660738 |
0.158713 |
0.352756 |
0.028325 |
0.195899 |
8 |
0.855695 |
0.578177 |
0.447043 |
0.093923 |
0.316234 |
9 |
0.337392 |
0.645260 |
0.140221 |
0.616652 |
0.727144 |
df.set_index('column_one') # 改变索引
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.set_index('B')
|
A |
C |
D |
E |
B |
|
|
|
|
0.824403 |
0.013092 |
0.055626 |
0.895268 |
0.837350 |
0.310559 |
0.689064 |
0.541375 |
0.275461 |
0.808554 |
0.533495 |
0.072835 |
0.563758 |
0.695029 |
0.524957 |
0.541211 |
0.820817 |
0.591130 |
0.268978 |
0.546996 |
0.286587 |
0.936692 |
0.343227 |
0.383610 |
0.811302 |
0.878391 |
0.938883 |
0.636148 |
0.776493 |
0.025840 |
0.156482 |
0.918591 |
0.030869 |
0.235020 |
0.096212 |
0.857049 |
0.613991 |
0.810541 |
0.917927 |
0.921329 |
0.713271 |
0.949683 |
0.811386 |
0.920452 |
0.213173 |
0.686945 |
0.522276 |
0.881299 |
0.936260 |
0.030993 |
df.rename(index = lambda x: x+ 1) # 改变全体索引
df = pd.DataFrame(np.random.rand(10,5))
df.rename(index = lambda x: x+ 1)
|
0 |
1 |
2 |
3 |
4 |
1 |
0.382337 |
0.185501 |
0.457958 |
0.009713 |
0.628963 |
2 |
0.024175 |
0.223274 |
0.698171 |
0.071715 |
0.063272 |
3 |
0.913995 |
0.713092 |
0.269621 |
0.575365 |
0.805266 |
4 |
0.612708 |
0.220953 |
0.090858 |
0.425472 |
0.018996 |
5 |
0.045363 |
0.153343 |
0.730828 |
0.323554 |
0.364821 |
6 |
0.462096 |
0.614072 |
0.993130 |
0.988894 |
0.788648 |
7 |
0.887381 |
0.802119 |
0.191248 |
0.980064 |
0.628450 |
8 |
0.138270 |
0.922870 |
0.250827 |
0.297472 |
0.289915 |
9 |
0.258687 |
0.807993 |
0.930009 |
0.811335 |
0.609763 |
10 |
0.588020 |
0.392127 |
0.590799 |
0.923180 |
0.722801 |
数据的过滤(filter),排序(sort)和分组(groupby)
df[df[col] > 0.5] # 选取数据df中对应行的数值大于0.5的全部列 支持逻辑运算
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df
|
A |
B |
C |
D |
E |
0 |
0.051900 |
0.548808 |
0.744936 |
0.848002 |
0.299505 |
1 |
0.979053 |
0.216078 |
0.394286 |
0.520654 |
0.584194 |
2 |
0.185679 |
0.453151 |
0.839947 |
0.730177 |
0.392377 |
3 |
0.161267 |
0.981833 |
0.890858 |
0.613972 |
0.467528 |
4 |
0.091140 |
0.369805 |
0.600035 |
0.372857 |
0.897063 |
5 |
0.612195 |
0.981150 |
0.578304 |
0.220064 |
0.488182 |
6 |
0.898736 |
0.626289 |
0.788306 |
0.747086 |
0.386097 |
7 |
0.568531 |
0.362593 |
0.644950 |
0.510410 |
0.092556 |
8 |
0.872898 |
0.771917 |
0.853365 |
0.227531 |
0.045184 |
9 |
0.898296 |
0.683850 |
0.138142 |
0.956854 |
0.335476 |
df[df['A'] > 0.5]
|
A |
B |
C |
D |
E |
1 |
0.979053 |
0.216078 |
0.394286 |
0.520654 |
0.584194 |
5 |
0.612195 |
0.981150 |
0.578304 |
0.220064 |
0.488182 |
6 |
0.898736 |
0.626289 |
0.788306 |
0.747086 |
0.386097 |
7 |
0.568531 |
0.362593 |
0.644950 |
0.510410 |
0.092556 |
8 |
0.872898 |
0.771917 |
0.853365 |
0.227531 |
0.045184 |
9 |
0.898296 |
0.683850 |
0.138142 |
0.956854 |
0.335476 |
df[(df['A'] > 0.5) & (df['B'] < 0.7)]
|
A |
B |
C |
D |
E |
1 |
0.979053 |
0.216078 |
0.394286 |
0.520654 |
0.584194 |
6 |
0.898736 |
0.626289 |
0.788306 |
0.747086 |
0.386097 |
7 |
0.568531 |
0.362593 |
0.644950 |
0.510410 |
0.092556 |
9 |
0.898296 |
0.683850 |
0.138142 |
0.956854 |
0.335476 |
排序
df.sort_values(col, ascending=True) #按照列进行排序 # ascending: True 升序 False 降序
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values('A', ascending=True)
|
A |
B |
C |
D |
E |
3 |
0.015834 |
0.758417 |
0.123415 |
0.802403 |
0.782450 |
1 |
0.068046 |
0.373240 |
0.414358 |
0.105285 |
0.759001 |
7 |
0.134238 |
0.104416 |
0.551595 |
0.472277 |
0.015997 |
4 |
0.236628 |
0.391852 |
0.390275 |
0.904988 |
0.650108 |
8 |
0.469382 |
0.426359 |
0.137109 |
0.253183 |
0.894667 |
2 |
0.508937 |
0.443894 |
0.147076 |
0.149885 |
0.434802 |
0 |
0.572640 |
0.369032 |
0.412343 |
0.402019 |
0.445365 |
9 |
0.663964 |
0.533604 |
0.217605 |
0.602667 |
0.637232 |
6 |
0.765109 |
0.646277 |
0.885381 |
0.743307 |
0.649711 |
5 |
0.962494 |
0.650830 |
0.754514 |
0.578115 |
0.659846 |
df.sort_values([col1,col2],ascending=[True,False]) # 按照数据框的列col1升序,col2降序的方式对数据框df做排序
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values(['A','E'],ascending=[True,False])
|
A |
B |
C |
D |
E |
1 |
0.083526 |
0.969267 |
0.012550 |
0.672851 |
0.866501 |
7 |
0.107839 |
0.383900 |
0.982337 |
0.390914 |
0.308559 |
0 |
0.148742 |
0.306888 |
0.853949 |
0.144650 |
0.414474 |
4 |
0.190011 |
0.794060 |
0.514756 |
0.272207 |
0.086894 |
6 |
0.423982 |
0.229873 |
0.992318 |
0.495706 |
0.971735 |
9 |
0.532263 |
0.106900 |
0.528114 |
0.456583 |
0.362642 |
2 |
0.619678 |
0.800373 |
0.927766 |
0.742667 |
0.645809 |
3 |
0.815312 |
0.920682 |
0.833351 |
0.266840 |
0.132698 |
5 |
0.842293 |
0.049499 |
0.780198 |
0.343752 |
0.341800 |
8 |
0.932379 |
0.398721 |
0.080358 |
0.200681 |
0.549237 |
分组
df.groupby(col) # 按照某列对数据框df做分组 # 常与count进行连用,统计出各词的个数
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df
|
A |
B |
C |
D |
0 |
foo |
one |
small |
1 |
1 |
foo |
one |
medium |
2 |
2 |
foo |
two |
large |
2 |
3 |
foo |
two |
large |
3 |
4 |
bar |
three |
small |
3 |
5 |
bar |
three |
small |
5 |
df.groupby('B').count()
|
A |
C |
D |
B |
|
|
|
one |
2 |
2 |
2 |
three |
2 |
2 |
2 |
two |
2 |
2 |
2 |
df.groupby([col1,col2]) # 按照列col1和col2对数据框df做分组
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.groupby(['A', 'B']).sum()
|
|
D |
A |
B |
|
bar |
three |
8 |
foo |
one |
3 |
two |
5 |
df.groupby(col1)[col2].mean() # 按照列col1对数据框df做分组处理后,返回对应的col2的平均值
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.groupby('A')['D'].mean()
A
bar 4
foo 2
Name: D, dtype: int32
df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean) # 做透视表,索引为col1,针对的数值列为col2和col3,分组函数为平均值
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.pivot_table(df, index=['A', 'B'], columns=['C'], aggfunc=np.sum)
|
|
D |
|
C |
large |
medium |
small |
A |
B |
|
|
|
bar |
three |
NaN |
NaN |
8.0 |
foo |
one |
NaN |
2.0 |
1.0 |
two |
5.0 |
NaN |
NaN |
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.groupby('A').agg(np.mean)
df.apply(np.mean, axis=0) # 对数据框df的每一列求平均值 axis: 0对列名(横着的)进行处理 1对索引(竖着的)进行处理
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df
|
A |
B |
C |
D |
E |
0 |
0.800902 |
0.933676 |
0.461338 |
0.353398 |
0.057885 |
1 |
0.988580 |
0.681318 |
0.533361 |
0.486016 |
0.220004 |
2 |
0.695034 |
0.643920 |
0.694040 |
0.280063 |
0.641867 |
3 |
0.925290 |
0.084906 |
0.120247 |
0.880991 |
0.399596 |
4 |
0.697742 |
0.372860 |
0.881456 |
0.565627 |
0.272549 |
5 |
0.614245 |
0.658123 |
0.797487 |
0.609511 |
0.544633 |
6 |
0.153517 |
0.354870 |
0.910838 |
0.416895 |
0.098821 |
7 |
0.088223 |
0.501401 |
0.702754 |
0.334938 |
0.182708 |
8 |
0.737348 |
0.569340 |
0.291342 |
0.847058 |
0.193331 |
9 |
0.083915 |
0.396210 |
0.589415 |
0.806525 |
0.598841 |
df.apply(np.mean, axis=0)
A 0.578480
B 0.519662
C 0.598228
D 0.558102
E 0.321024
dtype: float64
数据的连接(join)与组合(combine)
df1.append(df2) # 在数据框df2的末尾添加数据框df1,其中df1和df2的列数应该相等 列合并
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df1.append(df2)
|
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
4 |
A4 |
B4 |
C4 |
D4 |
5 |
A5 |
B5 |
C5 |
D5 |
6 |
A6 |
B6 |
C6 |
D6 |
7 |
A7 |
B7 |
C7 |
D7 |
pd.concat([df1, df2], axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等 # 中括号可以换成圆括号 # axis: 0进行行合并 1进行列合并
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
pd.concat([df1, df2], axis=0)
|
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
4 |
A4 |
B4 |
C4 |
D4 |
5 |
A5 |
B5 |
C5 |
D5 |
6 |
A6 |
B6 |
C6 |
D6 |
7 |
A7 |
B7 |
C7 |
D7 |
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
'F': ['B4', 'B5', 'B6', 'B7'],
'G': ['C4', 'C5', 'C6', 'C7'],
'H': ['D4', 'D5', 'D6', 'D7']},
index=[0, 1, 2, 3])
pd.concat((df1, df2), axis=1)
|
A |
B |
C |
D |
E |
F |
G |
H |
0 |
A0 |
B0 |
C0 |
D0 |
A4 |
B4 |
C4 |
D4 |
1 |
A1 |
B1 |
C1 |
D1 |
A5 |
B5 |
C5 |
D5 |
2 |
A2 |
B2 |
C2 |
D2 |
A6 |
B6 |
C6 |
D6 |
3 |
A3 |
B3 |
C3 |
D3 |
A7 |
B7 |
C7 |
D7 |
df1.join(df2,on=col1,how='inner') # 对数据框df1和df2做内连接,其中连接的列为col1
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
df2 = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
df1.join(df2, on='key', how='inner')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
2 |
A2 |
B2 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K1 |
C1 |
D1 |
3 |
A3 |
B3 |
K1 |
C1 |
D1 |
数据的统计
df.mean() # 得到数据框df中每一列的平均值
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.mean()
A 0.362158
B 0.432248
C 0.554478
D 0.331155
E 0.438283
dtype: float64
df.corr() # 得到数据框df中每一列与其他列的相关系数
df.corr()
|
A |
B |
C |
D |
E |
A |
1.000000 |
-0.167715 |
0.198216 |
0.036939 |
0.113714 |
B |
-0.167715 |
1.000000 |
0.449789 |
0.015883 |
-0.236658 |
C |
0.198216 |
0.449789 |
1.000000 |
-0.296943 |
0.386206 |
D |
0.036939 |
0.015883 |
-0.296943 |
1.000000 |
-0.777327 |
E |
0.113714 |
-0.236658 |
0.386206 |
-0.777327 |
1.000000 |
df.count() # 得到数据框df中每一列的非空值个数
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
# df.loc[0][0] = np.nan
df.iloc[0, 0] = np.nan
df.count()
A 9
B 10
C 10
D 10
E 10
dtype: int64
df.max() # 得到数据框df中每一列的最大值
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.max()
A 0.812708
B 0.886171
C 0.987035
D 0.977146
E 0.959625
dtype: float64
df.min() # 得到数据框df中每一列的最小值
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.min()
A 0.128560
B 0.135905
C 0.167476
D 0.137062
E 0.050306
dtype: float64
df.median() # 得到数据框df中每一列的中位数
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.median()
A 0.409373
B 0.597418
C 0.678203
D 0.705762
E 0.519713
dtype: float64
df.std() # 得到数据框df中每一列的标准差
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.std()
A 0.272847
B 0.254870
C 0.258956
D 0.301168
E 0.295753
dtype: float64