1 快速入门Pandas

安装Pandas

!pip install pandas
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting pandas
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 6.0 MB/s eta 0:00:0000:0100:01
[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from pandas) (2024.1)
Collecting tzdata>=2022.1 (from pandas)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Requirement already satisfied: numpy>=1.20.3 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from pandas) (1.24.4)
Requirement already satisfied: six>=1.5 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2024.2
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

import numpy as np
import pandas as pd

1.生成对象

s = pd.Series([1,2,3,4,np.nan,5,6])
s
0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
5    5.0
6    6.0
dtype: float64

np.nan 就是 ndarray中的 None 对象

Series 是序列的意思

df = pd.DataFrame({
    'A':1,
    'B':pd.Timestamp('20240101'),
    'C':pd.Series(1,index = list(range(4)) , dtype = 'float32'),
    'D':np.array([3]*4 , dtype = 'int32'),
    'E':pd.Categorical(['test','train','test','train']),
    'F':'foo'
})

df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E F
0 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
3 1 2024-01-01 1.0 3 train foo
# 用含日期的时间索引 , 标签 , NumPy数组生成DataFrame
d = pd.date_range('20240101' , periods = 6)
d
DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

根据时间向后延伸 periods 位

df2 = pd.DataFrame(np.random.randn(6,4) , index = d , columns = list('ABCD'))
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-01 -0.215153 2.176335 -2.150249 -0.377296
2024-01-02 0.886135 -1.040961 -0.322376 -0.522593
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978
2024-01-04 0.536811 0.348313 -1.278939 -1.554145
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654
2024-01-06 -0.340597 -1.329190 0.155433 0.481517

2. 查看数据

# 查看 DataFrame 列的数据类型
df.dtypes
A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
# 查看 DataFrame 头部和尾部数据
df2.head(1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E F
0 1 2024-01-01 1.0 3 test foo
df2.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-01 0.563029 -1.020525 0.230456 1.345755
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276

df.head(null) 查看5条

df2.tail(1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-06 -0.236926 0.556183 1.293092 2.213492
df2.tail()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276
2024-01-06 -0.236926 0.556183 1.293092 2.213492

df.tail(null) 查看5条

# 查看索引
print(df.index , end = '\n \n')

print(df2.index)
Index([0, 1, 2, 3], dtype='int64')
 
DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')
# 查看列名
df2.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看数据的统计概要
df2.describe()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.371758 0.698415 0.222825 1.139684
std 0.579717 1.509786 0.669168 0.601573
min -0.236926 -1.020525 -0.456521 0.492788
25% -0.078050 -0.459958 -0.314308 0.797105
50% 0.330994 0.548257 0.163356 1.029397
75% 0.619817 1.931883 0.526236 1.272367
max 1.303791 2.517436 1.293092 2.213492

3. 排序

# 按轴排序
df.sort_index(axis = 1, ascending = False)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
F E D C B A
0 foo test 3 1.0 2024-01-01 1
1 foo train 3 1.0 2024-01-01 1
2 foo test 3 1.0 2024-01-01 1
3 foo train 3 1.0 2024-01-01 1

按照列名排序

df.sort_index(axis = 0, ascending = False)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E F
3 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
0 1 2024-01-01 1.0 3 test foo

按照行索引排序

# 按值排序
df2.sort_values(by = 'B' , ascending = False)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-06 -0.236926 0.556183 1.293092 2.213492
2024-01-05 0.638746 0.540331 -0.451163 0.727276
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-01 0.563029 -1.020525 0.230456 1.345755

4. 选择数据

# 选择单列
df['A']
0    1
1    1
2    1
3    1
Name: A, dtype: int64
# 切片按行选择
df[2:]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E F
2 1 2024-01-01 1.0 3 test foo
3 1 2024-01-01 1.0 3 train foo
df[::-1]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E F
3 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
0 1 2024-01-01 1.0 3 test foo
df[3:0:-1]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E F
3 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
# 对索引范围切片
df2['2024-01-03':'2024-01-05':1]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276

对索引范围切片最终结果会是左闭右闭

1. 按标签选择

# index 也就是标签
df2.loc['2024-01-01']
A    0.563029
B   -1.020525
C    0.230456
D    1.345755
Name: 2024-01-01 00:00:00, dtype: float64
df.loc[1]
A                      1
B    2024-01-01 00:00:00
C                    1.0
D                      3
E                  train
F                    foo
Name: 1, dtype: object
# 用标签选择多列数据

df.loc[:,['A','B']]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B
0 1 2024-01-01
1 1 2024-01-01
2 1 2024-01-01
3 1 2024-01-01
# 用标签选择多列数据

df.loc[2:1:-1,['A','B']]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B
2 1 2024-01-01
1 1 2024-01-01
# 只取莫一行莫一个值
df.loc[2,'B']
Timestamp('2024-01-01 00:00:00')
# 快速某个标签的值
df.at[1,'B']
Timestamp('2024-01-01 00:00:00')

2. 按位置选择

# 按位置选择
df.iloc[3]
A                      1
B    2024-01-01 00:00:00
C                    1.0
D                      3
E                  train
F                    foo
Name: 3, dtype: object

df.iloc[N] 取第N行的数据

# 用整数的切片
df2.iloc[3:5 , 0:2]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B
2024-01-04 -0.137053 2.390449
2024-01-05 0.638746 0.540331

df2.iloc[i:j , x:y]

i ~ j 行

x ~ y列

# 精确的按位切片
df2.iloc[[1,2,3],[0,2]]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A C
2024-01-02 1.303791 -0.456521
2024-01-03 0.098958 0.096255
2024-01-04 -0.137053 0.624829
# 行切片
df2.iloc[1:3,::]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
# 列切片
df2.iloc[::,2:3]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
C
2024-01-01 0.230456
2024-01-02 -0.456521
2024-01-03 0.096255
2024-01-04 0.624829
2024-01-05 -0.451163
2024-01-06 1.293092
# 显式精确取值
df.iloc[1,1]
Timestamp('2024-01-01 00:00:00')
# 快速精确取值
df.iat[1,1]
Timestamp('2024-01-01 00:00:00')

3. 筛选

df2[df2['B'] > 0]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276
2024-01-06 -0.236926 0.556183 1.293092 2.213492
df2[df2 > 0]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-01 0.563029 NaN 0.230456 1.345755
2024-01-02 1.303791 NaN NaN 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 NaN 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 NaN 0.727276
2024-01-06 NaN 0.556183 1.293092 2.213492
# 用isin()筛选
df3 = df2.copy()
df3['E'] = ['one','one','two','three','four','three']
df3
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E
2024-01-01 0.563029 -1.020525 0.230456 1.345755 one
2024-01-02 1.303791 -0.793388 -0.456521 1.052202 one
2024-01-03 0.098958 2.517436 0.096255 0.492788 two
2024-01-04 -0.137053 2.390449 0.624829 1.006591 three
2024-01-05 0.638746 0.540331 -0.451163 0.727276 four
2024-01-06 -0.236926 0.556183 1.293092 2.213492 three
df3[df3['E'].isin(['two','four'])]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E
2024-01-03 0.098958 2.517436 0.096255 0.492788 two
2024-01-05 0.638746 0.540331 -0.451163 0.727276 four

5. 赋值

# 用索引自动对齐新增列的数据
s1 = pd.Series([1,2,3,4,5,6] , index = d)
s1
2024-01-01    1
2024-01-02    2
2024-01-03    3
2024-01-04    4
2024-01-05    5
2024-01-06    6
Freq: D, dtype: int64
df2['X'] = s1
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D X
2024-01-01 0.563029 -1.020525 0.230456 1.345755 1
2024-01-02 1.303791 -0.793388 -0.456521 1.052202 2
2024-01-03 0.098958 2.517436 0.096255 0.492788 3
2024-01-04 -0.137053 2.390449 0.624829 1.006591 4
2024-01-05 0.638746 0.540331 -0.451163 0.727276 5
2024-01-06 -0.236926 0.556183 1.293092 2.213492 6

数据s1 的index 需要和被添加的的表中的index数据类型和数量一样

# 按标签赋值
df2.at['2024-01-01','A'] = 0
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-01 0.000000 2.176335 -2.150249 -0.377296
2024-01-02 0.886135 -1.040961 -0.322376 -0.522593
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978
2024-01-04 0.536811 0.348313 -1.278939 -1.554145
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654
2024-01-06 -0.340597 -1.329190 0.155433 0.481517
# 按标签赋值
df2.iat[1,2] = 0
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2024-01-01 0.000000 2.176335 -2.150249 -0.377296
2024-01-02 0.886135 -1.040961 0.000000 -0.522593
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978
2024-01-04 0.536811 0.348313 -1.278939 -1.554145
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654
2024-01-06 -0.340597 -1.329190 0.155433 0.481517
# 按数组赋值
df2['G'] = [1,2,3,4,5,6]
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G
2024-01-01 0.000000 2.176335 -2.150249 -0.377296 1
2024-01-02 0.886135 -1.040961 0.000000 -0.522593 2
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978 3
2024-01-04 0.536811 0.348313 -1.278939 -1.554145 4
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654 5
2024-01-06 -0.340597 -1.329190 0.155433 0.481517 6
# 按 NumPy 数组赋值
df2.loc[:,'D'] = np.array([5] * len(df2))
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G
2024-01-01 0.000000 2.176335 -2.150249 5.0 1
2024-01-02 0.886135 -1.040961 0.000000 5.0 2
2024-01-03 -2.004391 0.078001 -0.622707 5.0 3
2024-01-04 0.536811 0.348313 -1.278939 5.0 4
2024-01-05 -0.609511 -2.019143 -0.871984 5.0 5
2024-01-06 -0.340597 -1.329190 0.155433 5.0 6
# where 条件赋值
df3 = df2.copy()
df3[df3 < 0] = 0
df3
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G
2024-01-01 0.000000 2.176335 0.000000 5.0 1
2024-01-02 0.886135 0.000000 0.000000 5.0 2
2024-01-03 0.000000 0.078001 0.000000 5.0 3
2024-01-04 0.536811 0.348313 0.000000 5.0 4
2024-01-05 0.000000 0.000000 0.000000 5.0 5
2024-01-06 0.000000 0.000000 0.155433 5.0 6

6. 空值

df1 = df2.reindex(index = d[0:4] , columns = list(df2.columns) + ['E'])
df1.loc[d[0] : d[1] , 'E'] = 1
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G E
2024-01-01 0.000000 2.176335 -2.150249 5.0 1 1.0
2024-01-02 0.886135 -1.040961 0.000000 5.0 2 1.0
2024-01-03 -2.004391 0.078001 -0.622707 5.0 3 NaN
2024-01-04 0.536811 0.348313 -1.278939 5.0 4 NaN
# 去除空值行

df1.dropna(how = 'any')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G E
2024-01-01 0.000000 2.176335 -2.150249 5.0 1 1.0
2024-01-02 0.886135 -1.040961 0.000000 5.0 2 1.0
# 将空值统一赋值 为value中的内容
df1.fillna(value = 5)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G E
2024-01-01 0.000000 2.176335 -2.150249 5.0 1 1.0
2024-01-02 0.886135 -1.040961 0.000000 5.0 2 1.0
2024-01-03 -2.004391 0.078001 -0.622707 5.0 3 5.0
2024-01-04 0.536811 0.348313 -1.278939 5.0 4 5.0
# 查询空值
pd.isna(df1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D G E
2024-01-01 False False False False False False
2024-01-02 False False False False False False
2024-01-03 False False False False False True
2024-01-04 False False False False False True

7. 运算

1. 算术运算

df1 = pd.DataFrame(np.random.randn(2,5))
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 -0.058019 0.431261 -0.500285 0.493584 0.056665
df2 = pd.DataFrame(np.random.randn(3,4))
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3
0 0.619248 -0.580760 0.439126 0.379799
1 -0.190684 -0.399529 -1.400112 0.469585
2 0.265781 0.863437 1.174504 1.351802
# 加法+
print(df1 + df2)
          0         1         2         3   4
0  1.674834 -1.351944  2.466340  1.732242 NaN
1 -0.248703  0.031732 -1.900397  0.963169 NaN
2       NaN       NaN       NaN       NaN NaN
# 减法-
print(df1 - df2)
          0         1         2         3   4
0  0.436338 -0.190424  1.588089  0.972645 NaN
1  0.132665  0.830789  0.899827  0.023998 NaN
2       NaN       NaN       NaN       NaN NaN
# 乘法*
print(df1 * df2)
          0         1         2         3   4
0  0.653669  0.447873  0.890202  0.513656 NaN
1  0.011063 -0.172301  0.700455  0.231780 NaN
2       NaN       NaN       NaN       NaN NaN
# 除法 / 
print(df1 / df2)
          0         1         2         3   4
0  1.704625  1.327888  4.616482  3.560949 NaN
1  0.304268 -1.079424  0.357318  1.051105 NaN
2       NaN       NaN       NaN       NaN NaN

2. 比较操作

# 等于
print('df1 等于 df2\n' ,df1.eq(df2)) 
df1 等于 df2
        0      1      2      3      4
0  False  False  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False
# 不等于
print('\n df1 不等于 df2\n',df1.ne(df2))
 df1 不等于 df2
       0     1     2     3     4
0  True  True  True  True  True
1  True  True  True  True  True
2  True  True  True  True  True
# 大于
print('\n df1 大于 df2\n',df1.gt(df2)) 
 df1 大于 df2
        0      1      2      3      4
0   True  False   True   True  False
1   True   True   True   True  False
2  False  False  False  False  False
# 小于
print('\n df1 小于 df2\n',df1.lt(df2))
 df1 小于 df2
        0      1      2      3      4
0  False   True  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False
# 大于等于
print('\n df1 大于等于 df2\n',df1.ge(df2)) 
 df1 大于等于 df2
        0      1      2      3      4
0   True  False   True   True  False
1   True   True   True   True  False
2  False  False  False  False  False
#小于等于
print('\n df1 小于等于 df2\n',df1.le (df2) )
 df1 小于等于 df2
        0      1      2      3      4
0  False   True  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False

3. 统计

函数描述函数描述函数描述函数描述
count统计非空值数量sum汇总值mean平均值mad平均绝对偏差
median算数中位数min最小值max最大值mode众数
abs绝对值prod乘积std标准偏差var无偏方差
sem平均值的标准误差skew样本偏度 (第三阶)kurt样本峰度 (第四阶)quantile样本分位数 (不同 % 的值)
cumsum累加cumprod累乘cummax累积最大值cummin累积最小值
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 -0.058019 0.431261 -0.500285 0.493584 0.056665
# 列均值
df1.mean()
0    0.498783
1   -0.169962
2    0.763465
3    0.923014
4    0.066265
dtype: float64
# 按行累加
df1.cumsum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 0.997567 -0.339923 1.526930 1.846027 0.132529

4. 合并concat

# 直接按行堆叠了
pd.concat([df1 , df2])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 -0.058019 0.431261 -0.500285 0.493584 0.056665
0 0.619248 -0.580760 0.439126 0.379799 NaN
1 -0.190684 -0.399529 -1.400112 0.469585 NaN
2 0.265781 0.863437 1.174504 1.351802 NaN

5. 连接join

left = pd.DataFrame({
    'key':['foo','foo','bar'],
    'val':[1,2,2]
})
left
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key val
0 foo 1
1 foo 2
2 bar 2
right = pd.DataFrame({
    'key':['foo','foo','bar'],
    'val':[3,4,5]
})
right
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key val
0 foo 3
1 foo 4
2 bar 5
# on = 'key' ,key 相同的地卡尔集
pd.merge(left , right , on = 'key')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key val_x val_y
0 foo 1 3
1 foo 1 4
2 foo 2 3
3 foo 2 4
4 bar 2 5

6. 追加Append

pandas > 2.0 通过concat进行追加

df = pd.DataFrame(np.random.randn(8,4),columns = ['A' , 'B' , 'C' , 'D'])
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
0 0.605013 0.011193 -1.909824 -1.296454
1 -0.288646 1.062910 -0.715485 0.628659
2 0.370543 -0.184787 1.694653 -0.238653
3 2.037362 -0.529822 -0.349057 1.085996
4 -0.055223 0.897539 0.509561 -0.687590
5 0.673449 -0.333648 -0.405807 0.218246
6 0.158412 2.809229 -0.754137 -0.157016
7 -1.408664 0.121101 -0.253810 -1.405964
s = df.iloc[3]
s
A    2.037362
B   -0.529822
C   -0.349057
D    1.085996
Name: 3, dtype: float64
# 通过concat进行追加
df = pd.concat([df, pd.DataFrame([s])], ignore_index=True)
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
0 0.605013 0.011193 -1.909824 -1.296454
1 -0.288646 1.062910 -0.715485 0.628659
2 0.370543 -0.184787 1.694653 -0.238653
3 2.037362 -0.529822 -0.349057 1.085996
4 -0.055223 0.897539 0.509561 -0.687590
5 0.673449 -0.333648 -0.405807 0.218246
6 0.158412 2.809229 -0.754137 -0.157016
7 -1.408664 0.121101 -0.253810 -1.405964
8 2.037362 -0.529822 -0.349057 1.085996

7. 分组group

df = pd.DataFrame({
    'A':['foo' , 'bar' , 'foo' , 'bar'],
    'B':['one' , 'two' , 'one' , 'two'],
    'C':np.random.randn(4),
    'D':np.random.randn(4)
})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
0 foo one 0.117230 0.913722
1 bar two 0.174091 0.878413
2 foo one -0.325810 -1.498865
3 bar two 0.198171 0.861958
df.groupby('A').sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
B C D
A
bar twotwo 0.372262 1.740370
foo oneone -0.208580 -0.585143
df.groupby(['A','B']).sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
C D
A B
bar two 0.372262 1.740370
foo one -0.208580 -0.585143

8. 数据透视表

df = pd.DataFrame({
    'A': ['one', 'two', 'three', 'four'] * 3,
    'B': ['A', 'B', 'C'] * 4,
    'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
    'D': np.random.randn(12),
    'E': np.random.randn(12)
})

df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D E
0 one A foo -0.715140 -0.074482
1 two B foo -0.272014 0.922678
2 three C foo -0.939558 0.990516
3 four A bar 0.479970 0.240101
4 one B bar 0.834005 -0.865389
5 two C bar -0.454982 -0.322976
6 three A foo -0.012628 -0.645079
7 four B foo -1.638204 -0.268406
8 one C foo 1.543030 1.132784
9 two A bar -0.225850 -1.284192
10 three B bar 0.439350 0.549397
11 four C bar 0.908596 -0.441868
pd.pivot_table(df , values = 'D' , index = ['A' , 'B'] , columns = ['C'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
C bar foo
A B
four A 0.479970 NaN
B NaN -1.638204
C 0.908596 NaN
one A NaN -0.715140
B 0.834005 NaN
C NaN 1.543030
three A NaN -0.012628
B 0.439350 NaN
C NaN -0.939558
two A -0.225850 NaN
B NaN -0.272014
C -0.454982 NaN
  • values: 作为透视表中的值
  • index: 作为透视表的行key
  • columns:作为透视表的列key

9. 时间序列

# 生成时间序列数据并重采样为5分钟频率
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
2012-01-01 00:00:00    293
2012-01-01 00:00:01    291
2012-01-01 00:00:02    271
2012-01-01 00:00:03    184
2012-01-01 00:00:04    278
                      ... 
2012-01-01 00:01:35    339
2012-01-01 00:01:36     12
2012-01-01 00:01:37    289
2012-01-01 00:01:38    238
2012-01-01 00:01:39    313
Freq: S, Length: 100, dtype: int64
# 将数据重采样为5分钟间隔并求和
ts.resample('5Min').sum()
2012-01-01    26100
Freq: 5T, dtype: int64
# 时区操作
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
2012-03-06    1.484014
2012-03-07   -0.826452
2012-03-08    0.347875
2012-03-09   -0.652534
2012-03-10   -0.648735
Freq: D, dtype: float64
# 本地化为UTC时区
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-03-06 00:00:00+00:00    1.484014
2012-03-07 00:00:00+00:00   -0.826452
2012-03-08 00:00:00+00:00    0.347875
2012-03-09 00:00:00+00:00   -0.652534
2012-03-10 00:00:00+00:00   -0.648735
Freq: D, dtype: float64
# 转换时区为美国东部时区
ts_utc.tz_convert('US/Eastern')
2012-03-05 19:00:00-05:00    1.484014
2012-03-06 19:00:00-05:00   -0.826452
2012-03-07 19:00:00-05:00    0.347875
2012-03-08 19:00:00-05:00   -0.652534
2012-03-09 19:00:00-05:00   -0.648735
Freq: D, dtype: float64
# 转换为时间段
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2012-01-31    0.997888
2012-02-29   -0.498015
2012-03-31    1.065982
2012-04-30   -0.123414
2012-05-31    0.088834
Freq: M, dtype: float64
# 转换为时间段类型并再转回时间戳
ps = ts.to_period()
ps
2012-01    0.997888
2012-02   -0.498015
2012-03    1.065982
2012-04   -0.123414
2012-05    0.088834
Freq: M, dtype: float64
# 转换回时间戳
ps.to_timestamp()
2012-01-01    0.997888
2012-02-01   -0.498015
2012-03-01    1.065982
2012-04-01   -0.123414
2012-05-01    0.088834
Freq: MS, dtype: float64

10. 可视化

!pip install matplotlib
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: matplotlib in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (3.7.5)
Requirement already satisfied: contourpy>=1.0.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (4.54.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: numpy<2,>=1.20 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (1.24.4)
Requirement already satisfied: packaging>=20.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=6.2.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: importlib-resources>=3.2.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (6.4.0)
Requirement already satisfied: zipp>=3.1.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.20.2)
Requirement already satisfied: six>=1.5 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

ts = pd.Series(np.random.randn(1000), \
               index = pd.date_range('1/1/2000',periods = 1000)
              )
ts = ts.cumsum()
ts.plot()
<Axes: >




png

import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(1000,4) , \
                  index = ts.index, \
                  columns = ['A' , 'B' , 'C' , 'D']
                 )
df = df.cumsum()
plt.figure()
df.plot()
plt.legend(loc = 'best')
<matplotlib.legend.Legend at 0x7fcc745b7850>




<Figure size 640x480 with 0 Axes>



png

11. 数据输入/输出

1. csv

# 输出csv
df.to_csv('foo.csv')
# 输入csv
mid = pd.read_csv('foo.csv')
mid[:5]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Unnamed: 0 A B C D
0 2000-01-01 1.935169 -0.357413 -0.610720 -0.888047
1 2000-01-02 1.716798 0.895592 -0.827020 -1.707031
2 2000-01-03 0.769426 1.589204 -0.992573 -3.305788
3 2000-01-04 1.602683 2.225518 -1.860329 -2.416657
4 2000-01-05 0.337869 4.107487 -2.186224 -3.856869

2. HDF5

!pip install tables
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tables
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/05/49/e392c92132b950c3a8bdbb66c687a82e808edd89024e73358084bf96e044/tables-3.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.5/6.5 MB 1.3 MB/s eta 0:00:00a 0:00:01m
[?25hCollecting cython>=0.29.21 (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/b2/52/eda119f98071ccde04a9a1c9c9a18fd6def025651c9d0cd01ad51d0dba36/Cython-3.0.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.6/3.6 MB 2.0 MB/s eta 0:00:00a 0:00:01
[?25hRequirement already satisfied: numpy>=1.19.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from tables) (1.24.4)
Collecting numexpr>=2.6.2 (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/0d/47/a2ede0e136a8ddc288b447c260aa035f3e75251f808aa61f6454b16dfd04/numexpr-2.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (384 kB)
Collecting blosc2~=2.0.0 (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/dd/7e214303c2acb0807fd1ad86d78112456c0fc9397f565ab86ebd147f2a9b/blosc2-2.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.9/3.9 MB 1.9 MB/s eta 0:00:00a 0:00:01m
[?25hRequirement already satisfied: packaging in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from tables) (24.1)
Collecting py-cpuinfo (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting msgpack (from blosc2~=2.0.0->tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/55/f6/d4859a158a915be52eecd52dee9761ab3a5d84c834a1d13ffc198e068a48/msgpack-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
Installing collected packages: py-cpuinfo, numexpr, msgpack, cython, blosc2, tables
Successfully installed blosc2-2.0.0 cython-3.0.11 msgpack-1.1.0 numexpr-2.8.6 py-cpuinfo-9.0.0 tables-3.8.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

# 输出h5文件
df.to_hdf('foo.h5','df')
# 输入csv
mid = pd.read_hdf('foo.h5','df')
mid[:5]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C D
2000-01-01 1.935169 -0.357413 -0.610720 -0.888047
2000-01-02 1.716798 0.895592 -0.827020 -1.707031
2000-01-03 0.769426 1.589204 -0.992573 -3.305788
2000-01-04 1.602683 2.225518 -1.860329 -2.416657
2000-01-05 0.337869 4.107487 -2.186224 -3.856869

3. Excel

!pip install openpyxl
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting openpyxl
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

# 将 DataFrame 保存到 Excel 文件
df.to_excel('foo.xlsx', sheet_name='Sheet1')
# 从 Excel 文件读取数据
mid = pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
mid[:5]
Unnamed: 0 A B C D
0 2000-01-01 1.935169 -0.357413 -0.610720 -0.888047
1 2000-01-02 1.716798 0.895592 -0.827020 -1.707031
2 2000-01-03 0.769426 1.589204 -0.992573 -3.305788
3 2000-01-04 1.602683 2.225518 -1.860329 -2.416657
4 2000-01-05 0.337869 4.107487 -2.186224 -3.856869