1 快速入门Pandas

安装Pandas

!pip install pandas
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting pandas
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 6.0 MB/s eta 0:00:0000:0100:01
[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from pandas) (2024.1)
Collecting tzdata>=2022.1 (from pandas)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Requirement already satisfied: numpy>=1.20.3 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from pandas) (1.24.4)
Requirement already satisfied: six>=1.5 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2024.2
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

import numpy as np
import pandas as pd

1.生成对象

s = pd.Series([1,2,3,4,np.nan,5,6])
s
0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
5    5.0
6    6.0
dtype: float64

np.nan 就是 ndarray中的 None 对象

Series 是序列的意思

df = pd.DataFrame({
    'A':1,
    'B':pd.Timestamp('20240101'),
    'C':pd.Series(1,index = list(range(4)) , dtype = 'float32'),
    'D':np.array([3]*4 , dtype = 'int32'),
    'E':pd.Categorical(['test','train','test','train']),
    'F':'foo'
})

df
A B C D E F
0 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
3 1 2024-01-01 1.0 3 train foo
# 用含日期的时间索引 , 标签 , NumPy数组生成DataFrame
d = pd.date_range('20240101' , periods = 6)
d
DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

根据时间向后延伸 periods 位

df2 = pd.DataFrame(np.random.randn(6,4) , index = d , columns = list('ABCD'))
df2
A B C D
2024-01-01 -0.215153 2.176335 -2.150249 -0.377296
2024-01-02 0.886135 -1.040961 -0.322376 -0.522593
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978
2024-01-04 0.536811 0.348313 -1.278939 -1.554145
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654
2024-01-06 -0.340597 -1.329190 0.155433 0.481517

2. 查看数据

# 查看 DataFrame 列的数据类型
df.dtypes
A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
# 查看 DataFrame 头部和尾部数据
df2.head(1)
A B C D E F
0 1 2024-01-01 1.0 3 test foo
df2.head()
A B C D
2024-01-01 0.563029 -1.020525 0.230456 1.345755
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276

df.head(null) 查看5条

df2.tail(1)
A B C D
2024-01-06 -0.236926 0.556183 1.293092 2.213492
df2.tail()
A B C D
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276
2024-01-06 -0.236926 0.556183 1.293092 2.213492

df.tail(null) 查看5条

# 查看索引
print(df.index , end = '\n \n')

print(df2.index)
Index([0, 1, 2, 3], dtype='int64')
 
DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')
# 查看列名
df2.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看数据的统计概要
df2.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.371758 0.698415 0.222825 1.139684
std 0.579717 1.509786 0.669168 0.601573
min -0.236926 -1.020525 -0.456521 0.492788
25% -0.078050 -0.459958 -0.314308 0.797105
50% 0.330994 0.548257 0.163356 1.029397
75% 0.619817 1.931883 0.526236 1.272367
max 1.303791 2.517436 1.293092 2.213492

3. 排序

# 按轴排序
df.sort_index(axis = 1, ascending = False)
F E D C B A
0 foo test 3 1.0 2024-01-01 1
1 foo train 3 1.0 2024-01-01 1
2 foo test 3 1.0 2024-01-01 1
3 foo train 3 1.0 2024-01-01 1

按照列名排序

df.sort_index(axis = 0, ascending = False)
A B C D E F
3 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
0 1 2024-01-01 1.0 3 test foo

按照行索引排序

# 按值排序
df2.sort_values(by = 'B' , ascending = False)
A B C D
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-06 -0.236926 0.556183 1.293092 2.213492
2024-01-05 0.638746 0.540331 -0.451163 0.727276
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-01 0.563029 -1.020525 0.230456 1.345755

4. 选择数据

# 选择单列
df['A']
0    1
1    1
2    1
3    1
Name: A, dtype: int64
# 切片按行选择
df[2:]
A B C D E F
2 1 2024-01-01 1.0 3 test foo
3 1 2024-01-01 1.0 3 train foo
df[::-1]
A B C D E F
3 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
0 1 2024-01-01 1.0 3 test foo
df[3:0:-1]
A B C D E F
3 1 2024-01-01 1.0 3 train foo
2 1 2024-01-01 1.0 3 test foo
1 1 2024-01-01 1.0 3 train foo
# 对索引范围切片
df2['2024-01-03':'2024-01-05':1]
A B C D
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276

对索引范围切片最终结果会是左闭右闭

1. 按标签选择

# index 也就是标签
df2.loc['2024-01-01']
A    0.563029
B   -1.020525
C    0.230456
D    1.345755
Name: 2024-01-01 00:00:00, dtype: float64
df.loc[1]
A                      1
B    2024-01-01 00:00:00
C                    1.0
D                      3
E                  train
F                    foo
Name: 1, dtype: object
# 用标签选择多列数据

df.loc[:,['A','B']]
A B
0 1 2024-01-01
1 1 2024-01-01
2 1 2024-01-01
3 1 2024-01-01
# 用标签选择多列数据

df.loc[2:1:-1,['A','B']]
A B
2 1 2024-01-01
1 1 2024-01-01
# 只取莫一行莫一个值
df.loc[2,'B']
Timestamp('2024-01-01 00:00:00')
# 快速某个标签的值
df.at[1,'B']
Timestamp('2024-01-01 00:00:00')

2. 按位置选择

# 按位置选择
df.iloc[3]
A                      1
B    2024-01-01 00:00:00
C                    1.0
D                      3
E                  train
F                    foo
Name: 3, dtype: object

df.iloc[N] 取第N行的数据

# 用整数的切片
df2.iloc[3:5 , 0:2]
A B
2024-01-04 -0.137053 2.390449
2024-01-05 0.638746 0.540331

df2.iloc[i:j , x:y]

i ~ j 行

x ~ y列

# 精确的按位切片
df2.iloc[[1,2,3],[0,2]]
A C
2024-01-02 1.303791 -0.456521
2024-01-03 0.098958 0.096255
2024-01-04 -0.137053 0.624829
# 行切片
df2.iloc[1:3,::]
A B C D
2024-01-02 1.303791 -0.793388 -0.456521 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
# 列切片
df2.iloc[::,2:3]
C
2024-01-01 0.230456
2024-01-02 -0.456521
2024-01-03 0.096255
2024-01-04 0.624829
2024-01-05 -0.451163
2024-01-06 1.293092
# 显式精确取值
df.iloc[1,1]
Timestamp('2024-01-01 00:00:00')
# 快速精确取值
df.iat[1,1]
Timestamp('2024-01-01 00:00:00')

3. 筛选

df2[df2['B'] > 0]
A B C D
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 -0.137053 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 -0.451163 0.727276
2024-01-06 -0.236926 0.556183 1.293092 2.213492
df2[df2 > 0]
A B C D
2024-01-01 0.563029 NaN 0.230456 1.345755
2024-01-02 1.303791 NaN NaN 1.052202
2024-01-03 0.098958 2.517436 0.096255 0.492788
2024-01-04 NaN 2.390449 0.624829 1.006591
2024-01-05 0.638746 0.540331 NaN 0.727276
2024-01-06 NaN 0.556183 1.293092 2.213492
# 用isin()筛选
df3 = df2.copy()
df3['E'] = ['one','one','two','three','four','three']
df3
A B C D E
2024-01-01 0.563029 -1.020525 0.230456 1.345755 one
2024-01-02 1.303791 -0.793388 -0.456521 1.052202 one
2024-01-03 0.098958 2.517436 0.096255 0.492788 two
2024-01-04 -0.137053 2.390449 0.624829 1.006591 three
2024-01-05 0.638746 0.540331 -0.451163 0.727276 four
2024-01-06 -0.236926 0.556183 1.293092 2.213492 three
df3[df3['E'].isin(['two','four'])]
A B C D E
2024-01-03 0.098958 2.517436 0.096255 0.492788 two
2024-01-05 0.638746 0.540331 -0.451163 0.727276 four

5. 赋值

# 用索引自动对齐新增列的数据
s1 = pd.Series([1,2,3,4,5,6] , index = d)
s1
2024-01-01    1
2024-01-02    2
2024-01-03    3
2024-01-04    4
2024-01-05    5
2024-01-06    6
Freq: D, dtype: int64
df2['X'] = s1
df2
A B C D X
2024-01-01 0.563029 -1.020525 0.230456 1.345755 1
2024-01-02 1.303791 -0.793388 -0.456521 1.052202 2
2024-01-03 0.098958 2.517436 0.096255 0.492788 3
2024-01-04 -0.137053 2.390449 0.624829 1.006591 4
2024-01-05 0.638746 0.540331 -0.451163 0.727276 5
2024-01-06 -0.236926 0.556183 1.293092 2.213492 6

数据s1 的index 需要和被添加的的表中的index数据类型和数量一样

# 按标签赋值
df2.at['2024-01-01','A'] = 0
df2
A B C D
2024-01-01 0.000000 2.176335 -2.150249 -0.377296
2024-01-02 0.886135 -1.040961 -0.322376 -0.522593
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978
2024-01-04 0.536811 0.348313 -1.278939 -1.554145
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654
2024-01-06 -0.340597 -1.329190 0.155433 0.481517
# 按标签赋值
df2.iat[1,2] = 0
df2
A B C D
2024-01-01 0.000000 2.176335 -2.150249 -0.377296
2024-01-02 0.886135 -1.040961 0.000000 -0.522593
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978
2024-01-04 0.536811 0.348313 -1.278939 -1.554145
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654
2024-01-06 -0.340597 -1.329190 0.155433 0.481517
# 按数组赋值
df2['G'] = [1,2,3,4,5,6]
df2
A B C D G
2024-01-01 0.000000 2.176335 -2.150249 -0.377296 1
2024-01-02 0.886135 -1.040961 0.000000 -0.522593 2
2024-01-03 -2.004391 0.078001 -0.622707 -0.714978 3
2024-01-04 0.536811 0.348313 -1.278939 -1.554145 4
2024-01-05 -0.609511 -2.019143 -0.871984 1.337654 5
2024-01-06 -0.340597 -1.329190 0.155433 0.481517 6
# 按 NumPy 数组赋值
df2.loc[:,'D'] = np.array([5] * len(df2))
df2
A B C D G
2024-01-01 0.000000 2.176335 -2.150249 5.0 1
2024-01-02 0.886135 -1.040961 0.000000 5.0 2
2024-01-03 -2.004391 0.078001 -0.622707 5.0 3
2024-01-04 0.536811 0.348313 -1.278939 5.0 4
2024-01-05 -0.609511 -2.019143 -0.871984 5.0 5
2024-01-06 -0.340597 -1.329190 0.155433 5.0 6
# where 条件赋值
df3 = df2.copy()
df3[df3 < 0] = 0
df3
A B C D G
2024-01-01 0.000000 2.176335 0.000000 5.0 1
2024-01-02 0.886135 0.000000 0.000000 5.0 2
2024-01-03 0.000000 0.078001 0.000000 5.0 3
2024-01-04 0.536811 0.348313 0.000000 5.0 4
2024-01-05 0.000000 0.000000 0.000000 5.0 5
2024-01-06 0.000000 0.000000 0.155433 5.0 6

6. 空值

df1 = df2.reindex(index = d[0:4] , columns = list(df2.columns) + ['E'])
df1.loc[d[0] : d[1] , 'E'] = 1
df1
A B C D G E
2024-01-01 0.000000 2.176335 -2.150249 5.0 1 1.0
2024-01-02 0.886135 -1.040961 0.000000 5.0 2 1.0
2024-01-03 -2.004391 0.078001 -0.622707 5.0 3 NaN
2024-01-04 0.536811 0.348313 -1.278939 5.0 4 NaN
# 去除空值行

df1.dropna(how = 'any')
A B C D G E
2024-01-01 0.000000 2.176335 -2.150249 5.0 1 1.0
2024-01-02 0.886135 -1.040961 0.000000 5.0 2 1.0
# 将空值统一赋值 为value中的内容
df1.fillna(value = 5)
A B C D G E
2024-01-01 0.000000 2.176335 -2.150249 5.0 1 1.0
2024-01-02 0.886135 -1.040961 0.000000 5.0 2 1.0
2024-01-03 -2.004391 0.078001 -0.622707 5.0 3 5.0
2024-01-04 0.536811 0.348313 -1.278939 5.0 4 5.0
# 查询空值
pd.isna(df1)
A B C D G E
2024-01-01 False False False False False False
2024-01-02 False False False False False False
2024-01-03 False False False False False True
2024-01-04 False False False False False True

7. 运算

1. 算术运算

df1 = pd.DataFrame(np.random.randn(2,5))
df1
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 -0.058019 0.431261 -0.500285 0.493584 0.056665
df2 = pd.DataFrame(np.random.randn(3,4))
df2
0 1 2 3
0 0.619248 -0.580760 0.439126 0.379799
1 -0.190684 -0.399529 -1.400112 0.469585
2 0.265781 0.863437 1.174504 1.351802
# 加法+
print(df1 + df2)
          0         1         2         3   4
0  1.674834 -1.351944  2.466340  1.732242 NaN
1 -0.248703  0.031732 -1.900397  0.963169 NaN
2       NaN       NaN       NaN       NaN NaN
# 减法-
print(df1 - df2)
          0         1         2         3   4
0  0.436338 -0.190424  1.588089  0.972645 NaN
1  0.132665  0.830789  0.899827  0.023998 NaN
2       NaN       NaN       NaN       NaN NaN
# 乘法*
print(df1 * df2)
          0         1         2         3   4
0  0.653669  0.447873  0.890202  0.513656 NaN
1  0.011063 -0.172301  0.700455  0.231780 NaN
2       NaN       NaN       NaN       NaN NaN
# 除法 / 
print(df1 / df2)
          0         1         2         3   4
0  1.704625  1.327888  4.616482  3.560949 NaN
1  0.304268 -1.079424  0.357318  1.051105 NaN
2       NaN       NaN       NaN       NaN NaN

2. 比较操作

# 等于
print('df1 等于 df2\n' ,df1.eq(df2)) 
df1 等于 df2
        0      1      2      3      4
0  False  False  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False
# 不等于
print('\n df1 不等于 df2\n',df1.ne(df2))
 df1 不等于 df2
       0     1     2     3     4
0  True  True  True  True  True
1  True  True  True  True  True
2  True  True  True  True  True
# 大于
print('\n df1 大于 df2\n',df1.gt(df2)) 
 df1 大于 df2
        0      1      2      3      4
0   True  False   True   True  False
1   True   True   True   True  False
2  False  False  False  False  False
# 小于
print('\n df1 小于 df2\n',df1.lt(df2))
 df1 小于 df2
        0      1      2      3      4
0  False   True  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False
# 大于等于
print('\n df1 大于等于 df2\n',df1.ge(df2)) 
 df1 大于等于 df2
        0      1      2      3      4
0   True  False   True   True  False
1   True   True   True   True  False
2  False  False  False  False  False
#小于等于
print('\n df1 小于等于 df2\n',df1.le (df2) )
 df1 小于等于 df2
        0      1      2      3      4
0  False   True  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False

3. 统计

函数 描述 函数 描述 函数 描述 函数 描述
count 统计非空值数量 sum 汇总值 mean 平均值 mad 平均绝对偏差
median 算数中位数 min 最小值 max 最大值 mode 众数
abs 绝对值 prod 乘积 std 标准偏差 var 无偏方差
sem 平均值的标准误差 skew 样本偏度 (第三阶) kurt 样本峰度 (第四阶) quantile 样本分位数 (不同 % 的值)
cumsum 累加 cumprod 累乘 cummax 累积最大值 cummin 累积最小值
df1
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 -0.058019 0.431261 -0.500285 0.493584 0.056665
# 列均值
df1.mean()
0    0.498783
1   -0.169962
2    0.763465
3    0.923014
4    0.066265
dtype: float64
# 按行累加
df1.cumsum()
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 0.997567 -0.339923 1.526930 1.846027 0.132529

4. 合并concat

# 直接按行堆叠了
pd.concat([df1 , df2])
0 1 2 3 4
0 1.055586 -0.771184 2.027215 1.352444 0.075864
1 -0.058019 0.431261 -0.500285 0.493584 0.056665
0 0.619248 -0.580760 0.439126 0.379799 NaN
1 -0.190684 -0.399529 -1.400112 0.469585 NaN
2 0.265781 0.863437 1.174504 1.351802 NaN

5. 连接join

left = pd.DataFrame({
    'key':['foo','foo','bar'],
    'val':[1,2,2]
})
left
key val
0 foo 1
1 foo 2
2 bar 2
right = pd.DataFrame({
    'key':['foo','foo','bar'],
    'val':[3,4,5]
})
right
key val
0 foo 3
1 foo 4
2 bar 5
# on = 'key' ,key 相同的地卡尔集
pd.merge(left , right , on = 'key')
key val_x val_y
0 foo 1 3
1 foo 1 4
2 foo 2 3
3 foo 2 4
4 bar 2 5

6. 追加Append

pandas > 2.0 通过concat进行追加

df = pd.DataFrame(np.random.randn(8,4),columns = ['A' , 'B' , 'C' , 'D'])
df
A B C D
0 0.605013 0.011193 -1.909824 -1.296454
1 -0.288646 1.062910 -0.715485 0.628659
2 0.370543 -0.184787 1.694653 -0.238653
3 2.037362 -0.529822 -0.349057 1.085996
4 -0.055223 0.897539 0.509561 -0.687590
5 0.673449 -0.333648 -0.405807 0.218246
6 0.158412 2.809229 -0.754137 -0.157016
7 -1.408664 0.121101 -0.253810 -1.405964
s = df.iloc[3]
s
A    2.037362
B   -0.529822
C   -0.349057
D    1.085996
Name: 3, dtype: float64
# 通过concat进行追加
df = pd.concat([df, pd.DataFrame([s])], ignore_index=True)
df
A B C D
0 0.605013 0.011193 -1.909824 -1.296454
1 -0.288646 1.062910 -0.715485 0.628659
2 0.370543 -0.184787 1.694653 -0.238653
3 2.037362 -0.529822 -0.349057 1.085996
4 -0.055223 0.897539 0.509561 -0.687590
5 0.673449 -0.333648 -0.405807 0.218246
6 0.158412 2.809229 -0.754137 -0.157016
7 -1.408664 0.121101 -0.253810 -1.405964
8 2.037362 -0.529822 -0.349057 1.085996

7. 分组group

df = pd.DataFrame({
    'A':['foo' , 'bar' , 'foo' , 'bar'],
    'B':['one' , 'two' , 'one' , 'two'],
    'C':np.random.randn(4),
    'D':np.random.randn(4)
})
df
A B C D
0 foo one 0.117230 0.913722
1 bar two 0.174091 0.878413
2 foo one -0.325810 -1.498865
3 bar two 0.198171 0.861958
df.groupby('A').sum()
B C D
A
bar twotwo 0.372262 1.740370
foo oneone -0.208580 -0.585143
df.groupby(['A','B']).sum()
C D
A B
bar two 0.372262 1.740370
foo one -0.208580 -0.585143

8. 数据透视表

df = pd.DataFrame({
    'A': ['one', 'two', 'three', 'four'] * 3,
    'B': ['A', 'B', 'C'] * 4,
    'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
    'D': np.random.randn(12),
    'E': np.random.randn(12)
})

df
A B C D E
0 one A foo -0.715140 -0.074482
1 two B foo -0.272014 0.922678
2 three C foo -0.939558 0.990516
3 four A bar 0.479970 0.240101
4 one B bar 0.834005 -0.865389
5 two C bar -0.454982 -0.322976
6 three A foo -0.012628 -0.645079
7 four B foo -1.638204 -0.268406
8 one C foo 1.543030 1.132784
9 two A bar -0.225850 -1.284192
10 three B bar 0.439350 0.549397
11 four C bar 0.908596 -0.441868
pd.pivot_table(df , values = 'D' , index = ['A' , 'B'] , columns = ['C'])
C bar foo
A B
four A 0.479970 NaN
B NaN -1.638204
C 0.908596 NaN
one A NaN -0.715140
B 0.834005 NaN
C NaN 1.543030
three A NaN -0.012628
B 0.439350 NaN
C NaN -0.939558
two A -0.225850 NaN
B NaN -0.272014
C -0.454982 NaN
  • values: 作为透视表中的值
  • index: 作为透视表的行key
  • columns:作为透视表的列key

9. 时间序列

# 生成时间序列数据并重采样为5分钟频率
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
2012-01-01 00:00:00    293
2012-01-01 00:00:01    291
2012-01-01 00:00:02    271
2012-01-01 00:00:03    184
2012-01-01 00:00:04    278
                      ... 
2012-01-01 00:01:35    339
2012-01-01 00:01:36     12
2012-01-01 00:01:37    289
2012-01-01 00:01:38    238
2012-01-01 00:01:39    313
Freq: S, Length: 100, dtype: int64
# 将数据重采样为5分钟间隔并求和
ts.resample('5Min').sum()
2012-01-01    26100
Freq: 5T, dtype: int64
# 时区操作
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
2012-03-06    1.484014
2012-03-07   -0.826452
2012-03-08    0.347875
2012-03-09   -0.652534
2012-03-10   -0.648735
Freq: D, dtype: float64
# 本地化为UTC时区
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-03-06 00:00:00+00:00    1.484014
2012-03-07 00:00:00+00:00   -0.826452
2012-03-08 00:00:00+00:00    0.347875
2012-03-09 00:00:00+00:00   -0.652534
2012-03-10 00:00:00+00:00   -0.648735
Freq: D, dtype: float64
# 转换时区为美国东部时区
ts_utc.tz_convert('US/Eastern')
2012-03-05 19:00:00-05:00    1.484014
2012-03-06 19:00:00-05:00   -0.826452
2012-03-07 19:00:00-05:00    0.347875
2012-03-08 19:00:00-05:00   -0.652534
2012-03-09 19:00:00-05:00   -0.648735
Freq: D, dtype: float64
# 转换为时间段
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2012-01-31    0.997888
2012-02-29   -0.498015
2012-03-31    1.065982
2012-04-30   -0.123414
2012-05-31    0.088834
Freq: M, dtype: float64
# 转换为时间段类型并再转回时间戳
ps = ts.to_period()
ps
2012-01    0.997888
2012-02   -0.498015
2012-03    1.065982
2012-04   -0.123414
2012-05    0.088834
Freq: M, dtype: float64
# 转换回时间戳
ps.to_timestamp()
2012-01-01    0.997888
2012-02-01   -0.498015
2012-03-01    1.065982
2012-04-01   -0.123414
2012-05-01    0.088834
Freq: MS, dtype: float64

10. 可视化

!pip install matplotlib
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: matplotlib in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (3.7.5)
Requirement already satisfied: contourpy>=1.0.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (4.54.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: numpy<2,>=1.20 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (1.24.4)
Requirement already satisfied: packaging>=20.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=6.2.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: importlib-resources>=3.2.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from matplotlib) (6.4.0)
Requirement already satisfied: zipp>=3.1.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.20.2)
Requirement already satisfied: six>=1.5 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

ts = pd.Series(np.random.randn(1000), \
               index = pd.date_range('1/1/2000',periods = 1000)
              )
ts = ts.cumsum()
ts.plot()
<Axes: >

png

import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(1000,4) , \
                  index = ts.index, \
                  columns = ['A' , 'B' , 'C' , 'D']
                 )
df = df.cumsum()
plt.figure()
df.plot()
plt.legend(loc = 'best')
<matplotlib.legend.Legend at 0x7fcc745b7850>




<Figure size 640x480 with 0 Axes>

png

11. 数据输入/输出

1. csv

# 输出csv
df.to_csv('foo.csv')
# 输入csv
mid = pd.read_csv('foo.csv')
mid[:5]
Unnamed: 0 A B C D
0 2000-01-01 1.935169 -0.357413 -0.610720 -0.888047
1 2000-01-02 1.716798 0.895592 -0.827020 -1.707031
2 2000-01-03 0.769426 1.589204 -0.992573 -3.305788
3 2000-01-04 1.602683 2.225518 -1.860329 -2.416657
4 2000-01-05 0.337869 4.107487 -2.186224 -3.856869

2. HDF5

!pip install tables
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tables
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/05/49/e392c92132b950c3a8bdbb66c687a82e808edd89024e73358084bf96e044/tables-3.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.5/6.5 MB 1.3 MB/s eta 0:00:00a 0:00:01m
[?25hCollecting cython>=0.29.21 (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/b2/52/eda119f98071ccde04a9a1c9c9a18fd6def025651c9d0cd01ad51d0dba36/Cython-3.0.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.6/3.6 MB 2.0 MB/s eta 0:00:00a 0:00:01
[?25hRequirement already satisfied: numpy>=1.19.0 in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from tables) (1.24.4)
Collecting numexpr>=2.6.2 (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/0d/47/a2ede0e136a8ddc288b447c260aa035f3e75251f808aa61f6454b16dfd04/numexpr-2.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (384 kB)
Collecting blosc2~=2.0.0 (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/dd/7e214303c2acb0807fd1ad86d78112456c0fc9397f565ab86ebd147f2a9b/blosc2-2.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.9/3.9 MB 1.9 MB/s eta 0:00:00a 0:00:01m
[?25hRequirement already satisfied: packaging in /root/anaconda3/envs/pyspark/lib/python3.8/site-packages (from tables) (24.1)
Collecting py-cpuinfo (from tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting msgpack (from blosc2~=2.0.0->tables)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/55/f6/d4859a158a915be52eecd52dee9761ab3a5d84c834a1d13ffc198e068a48/msgpack-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
Installing collected packages: py-cpuinfo, numexpr, msgpack, cython, blosc2, tables
Successfully installed blosc2-2.0.0 cython-3.0.11 msgpack-1.1.0 numexpr-2.8.6 py-cpuinfo-9.0.0 tables-3.8.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

# 输出h5文件
df.to_hdf('foo.h5','df')
# 输入csv
mid = pd.read_hdf('foo.h5','df')
mid[:5]
A B C D
2000-01-01 1.935169 -0.357413 -0.610720 -0.888047
2000-01-02 1.716798 0.895592 -0.827020 -1.707031
2000-01-03 0.769426 1.589204 -0.992573 -3.305788
2000-01-04 1.602683 2.225518 -1.860329 -2.416657
2000-01-05 0.337869 4.107487 -2.186224 -3.856869

3. Excel

!pip install openpyxl
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting openpyxl
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

# 将 DataFrame 保存到 Excel 文件
df.to_excel('foo.xlsx', sheet_name='Sheet1')
# 从 Excel 文件读取数据
mid = pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
mid[:5]
Unnamed: 0 A B C D
0 2000-01-01 1.935169 -0.357413 -0.610720 -0.888047
1 2000-01-02 1.716798 0.895592 -0.827020 -1.707031
2 2000-01-03 0.769426 1.589204 -0.992573 -3.305788
3 2000-01-04 1.602683 2.225518 -1.860329 -2.416657
4 2000-01-05 0.337869 4.107487 -2.186224 -3.856869