DataFrame是一个表格型的数据结构,可以看作是由Series组成的字典(共用同一个索引)。DataFrame由按一定顺序排列的多列数据组成。DataFrame既有行索引,也有列索引。
- 行索引:index
- 列索引:columns
- 值:values
1.DataFrame的创建
pd.DataFrame(data,index,columns)
- 字典:以字典的键作为每一列的名称;以字典的值(一个数组)作为每一列;会自动加上每一行的索引;若传入的列与字典的键不匹配,填充NaN
- 基本属性和方法
- values:值
- columns:列索引
- index:行索引
- shape:形状
- head():查看前几条数据,默认5条
- tail():查看后几条数据,默认5条
- 基本属性和方法
import numpy as np
import pandas as pd
d = {
"品种":['暹罗猫','阿比西尼亚猫','斯芬克斯猫'],
"价格":['2800','3500','10000']
}
df = pd.DataFrame(d)
print(df)
print('###########################')
print('value值:',df.values)
print('###########################')
print('列索引:',df.columns)
print('###########################')
print('行索引:',df.index)
print('###########################')
print('形状:',df.shape)
print('###########################')
print('前两条数据:\n',df.head(2))
print('###########################')
print('后两条数据:\n',df.tail(2))
- 其他方式
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['原价','活动价','会员价','团购价'],
columns= ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫','布偶猫','缅因猫']
)
print(df)
2.DataFrame的索引(优先对列进行操作)
- 对列进行索引
- 通过类似字典的方式
- 通过属性的方式
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫'],
columns= ['原价','活动价','会员价','团购价','秒杀价','促销价'],
)
print(df)
print('###############################')
print(df.原价)
print('###############################')
print(df['原价'])
print('###############################')
print(df[['原价','促销价']])
- 对行进行索引
- 使用.loc[]加index来进行行索引
- 使用.iloc[]加整数来进行行索引
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫'],
columns= ['原价','活动价','会员价','团购价','秒杀价','促销价'],
)
print(df)
print('###############################')
print(df.loc['暹罗猫'])
print('###############################')
print(df.iloc[3])
print('###############################')
print(df.loc[['暹罗猫','阿比西尼亚猫']])
print('###############################')
print(df.iloc[[1,3]])
- 对元素进行索引
- 使用列索引
- 使用行索引
- 使用values属性
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫'],
columns= ['原价','活动价','会员价','团购价','秒杀价','促销价'],
)
print(df)
print('###############################')
#先列后行
print('暹罗猫的原价:',df['原价']['暹罗猫'])
print('暹罗猫的原价:',df['原价'][0])
print('暹罗猫的原价:',df.原价[0])
print('暹罗猫的原价:',df.原价.暹罗猫)
print('###############################')
#先行后列
print('阿比西尼亚猫的会员价:',df.loc['阿比西尼亚猫']['会员价'])
print('阿比西尼亚猫的会员价:',df.loc['阿比西尼亚猫'][2])
print('阿比西尼亚猫的会员价:',df.iloc[1][2])
print('阿比西尼亚猫的会员价:',df.iloc[1,2])
print('阿比西尼亚猫的会员价:',df.iloc[1]['会员价'])
3.DataFrame的切片(优先对行进行操作)
-
行切片
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫'],
columns= ['原价','活动价','会员价','团购价','秒杀价','促销价'],
)
print(df)
print('###############################')
print(df[1:3]) #左闭右开
print('###############################')
print(df['阿比西尼亚猫':'美短猫']) #左闭右闭
print('###############################')
print(df.iloc[1:3]) #左闭右开
print('###############################')
print(df.loc['阿比西尼亚猫':'美短猫']) #左闭右闭
print('###############################')
-
列切片,必须先对行进行切片
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫'],
columns= ['原价','活动价','会员价','团购价','秒杀价','促销价'],
)
print(df)
print('###############################')
print(df.iloc[:,1:3])
print('###############################')
print(df.loc[:,"活动价":"团购价"])
print('###############################')
-
同时切片
import numpy as np
import pandas as pd
df = pd.DataFrame(
data = np.random.randint(10,100,size=(4,6)),
index = ['暹罗猫','阿比西尼亚猫','斯芬克斯猫','美短猫'],
columns= ['原价','活动价','会员价','团购价','秒杀价','促销价'],
)
print(df)
print('###############################')
print(df.iloc[1:3,1:3])
print('###############################')
print(df.loc["阿比西尼亚猫":"斯芬克斯猫","活动价":"团购价"])
print('###############################')
print(df.loc[["暹罗猫","斯芬克斯猫"],"活动价":"团购价"])
print('###############################')
print(df.iloc[[0,2],1:3])
print('###############################')
4.DataFrame的运算
示例数据
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
b = [
[45,88,92],
[74,81,96],
[67,80,77]
]
m = np.array(b)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
df2 = pd.DataFrame(
data = m,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
c = [
[45,88,92,61],
[74,81,96,88],
[67,80,77,78],
[56,84,69,93]
]
x = np.array(c)
df3 = pd.DataFrame(
data = x,
index = ['学生1','学生2','学生3','学生4'],
columns= ['语文','数学','英语','物理'],
)
print(df1)
print('#################################')
print(df2)
print('#################################')
print(df3)
- DataFrame和标量之间的运算
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
print('加法:\n',df1+100)
print('#################################')
print('减法:\n',df1-10)
print('#################################')
print('乘法:\n',df1*2)
print('#################################')
print('除法:\n',df1/2)
print('#################################')
print('取余:\n',df1%10)
print('#################################')
print('次方:\n',df1**2)
- DataFrame之间的运算
在运算中自动对齐不同索引的数据
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
b = [
[45,88,92],
[74,81,96],
[67,80,77]
]
m = np.array(b)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
df2 = pd.DataFrame(
data = m,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
print(df1+df2)
如果索引不对应,用NaN补齐;没有广播机制
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
c = [
[45,88,92,61],
[74,81,96,88],
[67,80,77,78],
[56,84,69,93]
]
x = np.array(c)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
df3 = pd.DataFrame(
data = x,
index = ['学生1','学生2','学生3','学生4'],
columns= ['语文','数学','英语','物理'],
)
print(df1+df3)
使用.add()函数填充数据,先填充数据,再相加
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
c = [
[45,88,92,61],
[74,81,96,88],
[67,80,77,78],
[56,84,69,93]
]
x = np.array(c)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
df3 = pd.DataFrame(
data = x,
index = ['学生1','学生2','学生3','学生4'],
columns= ['语文','数学','英语','物理'],
)
print(df1.add(df3,fill_value=0))
- Series与DataFrame之间的运算
列运算
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
s = pd.Series([10,20,30],index=df1.columns)
print('s:\n',s)
print('#############################')
print(df1+s)
print('#############################')
print(df1.add(s,axis='columns'))
print('#############################')
print(df1.add(s,axis=1))
行运算
import numpy as np
import pandas as pd
a = [
[90,80,70],
[100,89,78],
[78,99,65]
]
n = np.array(a)
df1 = pd.DataFrame(
data = n,
index = ['学生1','学生2','学生3'],
columns= ['语文','数学','英语'],
)
s = pd.Series([10,20,30],index=df1.index)
print('s:\n',s)
print('#############################')
print(df1.add(s,axis='index'))
print('#############################')
print(df1.add(s,axis=0))
知识点为听课总结笔记,课程为B站“千锋教育Pandas数据分析从入门到实战,零基础小白保姆级Python数据分析教程”:001_Pandas_Pandas介绍_哔哩哔哩_bilibili