pandas
1.数据结构(series,numpy数组,列表)
import numpy as np import matplotlib.pyplot as plt import pandas as pd #列表 lst01=[1,2,3,4,5] #numpy 数组 arr01=np.array(lst01) #pandas series ser01=pd.Series(arr01) #数组操作 arr01.mean() arr01.max() arr01.min() arr01.var() arr01.std() arr01.cumsum()#累加和 #series操作 ser02=pd.Series([10,20,30,40,],index=["one","two","three","four"],name="age") ser02 ser02.values ser02.index=['i','ii','iii','iiii'] ser02.index ser02.count() ser02.median() ser02.mode()#众数 ser02.isnull()#判断是否是空值 ser02.notnull()#判断是否不是空值 ser02.value_counts()#各数据出现的次数 ser02.value_counts(normalize=True)#归一化
2.DataFrame
arr2d=np.array([ [1,1,2], [2,1,5] ]) df1=pd.DataFrame(arr2d) #定义列名columns 定义行名rows df2=pd.DataFrame(arr2d,columns=["努力值","运气值","收获值"]) #修改列名 df2.columns=["effort","luck","harvest"]
Markdown 读入数据
weather=pd.read_csv("./data/harbin_2019_aqi.csv") weather.shape weather[:5] weather[-5:]
#更改列名 cols=["data","high","low","weather","wind_direction","wind_power","aqi","quality","aqi_level"] weather.columns=cols weather[:5] weather[:10]
#取出aqi_level列 weather['aqi_level'] #将aqi_level列作为表格取出 weather[['aqi_level']] #计算weather aqi_level各数值出现的次数 weather[['aqi_level']].value_counts() weather[['quality']].value_counts(normalize=True)#查看数据是否均衡
#dataframe 没有unique但是series有 weather['quality'].unique()#去除重复 weather['quality'].nunique()#去除重复后计数
空值处理
空值有两种情况一种是定义时将数值输入为np.nan nan not a number 空值,一种时导入数据表时有空值
arr2d=np.array([ [1,np.nan,2], [2,1,5] ]) df02=pd.DataFrame(arr2d) df02 # nan 空值 not a number
pandas数据操作
1. 数据访问
import numpy as np import matplotlib.pyplot as plt import pandas as pd # 读入数据 #weather=pd.read_csv("./data/harbin_2019_aqi.csv",nrows=10,skiprows=1,header=None)) get_cols=['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi'] weather=pd.read_csv("./data/harbin_2019_aqi.csv",nrows=10,usecols=get_cols) #修改列名 cols=["date","high","low","weather","wind_direction","wind_power","aqi"] weather.columns=cols # 查看数据 weather
#切片访问 weather[:5]#前五行 weather[5:10]#第六到十行 weather[6:]#第七行到最后一行 #访问列 weather['aqi']#访问aqi列的数据,数据类型味series c=['aqi'] weather[c] #访问aqi列的数据,数据类型为dataframe #loc[]访问 通过行标签和列标签访问数据 location weather.loc[0:2,['date','aqi']] #访问0-2行的date列和aqi列 #iloc 通过行索引和列索引访问数据 index location weather.iloc[0:2,[1,3]]#访问0-1行,1和3列 weather.iloc[0:2,[1,-1]]#访问0-1行,1和最后一列 weather.iloc[0:2,1:4]#访问0-1行,1-4列
2.删除数据和增加数据
wh=pd.read_csv("./data/harbin_2019_aqi.csv",nrows=10,skiprows=1,header=None) #删除数据 wh.drop([8],axis=0)#删除行 wh.drop([8],axis=1)#删除列
#使用场景 y=weather[['aqi']] x=weather.drop(columns=['aqi']) weather.drop(columns=['aqi'],inplace=True)#慎用inplace 在原本数据上删除 weather
3.合并数据
#读取python 51job数据表的内容 file_path="./data/jobs/" get_cols=['address', 'com_name', 'max_salary', 'min_salary', 'name', 'pub_date'] python_jobs=pd.read_csv(file_path+"job51_python_2019_10_15.csv",usecols=get_cols) python_jobs.shape
#读取java 51job数据表的内容 java_jobs=pd.read_csv(file_path+"job51_java_2019_10_15.csv",usecols=get_cols) java_jobs.shape
#读取web 51job数据表的内容 web_jobs=pd.read_csv(file_path+"job51_web_2019_10_15.csv",usecols=get_cols) web_jobs.shape
#合并三个表的数据 all_jobs=pd.concat([python_jobs,java_jobs,web_jobs],ignore_index=True) all_jobs.shape#行数应等于前三个表之和 all_jobs[-5:]
#综合代码 import os file_path="./data/jobs/" get_cols=['address', 'com_name', 'max_salary', 'min_salary', 'name', 'pub_date'] files=os.listdir(file_path) jobs_total=None if len(files) > 0 : for file in files: if file.endswith('.csv'): df=pd.read_csv(file_path+file,usecols=get_cols) jobs_total=pd.concat([jobs_total,df],ignore_index=True) else: print("目标目录为空")